ref: e39de8d40483f4a00930d9565ae6996142fb4fed
parent: 7313ecdbd058561b0188785cdc8786cbd63207ce
author: Licai Guo <guolicai@gmail.com>
date: Tue Mar 18 15:41:32 EDT 2014
reoranize common to inc/src/x86/arm
--- a/codec/build/android/dec/jni/welsdecdemo.mk
+++ b/codec/build/android/dec/jni/welsdecdemo.mk
@@ -24,7 +24,7 @@
$(CONSOLE_DEC_PATH)/src/h264dec.cpp \
$(CONSOLE_DEC_PATH)/src/read_config.cpp \
$(CONSOLE_DEC_PATH)/src/d3d9_utils.cpp \
- $(CODEC_PATH)/common/logging.cpp \
+ $(CODEC_PATH)/common/src/logging.cpp \
myjni.cpp
#
# Header Includes
@@ -32,7 +32,7 @@
LOCAL_C_INCLUDES := \
$(LOCAL_PATH)/../../../../api/svc \
$(LOCAL_PATH)/../../../../console/dec/inc \
- $(LOCAL_PATH)/../../../../common
+ $(LOCAL_PATH)/../../../../common/inc
#
# Compile Flags and Link Libraries
#
--- a/codec/build/android/enc/jni/welsencdemo.mk
+++ b/codec/build/android/enc/jni/welsencdemo.mk
@@ -23,7 +23,7 @@
LOCAL_SRC_FILES := \
$(CONSOLE_ENC_PATH)/src/welsenc.cpp \
$(CONSOLE_ENC_PATH)/src/read_config.cpp \
- $(CODEC_PATH)/common/logging.cpp \
+ $(CODEC_PATH)/common/src/logging.cpp \
myjni.cpp
#
@@ -34,7 +34,7 @@
$(LOCAL_PATH)/../../../../console/enc/inc \
$(LOCAL_PATH)/../../../../encoder/core/inc \
$(LOCAL_PATH)/../../../../processing/interface \
- $(LOCAL_PATH)/../../../../common
+ $(LOCAL_PATH)/../../../../common/inc
#
--- a/codec/build/win32/dec/WelsDecCore.vcproj
+++ b/codec/build/win32/dec/WelsDecCore.vcproj
@@ -53,7 +53,7 @@
Name="VCCLCompilerTool"
Optimization="2"
InlineFunctionExpansion="1"
- AdditionalIncludeDirectories="..\..\..\decoder\core\inc;..\..\..\common;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc"
+ AdditionalIncludeDirectories="..\..\..\decoder\core\inc;..\..\..\common\inc;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc"
PreprocessorDefinitions="WIN32;NDEBUG;_LIB;X86_ASM;HAVE_CACHE_LINE_ALIGN"
StringPooling="true"
RuntimeLibrary="2"
@@ -133,7 +133,7 @@
Name="VCCLCompilerTool"
Optimization="2"
InlineFunctionExpansion="1"
- AdditionalIncludeDirectories="..\..\..\decoder\core\inc;..\..\..\common;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc"
+ AdditionalIncludeDirectories="..\..\..\decoder\core\inc;..\..\..\common\inc;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc"
PreprocessorDefinitions="WIN64;NDEBUG;_LIB;HAVE_CACHE_LINE_ALIGN;X86_ASM"
StringPooling="true"
RuntimeLibrary="2"
@@ -211,7 +211,7 @@
<Tool
Name="VCCLCompilerTool"
Optimization="0"
- AdditionalIncludeDirectories="..\..\..\decoder\core\inc;..\..\..\common;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc"
+ AdditionalIncludeDirectories="..\..\..\decoder\core\inc;..\..\..\common\inc;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc"
PreprocessorDefinitions="WIN32;_DEBUG;_LIB;X86_ASM;HAVE_CACHE_LINE_ALIGN"
MinimalRebuild="true"
BasicRuntimeChecks="3"
@@ -290,7 +290,7 @@
<Tool
Name="VCCLCompilerTool"
Optimization="0"
- AdditionalIncludeDirectories="..\..\..\decoder\core\inc;..\..\..\common;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc"
+ AdditionalIncludeDirectories="..\..\..\decoder\core\inc;..\..\..\common\inc;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc"
PreprocessorDefinitions="WIN64;_DEBUG;_LIB;HAVE_CACHE_LINE_ALIGN;X86_ASM"
MinimalRebuild="true"
BasicRuntimeChecks="3"
@@ -356,7 +356,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -365,7 +365,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -374,7 +374,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -383,13 +383,13 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\common\cpuid.asm"
+ RelativePath="..\..\..\common\x86\cpuid.asm"
>
<FileConfiguration
Name="Release|Win32"
@@ -396,7 +396,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -405,7 +405,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -414,7 +414,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -423,7 +423,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -436,7 +436,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -445,7 +445,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -454,7 +454,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -463,13 +463,13 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\common\deblock.asm"
+ RelativePath="..\..\..\common\x86\deblock.asm"
>
<FileConfiguration
Name="Release|Win32"
@@ -476,7 +476,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -485,7 +485,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -494,7 +494,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -503,13 +503,13 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\common\expand_picture.asm"
+ RelativePath="..\..\..\common\x86\expand_picture.asm"
>
<FileConfiguration
Name="Release|Win32"
@@ -516,7 +516,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -525,7 +525,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -534,7 +534,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -543,7 +543,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -556,7 +556,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -565,7 +565,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -574,7 +574,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -583,13 +583,13 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\common\mb_copy.asm"
+ RelativePath="..\..\..\common\x86\mb_copy.asm"
>
<FileConfiguration
Name="Release|Win32"
@@ -596,7 +596,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -605,7 +605,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -614,7 +614,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -623,13 +623,13 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\common\mc_chroma.asm"
+ RelativePath="..\..\..\common\x86\mc_chroma.asm"
>
<FileConfiguration
Name="Release|Win32"
@@ -636,7 +636,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -645,7 +645,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -654,7 +654,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -663,13 +663,13 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\common\mc_luma.asm"
+ RelativePath="..\..\..\common\x86\mc_luma.asm"
>
<FileConfiguration
Name="Release|Win32"
@@ -676,7 +676,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -685,7 +685,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -694,7 +694,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -703,7 +703,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -726,11 +726,11 @@
>
</File>
<File
- RelativePath="..\..\..\common\cpu.h"
+ RelativePath="..\..\..\common\inc\cpu.h"
>
</File>
<File
- RelativePath="..\..\..\common\cpu_core.h"
+ RelativePath="..\..\..\common\inc\cpu_core.h"
>
</File>
<File
@@ -738,7 +738,7 @@
>
</File>
<File
- RelativePath="..\..\..\common\deblocking_common.h"
+ RelativePath="..\..\..\common\inc\deblocking_common.h"
>
</File>
<File
@@ -778,7 +778,7 @@
>
</File>
<File
- RelativePath="..\..\..\common\expand_picture_common.h"
+ RelativePath="..\..\..\common\inc\expand_picture_common.h"
>
</File>
<File
@@ -790,11 +790,11 @@
>
</File>
<File
- RelativePath="..\..\..\common\ls_defines.h"
+ RelativePath="..\..\..\common\inc\ls_defines.h"
>
</File>
<File
- RelativePath="..\..\..\common\macros.h"
+ RelativePath="..\..\..\common\inc\macros.h"
>
</File>
<File
@@ -810,11 +810,11 @@
>
</File>
<File
- RelativePath="..\..\..\common\mc_common.h"
+ RelativePath="..\..\..\common\inc\mc_common.h"
>
</File>
<File
- RelativePath="..\..\..\common\measure_time.h"
+ RelativePath="..\..\..\common\inc\measure_time.h"
>
</File>
<File
@@ -862,7 +862,7 @@
>
</File>
<File
- RelativePath="..\..\..\common\typedefs.h"
+ RelativePath="..\..\..\common\inc\typedefs.h"
>
</File>
<File
@@ -895,11 +895,11 @@
>
</File>
<File
- RelativePath="..\..\..\common\cpu.cpp"
+ RelativePath="..\..\..\common\src\cpu.cpp"
>
</File>
<File
- RelativePath="..\..\..\common\crt_util_safe_x.cpp"
+ RelativePath="..\..\..\common\src\crt_util_safe_x.cpp"
>
</File>
<File
@@ -907,7 +907,7 @@
>
</File>
<File
- RelativePath="..\..\..\common\deblocking_common.cpp"
+ RelativePath="..\..\..\common\src\deblocking_common.cpp"
>
</File>
<File
@@ -943,7 +943,7 @@
>
</File>
<File
- RelativePath="..\..\..\common\logging.cpp"
+ RelativePath="..\..\..\common\src\logging.cpp"
>
</File>
<File
--- a/codec/build/win32/dec/WelsDecPlus.vcproj
+++ b/codec/build/win32/dec/WelsDecPlus.vcproj
@@ -53,7 +53,7 @@
Name="VCCLCompilerTool"
Optimization="2"
InlineFunctionExpansion="1"
- AdditionalIncludeDirectories="..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc"
+ AdditionalIncludeDirectories="..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc"
PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSDECPLUS_EXPORTS;HAVE_CACHE_LINE_ALIGN;X86_ASM"
StringPooling="true"
RuntimeLibrary="2"
@@ -153,7 +153,7 @@
Name="VCCLCompilerTool"
Optimization="2"
InlineFunctionExpansion="1"
- AdditionalIncludeDirectories="..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc"
+ AdditionalIncludeDirectories="..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc"
PreprocessorDefinitions="WIN64;NDEBUG;_WINDOWS;_USRDLL;WELSDECPLUS_EXPORTS;HAVE_CACHE_LINE_ALIGN;X86_ASM"
StringPooling="true"
RuntimeLibrary="2"
@@ -252,7 +252,7 @@
<Tool
Name="VCCLCompilerTool"
Optimization="0"
- AdditionalIncludeDirectories="..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc"
+ AdditionalIncludeDirectories="..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc"
PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSDECPLUS_EXPORTS;HAVE_CACHE_LINE_ALIGN;X86_ASM"
MinimalRebuild="true"
BasicRuntimeChecks="3"
@@ -349,7 +349,7 @@
<Tool
Name="VCCLCompilerTool"
Optimization="0"
- AdditionalIncludeDirectories="..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc"
+ AdditionalIncludeDirectories="..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc"
PreprocessorDefinitions="WIN64;_DEBUG;_WINDOWS;_USRDLL;WELSDECPLUS_EXPORTS;HAVE_CACHE_LINE_ALIGN;X86_ASM"
MinimalRebuild="true"
BasicRuntimeChecks="3"
--- a/codec/build/win32/dec/decConsole.vcproj
+++ b/codec/build/win32/dec/decConsole.vcproj
@@ -49,7 +49,7 @@
Name="VCCLCompilerTool"
Optimization="2"
InlineFunctionExpansion="1"
- AdditionalIncludeDirectories="..\..\..\console\dec\inc,..\..\..\api\svc,..\..\..\common,..\..\..\encoder\core\inc"
+ AdditionalIncludeDirectories="..\..\..\console\dec\inc,..\..\..\api\svc,..\..\..\common\inc,..\..\..\encoder\core\inc"
PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
StringPooling="true"
RuntimeLibrary="2"
@@ -142,7 +142,7 @@
Name="VCCLCompilerTool"
Optimization="2"
InlineFunctionExpansion="1"
- AdditionalIncludeDirectories="..\..\..\console\dec\inc,..\..\..\api\svc,..\..\..\common,..\..\..\encoder\core\inc"
+ AdditionalIncludeDirectories="..\..\..\console\dec\inc,..\..\..\api\svc,..\..\..\common\inc,..\..\..\encoder\core\inc"
PreprocessorDefinitions="WIN64;NDEBUG;_CONSOLE"
StringPooling="true"
RuntimeLibrary="2"
@@ -233,7 +233,7 @@
<Tool
Name="VCCLCompilerTool"
Optimization="0"
- AdditionalIncludeDirectories="..\..\..\console\dec\inc,..\..\..\api\svc,..\..\..\common,..\..\..\decoder\core\inc,..\..\..\encoder\core\inc"
+ AdditionalIncludeDirectories="..\..\..\console\dec\inc,..\..\..\api\svc,..\..\..\common\inc,..\..\..\decoder\core\inc,..\..\..\encoder\core\inc"
PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
MinimalRebuild="true"
BasicRuntimeChecks="3"
@@ -327,7 +327,7 @@
<Tool
Name="VCCLCompilerTool"
Optimization="0"
- AdditionalIncludeDirectories="..\..\..\console\dec\inc,..\..\..\api\svc,..\..\..\common,..\..\..\decoder\core\inc,..\..\..\encoder\core\inc"
+ AdditionalIncludeDirectories="..\..\..\console\dec\inc,..\..\..\api\svc,..\..\..\common\inc,..\..\..\decoder\core\inc,..\..\..\encoder\core\inc"
PreprocessorDefinitions="WIN64;_DEBUG;_CONSOLE"
MinimalRebuild="true"
BasicRuntimeChecks="3"
@@ -443,7 +443,7 @@
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\common\logging.cpp"
+ RelativePath="..\..\..\common\src\logging.cpp"
>
</File>
<File
--- a/codec/build/win32/enc/WelsEncCore.vcproj
+++ b/codec/build/win32/enc/WelsEncCore.vcproj
@@ -52,7 +52,7 @@
<Tool
Name="VCCLCompilerTool"
Optimization="0"
- AdditionalIncludeDirectories="..\..\..\common;..\..\..\encoder\core\inc,..\..\..\api\svc;..\..\..\processing\interface"
+ AdditionalIncludeDirectories="..\..\..\common\inc;..\..\..\encoder\core\inc,..\..\..\api\svc;..\..\..\processing\interface"
PreprocessorDefinitions="WIN32;_DEBUG;_LIB;X86_ASM;MT_ENABLED"
MinimalRebuild="true"
BasicRuntimeChecks="3"
@@ -132,7 +132,7 @@
<Tool
Name="VCCLCompilerTool"
Optimization="0"
- AdditionalIncludeDirectories="..\..\..\common;..\..\..\encoder\core\inc,..\..\..\api\svc;..\..\..\processing\interface"
+ AdditionalIncludeDirectories="..\..\..\common\inc;..\..\..\encoder\core\inc,..\..\..\api\svc;..\..\..\processing\interface"
PreprocessorDefinitions="WIN64;_DEBUG;_LIB;X86_ASM;MT_ENABLED"
MinimalRebuild="true"
BasicRuntimeChecks="3"
@@ -214,7 +214,7 @@
InlineFunctionExpansion="2"
FavorSizeOrSpeed="1"
WholeProgramOptimization="true"
- AdditionalIncludeDirectories="..\..\..\common;..\..\..\encoder\core\inc,..\..\..\api\svc;..\..\..\processing\interface"
+ AdditionalIncludeDirectories="..\..\..\common\inc;..\..\..\encoder\core\inc,..\..\..\api\svc;..\..\..\processing\interface"
PreprocessorDefinitions="WIN32;NDEBUG;_LIB;X86_ASM;MT_ENABLED;"
StringPooling="true"
RuntimeLibrary="2"
@@ -298,7 +298,7 @@
InlineFunctionExpansion="2"
FavorSizeOrSpeed="1"
WholeProgramOptimization="true"
- AdditionalIncludeDirectories="..\..\..\common;..\..\..\encoder\core\inc,..\..\..\api\svc;..\..\..\processing\interface"
+ AdditionalIncludeDirectories="..\..\..\common\inc;..\..\..\encoder\core\inc,..\..\..\api\svc;..\..\..\processing\interface"
PreprocessorDefinitions="WIN64;NDEBUG;_LIB;MT_ENABLED;X86_ASM"
StringPooling="true"
RuntimeLibrary="2"
@@ -396,11 +396,11 @@
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\common\cpu.cpp"
+ RelativePath="..\..\..\common\src\cpu.cpp"
>
</File>
<File
- RelativePath="..\..\..\common\crt_util_safe_x.cpp"
+ RelativePath="..\..\..\common\src\crt_util_safe_x.cpp"
>
</File>
<File
@@ -444,7 +444,7 @@
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\common\deblocking_common.cpp"
+ RelativePath="..\..\..\common\src\deblocking_common.cpp"
>
</File>
<File
@@ -728,7 +728,7 @@
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\common\logging.cpp"
+ RelativePath="..\..\..\common\src\logging.cpp"
>
</File>
<File
@@ -1424,7 +1424,7 @@
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\common\WelsThreadLib.cpp"
+ RelativePath="..\..\..\common\src\WelsThreadLib.cpp"
>
</File>
</Filter>
@@ -1445,11 +1445,11 @@
>
</File>
<File
- RelativePath="..\..\..\common\cpu.h"
+ RelativePath="..\..\..\common\inc\cpu.h"
>
</File>
<File
- RelativePath="..\..\..\common\cpu_core.h"
+ RelativePath="..\..\..\common\inc\cpu_core.h"
>
</File>
<File
@@ -1457,7 +1457,7 @@
>
</File>
<File
- RelativePath="..\..\..\common\deblocking_common.h"
+ RelativePath="..\..\..\common\inc\deblocking_common.h"
>
</File>
<File
@@ -1485,7 +1485,7 @@
>
</File>
<File
- RelativePath="..\..\..\common\expand_picture_common.h"
+ RelativePath="..\..\..\common\inc\expand_picture_common.h"
>
</File>
<File
@@ -1497,11 +1497,11 @@
>
</File>
<File
- RelativePath="..\..\..\common\ls_defines.h"
+ RelativePath="..\..\..\common\inc\ls_defines.h"
>
</File>
<File
- RelativePath="..\..\..\common\macros.h"
+ RelativePath="..\..\..\common\inc\macros.h"
>
</File>
<File
@@ -1513,7 +1513,7 @@
>
</File>
<File
- RelativePath="..\..\..\common\mc_common.h"
+ RelativePath="..\..\..\common\inc\mc_common.h"
>
</File>
<File
@@ -1521,7 +1521,7 @@
>
</File>
<File
- RelativePath="..\..\..\common\measure_time.h"
+ RelativePath="..\..\..\common\inc\measure_time.h"
>
</File>
<File
@@ -1637,7 +1637,7 @@
>
</File>
<File
- RelativePath="..\..\..\common\typedefs.h"
+ RelativePath="..\..\..\common\inc\typedefs.h"
>
</File>
<File
@@ -1661,7 +1661,7 @@
>
</File>
<File
- RelativePath="..\..\..\common\WelsThreadLib.h"
+ RelativePath="..\..\..\common\inc\WelsThreadLib.h"
>
</File>
</Filter>
@@ -1677,7 +1677,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1686,7 +1686,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1695,7 +1695,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1704,13 +1704,13 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\common\cpuid.asm"
+ RelativePath="..\..\..\common\x86\cpuid.asm"
>
<FileConfiguration
Name="Debug|Win32"
@@ -1726,7 +1726,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1735,7 +1735,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1744,7 +1744,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1757,7 +1757,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1766,7 +1766,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1775,7 +1775,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1784,13 +1784,13 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\common\deblock.asm"
+ RelativePath="..\..\..\common\x86\deblock.asm"
>
<FileConfiguration
Name="Debug|Win32"
@@ -1797,7 +1797,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1806,7 +1806,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1815,7 +1815,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1824,13 +1824,13 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\common\expand_picture.asm"
+ RelativePath="..\..\..\common\x86\expand_picture.asm"
>
<FileConfiguration
Name="Debug|Win32"
@@ -1837,7 +1837,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1846,7 +1846,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1855,7 +1855,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1864,7 +1864,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1877,7 +1877,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1886,7 +1886,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1895,7 +1895,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1904,13 +1904,13 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\common\mb_copy.asm"
+ RelativePath="..\..\..\common\x86\mb_copy.asm"
>
<FileConfiguration
Name="Debug|Win32"
@@ -1917,7 +1917,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1926,7 +1926,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1935,7 +1935,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1944,13 +1944,13 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\common\mc_chroma.asm"
+ RelativePath="..\..\..\common\x86\mc_chroma.asm"
>
<FileConfiguration
Name="Debug|Win32"
@@ -1957,7 +1957,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1966,7 +1966,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1975,7 +1975,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1984,13 +1984,13 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\common\mc_luma.asm"
+ RelativePath="..\..\..\common\x86\mc_luma.asm"
>
<FileConfiguration
Name="Debug|Win32"
@@ -1997,7 +1997,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -2006,7 +2006,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -2015,7 +2015,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -2024,7 +2024,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -2037,7 +2037,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -2046,7 +2046,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -2055,7 +2055,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -2064,7 +2064,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -2077,7 +2077,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -2086,7 +2086,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -2095,7 +2095,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -2104,13 +2104,13 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\common\satd_sad.asm"
+ RelativePath="..\..\..\common\x86\satd_sad.asm"
>
<FileConfiguration
Name="Debug|Win32"
@@ -2117,7 +2117,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -2126,7 +2126,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -2135,7 +2135,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -2144,7 +2144,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -2157,7 +2157,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -2166,7 +2166,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -2175,7 +2175,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -2184,13 +2184,13 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\common\vaa.asm"
+ RelativePath="..\..\..\common\x86\vaa.asm"
>
<FileConfiguration
Name="Debug|Win32"
@@ -2197,7 +2197,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -2206,7 +2206,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -2215,7 +2215,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -2224,7 +2224,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
--- a/codec/build/win32/enc/WelsEncPlus.vcproj
+++ b/codec/build/win32/enc/WelsEncPlus.vcproj
@@ -52,7 +52,7 @@
<Tool
Name="VCCLCompilerTool"
Optimization="0"
- AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\processing\interface"
+ AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\processing\interface"
PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;MT_ENABLED;X86_ASM"
MinimalRebuild="true"
BasicRuntimeChecks="3"
@@ -151,7 +151,7 @@
<Tool
Name="VCCLCompilerTool"
Optimization="0"
- AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\processing\interface"
+ AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\processing\interface"
PreprocessorDefinitions="WIN64;_DEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;MT_ENABLED;X86_ASM"
MinimalRebuild="true"
BasicRuntimeChecks="3"
@@ -254,7 +254,7 @@
FavorSizeOrSpeed="1"
EnableFiberSafeOptimizations="true"
WholeProgramOptimization="true"
- AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\processing\interface"
+ AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\processing\interface"
PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;MT_ENABLED;X86_ASM"
StringPooling="true"
RuntimeLibrary="2"
@@ -359,7 +359,7 @@
FavorSizeOrSpeed="1"
EnableFiberSafeOptimizations="true"
WholeProgramOptimization="true"
- AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\processing\interface"
+ AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\processing\interface"
PreprocessorDefinitions="WIN64;NDEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;MT_ENABLED;X86_ASM"
StringPooling="true"
RuntimeLibrary="2"
--- a/codec/build/win32/enc/encConsole.vcproj
+++ b/codec/build/win32/enc/encConsole.vcproj
@@ -48,7 +48,7 @@
<Tool
Name="VCCLCompilerTool"
Optimization="0"
- AdditionalIncludeDirectories="..\..\..\console\enc\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api,..\..\..\encoder\core\inc,..\..\..\common,..\..\..\processing\interface"
+ AdditionalIncludeDirectories="..\..\..\console\enc\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api,..\..\..\encoder\core\inc,..\..\..\common\inc,..\..\..\processing\interface"
PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE;MT_ENABLED;"
MinimalRebuild="true"
BasicRuntimeChecks="3"
@@ -143,7 +143,7 @@
Name="VCCLCompilerTool"
Optimization="2"
InlineFunctionExpansion="1"
- AdditionalIncludeDirectories="..\..\..\console\enc\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api,..\..\..\encoder\core\inc,..\..\..\common,..\..\..\processing\interface"
+ AdditionalIncludeDirectories="..\..\..\console\enc\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api,..\..\..\encoder\core\inc,..\..\..\common\inc,..\..\..\processing\interface"
PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE;X86_ASM;MT_ENABLED;"
StringPooling="true"
RuntimeLibrary="2"
@@ -238,7 +238,7 @@
<Tool
Name="VCCLCompilerTool"
Optimization="0"
- AdditionalIncludeDirectories="..\..\..\console\enc\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api,..\..\..\encoder\core\inc,..\..\..\common,..\..\..\processing\interface"
+ AdditionalIncludeDirectories="..\..\..\console\enc\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api,..\..\..\encoder\core\inc,..\..\..\common\inc,..\..\..\processing\interface"
PreprocessorDefinitions="WIN64;_DEBUG;_CONSOLE;MT_ENABLED"
MinimalRebuild="true"
BasicRuntimeChecks="3"
@@ -334,7 +334,7 @@
Name="VCCLCompilerTool"
Optimization="2"
InlineFunctionExpansion="1"
- AdditionalIncludeDirectories="..\..\..\console\enc\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api,..\..\..\encoder\core\inc,..\..\..\common,..\..\..\processing\interface"
+ AdditionalIncludeDirectories="..\..\..\console\enc\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api,..\..\..\encoder\core\inc,..\..\..\common\inc,..\..\..\processing\interface"
PreprocessorDefinitions="WIN64;NDEBUG;_CONSOLE;MT_ENABLED"
StringPooling="true"
RuntimeLibrary="2"
@@ -407,7 +407,7 @@
Filter="cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
>
<File
- RelativePath="..\..\..\common\logging.cpp"
+ RelativePath="..\..\..\common\src\logging.cpp"
>
</File>
<File
--- a/codec/common/WelsThreadLib.cpp
+++ /dev/null
@@ -1,473 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2009-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file WelsThreadLib.c
- *
- * \brief Interfaces introduced in thread programming
- *
- * \date 11/17/2009 Created
- *
- *************************************************************************************
- */
-
-
-#ifdef LINUX
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE
-#endif
-#include <sched.h>
-#elif !defined(_WIN32)
-#include <sys/types.h>
-#include <sys/sysctl.h>
-#include <sys/param.h>
-#include <unistd.h>
-#ifdef __APPLE__
-#define HW_NCPU_NAME "hw.logicalcpu"
-#else
-#define HW_NCPU_NAME "hw.ncpu"
-#endif
-#endif
-#ifdef ANDROID_NDK
-#include <cpu-features.h>
-#endif
-
-#include "WelsThreadLib.h"
-#include <stdio.h>
-#include <stdlib.h>
-
-
-#ifdef _WIN32
-
-#ifdef WINAPI_FAMILY
-#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
-#define InitializeCriticalSection(x) InitializeCriticalSectionEx(x, 0, 0)
-#endif
-#endif
-
-WELS_THREAD_ERROR_CODE WelsMutexInit (WELS_MUTEX* mutex) {
- InitializeCriticalSection (mutex);
-
- return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE WelsMutexLock (WELS_MUTEX* mutex) {
- EnterCriticalSection (mutex);
-
- return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE WelsMutexUnlock (WELS_MUTEX* mutex) {
- LeaveCriticalSection (mutex);
-
- return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE WelsMutexDestroy (WELS_MUTEX* mutex) {
- DeleteCriticalSection (mutex);
-
- return WELS_THREAD_ERROR_OK;
-}
-
-#else /* _WIN32 */
-
-WELS_THREAD_ERROR_CODE WelsMutexInit (WELS_MUTEX* mutex) {
- return pthread_mutex_init (mutex, NULL);
-}
-
-WELS_THREAD_ERROR_CODE WelsMutexLock (WELS_MUTEX* mutex) {
- return pthread_mutex_lock (mutex);
-}
-
-WELS_THREAD_ERROR_CODE WelsMutexUnlock (WELS_MUTEX* mutex) {
- return pthread_mutex_unlock (mutex);
-}
-
-WELS_THREAD_ERROR_CODE WelsMutexDestroy (WELS_MUTEX* mutex) {
- return pthread_mutex_destroy (mutex);
-}
-
-#endif /* !_WIN32 */
-
-
-#ifdef MT_ENABLED
-
-#ifdef _WIN32
-
-void WelsSleep (uint32_t dwMilliseconds) {
- Sleep (dwMilliseconds);
-}
-
-WELS_THREAD_ERROR_CODE WelsEventOpen (WELS_EVENT* event, const char* event_name) {
- WELS_EVENT h = CreateEvent (NULL, FALSE, FALSE, NULL);
-
- if (h == NULL) {
- return WELS_THREAD_ERROR_GENERAL;
- }
- *event = h;
- return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE WelsEventSignal (WELS_EVENT* event) {
- if (SetEvent (*event)) {
- return WELS_THREAD_ERROR_OK;
- }
- return WELS_THREAD_ERROR_GENERAL;
-}
-
-WELS_THREAD_ERROR_CODE WelsEventWait (WELS_EVENT* event) {
- return WaitForSingleObject (*event, INFINITE);
-}
-
-WELS_THREAD_ERROR_CODE WelsEventWaitWithTimeOut (WELS_EVENT* event, uint32_t dwMilliseconds) {
- return WaitForSingleObject (*event, dwMilliseconds);
-}
-
-WELS_THREAD_ERROR_CODE WelsMultipleEventsWaitSingleBlocking (uint32_t nCount,
- WELS_EVENT* event_list, WELS_EVENT* master_event) {
- // Don't need/use the master event for anything, since windows has got WaitForMultipleObjects
- return WaitForMultipleObjects (nCount, event_list, FALSE, INFINITE);
-}
-
-WELS_THREAD_ERROR_CODE WelsMultipleEventsWaitAllBlocking (uint32_t nCount,
- WELS_EVENT* event_list, WELS_EVENT* master_event) {
- // Don't need/use the master event for anything, since windows has got WaitForMultipleObjects
- return WaitForMultipleObjects (nCount, event_list, TRUE, INFINITE);
-}
-
-WELS_THREAD_ERROR_CODE WelsEventClose (WELS_EVENT* event, const char* event_name) {
- CloseHandle (*event);
-
- *event = NULL;
- return WELS_THREAD_ERROR_OK;
-}
-
-
-WELS_THREAD_ERROR_CODE WelsThreadCreate (WELS_THREAD_HANDLE* thread, LPWELS_THREAD_ROUTINE routine,
- void* arg, WELS_THREAD_ATTR attr) {
- WELS_THREAD_HANDLE h = CreateThread (NULL, 0, routine, arg, 0, NULL);
-
- if (h == NULL) {
- return WELS_THREAD_ERROR_GENERAL;
- }
- * thread = h;
-
- return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE WelsThreadJoin (WELS_THREAD_HANDLE thread) {
- WaitForSingleObject (thread, INFINITE);
- CloseHandle (thread);
-
- return WELS_THREAD_ERROR_OK;
-}
-
-
-WELS_THREAD_HANDLE WelsThreadSelf() {
- return GetCurrentThread();
-}
-
-WELS_THREAD_ERROR_CODE WelsQueryLogicalProcessInfo (WelsLogicalProcessInfo* pInfo) {
- SYSTEM_INFO si;
-
- GetSystemInfo (&si);
-
- pInfo->ProcessorCount = si.dwNumberOfProcessors;
-
- return WELS_THREAD_ERROR_OK;
-}
-
-#else
-
-void WelsSleep (uint32_t dwMilliseconds) {
- usleep (dwMilliseconds * 1000); // microseconds
-}
-
-WELS_THREAD_ERROR_CODE WelsThreadCreate (WELS_THREAD_HANDLE* thread, LPWELS_THREAD_ROUTINE routine,
- void* arg, WELS_THREAD_ATTR attr) {
- WELS_THREAD_ERROR_CODE err = 0;
-
- pthread_attr_t at;
- err = pthread_attr_init (&at);
- if (err)
- return err;
-#ifndef __ANDROID__
- err = pthread_attr_setscope (&at, PTHREAD_SCOPE_SYSTEM);
- if (err)
- return err;
- err = pthread_attr_setschedpolicy (&at, SCHED_FIFO);
- if (err)
- return err;
-#endif
- err = pthread_create (thread, &at, routine, arg);
-
- pthread_attr_destroy (&at);
-
- return err;
-}
-
-WELS_THREAD_ERROR_CODE WelsThreadJoin (WELS_THREAD_HANDLE thread) {
- return pthread_join (thread, NULL);
-}
-
-WELS_THREAD_HANDLE WelsThreadSelf() {
- return pthread_self();
-}
-
-// unnamed semaphores aren't supported on OS X
-
-WELS_THREAD_ERROR_CODE WelsEventOpen (WELS_EVENT* p_event, const char* event_name) {
-#ifdef __APPLE__
- if (p_event == NULL || event_name == NULL)
- return WELS_THREAD_ERROR_GENERAL;
- *p_event = sem_open (event_name, O_CREAT, (S_IRUSR | S_IWUSR)/*0600*/, 0);
- if (*p_event == (sem_t*)SEM_FAILED) {
- sem_unlink (event_name);
- *p_event = NULL;
- return WELS_THREAD_ERROR_GENERAL;
- } else {
- return WELS_THREAD_ERROR_OK;
- }
-#else
- WELS_EVENT event = (WELS_EVENT) malloc(sizeof(*event));
- if (event == NULL)
- return WELS_THREAD_ERROR_GENERAL;
- WELS_THREAD_ERROR_CODE err = sem_init(event, 0, 0);
- if (!err) {
- *p_event = event;
- return err;
- }
- free(event);
- return err;
-#endif
-}
-WELS_THREAD_ERROR_CODE WelsEventClose (WELS_EVENT* event, const char* event_name) {
-#ifdef __APPLE__
- WELS_THREAD_ERROR_CODE err = sem_close (*event); // match with sem_open
- if (event_name)
- sem_unlink (event_name);
- return err;
-#else
- WELS_THREAD_ERROR_CODE err = sem_destroy (*event); // match with sem_init
- free(*event);
- return err;
-#endif
-}
-
-WELS_THREAD_ERROR_CODE WelsEventSignal (WELS_EVENT* event) {
- WELS_THREAD_ERROR_CODE err = 0;
-// int32_t val = 0;
-// sem_getvalue(event, &val);
-// fprintf( stderr, "before signal it, val= %d..\n",val );
- err = sem_post (*event);
-// sem_getvalue(event, &val);
-// fprintf( stderr, "after signal it, val= %d..\n",val );
- return err;
-}
-
-WELS_THREAD_ERROR_CODE WelsEventWait (WELS_EVENT* event) {
- return sem_wait (*event); // blocking until signaled
-}
-
-WELS_THREAD_ERROR_CODE WelsEventWaitWithTimeOut (WELS_EVENT* event, uint32_t dwMilliseconds) {
- if (dwMilliseconds != (uint32_t) - 1) {
- return sem_wait (*event);
- } else {
-#if defined(__APPLE__)
- int32_t err = 0;
- int32_t wait_count = 0;
- do {
- err = sem_trywait (*event);
- if (WELS_THREAD_ERROR_OK == err)
- break;// WELS_THREAD_ERROR_OK;
- else if (wait_count > 0)
- break;
- usleep (dwMilliseconds * 1000);
- ++ wait_count;
- } while (1);
- return err;
-#else
- struct timespec ts;
- struct timeval tv;
-
- gettimeofday (&tv, 0);
-
- ts.tv_nsec = tv.tv_usec * 1000 + dwMilliseconds * 1000000;
- ts.tv_sec = tv.tv_sec + ts.tv_nsec / 1000000000;
- ts.tv_nsec %= 1000000000;
-
- return sem_timedwait (*event, &ts);
-#endif//__APPLE__
- }
-}
-
-WELS_THREAD_ERROR_CODE WelsMultipleEventsWaitSingleBlocking (uint32_t nCount,
- WELS_EVENT* event_list, WELS_EVENT* master_event) {
- uint32_t nIdx = 0;
- uint32_t uiAccessTime = 2; // 2 us once
-
- if (nCount == 0)
- return WELS_THREAD_ERROR_WAIT_FAILED;
-
- if (master_event != NULL) {
- // This design relies on the events actually being semaphores;
- // if multiple events in the list have been signalled, the master
- // event should have a similar count (events in windows can't keep
- // track of the actual count, but the master event isn't needed there
- // since it uses WaitForMultipleObjects).
- int32_t err = sem_wait (*master_event);
- if (err != WELS_THREAD_ERROR_OK)
- return err;
- uiAccessTime = 0; // no blocking, just quickly loop through all to find the one that was signalled
- }
-
- while (1) {
- nIdx = 0; // access each event by order
- while (nIdx < nCount) {
- int32_t err = 0;
- int32_t wait_count = 0;
-
- /*
- * although such interface is not used in __GNUC__ like platform, to use
- * pthread_cond_timedwait() might be better choice if need
- */
- do {
- err = sem_trywait (event_list[nIdx]);
- if (WELS_THREAD_ERROR_OK == err)
- return WELS_THREAD_ERROR_WAIT_OBJECT_0 + nIdx;
- else if (wait_count > 0 || uiAccessTime == 0)
- break;
- usleep (uiAccessTime);
- ++ wait_count;
- } while (1);
- // we do need access next event next time
- ++ nIdx;
- }
- usleep (1); // switch to working threads
- if (master_event != NULL) {
- // A master event was used and was signalled, but none of the events in the
- // list was found to be signalled, thus wait a little more when rechecking
- // the list to avoid busylooping here.
- // If we ever hit this codepath it's mostly a bug in the code that signals
- // the events.
- uiAccessTime = 2;
- }
- }
-
- return WELS_THREAD_ERROR_WAIT_FAILED;
-}
-
-WELS_THREAD_ERROR_CODE WelsMultipleEventsWaitAllBlocking (uint32_t nCount,
- WELS_EVENT* event_list, WELS_EVENT* master_event) {
- uint32_t nIdx = 0;
- uint32_t uiCountSignals = 0;
- uint32_t uiSignalFlag = 0; // UGLY: suppose maximal event number up to 32
-
- if (nCount == 0 || nCount > (sizeof (uint32_t) << 3))
- return WELS_THREAD_ERROR_WAIT_FAILED;
-
- while (1) {
- nIdx = 0; // access each event by order
- while (nIdx < nCount) {
- const uint32_t kuiBitwiseFlag = (1 << nIdx);
-
- if ((uiSignalFlag & kuiBitwiseFlag) != kuiBitwiseFlag) { // non-blocking mode
- int32_t err = 0;
-// fprintf( stderr, "sem_wait(): start to wait event %d..\n", nIdx );
- if (master_event == NULL) {
- err = sem_wait (event_list[nIdx]);
- } else {
- err = sem_wait (*master_event);
- if (err == WELS_THREAD_ERROR_OK) {
- err = sem_wait (event_list[nIdx]);
- if (err != WELS_THREAD_ERROR_OK) {
- // We successfully waited for the master event,
- // but waiting for the individual event failed (e.g. EINTR?).
- // Increase the master event count so that the next retry will
- // work as intended.
- sem_post (*master_event);
- }
- }
- }
-// fprintf( stderr, "sem_wait(): wait event %d result %d errno %d..\n", nIdx, err, errno );
- if (WELS_THREAD_ERROR_OK == err) {
-// int32_t val = 0;
-// sem_getvalue(&event_list[nIdx], &val);
-// fprintf( stderr, "after sem_timedwait(), event_list[%d] semaphore value= %d..\n", nIdx, val);
-
- uiSignalFlag |= kuiBitwiseFlag;
- ++ uiCountSignals;
- if (uiCountSignals >= nCount) {
- return WELS_THREAD_ERROR_OK;
- }
- }
- }
- // we do need access next event next time
- ++ nIdx;
- }
- }
-
- return WELS_THREAD_ERROR_WAIT_FAILED;
-}
-
-WELS_THREAD_ERROR_CODE WelsQueryLogicalProcessInfo (WelsLogicalProcessInfo* pInfo) {
-#ifdef ANDROID_NDK
- pInfo->ProcessorCount = android_getCpuCount();
- return WELS_THREAD_ERROR_OK;
-#elif defined(LINUX)
-
- cpu_set_t cpuset;
-
- CPU_ZERO (&cpuset);
-
- if (!sched_getaffinity (0, sizeof (cpuset), &cpuset))
- pInfo->ProcessorCount = CPU_COUNT (&cpuset);
- else
- pInfo->ProcessorCount = 1;
-
- return WELS_THREAD_ERROR_OK;
-
-#else
-
- size_t len = sizeof (pInfo->ProcessorCount);
-
- if (sysctlbyname (HW_NCPU_NAME, &pInfo->ProcessorCount, &len, NULL, 0) == -1)
- pInfo->ProcessorCount = 1;
-
- return WELS_THREAD_ERROR_OK;
-
-#endif//LINUX
-}
-
-#endif
-
-
-#endif // MT_ENABLED
-
--- a/codec/common/WelsThreadLib.h
+++ /dev/null
@@ -1,132 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2009-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file WelsThreadLib.h
- *
- * \brief Interfaces introduced in thread programming
- *
- * \date 11/17/2009 Created
- *
- *************************************************************************************
- */
-
-#ifndef _WELS_THREAD_API_H_
-#define _WELS_THREAD_API_H_
-
-#include "typedefs.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if defined(_WIN32)
-
-#include <windows.h>
-
-typedef HANDLE WELS_THREAD_HANDLE;
-typedef LPTHREAD_START_ROUTINE LPWELS_THREAD_ROUTINE;
-
-typedef CRITICAL_SECTION WELS_MUTEX;
-typedef HANDLE WELS_EVENT;
-
-#define WELS_THREAD_ROUTINE_TYPE DWORD WINAPI
-#define WELS_THREAD_ROUTINE_RETURN(rc) return (DWORD)rc;
-
-#else // NON-WINDOWS
-
-#include <stdlib.h>
-#include <unistd.h>
-#include <string.h>
-#include <pthread.h>
-#include <semaphore.h>
-#include <signal.h>
-#include <errno.h>
-#include <time.h>
-#include <sys/time.h>
-
-#include <sys/stat.h>
-#include <fcntl.h>
-
-typedef pthread_t WELS_THREAD_HANDLE;
-typedef void* (*LPWELS_THREAD_ROUTINE) (void*);
-
-typedef pthread_mutex_t WELS_MUTEX;
-typedef sem_t* WELS_EVENT;
-
-#define WELS_THREAD_ROUTINE_TYPE void *
-#define WELS_THREAD_ROUTINE_RETURN(rc) return (void*)(intptr_t)rc;
-
-#endif//_WIN32
-
-typedef int32_t WELS_THREAD_ERROR_CODE;
-typedef int32_t WELS_THREAD_ATTR;
-
-typedef struct _WelsLogicalProcessorInfo {
- int32_t ProcessorCount;
-} WelsLogicalProcessInfo;
-
-#define WELS_THREAD_ERROR_OK 0
-#define WELS_THREAD_ERROR_GENERAL ((uint32_t)(-1))
-#define WELS_THREAD_ERROR_WAIT_OBJECT_0 0
-#define WELS_THREAD_ERROR_WAIT_TIMEOUT ((uint32_t)0x00000102L)
-#define WELS_THREAD_ERROR_WAIT_FAILED WELS_THREAD_ERROR_GENERAL
-
-void WelsSleep (uint32_t dwMilliseconds);
-WELS_THREAD_ERROR_CODE WelsMutexInit (WELS_MUTEX* mutex);
-WELS_THREAD_ERROR_CODE WelsMutexLock (WELS_MUTEX* mutex);
-WELS_THREAD_ERROR_CODE WelsMutexUnlock (WELS_MUTEX* mutex);
-WELS_THREAD_ERROR_CODE WelsMutexDestroy (WELS_MUTEX* mutex);
-
-WELS_THREAD_ERROR_CODE WelsEventOpen (WELS_EVENT* p_event, const char* event_name);
-WELS_THREAD_ERROR_CODE WelsEventClose (WELS_EVENT* event, const char* event_name);
-WELS_THREAD_ERROR_CODE WelsEventSignal (WELS_EVENT* event);
-WELS_THREAD_ERROR_CODE WelsEventWait (WELS_EVENT* event);
-WELS_THREAD_ERROR_CODE WelsEventWaitWithTimeOut (WELS_EVENT* event, uint32_t dwMilliseconds);
-WELS_THREAD_ERROR_CODE WelsMultipleEventsWaitSingleBlocking (uint32_t nCount, WELS_EVENT* event_list,
- WELS_EVENT* master_event = NULL);
-WELS_THREAD_ERROR_CODE WelsMultipleEventsWaitAllBlocking (uint32_t nCount, WELS_EVENT* event_list,
- WELS_EVENT* master_event = NULL);
-
-WELS_THREAD_ERROR_CODE WelsThreadCreate (WELS_THREAD_HANDLE* thread, LPWELS_THREAD_ROUTINE routine,
- void* arg, WELS_THREAD_ATTR attr);
-
-WELS_THREAD_ERROR_CODE WelsThreadJoin (WELS_THREAD_HANDLE thread);
-
-WELS_THREAD_HANDLE WelsThreadSelf();
-
-WELS_THREAD_ERROR_CODE WelsQueryLogicalProcessInfo (WelsLogicalProcessInfo* pInfo);
-
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- /dev/null
+++ b/codec/common/arm/arm_arch_common_macro.S
@@ -1,0 +1,64 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef __APPLE__
+
+.macro WELS_ASM_FUNC_BEGIN
+.align 2
+.arm
+.globl _$0
+_$0:
+.endm
+
+.macro WELS_ASM_FUNC_END
+mov pc, lr
+.endm
+#else
+
+.syntax unified
+.section .note.GNU-stack,"",%progbits // Mark stack as non-executable
+.text
+
+.macro WELS_ASM_FUNC_BEGIN funcName
+.align 2
+.arm
+.global \funcName
+.type \funcName, %function
+.func \funcName
+\funcName:
+.endm
+
+.macro WELS_ASM_FUNC_END
+mov pc, lr
+.endfunc
+.endm
+#endif
--- /dev/null
+++ b/codec/common/arm/deblocking_neon.S
@@ -1,0 +1,1052 @@
+/*!
+* \copy
+* Copyright (c) 2013, Cisco Systems
+* All rights reserved.
+
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+
+* * Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+
+* * Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in
+* the documentation and/or other materials provided with the
+* distribution.
+
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+* POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifdef HAVE_NEON
+.text
+
+#include "arm_arch_common_macro.S"
+
+#ifdef __APPLE__
+.macro JMP_IF_128BITS_IS_ZERO
+ vorr.s16 $2, $0, $1
+ vmov r3, r2, $2
+ orr r3, r3, r2
+ cmp r3, #0
+.endm
+
+.macro MASK_MATRIX
+ vabd.u8 $6, $1, $2
+ vcgt.u8 $6, $4, $6
+
+ vabd.u8 $4, $0, $1
+ vclt.u8 $4, $4, $5
+ vand.u8 $6, $6, $4
+
+ vabd.u8 $4, $3, $2
+ vclt.u8 $4, $4, $5
+ vand.u8 $6, $6, $4
+.endm
+
+
+.macro DIFF_LUMA_LT4_P1_Q1
+ vabd.u8 $9, $0, $2
+ vclt.u8 $9, $9, $4
+ vrhadd.u8 $8, $2, $3
+ vhadd.u8 $8, $0, $8
+ vsub.s8 $8, $8, $1
+ vmax.s8 $8, $8, $5
+ vmin.s8 $8, $8, $6
+ vand.s8 $8, $8, $9
+ vand.s8 $8, $8, $7
+ vadd.u8 $8, $1, $8
+ vabs.s8 $9, $9
+.endm
+
+.macro DIFF_LUMA_LT4_P0_Q0
+ vsubl.u8 $5, $0, $3
+ vsubl.u8 $6, $2, $1
+ vshl.s16 $6, $6, #2
+ vadd.s16 $5, $5, $6
+ vrshrn.s16 $4, $5, #3
+.endm
+
+.macro DIFF_LUMA_EQ4_P2P1P0
+ vaddl.u8 q4, $1, $2
+ vaddl.u8 q5, $3, $4
+ vadd.u16 q5, q4, q5
+
+ vaddl.u8 q4, $0, $1
+ vshl.u16 q4, q4, #1
+ vadd.u16 q4, q5, q4
+
+ vrshrn.u16 $0, q5, #2
+ vrshrn.u16 $7, q4, #3
+
+ vshl.u16 q5, q5, #1
+ vsubl.u8 q4, $5, $1
+ vadd.u16 q5, q4,q5
+
+ vaddl.u8 q4, $2, $5
+ vaddw.u8 q4, q4, $2
+ vaddw.u8 q4, q4, $3
+
+ vrshrn.u16 d10,q5, #3
+ vrshrn.u16 d8, q4, #2
+ vbsl.u8 $6, d10, d8
+.endm
+
+.macro DIFF_LUMA_EQ4_MASK
+ vmov $3, $2
+ vbsl.u8 $3, $0, $1
+.endm
+
+.macro DIFF_CHROMA_EQ4_P0Q0
+ vaddl.u8 $4, $0, $3
+ vaddw.u8 $5, $4, $1
+ vaddw.u8 $6, $4, $2
+ vaddw.u8 $5, $5, $0
+
+ vaddw.u8 $6, $6, $3
+ vrshrn.u16 $7, $5, #2
+ vrshrn.u16 $8, $6, #2
+.endm
+
+.macro LOAD_CHROMA_DATA_4
+ vld4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
+ vld4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
+.endm
+
+.macro STORE_CHROMA_DATA_4
+ vst4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
+ vst4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
+.endm
+
+.macro LOAD_LUMA_DATA_3
+ vld3.u8 {$0[$6],$1[$6],$2[$6]}, [r2], r1
+ vld3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
+.endm
+
+.macro STORE_LUMA_DATA_4
+ vst4.u8 {$0[$4],$1[$4],$2[$4],$3[$4]}, [r0], r1
+ vst4.u8 {$0[$5],$1[$5],$2[$5],$3[$5]}, [r2], r1
+.endm
+
+.macro LOAD_LUMA_DATA_4
+ vld4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r3], r1
+ vld4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r0], r1
+.endm
+
+.macro STORE_LUMA_DATA_3
+ vst3.u8 {$0[$6],$1[$6],$2[$6]}, [r3], r1
+ vst3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
+.endm
+
+.macro EXTRACT_DELTA_INTO_TWO_PART
+ vcge.s8 $1, $0, #0
+ vand $1, $0, $1
+ vsub.s8 $0, $1, $0
+.endm
+#else
+.macro JMP_IF_128BITS_IS_ZERO arg0, arg1, arg2
+ vorr.s16 \arg2, \arg0, \arg1
+ vmov r3, r2, \arg2
+ orr r3, r3, r2
+ cmp r3, #0
+.endm
+
+.macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
+ vabd.u8 \arg6, \arg1, \arg2
+ vcgt.u8 \arg6, \arg4, \arg6
+
+ vabd.u8 \arg4, \arg0, \arg1
+ vclt.u8 \arg4, \arg4, \arg5
+ vand.u8 \arg6, \arg6, \arg4
+
+ vabd.u8 \arg4, \arg3, \arg2
+ vclt.u8 \arg4, \arg4, \arg5
+ vand.u8 \arg6, \arg6, \arg4
+.endm
+
+.macro DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+ vabd.u8 \arg9, \arg0, \arg2
+ vclt.u8 \arg9, \arg9, \arg4
+ vrhadd.u8 \arg8, \arg2, \arg3
+ vhadd.u8 \arg8, \arg0, \arg8
+ vsub.s8 \arg8, \arg8, \arg1
+ vmax.s8 \arg8, \arg8, \arg5
+ vmin.s8 \arg8, \arg8, \arg6
+ vand.s8 \arg8, \arg8, \arg9
+ vand.s8 \arg8, \arg8, \arg7
+ vadd.u8 \arg8, \arg1, \arg8
+ vabs.s8 \arg9, \arg9
+.endm
+
+.macro DIFF_LUMA_LT4_P0_Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6
+ vsubl.u8 \arg5, \arg0, \arg3
+ vsubl.u8 \arg6, \arg2, \arg1
+ vshl.s16 \arg6, \arg6, #2
+ vadd.s16 \arg5, \arg5, \arg6
+ vrshrn.s16 \arg4, \arg5, #3
+.endm
+
+
+.macro DIFF_LUMA_EQ4_P2P1P0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+ vaddl.u8 q4, \arg1, \arg2
+ vaddl.u8 q5, \arg3, \arg4
+ vadd.u16 q5, q4, q5
+
+ vaddl.u8 q4, \arg0, \arg1
+ vshl.u16 q4, q4, #1
+ vadd.u16 q4, q5, q4
+
+ vrshrn.u16 \arg0, q5, #2
+ vrshrn.u16 \arg7, q4, #3
+
+ vshl.u16 q5, q5, #1
+ vsubl.u8 q4, \arg5, \arg1
+ vadd.u16 q5, q4,q5
+
+ vaddl.u8 q4, \arg2, \arg5
+ vaddw.u8 q4, q4, \arg2
+ vaddw.u8 q4, q4, \arg3
+
+ vrshrn.u16 d10,q5, #3
+ vrshrn.u16 d8, q4, #2
+ vbsl.u8 \arg6, d10, d8
+.endm
+
+.macro DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3
+ vmov \arg3, \arg2
+ vbsl.u8 \arg3, \arg0, \arg1
+.endm
+
+.macro DIFF_CHROMA_EQ4_P0Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+ vaddl.u8 \arg4, \arg0, \arg3
+ vaddw.u8 \arg5, \arg4, \arg1
+ vaddw.u8 \arg6, \arg4, \arg2
+ vaddw.u8 \arg5, \arg5, \arg0
+ vaddw.u8 \arg6, \arg6, \arg3
+ vrshrn.u16 \arg7, \arg5, #2
+ vrshrn.u16 \arg8, \arg6, #2
+.endm
+
+.macro LOAD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+ vld4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
+ vld4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
+.endm
+
+.macro STORE_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+ vst4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
+ vst4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
+.endm
+
+.macro LOAD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
+ vld3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r2], r1
+ vld3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
+.endm
+
+.macro STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
+ vst4.u8 {\arg0[\arg4],\arg1[\arg4],\arg2[\arg4],\arg3[\arg4]}, [r0], r1
+ vst4.u8 {\arg0[\arg5],\arg1[\arg5],\arg2[\arg5],\arg3[\arg5]}, [r2], r1
+.endm
+
+.macro LOAD_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+ vld4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r3], r1
+ vld4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r0], r1
+.endm
+
+.macro STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
+ vst3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r3], r1
+ vst3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
+.endm
+
+.macro EXTRACT_DELTA_INTO_TWO_PART arg0, arg1
+ vcge.s8 \arg1, \arg0, #0
+ vand \arg1, \arg0, \arg1
+ vsub.s8 \arg0, \arg1, \arg0
+.endm
+#endif
+
+WELS_ASM_FUNC_BEGIN DeblockLumaLt4V_neon
+ vpush {q4-q7}
+ vdup.u8 q11, r2
+ vdup.u8 q9, r3
+
+ add r2, r1, r1, lsl #1
+ sub r2, r0, r2
+ vld1.u8 {q0}, [r2], r1
+ vld1.u8 {q3}, [r0], r1
+ vld1.u8 {q1}, [r2], r1
+ vld1.u8 {q4}, [r0], r1
+ vld1.u8 {q2}, [r2]
+ vld1.u8 {q5}, [r0]
+ sub r2, r2, r1
+
+ ldr r3, [sp, #64]
+ vld1.s8 {d31}, [r3]
+ vdup.s8 d28, d31[0]
+ vdup.s8 d30, d31[1]
+ vdup.s8 d29, d31[2]
+ vdup.s8 d31, d31[3]
+ vtrn.32 d28, d30
+ vtrn.32 d29, d31
+ vcge.s8 q10, q14, #0
+
+ MASK_MATRIX q1, q2, q3, q4, q11, q9, q15
+ vand.u8 q10, q10, q15
+
+ veor q15, q15
+ vsub.i8 q15,q15,q14
+
+ DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
+ vst1.u8 {q6}, [r2], r1
+
+ DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
+
+ vabs.s8 q12, q12
+ vabs.s8 q13, q13
+ vadd.u8 q14,q14,q12
+ vadd.u8 q14,q14,q13
+ veor q15, q15
+ vsub.i8 q15,q15,q14
+
+ DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13
+ DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13
+ vmax.s8 q8, q8, q15
+ vmin.s8 q8, q8, q14
+ vand.s8 q8, q8, q10
+ EXTRACT_DELTA_INTO_TWO_PART q8, q9
+ vqadd.u8 q2, q2, q9
+ vqsub.u8 q2, q2, q8
+ vst1.u8 {q2}, [r2], r1
+ vqsub.u8 q3, q3, q9
+ vqadd.u8 q3, q3, q8
+ vst1.u8 {q3}, [r2] , r1
+ vst1.u8 {q7}, [r2]
+
+ vpop {q4-q7}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN DeblockLumaEq4V_neon
+ vpush {q4-q7}
+
+ vdup.u8 q5, r2
+ vdup.u8 q4, r3
+
+ sub r3, r0, r1, lsl #2
+ vld1.u8 {q8}, [r3], r1
+ vld1.u8 {q12}, [r0], r1
+ vld1.u8 {q9}, [r3], r1
+ vld1.u8 {q13}, [r0], r1
+ vld1.u8 {q10}, [r3], r1
+ vld1.u8 {q14}, [r0], r1
+ vld1.u8 {q11}, [r3]
+ vld1.u8 {q15}, [r0]
+ sub r3, r3, r1 , lsl #1
+
+ MASK_MATRIX q10, q11, q12, q13, q5, q4, q6
+
+ mov r2, r2, lsr #2
+ add r2, r2, #2
+ vdup.u8 q5, r2
+ vabd.u8 q0, q11, q12
+ vclt.u8 q7, q0, q5
+
+ vabd.u8 q1, q9, q11
+ vclt.u8 q1, q1, q4
+ vand.s8 q1, q1, q7
+
+ vabd.u8 q2, q14,q12
+ vclt.u8 q2, q2, q4
+ vand.s8 q2, q2, q7
+ vand.u8 q7, q7, q6
+
+ vmov q3, q1
+
+ DIFF_LUMA_EQ4_P2P1P0 d16, d18, d20, d22, d24, d26, d2, d0
+ DIFF_LUMA_EQ4_P2P1P0 d17, d19, d21, d23, d25, d27, d3, d1
+
+ vand.u8 q3, q7, q3
+ DIFF_LUMA_EQ4_MASK q0, q9, q3, q4
+ vst1.u8 {q4}, [r3], r1
+ DIFF_LUMA_EQ4_MASK q8,q10, q3, q4
+ vst1.u8 {q4}, [r3], r1
+ DIFF_LUMA_EQ4_MASK q1,q11, q6, q4
+ vst1.u8 {q4}, [r3], r1
+
+ vmov q0, q2
+ DIFF_LUMA_EQ4_P2P1P0 d30, d28, d26, d24, d22, d20, d4, d6
+ DIFF_LUMA_EQ4_P2P1P0 d31, d29, d27, d25, d23, d21, d5, d7
+
+ vand.u8 q0, q7, q0
+ DIFF_LUMA_EQ4_MASK q2, q12, q6, q4
+ vst1.u8 {q4}, [r3], r1
+ DIFF_LUMA_EQ4_MASK q15, q13, q0, q4
+ vst1.u8 {q4}, [r3], r1
+ DIFF_LUMA_EQ4_MASK q3, q14, q0, q4
+ vst1.u8 {q4}, [r3], r1
+
+ vpop {q4-q7}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN DeblockLumaLt4H_neon
+ vpush {q4-q7}
+
+ vdup.u8 q11, r2
+ vdup.u8 q9, r3
+
+ sub r2, r0, #3
+ LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 0
+ LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 1
+ LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 2
+ LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 3
+ LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 4
+ LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 5
+ LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 6
+ LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 7
+
+ LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 0
+ LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 1
+ LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 2
+ LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 3
+ LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 4
+ LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 5
+ LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 6
+ LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 7
+
+ vswp d1, d2
+ vswp d3, d4
+ vswp d1, d4
+ vswp d7, d8
+ vswp d9, d10
+ vswp d7, d10
+
+ sub r0, r0, r1, lsl #4
+
+ ldr r3, [sp, #64]
+ vld1.s8 {d31}, [r3]
+ vdup.s8 d28, d31[0]
+ vdup.s8 d30, d31[1]
+ vdup.s8 d29, d31[2]
+ vdup.s8 d31, d31[3]
+ vtrn.32 d28, d30
+ vtrn.32 d29, d31
+ vcge.s8 q10, q14, #0
+
+ MASK_MATRIX q1, q2, q3, q4, q11, q9, q15
+ vand.u8 q10, q10, q15
+
+ veor q15, q15
+ vsub.i8 q15,q15,q14
+
+ DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
+ DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
+
+ vabs.s8 q12, q12
+ vabs.s8 q13, q13
+ vadd.u8 q14,q14,q12
+ vadd.u8 q14,q14,q13
+ veor q15, q15
+ vsub.i8 q15,q15,q14
+
+ DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13
+ DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13
+ vmax.s8 q8, q8, q15
+ vmin.s8 q8, q8, q14
+ vand.s8 q8, q8, q10
+ EXTRACT_DELTA_INTO_TWO_PART q8, q9
+ vqadd.u8 q2, q2, q9
+ vqsub.u8 q2, q2, q8
+
+ vqsub.u8 q3, q3, q9
+ vqadd.u8 q3, q3, q8
+
+ sub r0, #2
+ add r2, r0, r1
+ lsl r1, #1
+
+ vmov q1, q6
+ vmov q4, q7
+
+ vswp q2, q3
+ vswp d3, d6
+ vswp d5, d8
+
+ STORE_LUMA_DATA_4 d2, d3, d4, d5, 0, 1
+ STORE_LUMA_DATA_4 d2, d3, d4, d5, 2, 3
+ STORE_LUMA_DATA_4 d2, d3, d4, d5, 4, 5
+ STORE_LUMA_DATA_4 d2, d3, d4, d5, 6, 7
+
+ STORE_LUMA_DATA_4 d6, d7, d8, d9, 0, 1
+ STORE_LUMA_DATA_4 d6, d7, d8, d9, 2, 3
+ STORE_LUMA_DATA_4 d6, d7, d8, d9, 4, 5
+ STORE_LUMA_DATA_4 d6, d7, d8, d9, 6, 7
+
+ vpop {q4-q7}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN DeblockLumaEq4H_neon
+ vpush {q4-q7}
+ vdup.u8 q5, r2
+ vdup.u8 q4, r3
+
+ sub r3, r0, #4 // pix -= 4
+ LOAD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,0
+ LOAD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,1
+ LOAD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,2
+ LOAD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,3
+ LOAD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,4
+ LOAD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,5
+ LOAD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,6
+ LOAD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,7
+
+ LOAD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,0
+ LOAD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,1
+ LOAD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,2
+ LOAD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,3
+ LOAD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,4
+ LOAD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,5
+ LOAD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,6
+ LOAD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,7
+
+ vswp q9, q10
+ vswp d17,d18
+ vswp d21,d22
+ vswp q13,q14
+ vswp d25,d26
+ vswp d29,d30
+ sub r0, r0, r1 , lsl #4
+
+ MASK_MATRIX q10, q11, q12, q13, q5, q4, q6
+
+ mov r2, r2, lsr #2
+ add r2, r2, #2
+ vdup.u8 q5, r2
+ vabd.u8 q0, q11, q12
+ vclt.u8 q7, q0, q5
+
+ vabd.u8 q1, q9, q11
+ vclt.u8 q1, q1, q4
+ vand.s8 q1, q1, q7
+
+ vabd.u8 q2, q14,q12
+ vclt.u8 q2, q2, q4
+ vand.s8 q2, q2, q7
+ vand.u8 q7, q7, q6
+
+ vmov q3, q1
+
+ DIFF_LUMA_EQ4_P2P1P0 d16, d18, d20, d22, d24, d26, d2, d0
+ DIFF_LUMA_EQ4_P2P1P0 d17, d19, d21, d23, d25, d27, d3, d1
+
+ vand.u8 q3, q7, q3
+ DIFF_LUMA_EQ4_MASK q0, q9, q3, q4
+ vmov q9, q4
+ vbsl.u8 q3, q8, q10
+ DIFF_LUMA_EQ4_MASK q1,q11, q6, q8
+
+ vand.u8 q7, q7, q2
+
+ DIFF_LUMA_EQ4_P2P1P0 d30, d28, d26, d24, d22, d20, d4, d0
+ DIFF_LUMA_EQ4_P2P1P0 d31, d29, d27, d25, d23, d21, d5, d1
+
+ vbsl.u8 q6, q2, q12
+ DIFF_LUMA_EQ4_MASK q15, q13, q7, q4
+
+ vbsl.u8 q7, q0, q14
+
+ vmov q5, q6
+ vmov q2, q9
+ vmov q6, q4
+ vmov q4, q8
+
+ vswp d8, d6
+ vswp d5, d7
+ vswp d5, d8
+ vswp d14, d12
+ vswp d11, d13
+ vswp d11, d14
+
+ sub r3, r0, #3
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,0
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,1
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,2
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,3
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,4
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,5
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,6
+ STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,7
+
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,0
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,1
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,2
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,3
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,4
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,5
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,6
+ STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,7
+
+ vpop {q4-q7}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN DeblockChromaLt4V_neon
+ vdup.u8 q11, r3
+ ldr r3, [sp, #0]
+
+ sub r0, r0, r2 , lsl #1
+ sub r1, r1, r2, lsl #1
+ vdup.u8 q9, r3
+ ldr r3, [sp, #4]
+
+ vld1.u8 {d0}, [r0], r2
+ vld1.u8 {d1}, [r1], r2
+ vld1.u8 {d2}, [r0], r2
+ vld1.u8 {d3}, [r1], r2
+ vld1.u8 {d4}, [r0], r2
+ vld1.u8 {d5}, [r1], r2
+ vld1.u8 {d6}, [r0]
+ vld1.u8 {d7}, [r1]
+
+ sub r0, r0, r2, lsl #1
+ sub r1, r1, r2, lsl #1
+
+ vld1.s8 {d31}, [r3]
+ vmovl.u8 q14,d31
+ vshl.u64 d29,d28,#8
+ vorr d28,d29
+ vmov d29, d28
+ veor q15, q15
+ vsub.i8 q15,q15,q14
+
+ MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
+
+ DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d16, q12, q13
+ DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d17, q12, q13
+ vmax.s8 q8, q8, q15
+ vmin.s8 q8, q8, q14
+
+ vand.s8 q8, q8, q10
+ vcge.s8 q14, q14, #0
+ vand.s8 q8, q8, q14
+ EXTRACT_DELTA_INTO_TWO_PART q8, q10
+ vqadd.u8 q1, q1, q10
+ vqsub.u8 q1, q1, q8
+ vst1.u8 {d2}, [r0], r2
+ vst1.u8 {d3}, [r1], r2
+ vqsub.u8 q2, q2, q10
+ vqadd.u8 q2, q2, q8
+ vst1.u8 {d4}, [r0]
+ vst1.u8 {d5}, [r1]
+
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN DeblockChromaEq4V_neon
+ vpush {q4-q5}
+
+ vdup.u8 q11, r3
+ ldr r3, [sp, #32]
+
+ sub r0, r0, r2 , lsl #1
+ sub r1, r1, r2, lsl #1
+ vdup.u8 q9, r3
+ vld1.u8 {d0}, [r0], r2 // q0::p1
+ vld1.u8 {d1}, [r1], r2
+ vld1.u8 {d2}, [r0], r2 // q1::p0
+ vld1.u8 {d3}, [r1], r2
+ vld1.u8 {d4}, [r0], r2 // q2::q0
+ vld1.u8 {d5}, [r1], r2
+ vld1.u8 {d6}, [r0] // q3::q1
+ vld1.u8 {d7}, [r1]
+
+ sub r0, r0, r2, lsl #1 // pix = [-1*src_stride]
+ sub r1, r1, r2, lsl #1
+
+ MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
+
+ vmov q11, q10
+
+ DIFF_CHROMA_EQ4_P0Q0 d0, d2, d4, d6, q4, q5, q8, d30, d0 // Cb::p0' q0'
+ DIFF_CHROMA_EQ4_P0Q0 d1, d3, d5, d7, q12, q13, q14, d31, d1 // Cr::p0' q0'
+
+ vbsl.u8 q10, q15, q1
+ vst1.u8 {d20}, [r0], r2
+ vst1.u8 {d21}, [r1], r2
+
+ vbsl.u8 q11, q0, q2
+ vst1.u8 {d22}, [r0]
+ vst1.u8 {d23}, [r1]
+
+ vpop {q4-q5}
+WELS_ASM_FUNC_END
+
+WELS_ASM_FUNC_BEGIN DeblockChromaLt4H_neon
+
+ vdup.u8 q11, r3
+ ldr r3, [sp, #0]
+
+ sub r0, r0, #2
+ vdup.u8 q9, r3
+ ldr r3, [sp, #4]
+ sub r1, r1, #2
+ vld1.s8 {d31}, [r3]
+
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
+ vswp q1, q2
+ vswp d1, d2
+ vswp d6, d5
+
+ vmovl.u8 q14, d31
+ vshl.u64 d29,d28,#8
+ vorr d28,d29
+ vmov d29, d28
+ veor q15, q15
+ vsub.i8 q15,q15,q14
+
+ MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
+
+ DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d16, q12, q13
+ DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d17, q12, q13
+ vmax.s8 q8, q8, q15
+ vmin.s8 q8, q8, q14
+
+ vand.s8 q8, q8, q10
+ vcge.s8 q14, q14, #0
+ vand.s8 q8, q8, q14
+ EXTRACT_DELTA_INTO_TWO_PART q8, q10
+ vqadd.u8 q1, q1, q10
+ vqsub.u8 q1, q1, q8
+ vqsub.u8 q2, q2, q10
+ vqadd.u8 q2, q2, q8
+
+ sub r0, r0, r2, lsl #3
+ sub r1, r1, r2, lsl #3
+ vswp d1, d2
+ vswp d6, d5
+ vswp q1, q2
+
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
+
+WELS_ASM_FUNC_END
+
+WELS_ASM_FUNC_BEGIN DeblockChromaEq4H_neon
+ vpush {q4-q5}
+ vdup.u8 q11, r3
+ ldr r3, [sp, #32]
+
+ sub r0, r0, #2
+ sub r1, r1, #2
+
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
+ LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
+ vswp q1, q2
+ vswp d1, d2
+ vswp d6, d5
+
+ vdup.u8 q9, r3
+ MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
+ vmov q11, q10
+
+ DIFF_CHROMA_EQ4_P0Q0 d0, d2, d4, d6, q8, q9, q12, d8, d10
+ DIFF_CHROMA_EQ4_P0Q0 d1, d3, d5, d7, q13, q14, q15, d9, d11
+
+ vbsl.u8 q10, q4, q1
+ vbsl.u8 q11, q5, q2
+ sub r0, r0, r2, lsl #3 // pix: 0th row [-2]
+ sub r1, r1, r2, lsl #3
+
+ vmov q1, q10
+ vmov q2, q11
+ vswp d1, d2
+ vswp d6, d5
+ vswp q1, q2
+ // Cb:d0d1d2d3, Cr:d4d5d6d7
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
+ STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
+
+ vpop {q4-q5}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsNonZeroCount_neon
+
+ vld1.64 {d0-d2}, [r0]
+
+ vceq.s8 q0, q0, #0
+ vceq.s8 d2, d2, #0
+ vmvn q0, q0
+ vmvn d2, d2
+ vabs.s8 q0, q0
+ vabs.s8 d2, d2
+
+ vst1.64 {d0-d2}, [r0]
+WELS_ASM_FUNC_END
+
+#ifdef __APPLE__
+.macro BS_NZC_CHECK
+ vld1.8 {d0,d1}, [$0]
+ /* Arrenge the input data --- TOP */
+ ands r6, $1, #2
+ beq bs_nzc_check_jump0
+
+ sub r6, $0, $2, lsl #4
+ sub r6, $2, lsl #3
+ add r6, #12
+ vld1.32 d3[1], [r6]
+
+bs_nzc_check_jump0:
+ vext.8 q1, q1, q0, #12
+ vadd.u8 $3, q0, q1
+
+
+ /* Arrenge the input data --- LEFT */
+ ands r6, $1, #1
+ beq bs_nzc_check_jump1
+
+ sub r6, $0, #21
+ add r7, r6, #4
+ vld1.8 d3[4], [r6]
+ add r6, r7, #4
+ vld1.8 d3[5], [r7]
+ add r7, r6, #4
+ vld1.8 d3[6], [r6]
+ vld1.8 d3[7], [r7]
+
+bs_nzc_check_jump1:
+ vzip.8 d0, d1
+ vzip.8 d0, d1
+ vext.8 q1, q1, q0, #12
+ vadd.u8 $4, q0, q1
+.endm
+
+.macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6
+ mov r6, #4
+ vabd.s16 q8, $0, $1
+ vabd.s16 q9, $1, $2
+ vdup.s16 $0, r6
+ vabd.s16 q10, $2, $3
+ vabd.s16 q11, $3, $4
+
+ vcge.s16 q8, $0
+ vcge.s16 q9, $0
+ vcge.s16 q10, $0
+ vcge.s16 q11, $0
+
+ vpadd.i16 d16, d16, d17
+ vpadd.i16 d17, d18, d19
+ vpadd.i16 d18, d20, d21
+ vpadd.i16 d19, d22, d23
+
+ vaddhn.i16 $5, q8, q8
+ vaddhn.i16 $6, q9, q9
+.endm
+
+.macro BS_MV_CHECK
+ vldm $0, {q0,q1,q2,q3}
+
+ /* Arrenge the input data --- TOP */
+ ands r6, $1, #2
+ beq bs_mv_check_jump0
+
+ sub r6, $0, $2, lsl #6
+ add r6, #48
+ vld1.8 {d8, d9}, [r6]
+
+bs_mv_check_jump0:
+ BS_COMPARE_MV q4, q0, q1, q2, q3, $3, $4
+
+ /* Arrenge the input data --- LEFT */
+ ands r6, $1, #1
+ beq bs_mv_check_jump1
+
+ sub r6, $0, #52
+ add r7, r6, #16
+ vld1.32 d8[0], [r6]
+ add r6, r7, #16
+ vld1.32 d8[1], [r7]
+ add r7, r6, #16
+ vld1.32 d9[0], [r6]
+ vld1.32 d9[1], [r7]
+
+bs_mv_check_jump1:
+ vzip.32 q0, q2
+ vzip.32 q1, q3
+ vzip.32 q0, q1
+ vzip.32 q2, q3
+ BS_COMPARE_MV q4, q0, q1, q2, q3, $5, $6
+.endm
+#else
+.macro BS_NZC_CHECK arg0, arg1, arg2, arg3, arg4
+ vld1.8 {d0,d1}, [\arg0]
+ /* Arrenge the input data --- TOP */
+ ands r6, \arg1, #2
+ beq bs_nzc_check_jump0
+
+ sub r6, \arg0, \arg2, lsl #4
+ sub r6, r6, \arg2, lsl #3
+ add r6, #12
+ vld1.32 d3[1], [r6]
+
+bs_nzc_check_jump0:
+ vext.8 q1, q1, q0, #12
+ vadd.u8 \arg3, q0, q1
+
+
+ /* Arrenge the input data --- LEFT */
+ ands r6, \arg1, #1
+ beq bs_nzc_check_jump1
+
+ sub r6, \arg0, #21
+ add r7, r6, #4
+ vld1.8 d3[4], [r6]
+ add r6, r7, #4
+ vld1.8 d3[5], [r7]
+ add r7, r6, #4
+ vld1.8 d3[6], [r6]
+ vld1.8 d3[7], [r7]
+
+bs_nzc_check_jump1:
+ vzip.8 d0, d1
+ vzip.8 d0, d1
+ vext.8 q1, q1, q0, #12
+ vadd.u8 \arg4, q0, q1
+.endm
+
+.macro BS_COMPARE_MV arg0, arg1, arg2, arg3, arg4, arg5, arg6 //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6
+ mov r6, #4
+ vabd.s16 q8, \arg0, \arg1
+ vabd.s16 q9, \arg1, \arg2
+ vdup.s16 \arg0, r6
+ vabd.s16 q10, \arg2, \arg3
+ vabd.s16 q11, \arg3, \arg4
+
+ vcge.s16 q8, \arg0
+ vcge.s16 q9, \arg0
+ vcge.s16 q10, \arg0
+ vcge.s16 q11, \arg0
+
+ vpadd.i16 d16, d16, d17
+ vpadd.i16 d17, d18, d19
+ vpadd.i16 d18, d20, d21
+ vpadd.i16 d19, d22, d23
+
+ vaddhn.i16 \arg5, q8, q8
+ vaddhn.i16 \arg6, q9, q9
+.endm
+
+.macro BS_MV_CHECK arg0, arg1, arg2, arg3, arg4, arg5, arg6
+ vldm \arg0, {q0,q1,q2,q3}
+
+ /* Arrenge the input data --- TOP */
+ ands r6, \arg1, #2
+ beq bs_mv_check_jump0
+
+ sub r6, \arg0, \arg2, lsl #6
+ add r6, #48
+ vld1.8 {d8, d9}, [r6]
+
+bs_mv_check_jump0:
+ BS_COMPARE_MV q4, q0, q1, q2, q3, \arg3, \arg4
+
+ /* Arrenge the input data --- LEFT */
+ ands r6, \arg1, #1
+ beq bs_mv_check_jump1
+
+ sub r6, \arg0, #52
+ add r7, r6, #16
+ vld1.32 d8[0], [r6]
+ add r6, r7, #16
+ vld1.32 d8[1], [r7]
+ add r7, r6, #16
+ vld1.32 d9[0], [r6]
+ vld1.32 d9[1], [r7]
+
+bs_mv_check_jump1:
+ vzip.32 q0, q2
+ vzip.32 q1, q3
+ vzip.32 q0, q1
+ vzip.32 q2, q3
+ BS_COMPARE_MV q4, q0, q1, q2, q3, \arg5, \arg6
+.endm
+#endif
+
+
+WELS_ASM_FUNC_BEGIN DeblockingBSCalcEnc_neon
+
+ stmdb sp!, {r5-r7}
+ vpush {q4}
+
+ ldr r5, [sp, #28] //Save BS to r5
+
+ /* Checking the nzc status */
+ BS_NZC_CHECK r0, r2, r3, q14, q15 //q14,q15 save the nzc status
+
+ /* For checking bS[I] = 2 */
+ mov r6, #2
+ vcgt.s8 q14, q14, #0
+ vdup.u8 q0, r6
+ vcgt.s8 q15, q15, #0
+
+ vand.u8 q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top
+ vand.u8 q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left
+
+ /* Checking the mv status*/
+ BS_MV_CHECK r1, r2, r3, d24, d25, d26, d27//q12, q13 save the mv status
+
+ /* For checking bS[I] = 1 */
+ mov r6, #1
+ vdup.u8 q0, r6
+
+ vand.u8 q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top
+ vand.u8 q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left
+
+
+ /* Check bS[I] is '1' or '2' */
+ vmax.u8 q1, q12, q14
+ vmax.u8 q0, q13, q15
+
+ //vstm r5, {q0, q1}
+ vst1.32 {q0, q1}, [r5]
+ vpop {q4}
+ ldmia sp!, {r5-r7}
+WELS_ASM_FUNC_END
+#endif
--- /dev/null
+++ b/codec/common/arm/expand_picture_neon.S
@@ -1,0 +1,137 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef HAVE_NEON
+.text
+#include "arm_arch_common_macro.S"
+
+
+WELS_ASM_FUNC_BEGIN ExpandPictureLuma_neon
+ stmdb sp!, {r4-r8}
+ //Save the dst
+ mov r7, r0
+ mov r8, r3
+
+ add r4, r7, r2
+ sub r4, #1
+ //For the left and right expand
+_expand_picture_luma_loop2:
+ sub r5, r7, #32
+ add r6, r4, #1
+
+ vld1.8 {d0[], d1[]}, [r7], r1
+ vld1.8 {d2[], d3[]}, [r4], r1
+
+ vst1.8 {q0}, [r5]!
+ vst1.8 {q0}, [r5]
+ vst1.8 {q1}, [r6]!
+ vst1.8 {q1}, [r6]
+ subs r8, #1
+ bne _expand_picture_luma_loop2
+
+ //for the top and bottom expand
+ add r2, #64
+ sub r0, #32
+ mla r4, r1, r3, r0
+ sub r4, r1
+_expand_picture_luma_loop0:
+ mov r5, #32
+ mls r5, r5, r1, r0
+ add r6, r4, r1
+ vld1.8 {q0}, [r0]!
+ vld1.8 {q1}, [r4]!
+
+ mov r8, #32
+_expand_picture_luma_loop1:
+ vst1.8 {q0}, [r5], r1
+ vst1.8 {q1}, [r6], r1
+ subs r8, #1
+ bne _expand_picture_luma_loop1
+
+ subs r2, #16
+ bne _expand_picture_luma_loop0
+
+ //vldreq.32 d0, [r0]
+
+ ldmia sp!, {r4-r8}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN ExpandPictureChroma_neon
+ stmdb sp!, {r4-r8}
+ //Save the dst
+ mov r7, r0
+ mov r8, r3
+
+ add r4, r7, r2
+ sub r4, #1
+ //For the left and right expand
+_expand_picture_chroma_loop2:
+ sub r5, r7, #16
+ add r6, r4, #1
+
+ vld1.8 {d0[], d1[]}, [r7], r1
+ vld1.8 {d2[], d3[]}, [r4], r1
+
+ vst1.8 {q0}, [r5]
+ vst1.8 {q1}, [r6]
+ subs r8, #1
+ bne _expand_picture_chroma_loop2
+
+ //for the top and bottom expand
+ add r2, #32
+ sub r0, #16
+ mla r4, r1, r3, r0
+ sub r4, r1
+_expand_picture_chroma_loop0:
+ mov r5, #16
+ mls r5, r5, r1, r0
+ add r6, r4, r1
+ vld1.8 {q0}, [r0]!
+ vld1.8 {q1}, [r4]!
+
+ mov r8, #16
+_expand_picture_chroma_loop1:
+ vst1.8 {q0}, [r5], r1
+ vst1.8 {q1}, [r6], r1
+ subs r8, #1
+ bne _expand_picture_chroma_loop1
+
+ subs r2, #16
+ bne _expand_picture_chroma_loop0
+
+ //vldreq.32 d0, [r0]
+
+ ldmia sp!, {r4-r8}
+WELS_ASM_FUNC_END
+
+#endif
--- /dev/null
+++ b/codec/common/arm/mc_neon.S
@@ -1,0 +1,2210 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef HAVE_NEON
+.text
+#include "arm_arch_common_macro.S"
+
+#ifdef __APPLE__
+.macro AVERAGE_TWO_8BITS
+// { // input:dst_d, src_d A and B; working: q13
+ vaddl.u8 q13, $2, $1
+ vrshrn.u16 $0, q13, #1
+// }
+.endm
+
+.macro FILTER_6TAG_8BITS
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+ vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, $2, $3 //src[0]+src[1]
+ vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, $1, $4 //src[-1]+src[2]
+ vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 $6, q12, #5
+// }
+.endm
+
+.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used
+// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2},
+ vrev64.8 $2, $0 // X[5][4][3][2][1][0]O
+ vaddl.u8 $3, $0, $2 // each 16bits, *[50][41][32][23][14][05]*
+ vmul.s16 $0, $2, $1 // 0+1*[50]-5*[41]+20[32]
+ vpadd.s16 $0, $0, $0
+ vpadd.s16 $0, $0, $0
+ vqrshrun.s16 $0, $4, #5
+// }
+.endm
+
+.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+ vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, $2, $3 //src[0]+src[1]
+ vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, $1, $4 //src[-1]+src[2]
+ vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 $6, q12, #5
+ vaddl.u8 q13, $2, $6
+ vrshrn.u16 $6, q13, #1
+// }
+.endm
+
+.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+ vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, $2, $3 //src[0]+src[1]
+ vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, $1, $4 //src[-1]+src[2]
+ vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 $6, q12, #5
+ vaddl.u8 q13, $3, $6
+ vrshrn.u16 $6, q13, #1
+// }
+.endm
+
+.macro FILTER_6TAG_8BITS_TO_16BITS
+// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
+ vaddl.u8 $6, $0, $5 //dst_q=src[-2]+src[3]
+ vaddl.u8 q13, $2, $3 //src[0]+src[1]
+ vmla.u16 $6, q13, $7 //dst_q += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, $1, $4 //src[-1]+src[2]
+ vmls.s16 $6, q13, $8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+// }
+.endm
+
+.macro FILTER_3_IN_16BITS_TO_8BITS
+// { // input:a, b, c, dst_d;
+ vsub.s16 $0, $0, $1 //a-b
+ vshr.s16 $0, $0, #2 //(a-b)/4
+ vsub.s16 $0, $0, $1 //(a-b)/4-b
+ vadd.s16 $0, $0, $2 //(a-b)/4-b+c
+ vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4
+ vadd.s16 $0, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ vqrshrun.s16 $3, $0, #6 //(+32)>>6
+// }
+.endm
+
+.macro UNPACK_2_16BITS_TO_ABC
+// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
+ vext.16 $4, $0, $1, #2 //src[0]
+ vext.16 $3, $0, $1, #3 //src[1]
+ vadd.s16 $4, $3 //c=src[0]+src[1]
+
+ vext.16 $3, $0, $1, #1 //src[-1]
+ vext.16 $2, $0, $1, #4 //src[2]
+ vadd.s16 $3, $2 //b=src[-1]+src[2]
+
+ vext.16 $2, $0, $1, #5 //src[3]
+ vadd.s16 $2, $0 //a=src[-2]+src[3]
+// }
+.endm
+
+.macro UNPACK_1_IN_8x16BITS_TO_8BITS
+// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
+ vext.16 $3, $3, $3, #7 // 0x????, [0][1][2][3][4][5],
+ vrev64.16 $1, $1
+ vadd.u16 $2, $1 // C[2+3],B[1+4],A[0+5],
+ vshr.s64 $1, $2, #16
+ vshr.s64 $0, $2, #32 // Output: C $2, B $1, A $0
+
+ vsub.s16 $0, $0, $1 //a-b
+ vshr.s16 $0, $0, #2 //(a-b)/4
+ vsub.s16 $0, $0, $1 //(a-b)/4-b
+ vadd.s16 $0, $0, $2 //(a-b)/4-b+c
+ vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4
+ vadd.s16 $1, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ vqrshrun.s16 $0, $3, #6 //(+32)>>6
+// }
+.endm
+#else
+.macro AVERAGE_TWO_8BITS arg0, arg1, arg2
+// { // input:dst_d, src_d A and B; working: q13
+ vaddl.u8 q13, \arg2, \arg1
+ vrshrn.u16 \arg0, q13, #1
+// }
+.endm
+
+.macro FILTER_6TAG_8BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+ vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
+ vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
+ vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 \arg6, q12, #5
+// }
+.endm
+
+.macro FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5 // when width=17/9, used
+// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}
+ vrev64.8 \arg2, \arg0 // X[5][4][3][2][1][0]O
+ vaddl.u8 \arg3, \arg0, \arg2 // each 16bits, *[50][41][32][23][14][05]*
+ vmul.s16 \arg0, \arg2, \arg1 // 0+1*[50]-5*[41]+20[32]
+ vpadd.s16 \arg0, \arg0, \arg0
+ vpadd.s16 \arg0, \arg0, \arg0
+ vqrshrun.s16 \arg0, \arg4, #5
+// }
+.endm
+
+.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+ vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
+ vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
+ vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 \arg6, q12, #5
+ vaddl.u8 q13, \arg2, \arg6
+ vrshrn.u16 \arg6, q13, #1
+// }
+.endm
+
+.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+ vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
+ vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
+ vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 \arg6, q12, #5
+ vaddl.u8 q13, \arg3, \arg6
+ vrshrn.u16 \arg6, q13, #1
+// }
+.endm
+
+.macro FILTER_6TAG_8BITS_TO_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
+ vaddl.u8 \arg6, \arg0, \arg5 //dst_q=src[-2]+src[3]
+ vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
+ vmla.u16 \arg6, q13, \arg7 //dst_q += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
+ vmls.s16 \arg6, q13, \arg8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+// }
+.endm
+
+.macro FILTER_3_IN_16BITS_TO_8BITS arg0, arg1, arg2, arg3
+// { // input:a, b, c, dst_d;
+ vsub.s16 \arg0, \arg0, \arg1 //a-b
+ vshr.s16 \arg0, \arg0, #2 //(a-b)/4
+ vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b
+ vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c
+ vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4
+ vadd.s16 \arg0, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ vqrshrun.s16 \arg3, \arg0, #6 //(+32)>>6
+// }
+.endm
+
+.macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
+// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
+ vext.16 \arg4, \arg0, \arg1, #2 //src[0]
+ vext.16 \arg3, \arg0, \arg1, #3 //src[1]
+ vadd.s16 \arg4, \arg3 //c=src[0]+src[1]
+
+ vext.16 \arg3, \arg0, \arg1, #1 //src[-1]
+ vext.16 \arg2, \arg0, \arg1, #4 //src[2]
+ vadd.s16 \arg3,\arg2 //b=src[-1]+src[2]
+
+ vext.16 \arg2, \arg0, \arg1, #5 //src[3]
+ vadd.s16 \arg2, \arg0 //a=src[-2]+src[3]
+// }
+.endm
+
+.macro UNPACK_1_IN_8x16BITS_TO_8BITS arg0, arg1,arg2, arg3
+// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
+ vext.16 \arg3, \arg3, \arg3, #7 // 0x????, [0][1][2][3][4][5]
+ vrev64.16 \arg1, \arg1
+ vadd.u16 \arg2, \arg1 // C[2+3],B[1+4],A[0+5]
+ vshr.s64 \arg1, \arg2, #16
+ vshr.s64 \arg0, \arg2, #32 // Output: C \arg2, B \arg1, A \arg0
+
+ vsub.s16 \arg0, \arg0, \arg1 //a-b
+ vshr.s16 \arg0, \arg0, #2 //(a-b)/4
+ vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b
+ vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c
+ vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4
+ vadd.s16 \arg1, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ vqrshrun.s16 \arg0, \arg3, #6 //(+32)>>6
+// }
+.endm
+#endif
+
+WELS_ASM_FUNC_BEGIN McHorVer20WidthEq16_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w16_h_mc_luma_loop:
+ vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
+ pld [r0]
+ pld [r0, #16]
+
+ vext.8 q2, q0, q1, #1 //q2=src[-1]
+ vext.8 q3, q0, q1, #2 //q3=src[0]
+ vext.8 q8, q0, q1, #3 //q8=src[1]
+ vext.8 q9, q0, q1, #4 //q9=src[2]
+ vext.8 q10, q0, q1, #5 //q10=src[3]
+
+ FILTER_6TAG_8BITS d0, d4, d6, d16, d18, d20, d2, q14, q15
+
+ FILTER_6TAG_8BITS d1, d5, d7, d17, d19, d21, d3, q14, q15
+
+ sub r4, #1
+ vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
+
+ cmp r4, #0
+ bne w16_h_mc_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer20WidthEq8_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w8_h_mc_luma_loop:
+ vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
+ pld [r0]
+
+ vext.8 d2, d0, d1, #1 //d2=src[-1]
+ vext.8 d3, d0, d1, #2 //d3=src[0]
+ vext.8 d4, d0, d1, #3 //d4=src[1]
+ vext.8 d5, d0, d1, #4 //d5=src[2]
+ vext.8 d6, d0, d1, #5 //d6=src[3]
+
+ FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d1, q14, q15
+
+ sub r4, #1
+ vst1.u8 {d1}, [r2], r3
+
+ cmp r4, #0
+ bne w8_h_mc_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer20WidthEq4_neon
+ push {r4, r5, r6}
+ ldr r6, [sp, #12]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w4_h_mc_luma_loop:
+ vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
+ pld [r0]
+ vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
+ pld [r0]
+
+ vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
+ vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
+ vext.8 q3, q2, q2, #1 //src[0:6 *]
+ vext.8 q8, q2, q2, #2 //src[1:6 * *]
+
+ vtrn.32 q3, q8 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+ vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
+ vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
+ vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
+
+ FILTER_6TAG_8BITS d0, d4, d6, d7, d2, d5, d1, q14, q15
+
+ vmov r4, r5, d1
+ str r4, [r2], r3
+ str r5, [r2], r3
+
+ sub r6, #2
+ cmp r6, #0
+ bne w4_h_mc_luma_loop
+
+ pop {r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer10WidthEq16_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w16_xy_10_mc_luma_loop:
+ vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
+ pld [r0]
+ pld [r0, #16]
+
+ vext.8 q2, q0, q1, #1 //q2=src[-1]
+ vext.8 q3, q0, q1, #2 //q3=src[0]
+ vext.8 q8, q0, q1, #3 //q8=src[1]
+ vext.8 q9, q0, q1, #4 //q9=src[2]
+ vext.8 q10, q0, q1, #5 //q10=src[3]
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d16, d18, d20, d2, q14, q15
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d5, d7, d17, d19, d21, d3, q14, q15
+
+ sub r4, #1
+ vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
+
+ cmp r4, #0
+ bne w16_xy_10_mc_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer10WidthEq8_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w8_xy_10_mc_luma_loop:
+ vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
+ pld [r0]
+
+ vext.8 d2, d0, d1, #1 //d2=src[-1]
+ vext.8 d3, d0, d1, #2 //d3=src[0]
+ vext.8 d4, d0, d1, #3 //d4=src[1]
+ vext.8 d5, d0, d1, #4 //d5=src[2]
+ vext.8 d6, d0, d1, #5 //d6=src[3]
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d3, d4, d5, d6, d1, q14, q15
+
+ sub r4, #1
+ vst1.u8 {d1}, [r2], r3
+
+ cmp r4, #0
+ bne w8_xy_10_mc_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer10WidthEq4_neon
+ push {r4, r5, r6}
+ ldr r6, [sp, #12]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w4_xy_10_mc_luma_loop:
+ vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
+ pld [r0]
+ vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
+ pld [r0]
+
+ vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
+ vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
+ vext.8 q3, q2, q2, #1 //src[0:6 *]
+ vext.8 q8, q2, q2, #2 //src[1:6 * *]
+
+ vtrn.32 q3, q8 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+ vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
+ vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
+ vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d7, d2, d5, d1, q14, q15
+
+ vmov r4, r5, d1
+ str r4, [r2], r3
+ str r5, [r2], r3
+
+ sub r6, #2
+ cmp r6, #0
+ bne w4_xy_10_mc_luma_loop
+
+ pop {r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer30WidthEq16_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w16_xy_30_mc_luma_loop:
+ vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
+ pld [r0]
+ pld [r0, #16]
+
+ vext.8 q2, q0, q1, #1 //q2=src[-1]
+ vext.8 q3, q0, q1, #2 //q3=src[0]
+ vext.8 q8, q0, q1, #3 //q8=src[1]
+ vext.8 q9, q0, q1, #4 //q9=src[2]
+ vext.8 q10, q0, q1, #5 //q10=src[3]
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d16, d18, d20, d2, q14, q15
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d5, d7, d17, d19, d21, d3, q14, q15
+
+ sub r4, #1
+ vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
+
+ cmp r4, #0
+ bne w16_xy_30_mc_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer30WidthEq8_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w8_xy_30_mc_luma_loop:
+ vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
+ pld [r0]
+
+ vext.8 d2, d0, d1, #1 //d2=src[-1]
+ vext.8 d3, d0, d1, #2 //d3=src[0]
+ vext.8 d4, d0, d1, #3 //d4=src[1]
+ vext.8 d5, d0, d1, #4 //d5=src[2]
+ vext.8 d6, d0, d1, #5 //d6=src[3]
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d3, d4, d5, d6, d1, q14, q15
+
+ sub r4, #1
+ vst1.u8 {d1}, [r2], r3
+
+ cmp r4, #0
+ bne w8_xy_30_mc_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer30WidthEq4_neon
+ push {r4, r5, r6}
+ ldr r6, [sp, #12]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w4_xy_30_mc_luma_loop:
+ vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
+ pld [r0]
+ vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
+ pld [r0]
+
+ vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
+ vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
+ vext.8 q3, q2, q2, #1 //src[0:6 *]
+ vext.8 q8, q2, q2, #2 //src[1:6 * *]
+
+ vtrn.32 q3, q8 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+ vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
+ vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
+ vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d7, d2, d5, d1, q14, q15
+
+ vmov r4, r5, d1
+ str r4, [r2], r3
+ str r5, [r2], r3
+
+ sub r6, #2
+ cmp r6, #0
+ bne w4_xy_30_mc_luma_loop
+
+ pop {r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer01WidthEq16_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //q0=src[-2]
+ vld1.u8 {q1}, [r0], r1 //q1=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {q2}, [r0], r1 //q2=src[0]
+ vld1.u8 {q3}, [r0], r1 //q3=src[1]
+ vld1.u8 {q8}, [r0], r1 //q8=src[2]
+
+w16_xy_01_luma_loop:
+
+ vld1.u8 {q9}, [r0], r1 //q9=src[3]
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d16, d18, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d16, d18, d0, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d17, d19, d1, d21, q14, q15
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d4, d6, d16, d18, d0, d2, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d5, d7, d17, d19, d1, d3, d21, q14, q15
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d6, d16, d18, d0, d2, d4, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d7, d17, d19, d1, d3, d5, d21, q14, q15
+ vld1.u8 {q3}, [r0], r1 //read 5th row
+ vst1.u8 {q10}, [r2], r3 //write 4th 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d16, d18, d0, d2, d4, d6, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d17, d19, d1, d3, d5, d7, d21, q14, q15
+ vld1.u8 {q8}, [r0], r1 //read 6th row
+ vst1.u8 {q10}, [r2], r3 //write 5th 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d18, d0, d2, d4, d6, d16, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d19, d1, d3, d5, d7, d17, d21, q14, q15
+ vld1.u8 {q9}, [r0], r1 //read 7th row
+ vst1.u8 {q10}, [r2], r3 //write 6th 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d16, d18, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 8th row
+ vst1.u8 {q10}, [r2], r3 //write 7th 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d16, d18, d0, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d17, d19, d1, d21, q14, q15
+ vst1.u8 {q10}, [r2], r3 //write 8th 16Byte
+
+ //q2, q3, q4, q5, q0 --> q0~q4
+ vswp q0, q8
+ vswp q0, q2
+ vmov q1, q3
+ vmov q3, q9 //q0~q4
+
+ sub r4, #8
+ cmp r4, #0
+ bne w16_xy_01_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer01WidthEq8_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0}, [r0], r1 //d0=src[-2]
+ vld1.u8 {d1}, [r0], r1 //d1=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {d2}, [r0], r1 //d2=src[0]
+ vld1.u8 {d3}, [r0], r1 //d3=src[1]
+
+ vld1.u8 {d4}, [r0], r1 //d4=src[2]
+ vld1.u8 {d5}, [r0], r1 //d5=src[3]
+
+w8_xy_01_mc_luma_loop:
+
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d16, q14, q15
+ vld1.u8 {d0}, [r0], r1 //read 2nd row
+ vst1.u8 {d16}, [r2], r3 //write 1st 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d2, d3, d4, d5, d0, d16, q14, q15
+ vld1.u8 {d1}, [r0], r1 //read 3rd row
+ vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d16, q14, q15
+ vld1.u8 {d2}, [r0], r1 //read 4th row
+ vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d4, d5, d0, d1, d2, d16, q14, q15
+ vld1.u8 {d3}, [r0], r1 //read 5th row
+ vst1.u8 {d16}, [r2], r3 //write 4th 8Byte
+
+ //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+ vswp q0, q2
+ vswp q1, q2
+
+ sub r4, #4
+ cmp r4, #0
+ bne w8_xy_01_mc_luma_loop
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer01WidthEq4_neon
+ push {r4, r5, r6, r7}
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ ldr r4, [r0], r1 //r4=src[-2]
+ ldr r5, [r0], r1 //r5=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ ldr r6, [r0], r1 //r6=src[0]
+ ldr r7, [r0], r1 //r7=src[1]
+
+ vmov d0, r4, r5
+ vmov d1, r5, r6
+ vmov d2, r6, r7
+
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d3, r7, r4
+ ldr r7, [sp, #16]
+
+w4_xy_01_mc_luma_loop:
+
+// pld [r0]
+ //using reserving r4
+ ldr r5, [r0], r1 //r5=src[3]
+ ldr r6, [r0], r1 //r6=src[0]
+ vmov d4, r4, r5
+ vmov d5, r5, r6 //reserved r6
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d16, q14, q15
+ vmov r4, r5, d16
+ str r4, [r2], r3 //write 1st 4Byte
+ str r5, [r2], r3 //write 2nd 4Byte
+
+ ldr r5, [r0], r1 //r5=src[1]
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d0, r6, r5
+ vmov d1, r5, r4 //reserved r4
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d16, q14, q15
+ vmov r5, r6, d16
+ str r5, [r2], r3 //write 3rd 4Byte
+ str r6, [r2], r3 //write 4th 4Byte
+
+ //d4, d5, d0, d1 --> d0, d1, d2, d3
+ vmov q1, q0
+ vmov q0, q2
+
+ sub r7, #4
+ cmp r7, #0
+ bne w4_xy_01_mc_luma_loop
+
+ pop {r4, r5, r6, r7}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer03WidthEq16_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //q0=src[-2]
+ vld1.u8 {q1}, [r0], r1 //q1=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {q2}, [r0], r1 //q2=src[0]
+ vld1.u8 {q3}, [r0], r1 //q3=src[1]
+ vld1.u8 {q8}, [r0], r1 //q8=src[2]
+
+w16_xy_03_luma_loop:
+
+ vld1.u8 {q9}, [r0], r1 //q9=src[3]
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d16, d18, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d16, d18, d0, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d17, d19, d1, d21, q14, q15
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d4, d6, d16, d18, d0, d2, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d5, d7, d17, d19, d1, d3, d21, q14, q15
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d6, d16, d18, d0, d2, d4, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d7, d17, d19, d1, d3, d5, d21, q14, q15
+ vld1.u8 {q3}, [r0], r1 //read 5th row
+ vst1.u8 {q10}, [r2], r3 //write 4th 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d16, d18, d0, d2, d4, d6, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d17, d19, d1, d3, d5, d7, d21, q14, q15
+ vld1.u8 {q8}, [r0], r1 //read 6th row
+ vst1.u8 {q10}, [r2], r3 //write 5th 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d18, d0, d2, d4, d6, d16, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d19, d1, d3, d5, d7, d17, d21, q14, q15
+ vld1.u8 {q9}, [r0], r1 //read 7th row
+ vst1.u8 {q10}, [r2], r3 //write 6th 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d16, d18, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 8th row
+ vst1.u8 {q10}, [r2], r3 //write 7th 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d16, d18, d0, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d17, d19, d1, d21, q14, q15
+ vst1.u8 {q10}, [r2], r3 //write 8th 16Byte
+
+ //q2, q3, q8, q9, q0 --> q0~q8
+ vswp q0, q8
+ vswp q0, q2
+ vmov q1, q3
+ vmov q3, q9 //q0~q8
+
+ sub r4, #8
+ cmp r4, #0
+ bne w16_xy_03_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer03WidthEq8_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0}, [r0], r1 //d0=src[-2]
+ vld1.u8 {d1}, [r0], r1 //d1=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {d2}, [r0], r1 //d2=src[0]
+ vld1.u8 {d3}, [r0], r1 //d3=src[1]
+
+ vld1.u8 {d4}, [r0], r1 //d4=src[2]
+ vld1.u8 {d5}, [r0], r1 //d5=src[3]
+
+w8_xy_03_mc_luma_loop:
+
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d16, q14, q15
+ vld1.u8 {d0}, [r0], r1 //read 2nd row
+ vst1.u8 {d16}, [r2], r3 //write 1st 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d2, d3, d4, d5, d0, d16, q14, q15
+ vld1.u8 {d1}, [r0], r1 //read 3rd row
+ vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d16, q14, q15
+ vld1.u8 {d2}, [r0], r1 //read 4th row
+ vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d4, d5, d0, d1, d2, d16, q14, q15
+ vld1.u8 {d3}, [r0], r1 //read 5th row
+ vst1.u8 {d16}, [r2], r3 //write 4th 8Byte
+
+ //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+ vswp q0, q2
+ vswp q1, q2
+
+ sub r4, #4
+ cmp r4, #0
+ bne w8_xy_03_mc_luma_loop
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer03WidthEq4_neon
+ push {r4, r5, r6, r7}
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ ldr r4, [r0], r1 //r4=src[-2]
+ ldr r5, [r0], r1 //r5=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ ldr r6, [r0], r1 //r6=src[0]
+ ldr r7, [r0], r1 //r7=src[1]
+
+ vmov d0, r4, r5
+ vmov d1, r5, r6
+ vmov d2, r6, r7
+
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d3, r7, r4
+ ldr r7, [sp, #16]
+
+w4_xy_03_mc_luma_loop:
+
+// pld [r0]
+ //using reserving r4
+ ldr r5, [r0], r1 //r5=src[3]
+ ldr r6, [r0], r1 //r6=src[0]
+ vmov d4, r4, r5
+ vmov d5, r5, r6 //reserved r6
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d16, q14, q15
+ vmov r4, r5, d16
+ str r4, [r2], r3 //write 1st 4Byte
+ str r5, [r2], r3 //write 2nd 4Byte
+
+ ldr r5, [r0], r1 //r5=src[1]
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d0, r6, r5
+ vmov d1, r5, r4 //reserved r4
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d16, q14, q15
+ vmov r5, r6, d16
+ str r5, [r2], r3 //write 3rd 4Byte
+ str r6, [r2], r3 //write 4th 4Byte
+
+ //d4, d5, d0, d1 --> d0, d1, d2, d3
+ vmov q1, q0
+ vmov q0, q2
+
+ sub r7, #4
+ cmp r7, #0
+ bne w4_xy_03_mc_luma_loop
+
+ pop {r4, r5, r6, r7}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer02WidthEq16_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //q0=src[-2]
+ vld1.u8 {q1}, [r0], r1 //q1=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {q2}, [r0], r1 //q2=src[0]
+ vld1.u8 {q3}, [r0], r1 //q3=src[1]
+ vld1.u8 {q8}, [r0], r1 //q8=src[2]
+
+w16_v_mc_luma_loop:
+
+ vld1.u8 {q9}, [r0], r1 //q9=src[3]
+
+ FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
+
+ FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte
+
+ FILTER_6TAG_8BITS d4, d6, d16, d18, d0, d2, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d5, d7, d17, d19, d1, d3, d21, q14, q15
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte
+
+ FILTER_6TAG_8BITS d6, d16, d18, d0, d2, d4, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d7, d17, d19, d1, d3, d5, d21, q14, q15
+ vld1.u8 {q3}, [r0], r1 //read 5th row
+ vst1.u8 {q10}, [r2], r3 //write 4th 16Byte
+
+ FILTER_6TAG_8BITS d16, d18, d0, d2, d4, d6, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d17, d19, d1, d3, d5, d7, d21, q14, q15
+ vld1.u8 {q8}, [r0], r1 //read 6th row
+ vst1.u8 {q10}, [r2], r3 //write 5th 16Byte
+
+ FILTER_6TAG_8BITS d18, d0, d2, d4, d6, d16, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d19, d1, d3, d5, d7, d17, d21, q14, q15
+ vld1.u8 {q9}, [r0], r1 //read 7th row
+ vst1.u8 {q10}, [r2], r3 //write 6th 16Byte
+
+ FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 8th row
+ vst1.u8 {q10}, [r2], r3 //write 7th 16Byte
+
+ FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15
+ vst1.u8 {q10}, [r2], r3 //write 8th 16Byte
+
+ //q2, q3, q8, q9, q0 --> q0~q8
+ vswp q0, q8
+ vswp q0, q2
+ vmov q1, q3
+ vmov q3, q9 //q0~q8
+
+ sub r4, #8
+ cmp r4, #0
+ bne w16_v_mc_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer02WidthEq8_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0}, [r0], r1 //d0=src[-2]
+ vld1.u8 {d1}, [r0], r1 //d1=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {d2}, [r0], r1 //d2=src[0]
+ vld1.u8 {d3}, [r0], r1 //d3=src[1]
+
+ vld1.u8 {d4}, [r0], r1 //d4=src[2]
+ vld1.u8 {d5}, [r0], r1 //d5=src[3]
+
+w8_v_mc_luma_loop:
+
+ pld [r0]
+ FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15
+ vld1.u8 {d0}, [r0], r1 //read 2nd row
+ vst1.u8 {d16}, [r2], r3 //write 1st 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d16, q14, q15
+ vld1.u8 {d1}, [r0], r1 //read 3rd row
+ vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d16, q14, q15
+ vld1.u8 {d2}, [r0], r1 //read 4th row
+ vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d16, q14, q15
+ vld1.u8 {d3}, [r0], r1 //read 5th row
+ vst1.u8 {d16}, [r2], r3 //write 4th 8Byte
+
+ //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+ vswp q0, q2
+ vswp q1, q2
+
+ sub r4, #4
+ cmp r4, #0
+ bne w8_v_mc_luma_loop
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer02WidthEq4_neon
+ push {r4, r5, r6, r7}
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ ldr r4, [r0], r1 //r4=src[-2]
+ ldr r5, [r0], r1 //r5=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ ldr r6, [r0], r1 //r6=src[0]
+ ldr r7, [r0], r1 //r7=src[1]
+
+ vmov d0, r4, r5
+ vmov d1, r5, r6
+ vmov d2, r6, r7
+
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d3, r7, r4
+ ldr r7, [sp, #16]
+
+w4_v_mc_luma_loop:
+
+// pld [r0]
+ //using reserving r4
+ ldr r5, [r0], r1 //r5=src[3]
+ ldr r6, [r0], r1 //r6=src[0]
+ vmov d4, r4, r5
+ vmov d5, r5, r6 //reserved r6
+
+ FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15
+ vmov r4, r5, d16
+ str r4, [r2], r3 //write 1st 4Byte
+ str r5, [r2], r3 //write 2nd 4Byte
+
+ ldr r5, [r0], r1 //r5=src[1]
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d0, r6, r5
+ vmov d1, r5, r4 //reserved r4
+
+ FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d16, q14, q15
+ vmov r5, r6, d16
+ str r5, [r2], r3 //write 3rd 4Byte
+ str r6, [r2], r3 //write 4th 4Byte
+
+ //d4, d5, d0, d1 --> d0, d1, d2, d3
+ vmov q1, q0
+ vmov q0, q2
+
+ sub r7, #4
+ cmp r7, #0
+ bne w4_v_mc_luma_loop
+
+ pop {r4, r5, r6, r7}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer22WidthEq16_neon
+ push {r4}
+ vpush {q4-q7}
+ ldr r4, [sp, #68]
+
+ sub r0, #2 //src[-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
+ pld [r0]
+ pld [r0, r1]
+
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0-d2}, [r0], r1 //use 21(16+5), =src[-2]
+ vld1.u8 {d3-d5}, [r0], r1 //use 21(16+5), =src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+
+ vld1.u8 {d6-d8}, [r0], r1 //use 21(16+5), =src[0]
+ vld1.u8 {d9-d11}, [r0], r1 //use 21(16+5), =src[1]
+ pld [r0]
+ pld [r0, r1]
+ vld1.u8 {d12-d14}, [r0], r1 //use 21(16+5), =src[2]
+
+w16_hv_mc_luma_loop:
+
+ vld1.u8 {d15-d17}, [r0], r1 //use 21(16+5), =src[3]
+ //the 1st row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
+
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
+ vst1.u8 {q0}, [r2], r3 //write 16Byte
+
+
+ vld1.u8 {d0-d2}, [r0], r1 //read 2nd row
+ //the 2nd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3
+
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4
+
+ vst1.u8 {d3, d4}, [r2], r3 //write 16Byte
+
+ vld1.u8 {d3-d5}, [r0], r1 //read 3rd row
+ //the 3rd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6
+
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7
+ vst1.u8 {d6, d7}, [r2], r3 //write 16Byte
+
+ vld1.u8 {d6-d8}, [r0], r1 //read 4th row
+ //the 4th row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10
+ vst1.u8 {d9, d10}, [r2], r3 //write 16Byte
+
+ //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
+ vswp q0, q6
+ vswp q6, q3
+ vmov q5, q2
+ vmov q2, q8
+
+ vmov d20,d8
+ vmov q4, q1
+ vmov q1, q7
+ vmov d14,d20
+
+ sub r4, #4
+ cmp r4, #0
+ bne w16_hv_mc_luma_loop
+ vpop {q4-q7}
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer22WidthEq8_neon
+ push {r4}
+ vpush {q4}
+ ldr r4, [sp, #20]
+
+ sub r0, #2 //src[-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
+ pld [r0]
+ pld [r0, r1]
+
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //use 13(8+5), =src[-2]
+ vld1.u8 {q1}, [r0], r1 //use 13(8+5), =src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+
+ vld1.u8 {q2}, [r0], r1 //use 13(8+5), =src[0]
+ vld1.u8 {q3}, [r0], r1 //use 13(8+5), =src[1]
+ pld [r0]
+ pld [r0, r1]
+ vld1.u8 {q4}, [r0], r1 //use 13(8+5), =src[2]
+
+w8_hv_mc_luma_loop:
+
+ vld1.u8 {q8}, [r0], r1 //use 13(8+5), =src[3]
+ //the 1st row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d16, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d17, q10, q14, q15 // 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2], r3 //write 8Byte
+
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ //the 2nd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d16, d0, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d17, d1, q10, q14, q15 // 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2], r3 //write 8Byte
+
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ //the 3rd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d16, d0, d2, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d17, d1, d3, q10, q14, q15 // 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2], r3 //write 8Byte
+
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ //the 4th row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d6, d8, d16, d0, d2, d4, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d7, d9, d17, d1, d3, d5, q10, q14, q15 // 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2], r3 //write 8Byte
+
+ //q4~q5, q0~q2, --> q0~q4
+ vswp q0, q4
+ vswp q2, q4
+ vmov q3, q1
+ vmov q1, q8
+
+ sub r4, #4
+ cmp r4, #0
+ bne w8_hv_mc_luma_loop
+ vpop {q4}
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer22WidthEq4_neon
+ push {r4 ,r5, r6}
+ vpush {q4-q7}
+ ldr r6, [sp, #76]
+
+ sub r0, #2 //src[-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
+ pld [r0]
+ pld [r0, r1]
+
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[-2]
+ vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+
+ vld1.u8 {q2}, [r0], r1 //use 9(4+5), =src[0]
+ vld1.u8 {q3}, [r0], r1 //use 9(4+5), =src[1]
+ pld [r0]
+ pld [r0, r1]
+ vld1.u8 {q4}, [r0], r1 //use 9(4+5), =src[2]
+
+w4_hv_mc_luma_loop:
+
+ vld1.u8 {q5}, [r0], r1 //use 9(4+5), =src[3]
+ vld1.u8 {q6}, [r0], r1 //use 9(4+5), =src[4]
+
+ //the 1st&2nd row
+ pld [r0]
+ pld [r0, r1]
+ // vertical filtered
+ FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q7, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q8, q14, q15 // 1 avail
+
+ FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8,d10, d12, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9,d11, d13,q10, q14, q15 // 1 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail
+ UNPACK_2_16BITS_TO_ABC q9,q10, q0, q7, q8 //4 avail
+
+ vmov d23, d0
+ vmov d25, d14
+ vmov d27, d16
+
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0]
+ vmov r4 ,r5, d22
+ str r4, [r2], r3 //write 4Byte
+ str r5, [r2], r3 //write 4Byte
+
+ //the 3rd&4th row
+ vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[3]
+ vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[4]
+ pld [r0]
+ pld [r0, r1]
+ // vertical filtered
+ FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d12, d0, q7, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d13, d1, q8, q14, q15 // 1 avail
+
+ FILTER_6TAG_8BITS_TO_16BITS d6, d8,d10, d12, d0, d2, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d7, d9,d11, d13, d1, d3,q10, q14, q15 // 1 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail
+ UNPACK_2_16BITS_TO_ABC q9,q10, q2, q7, q8 //4 avail
+
+ vmov d23, d4
+ vmov d25, d14
+ vmov d27, d16
+
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0]
+ vmov r4 ,r5, d22
+ str r4, [r2], r3 //write 4Byte
+ str r5, [r2], r3 //write 4Byte
+
+ //q4~q6, q0~q1, --> q0~q4
+ vswp q4, q0
+ vmov q3, q4
+ vmov q4, q1
+ vmov q1, q5
+ vmov q2, q6
+
+ sub r6, #4
+ cmp r6, #0
+ bne w4_hv_mc_luma_loop
+
+ vpop {q4-q7}
+ pop {r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McCopyWidthEq16_neon
+ push {r4}
+ ldr r4, [sp, #4]
+w16_copy_loop:
+ vld1.u8 {q0}, [r0], r1
+ sub r4, #2
+ vld1.u8 {q1}, [r0], r1
+ vst1.u8 {q0}, [r2], r3
+ cmp r4, #0
+ vst1.u8 {q1}, [r2], r3
+ bne w16_copy_loop
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McCopyWidthEq8_neon
+ push {r4}
+ ldr r4, [sp, #4]
+w8_copy_loop:
+ vld1.u8 {d0}, [r0], r1
+ vld1.u8 {d1}, [r0], r1
+ vst1.u8 {d0}, [r2], r3
+ vst1.u8 {d1}, [r2], r3
+ sub r4, #2
+ cmp r4, #0
+ bne w8_copy_loop
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McCopyWidthEq4_neon
+ push {r4, r5, r6}
+ ldr r4, [sp, #12]
+w4_copy_loop:
+ ldr r5, [r0], r1
+ ldr r6, [r0], r1
+ str r5, [r2], r3
+ str r6, [r2], r3
+
+ sub r4, #2
+ cmp r4, #0
+ bne w4_copy_loop
+
+ pop {r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN PixelAvgWidthEq16_neon
+ push {r4}
+ ldr r4, [sp, #4]
+w16_pix_avg_loop:
+ vld1.u8 {q0}, [r2]!
+ vld1.u8 {q1}, [r3]!
+ vld1.u8 {q2}, [r2]!
+ vld1.u8 {q3}, [r3]!
+
+ vld1.u8 {q8}, [r2]!
+ vld1.u8 {q9}, [r3]!
+ vld1.u8 {q10}, [r2]!
+ vld1.u8 {q11}, [r3]!
+
+ AVERAGE_TWO_8BITS d0, d0, d2
+ AVERAGE_TWO_8BITS d1, d1, d3
+ vst1.u8 {q0}, [r0], r1
+
+ AVERAGE_TWO_8BITS d4, d4, d6
+ AVERAGE_TWO_8BITS d5, d5, d7
+ vst1.u8 {q2}, [r0], r1
+
+ AVERAGE_TWO_8BITS d16, d16, d18
+ AVERAGE_TWO_8BITS d17, d17, d19
+ vst1.u8 {q8}, [r0], r1
+
+ AVERAGE_TWO_8BITS d20, d20, d22
+ AVERAGE_TWO_8BITS d21, d21, d23
+ vst1.u8 {q10}, [r0], r1
+
+ sub r4, #4
+ cmp r4, #0
+ bne w16_pix_avg_loop
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN PixelAvgWidthEq8_neon
+ push {r4, r5}
+ ldr r4, [sp, #8]
+ mov r5, #16
+w8_pix_avg_loop:
+
+ vld1.u8 {d0}, [r2], r5
+ vld1.u8 {d2}, [r3], r5
+ vld1.u8 {d1}, [r2], r5
+ vld1.u8 {d3}, [r3], r5
+
+ AVERAGE_TWO_8BITS d0, d0, d2
+ AVERAGE_TWO_8BITS d1, d1, d3
+ vst1.u8 {d0}, [r0], r1
+ vst1.u8 {d1}, [r0], r1
+
+ vld1.u8 {d4}, [r2], r5
+ vld1.u8 {d6}, [r3], r5
+ vld1.u8 {d5}, [r2], r5
+ vld1.u8 {d7}, [r3], r5
+
+ AVERAGE_TWO_8BITS d4, d4, d6
+ AVERAGE_TWO_8BITS d5, d5, d7
+ vst1.u8 {d4}, [r0], r1
+ vst1.u8 {d5}, [r0], r1
+
+ sub r4, #4
+ cmp r4, #0
+ bne w8_pix_avg_loop
+
+ pop {r4, r5}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN PixelAvgWidthEq4_neon
+ push {r4-r8}
+ ldr r4, [sp, #20]
+w4_pix_avg_loop:
+
+ ldr r5, [r2]
+ ldr r6, [r2, #16]
+ ldr r7, [r3]
+ ldr r8, [r3, #16]
+ add r2, #32
+ add r3, #32
+
+ vmov d0, r5, r6
+ vmov d1, r7, r8
+ AVERAGE_TWO_8BITS d0, d0, d1
+ vmov r5, r6, d0
+
+ str r5, [r0], r1
+ str r6, [r0], r1
+
+ sub r4, #2
+ cmp r4, #0
+ bne w4_pix_avg_loop
+
+ pop {r4-r8}
+WELS_ASM_FUNC_END
+
+WELS_ASM_FUNC_BEGIN McChromaWidthEq8_neon
+ push {r4, r5}
+ ldr r4, [sp, #8]
+ ldr r5, [sp, #12]
+// normal case: {cA*src[x] + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
+// we can opti it by adding vert only/ hori only cases, to be continue
+ vld1.u8 {d31}, [r4] //load A/B/C/D
+ vld1.u8 {q0}, [r0], r1 //src[x]
+
+ vdup.u8 d28, d31[0] //A
+ vdup.u8 d29, d31[1] //B
+ vdup.u8 d30, d31[2] //C
+ vdup.u8 d31, d31[3] //D
+
+ vext.u8 d1, d0, d1, #1 //src[x+1]
+
+w8_mc_chroma_loop: // each two pxl row
+ vld1.u8 {q1}, [r0], r1 //src[x+stride]
+ vld1.u8 {q2}, [r0], r1 //src[x+2*stride]
+ vext.u8 d3, d2, d3, #1 //src[x+stride+1]
+ vext.u8 d5, d4, d5, #1 //src[x+2*stride+1]
+
+ vmull.u8 q3, d0, d28 //(src[x] * A)
+ vmlal.u8 q3, d1, d29 //+=(src[x+1] * B)
+ vmlal.u8 q3, d2, d30 //+=(src[x+stride] * C)
+ vmlal.u8 q3, d3, d31 //+=(src[x+stride+1] * D)
+ vrshrn.u16 d6, q3, #6
+ vst1.u8 d6, [r2], r3
+
+ vmull.u8 q3, d2, d28 //(src[x] * A)
+ vmlal.u8 q3, d3, d29 //+=(src[x+1] * B)
+ vmlal.u8 q3, d4, d30 //+=(src[x+stride] * C)
+ vmlal.u8 q3, d5, d31 //+=(src[x+stride+1] * D)
+ vrshrn.u16 d6, q3, #6
+ vst1.u8 d6, [r2], r3
+
+ vmov q0, q2
+ sub r5, #2
+ cmp r5, #0
+ bne w8_mc_chroma_loop
+
+ pop {r4, r5}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McChromaWidthEq4_neon
+
+ push {r4, r5, r6}
+ ldr r4, [sp, #12]
+ ldr r6, [sp, #16]
+// normal case: {cA*src[x] + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
+// we can opti it by adding vert only/ hori only cases, to be continue
+ vld1.u8 {d31}, [r4] //load A/B/C/D
+
+ vdup.u8 d28, d31[0] //A
+ vdup.u8 d29, d31[1] //B
+ vdup.u8 d30, d31[2] //C
+ vdup.u8 d31, d31[3] //D
+
+w4_mc_chroma_loop: // each two pxl row
+ vld1.u8 {d0}, [r0], r1 //a::src[x]
+ vld1.u8 {d2}, [r0], r1 //b::src[x+stride]
+ vld1.u8 {d4}, [r0] //c::src[x+2*stride]
+
+ vshr.u64 d1, d0, #8
+ vshr.u64 d3, d2, #8
+ vshr.u64 d5, d4, #8
+
+ vmov q3, q1 //b::[0:7]+b::[1~8]
+ vtrn.32 q0, q1 //d0{a::[0:3]+b::[0:3]}; d1{a::[1:4]+b::[1:4]}
+ vtrn.32 q3, q2 //d6{b::[0:3]+c::[0:3]}; d7{b::[1:4]+c::[1:4]}
+
+ vmull.u8 q1, d0, d28 //(src[x] * A)
+ vmlal.u8 q1, d1, d29 //+=(src[x+1] * B)
+ vmlal.u8 q1, d6, d30 //+=(src[x+stride] * C)
+ vmlal.u8 q1, d7, d31 //+=(src[x+stride+1] * D)
+
+ vrshrn.u16 d2, q1, #6
+ vmov r4, r5, d2
+ str r4, [r2], r3
+ str r5, [r2], r3
+
+ sub r6, #2
+ cmp r6, #0
+ bne w4_mc_chroma_loop
+
+ pop {r4, r5, r6}
+WELS_ASM_FUNC_END
+
+WELS_ASM_FUNC_BEGIN McHorVer20Width17_neon
+ push {r4-r5}
+ mov r4, #20
+ mov r5, #1
+ sub r4, r4, r4, lsl #(16-2)
+ lsl r5, #16
+ ror r4, #16
+ vmov d3, r5, r4 // 0x0014FFFB00010000
+
+ sub r3, #16
+ ldr r4, [sp, #8]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w17_h_mc_luma_loop:
+ vld1.u8 {d0,d1,d2}, [r0], r1 //only use 22(17+5); q0=src[-2]
+
+ vext.8 q2, q0, q1, #1 //q2=src[-1]
+ vext.8 q3, q0, q1, #2 //q3=src[0]
+ vext.8 q8, q0, q1, #3 //q8=src[1]
+ vext.8 q9, q0, q1, #4 //q9=src[2]
+ vext.8 q10, q0, q1, #5 //q10=src[3]
+
+ FILTER_6TAG_8BITS d0, d4, d6, d16, d18, d20, d22, q14, q15
+
+ FILTER_6TAG_8BITS d1, d5, d7, d17, d19, d21, d23, q14, q15
+
+ vst1.u8 {d22, d23}, [r2]! //write [0:15] Byte
+
+ vsli.64 d2, d2, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
+ FILTER_SINGLE_TAG_8BITS d2, d3, d22, q11, q1
+
+ vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
+
+ sub r4, #1
+ cmp r4, #0
+ bne w17_h_mc_luma_loop
+ pop {r4-r5}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer20Width9_neon
+ push {r4-r5}
+ mov r4, #20
+ mov r5, #1
+ sub r4, r4, r4, lsl #(16-2)
+ lsl r5, #16
+ ror r4, #16
+ vmov d7, r5, r4 // 0x0014FFFB00010000
+
+ sub r3, #8
+ ldr r4, [sp, #8]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w9_h_mc_luma_loop:
+ vld1.u8 {d0,d1}, [r0], r1 //only use 14(9+5); q0=src[-2]
+ pld [r0]
+
+ vext.8 d2, d0, d1, #1 //d2=src[-1]
+ vext.8 d3, d0, d1, #2 //d3=src[0]
+ vext.8 d4, d0, d1, #3 //d4=src[1]
+ vext.8 d5, d0, d1, #4 //d5=src[2]
+ vext.8 d6, d0, d1, #5 //d6=src[3]
+
+ FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d16, q14, q15
+
+ sub r4, #1
+ vst1.u8 {d16}, [r2]! //write [0:7] Byte
+
+ vsli.64 d2, d1, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
+ FILTER_SINGLE_TAG_8BITS d2, d7, d18, q9, q1
+ vst1.u8 {d2[0]}, [r2], r3 //write 8th Byte
+
+ cmp r4, #0
+ bne w9_h_mc_luma_loop
+ pop {r4-r5}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer02Height17_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //q0=src[-2]
+ vld1.u8 {q1}, [r0], r1 //q1=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {q2}, [r0], r1 //q2=src[0]
+ vld1.u8 {q3}, [r0], r1 //q3=src[1]
+ vld1.u8 {q8}, [r0], r1 //q8=src[2]
+
+w17_v_mc_luma_loop:
+
+ vld1.u8 {q9}, [r0], r1 //q9=src[3]
+
+ FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
+
+ FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte
+
+ FILTER_6TAG_8BITS d4, d6, d16, d18, d0, d2, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d5, d7, d17, d19, d1, d3, d21, q14, q15
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte
+
+ FILTER_6TAG_8BITS d6, d16, d18, d0, d2, d4, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d7, d17, d19, d1, d3, d5, d21, q14, q15
+ vld1.u8 {q3}, [r0], r1 //read 5th row
+ vst1.u8 {q10}, [r2], r3 //write 4th 16Byte
+
+ FILTER_6TAG_8BITS d16, d18, d0, d2, d4, d6, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d17, d19, d1, d3, d5, d7, d21, q14, q15
+ vld1.u8 {q8}, [r0], r1 //read 6th row
+ vst1.u8 {q10}, [r2], r3 //write 5th 16Byte
+
+ FILTER_6TAG_8BITS d18, d0, d2, d4, d6, d16, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d19, d1, d3, d5, d7, d17, d21, q14, q15
+ vld1.u8 {q9}, [r0], r1 //read 7th row
+ vst1.u8 {q10}, [r2], r3 //write 6th 16Byte
+
+ FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 8th row
+ vst1.u8 {q10}, [r2], r3 //write 7th 16Byte
+
+ FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15
+ vst1.u8 {q10}, [r2], r3 //write 8th 16Byte
+
+ //q2, q3, q8, q9, q0 --> q0~q8
+ vswp q0, q8
+ vswp q0, q2
+ vmov q1, q3
+ vmov q3, q9 //q0~q8
+
+ sub r4, #8
+ cmp r4, #1
+ bne w17_v_mc_luma_loop
+ // the last 16Bytes
+ vld1.u8 {q9}, [r0], r1 //q9=src[3]
+ FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
+ FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
+ vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer02Height9_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0}, [r0], r1 //d0=src[-2]
+ vld1.u8 {d1}, [r0], r1 //d1=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {d2}, [r0], r1 //d2=src[0]
+ vld1.u8 {d3}, [r0], r1 //d3=src[1]
+
+ vld1.u8 {d4}, [r0], r1 //d4=src[2]
+ vld1.u8 {d5}, [r0], r1 //d5=src[3]
+
+w9_v_mc_luma_loop:
+
+ pld [r0]
+ FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15
+ vld1.u8 {d0}, [r0], r1 //read 2nd row
+ vst1.u8 {d16}, [r2], r3 //write 1st 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d16, q14, q15
+ vld1.u8 {d1}, [r0], r1 //read 3rd row
+ vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d16, q14, q15
+ vld1.u8 {d2}, [r0], r1 //read 4th row
+ vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d16, q14, q15
+ vld1.u8 {d3}, [r0], r1 //read 5th row
+ vst1.u8 {d16}, [r2], r3 //write 4th 8Byte
+
+ //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+ vswp q0, q2
+ vswp q1, q2
+
+ sub r4, #4
+ cmp r4, #1
+ bne w9_v_mc_luma_loop
+
+ FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15
+ vst1.u8 {d16}, [r2], r3 //write last 8Byte
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer22Width17_neon
+ push {r4}
+ vpush {q4-q7}
+ ldr r4, [sp, #68]
+
+ sub r0, #2 //src[-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
+ pld [r0]
+ pld [r0, r1]
+
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0-d2}, [r0], r1 //use 21(17+5), =src[-2]
+ vld1.u8 {d3-d5}, [r0], r1 //use 21(17+5), =src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+
+ vld1.u8 {d6-d8}, [r0], r1 //use 21(17+5), =src[0]
+ vld1.u8 {d9-d11}, [r0], r1 //use 21(17+5), =src[1]
+ pld [r0]
+ pld [r0, r1]
+ vld1.u8 {d12-d14}, [r0], r1 //use 21(17+5), =src[2]
+ sub r3, #16
+
+w17_hv_mc_luma_loop:
+
+ vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3]
+ //the 1st row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
+ vst1.u8 {d0, d1}, [r2]! //write 16Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0]
+ vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
+
+ vld1.u8 {d0-d2}, [r0], r1 //read 2nd row
+ //the 2nd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4
+ vst1.u8 {d3, d4}, [r2]! //write 16Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d5, d22, d23, q11 //output to d5[0]
+ vst1.u8 {d5[0]}, [r2], r3 //write 16th Byte
+
+ vld1.u8 {d3-d5}, [r0], r1 //read 3rd row
+ //the 3rd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7
+ vst1.u8 {d6, d7}, [r2]! //write 16Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d8, d22, d23, q11 //output to d8[0]
+ vst1.u8 {d8[0]}, [r2], r3 //write 16th Byte
+
+ vld1.u8 {d6-d8}, [r0], r1 //read 4th row
+ //the 4th row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10
+ vst1.u8 {d9, d10}, [r2], r3 //write 16Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d11, d22, d23, q11 //output to d11[0]
+ vst1.u8 {d11[0]}, [r2], r3 //write 16th Byte
+
+ //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
+ vswp q0, q6
+ vswp q6, q3
+ vmov q5, q2
+ vmov q2, q8
+
+ vmov d20,d8
+ vmov q4, q1
+ vmov q1, q7
+ vmov d14,d20
+
+ sub r4, #4
+ cmp r4, #1
+ bne w17_hv_mc_luma_loop
+ //the last row
+ vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
+ vst1.u8 {q0}, [r2]! //write 16Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0]
+ vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
+
+ vpop {q4-q7}
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer22Width9_neon
+ push {r4}
+ vpush {q4}
+ ldr r4, [sp, #20]
+
+ sub r0, #2 //src[-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
+ pld [r0]
+ pld [r0, r1]
+
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //use 14(9+5), =src[-2]
+ vld1.u8 {q1}, [r0], r1 //use 14(9+5), =src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+
+ vld1.u8 {q2}, [r0], r1 //use 14(9+5), =src[0]
+ vld1.u8 {q3}, [r0], r1 //use 14(9+5), =src[1]
+ pld [r0]
+ pld [r0, r1]
+ vld1.u8 {q4}, [r0], r1 //use 14(9+5), =src[2]
+ sub r3, #8
+
+w9_hv_mc_luma_loop:
+
+ vld1.u8 {q8}, [r0], r1 //use 14(9+5), =src[3]
+ //the 1st row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d16, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d17, q10, q14, q15 // 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2]! //write 8Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
+ vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
+
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ //the 2nd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d16, d0, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d17, d1, q10, q14, q15 // 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2]! //write 8Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
+ vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
+
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ //the 3rd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d16, d0, d2, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d17, d1, d3, q10, q14, q15 // 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2]! //write 8Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
+ vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
+
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ //the 4th row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d6, d8, d16, d0, d2, d4, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d7, d9, d17, d1, d3, d5, q10, q14, q15 // 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2]! //write 8Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
+ vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
+
+ //q4~q8, q0~q2, --> q0~q4
+ vswp q0, q4
+ vswp q2, q4
+ vmov q3, q1
+ vmov q1, q8
+
+ sub r4, #4
+ cmp r4, #1
+ bne w9_hv_mc_luma_loop
+ //the last row
+ vld1.u8 {q8}, [r0], r1 //use 14(9+5), =src[3]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d16, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d17, q10, q14, q15 // 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
+ vst1.u8 d18, [r2]! //write 8Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
+ vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
+ vpop {q4}
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN PixStrideAvgWidthEq16_neon
+ push {r4, r5, r6}
+ ldr r4, [sp, #12]
+ ldr r5, [sp, #16]
+ ldr r6, [sp, #20]
+
+enc_w16_pix_avg_loop:
+ vld1.u8 {q0}, [r2], r3
+ vld1.u8 {q1}, [r4], r5
+ vld1.u8 {q2}, [r2], r3
+ vld1.u8 {q3}, [r4], r5
+
+ vld1.u8 {q8}, [r2], r3
+ vld1.u8 {q9}, [r4], r5
+ vld1.u8 {q10}, [r2], r3
+ vld1.u8 {q11}, [r4], r5
+
+ AVERAGE_TWO_8BITS d0, d0, d2
+ AVERAGE_TWO_8BITS d1, d1, d3
+ vst1.u8 {q0}, [r0], r1
+
+ AVERAGE_TWO_8BITS d4, d4, d6
+ AVERAGE_TWO_8BITS d5, d5, d7
+ vst1.u8 {q2}, [r0], r1
+
+ AVERAGE_TWO_8BITS d16, d16, d18
+ AVERAGE_TWO_8BITS d17, d17, d19
+ vst1.u8 {q8}, [r0], r1
+
+ AVERAGE_TWO_8BITS d20, d20, d22
+ AVERAGE_TWO_8BITS d21, d21, d23
+ vst1.u8 {q10}, [r0], r1
+
+ sub r6, #4
+ cmp r6, #0
+ bne enc_w16_pix_avg_loop
+
+ pop {r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN PixStrideAvgWidthEq8_neon
+ push {r4, r5, r6}
+ ldr r4, [sp, #12]
+ ldr r5, [sp, #16]
+ ldr r6, [sp, #20]
+enc_w8_pix_avg_loop:
+
+ vld1.u8 {d0}, [r2], r3
+ vld1.u8 {d2}, [r4], r5
+ vld1.u8 {d1}, [r2], r3
+ vld1.u8 {d3}, [r4], r5
+
+ AVERAGE_TWO_8BITS d0, d0, d2
+ AVERAGE_TWO_8BITS d1, d1, d3
+ vst1.u8 {d0}, [r0], r1
+ vst1.u8 {d1}, [r0], r1
+
+ vld1.u8 {d4}, [r2], r3
+ vld1.u8 {d6}, [r4], r5
+ vld1.u8 {d5}, [r2], r3
+ vld1.u8 {d7}, [r4], r5
+
+ AVERAGE_TWO_8BITS d4, d4, d6
+ AVERAGE_TWO_8BITS d5, d5, d7
+ vst1.u8 {d4}, [r0], r1
+ vst1.u8 {d5}, [r0], r1
+
+ sub r6, #4
+ cmp r6, #0
+ bne enc_w8_pix_avg_loop
+
+ pop {r4, r5, r6}
+WELS_ASM_FUNC_END
+
+#endif
--- a/codec/common/arm_arch_common_macro.S
+++ /dev/null
@@ -1,64 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifdef __APPLE__
-
-.macro WELS_ASM_FUNC_BEGIN
-.align 2
-.arm
-.globl _$0
-_$0:
-.endm
-
-.macro WELS_ASM_FUNC_END
-mov pc, lr
-.endm
-#else
-
-.syntax unified
-.section .note.GNU-stack,"",%progbits // Mark stack as non-executable
-.text
-
-.macro WELS_ASM_FUNC_BEGIN funcName
-.align 2
-.arm
-.global \funcName
-.type \funcName, %function
-.func \funcName
-\funcName:
-.endm
-
-.macro WELS_ASM_FUNC_END
-mov pc, lr
-.endfunc
-.endm
-#endif
--- a/codec/common/asm_inc.asm
+++ /dev/null
@@ -1,599 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* sse2inc.asm
-;*
-;* Abstract
-;* macro and constant
-;*
-;* History
-;* 8/5/2009 Created
-;*
-;*
-;*************************************************************************/
-;***********************************************************************
-; Options, for DEBUG
-;***********************************************************************
-
-%if 1
- %define MOVDQ movdqa
-%else
- %define MOVDQ movdqu
-%endif
-
-%if 1
- %define WELSEMMS emms
-%else
- %define WELSEMMS
-%endif
-
-
-;***********************************************************************
-; Macros
-;***********************************************************************
-
-DEFAULT REL
-
-%ifdef WIN64 ; Windows x64 ;************************************
-
-BITS 64
-
-%define arg1 rcx
-%define arg2 rdx
-%define arg3 r8
-%define arg4 r9
-%define arg5 [rsp + push_num*8 + 40]
-%define arg6 [rsp + push_num*8 + 48]
-%define arg7 [rsp + push_num*8 + 56]
-%define arg8 [rsp + push_num*8 + 64]
-%define arg9 [rsp + push_num*8 + 72]
-%define arg10 [rsp + push_num*8 + 80]
-%define arg11 [rsp + push_num*8 + 88]
-%define arg12 [rsp + push_num*8 + 96]
-
-%define r0 rcx
-%define r1 rdx
-%define r2 r8
-%define r3 r9
-%define r4 rax
-%define r5 r10
-%define r6 r11
-%define r7 rsp
-
-%define r0d ecx
-%define r1d edx
-%define r2d r8d
-%define r3d r9d
-%define r4d eax
-%define r5d r10d
-%define r6d r11d
-
-%define r0w cx
-%define r1w dx
-%define r2w r8w
-%define r3w r9w
-
-%define r0b cl
-%define r1b dl
-%define r2b r8l
-%define r3b r9l
-
-%define PUSHRFLAGS pushfq
-%define POPRFLAGS popfq
-%define retrq rax
-%define retrd eax
-
-%elifdef UNIX64 ; Unix x64 ;************************************
-
-BITS 64
-
-%define arg1 rdi
-%define arg2 rsi
-%define arg3 rdx
-%define arg4 rcx
-%define arg5 r8
-%define arg6 r9
-%define arg7 [rsp + push_num*8 + 8]
-%define arg8 [rsp + push_num*8 + 16]
-%define arg9 [rsp + push_num*8 + 24]
-%define arg10 [rsp + push_num*8 + 32]
-%define arg11 [rsp + push_num*8 + 40]
-%define arg12 [rsp + push_num*8 + 48]
-
-%define r0 rdi
-%define r1 rsi
-%define r2 rdx
-%define r3 rcx
-%define r4 r8
-%define r5 r9
-%define r6 r10
-%define r7 rsp
-
-%define r0d edi
-%define r1d esi
-%define r2d edx
-%define r3d ecx
-%define r4d r8d
-%define r5d r9d
-%define r6d r10d
-
-%define r0w di
-%define r1w si
-%define r2w dx
-%define r3w cx
-
-%define r0b dil
-%define r1b sil
-%define r2b dl
-%define r3b cl
-
-%define PUSHRFLAGS pushfq
-%define POPRFLAGS popfq
-%define retrq rax
-%define retrd eax
-
-%elifdef X86_32 ; X86_32 ;************************************
-
-BITS 32
-
-%define arg1 [esp + push_num*4 + 4]
-%define arg2 [esp + push_num*4 + 8]
-%define arg3 [esp + push_num*4 + 12]
-%define arg4 [esp + push_num*4 + 16]
-%define arg5 [esp + push_num*4 + 20]
-%define arg6 [esp + push_num*4 + 24]
-%define arg7 [esp + push_num*4 + 28]
-%define arg8 [esp + push_num*4 + 32]
-%define arg9 [esp + push_num*4 + 36]
-%define arg10 [esp + push_num*4 + 40]
-%define arg11 [esp + push_num*4 + 44]
-%define arg12 [esp + push_num*4 + 48]
-
-%define r0 eax
-%define r1 ecx
-%define r2 edx
-%define r3 ebx
-%define r4 esi
-%define r5 edi
-%define r6 ebp
-%define r7 esp
-
-%define r0d eax
-%define r1d ecx
-%define r2d edx
-%define r3d ebx
-%define r4d esi
-%define r5d edi
-%define r6d ebp
-
-%define r0w ax
-%define r1w cx
-%define r2w dx
-%define r3w bx
-
-%define r0b al
-%define r1b cl
-%define r2b dl
-%define r3b bl
-
-%define PUSHRFLAGS pushfd
-%define POPRFLAGS popfd
-%define retrq eax ; 32 bit mode do not support 64 bits regesters
-%define retrd eax
-
-%endif
-
-%macro LOAD_PARA 2
- mov %1, %2
-%endmacro
-
-%macro LOAD_1_PARA 0
- %ifdef X86_32
- mov r0, [esp + push_num*4 + 4]
- %endif
-%endmacro
-
-%macro LOAD_2_PARA 0
- %ifdef X86_32
- mov r0, [esp + push_num*4 + 4]
- mov r1, [esp + push_num*4 + 8]
- %endif
-%endmacro
-
-%macro LOAD_3_PARA 0
- %ifdef X86_32
- mov r0, [esp + push_num*4 + 4]
- mov r1, [esp + push_num*4 + 8]
- mov r2, [esp + push_num*4 + 12]
- %endif
-%endmacro
-
-%macro LOAD_4_PARA 0
- %ifdef X86_32
- push r3
- %assign push_num push_num+1
- mov r0, [esp + push_num*4 + 4]
- mov r1, [esp + push_num*4 + 8]
- mov r2, [esp + push_num*4 + 12]
- mov r3, [esp + push_num*4 + 16]
- %endif
-%endmacro
-
-%macro LOAD_5_PARA 0
- %ifdef X86_32
- push r3
- push r4
- %assign push_num push_num+2
- mov r0, [esp + push_num*4 + 4]
- mov r1, [esp + push_num*4 + 8]
- mov r2, [esp + push_num*4 + 12]
- mov r3, [esp + push_num*4 + 16]
- mov r4, [esp + push_num*4 + 20]
- %elifdef WIN64
- mov r4, [rsp + push_num*8 + 40]
- %endif
-%endmacro
-
-%macro LOAD_6_PARA 0
- %ifdef X86_32
- push r3
- push r4
- push r5
- %assign push_num push_num+3
- mov r0, [esp + push_num*4 + 4]
- mov r1, [esp + push_num*4 + 8]
- mov r2, [esp + push_num*4 + 12]
- mov r3, [esp + push_num*4 + 16]
- mov r4, [esp + push_num*4 + 20]
- mov r5, [esp + push_num*4 + 24]
- %elifdef WIN64
- mov r4, [rsp + push_num*8 + 40]
- mov r5, [rsp + push_num*8 + 48]
- %endif
-%endmacro
-
-%macro LOAD_7_PARA 0
- %ifdef X86_32
- push r3
- push r4
- push r5
- push r6
- %assign push_num push_num+4
- mov r0, [esp + push_num*4 + 4]
- mov r1, [esp + push_num*4 + 8]
- mov r2, [esp + push_num*4 + 12]
- mov r3, [esp + push_num*4 + 16]
- mov r4, [esp + push_num*4 + 20]
- mov r5, [esp + push_num*4 + 24]
- mov r6, [esp + push_num*4 + 28]
- %elifdef WIN64
- mov r4, [rsp + push_num*8 + 40]
- mov r5, [rsp + push_num*8 + 48]
- mov r6, [rsp + push_num*8 + 56]
- %elifdef UNIX64
- mov r6, [rsp + push_num*8 + 8]
- %endif
-%endmacro
-
-
-
-%macro LOAD_4_PARA_POP 0
- %ifdef X86_32
- pop r3
- %endif
-%endmacro
-
-%macro LOAD_5_PARA_POP 0
- %ifdef X86_32
- pop r4
- pop r3
- %endif
-%endmacro
-
-%macro LOAD_6_PARA_POP 0
- %ifdef X86_32
- pop r5
- pop r4
- pop r3
- %endif
-%endmacro
-
-%macro LOAD_7_PARA_POP 0
- %ifdef X86_32
- pop r6
- pop r5
- pop r4
- pop r3
- %endif
-%endmacro
-
-%macro PUSH_XMM 1
- %ifdef WIN64
- %assign xmm_num_regs %1
- %if xmm_num_regs > 6
- %ifdef push_num
- %assign push_num push_num+2*(%1-6)
- %endif
- sub rsp, 16*(%1 - 6)
- movdqu [rsp], xmm6
- %endif
- %if xmm_num_regs > 7
- movdqu [rsp+16], xmm7
- %endif
- %if xmm_num_regs > 8
- movdqu [rsp+32], xmm8
- %endif
- %if xmm_num_regs > 9
- movdqu [rsp+48], xmm9
- %endif
- %if xmm_num_regs > 10
- movdqu [rsp+64], xmm10
- %endif
- %if xmm_num_regs > 11
- movdqu [rsp+80], xmm11
- %endif
- %if xmm_num_regs > 12
- movdqu [rsp+96], xmm12
- %endif
- %if xmm_num_regs > 13
- movdqu [rsp+112], xmm13
- %endif
- %if xmm_num_regs > 14
- movdqu [rsp+128], xmm14
- %endif
- %if xmm_num_regs > 15
- movdqu [rsp+144], xmm15
- %endif
- %endif
-%endmacro
-
-%macro POP_XMM 0
- %ifdef WIN64
- %if xmm_num_regs > 15
- movdqu xmm15, [rsp+144]
- %endif
- %if xmm_num_regs > 14
- movdqu xmm14, [rsp+128]
- %endif
- %if xmm_num_regs > 13
- movdqu xmm13, [rsp+112]
- %endif
- %if xmm_num_regs > 12
- movdqu xmm12, [rsp+96]
- %endif
- %if xmm_num_regs > 11
- movdqu xmm11, [rsp+80]
- %endif
- %if xmm_num_regs > 10
- movdqu xmm10, [rsp+64]
- %endif
- %if xmm_num_regs > 9
- movdqu xmm9, [rsp+48]
- %endif
- %if xmm_num_regs > 8
- movdqu xmm8, [rsp+32]
- %endif
- %if xmm_num_regs > 7
- movdqu xmm7, [rsp+16]
- %endif
- %if xmm_num_regs > 6
- movdqu xmm6, [rsp]
- add rsp, 16*(xmm_num_regs - 6)
- %endif
- %endif
-%endmacro
-
-%macro SIGN_EXTENSION 2
- %ifndef X86_32
- movsxd %1, %2
- %endif
-%endmacro
-
-%macro SIGN_EXTENSIONW 2
- %ifndef X86_32
- movsx %1, %2
- %endif
-%endmacro
-
-%macro WELS_EXTERN 1
- ALIGN 16
- %ifdef PREFIX
- global _%1
- %define %1 _%1
- %else
- global %1
- %endif
- %1:
-%endmacro
-
-%macro WELS_AbsW 2
- pxor %2, %2
- psubw %2, %1
- pmaxsw %1, %2
-%endmacro
-
-%macro MMX_XSwap 4
- movq %4, %2
- punpckh%1 %4, %3
- punpckl%1 %2, %3
-%endmacro
-
-; pOut mm1, mm4, mm5, mm3
-%macro MMX_Trans4x4W 5
- MMX_XSwap wd, %1, %2, %5
- MMX_XSwap wd, %3, %4, %2
- MMX_XSwap dq, %1, %3, %4
- MMX_XSwap dq, %5, %2, %3
-%endmacro
-
-;for TRANSPOSE
-%macro SSE2_XSawp 4
- movdqa %4, %2
- punpckl%1 %2, %3
- punpckh%1 %4, %3
-%endmacro
-
-; in: xmm1, xmm2, xmm3, xmm4 pOut: xmm1, xmm4, xmm5, mm3
-%macro SSE2_Trans4x4D 5
- SSE2_XSawp dq, %1, %2, %5
- SSE2_XSawp dq, %3, %4, %2
- SSE2_XSawp qdq, %1, %3, %4
- SSE2_XSawp qdq, %5, %2, %3
-%endmacro
-
-;in: xmm0, xmm1, xmm2, xmm3 pOut: xmm0, xmm1, xmm3, xmm4
-%macro SSE2_TransTwo4x4W 5
- SSE2_XSawp wd, %1, %2, %5
- SSE2_XSawp wd, %3, %4, %2
- SSE2_XSawp dq, %1, %3, %4
- SSE2_XSawp dq, %5, %2, %3
- SSE2_XSawp qdq, %1, %5, %2
- SSE2_XSawp qdq, %4, %3, %5
-%endmacro
-
-;in: m1, m2, m3, m4, m5, m6, m7, m8
-;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-%macro SSE2_TransTwo8x8B 9
- movdqa %9, %8
- SSE2_XSawp bw, %1, %2, %8
- SSE2_XSawp bw, %3, %4, %2
- SSE2_XSawp bw, %5, %6, %4
- movdqa %6, %9
- movdqa %9, %4
- SSE2_XSawp bw, %7, %6, %4
-
- SSE2_XSawp wd, %1, %3, %6
- SSE2_XSawp wd, %8, %2, %3
- SSE2_XSawp wd, %5, %7, %2
- movdqa %7, %9
- movdqa %9, %3
- SSE2_XSawp wd, %7, %4, %3
-
- SSE2_XSawp dq, %1, %5, %4
- SSE2_XSawp dq, %6, %2, %5
- SSE2_XSawp dq, %8, %7, %2
- movdqa %7, %9
- movdqa %9, %5
- SSE2_XSawp dq, %7, %3, %5
-
- SSE2_XSawp qdq, %1, %8, %3
- SSE2_XSawp qdq, %4, %2, %8
- SSE2_XSawp qdq, %6, %7, %2
- movdqa %7, %9
- movdqa %9, %1
- SSE2_XSawp qdq, %7, %5, %1
- movdqa %5, %9
-%endmacro
-
-;xmm0, xmm6, xmm7, [eax], [ecx]
-;xmm7 = 0, eax = pix1, ecx = pix2, xmm0 save the result
-%macro SSE2_LoadDiff8P 5
- movq %1, %4
- punpcklbw %1, %3
- movq %2, %5
- punpcklbw %2, %3
- psubw %1, %2
-%endmacro
-
-; m2 = m1 + m2, m1 = m1 - m2
-%macro SSE2_SumSub 3
- movdqa %3, %2
- paddw %2, %1
- psubw %1, %3
-%endmacro
-
-
-%macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
- mov %3h, %3l
- movd %1, e%3x ; i.e, 1% = eax (=b0)
- pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
- pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
-%endmacro
-
-;copy a dw into a xmm for 8 times
-%macro SSE2_Copy8Times 2
- movd %1, %2
- punpcklwd %1, %1
- pshufd %1, %1, 0
-%endmacro
-
-;copy a db into a xmm for 16 times
-%macro SSE2_Copy16Times 2
- movd %1, %2
- pshuflw %1, %1, 0
- punpcklqdq %1, %1
- packuswb %1, %1
-%endmacro
-
-
-
-;***********************************************************************
-;preprocessor constants
-;***********************************************************************
-;dw 32,32,32,32,32,32,32,32 for xmm
-;dw 32,32,32,32 for mm
-%macro WELS_DW32 1
- pcmpeqw %1,%1
- psrlw %1,15
- psllw %1,5
-%endmacro
-
-;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm
-;dw 1, 1, 1, 1 for mm
-%macro WELS_DW1 1
- pcmpeqw %1,%1
- psrlw %1,15
-%endmacro
-
-;all 0 for xmm and mm
-%macro WELS_Zero 1
- pxor %1, %1
-%endmacro
-
-;dd 1, 1, 1, 1 for xmm
-;dd 1, 1 for mm
-%macro WELS_DD1 1
- pcmpeqw %1,%1
- psrld %1,31
-%endmacro
-
-;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
-%macro WELS_DB1 1
- pcmpeqw %1,%1
- psrlw %1,15
- packuswb %1,%1
-%endmacro
-
-
-
-
-
-
--- a/codec/common/cpu.cpp
+++ /dev/null
@@ -1,293 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2009-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file cpu.cpp
- *
- * \brief CPU compatibility detection
- *
- * \date 04/29/2009 Created
- *
- *************************************************************************************
- */
-#include <string.h>
-#include <stdio.h>
-#ifdef ANDROID_NDK
-#include <cpu-features.h>
-#endif
-#include "cpu.h"
-#include "cpu_core.h"
-
-
-
-#define CPU_Vendor_AMD "AuthenticAMD"
-#define CPU_Vendor_INTEL "GenuineIntel"
-#define CPU_Vendor_CYRIX "CyrixInstead"
-
-#if defined(X86_ASM)
-
-uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
- uint32_t uiCPU = 0;
- uint32_t uiFeatureA = 0, uiFeatureB = 0, uiFeatureC = 0, uiFeatureD = 0;
- int32_t CacheLineSize = 0;
- int8_t chVendorName[16] = { 0 };
- uint32_t uiMaxCpuidLevel = 0;
-
- if (!WelsCPUIdVerify()) {
- /* cpuid is not supported in cpu */
- return 0;
- }
-
- WelsCPUId (0, &uiFeatureA, (uint32_t*)&chVendorName[0], (uint32_t*)&chVendorName[8], (uint32_t*)&chVendorName[4]);
- uiMaxCpuidLevel = uiFeatureA;
- if (uiMaxCpuidLevel == 0) {
- /* maximum input value for basic cpuid information */
- return 0;
- }
-
- WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
- if ((uiFeatureD & 0x00800000) == 0) {
- /* Basic MMX technology is not support in cpu, mean nothing for us so return here */
- return 0;
- }
-
- uiCPU = WELS_CPU_MMX;
- if (uiFeatureD & 0x02000000) {
- /* SSE technology is identical to AMD MMX extensions */
- uiCPU |= WELS_CPU_MMXEXT | WELS_CPU_SSE;
- }
- if (uiFeatureD & 0x04000000) {
- /* SSE2 support here */
- uiCPU |= WELS_CPU_SSE2;
- }
- if (uiFeatureD & 0x00000001) {
- /* x87 FPU on-chip checking */
- uiCPU |= WELS_CPU_FPU;
- }
- if (uiFeatureD & 0x00008000) {
- /* CMOV instruction checking */
- uiCPU |= WELS_CPU_CMOV;
- }
- if ((!strcmp ((const char*)chVendorName, CPU_Vendor_INTEL)) ||
- (!strcmp((const char*)chVendorName, CPU_Vendor_AMD)) ) { // confirmed_safe_unsafe_usage
- if (uiFeatureD & 0x10000000) {
- /* Multi-Threading checking: contains of multiple logic processors */
- uiCPU |= WELS_CPU_HTT;
- }
- }
-
- if (uiFeatureC & 0x00000001) {
- /* SSE3 support here */
- uiCPU |= WELS_CPU_SSE3;
- }
- if (uiFeatureC & 0x00000200) {
- /* SSSE3 support here */
- uiCPU |= WELS_CPU_SSSE3;
- }
- if (uiFeatureC & 0x00080000) {
- /* SSE4.1 support here, 45nm Penryn processor */
- uiCPU |= WELS_CPU_SSE41;
- }
- if (uiFeatureC & 0x00100000) {
- /* SSE4.2 support here, next generation Nehalem processor */
- uiCPU |= WELS_CPU_SSE42;
- }
- if (WelsCPUSupportAVX (uiFeatureA, uiFeatureC)) {
- /* AVX supported */
- uiCPU |= WELS_CPU_AVX;
- }
- if (WelsCPUSupportFMA (uiFeatureA, uiFeatureC)) {
- /* AVX FMA supported */
- uiCPU |= WELS_CPU_FMA;
- }
- if (uiFeatureC & 0x02000000) {
- /* AES checking */
- uiCPU |= WELS_CPU_AES;
- }
- if (uiFeatureC & 0x00400000) {
- /* MOVBE checking */
- uiCPU |= WELS_CPU_MOVBE;
- }
-
- if( pNumberOfLogicProcessors != NULL ){
- if( uiCPU & WELS_CPU_HTT){
- *pNumberOfLogicProcessors = (uiFeatureB & 0x00ff0000) >> 16; // feature bits: 23-16 on returned EBX
- } else {
- *pNumberOfLogicProcessors = 0;
- }
- if( !strcmp((const char*)chVendorName, CPU_Vendor_INTEL) ){
- if( uiMaxCpuidLevel >= 4 ){
- uiFeatureC = 0;
- WelsCPUId(0x4, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
- if( uiFeatureA != 0 ){
- *pNumberOfLogicProcessors = ((uiFeatureA&0xfc000000)>>26) + 1;
- }
- }
- }
- }
-
- WelsCPUId (0x80000000, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-
- if ((!strcmp ((const char*)chVendorName, CPU_Vendor_AMD))
- && (uiFeatureA >= 0x80000001)) { // confirmed_safe_unsafe_usage
- WelsCPUId (0x80000001, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
- if (uiFeatureD & 0x00400000) {
- uiCPU |= WELS_CPU_MMXEXT;
- }
- if (uiFeatureD & 0x80000000) {
- uiCPU |= WELS_CPU_3DNOW;
- }
- }
-
- if (!strcmp ((const char*)chVendorName, CPU_Vendor_INTEL)) { // confirmed_safe_unsafe_usage
- int32_t family, model;
-
- WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
- family = ((uiFeatureA >> 8) & 0xf) + ((uiFeatureA >> 20) & 0xff);
- model = ((uiFeatureA >> 4) & 0xf) + ((uiFeatureA >> 12) & 0xf0);
-
- if ((family == 6) && (model == 9 || model == 13 || model == 14)) {
- uiCPU &= ~ (WELS_CPU_SSE2 | WELS_CPU_SSE3);
- }
- }
-
- // get cache line size
- if ((!strcmp ((const char*)chVendorName, CPU_Vendor_INTEL))
- || ! (strcmp ((const char*)chVendorName, CPU_Vendor_CYRIX))) { // confirmed_safe_unsafe_usage
- WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-
- CacheLineSize = (uiFeatureB & 0xff00) >>
- 5; // ((clflush_line_size >> 8) << 3), CLFLUSH_line_size * 8 = CacheLineSize_in_byte
-
- if (CacheLineSize == 128) {
- uiCPU |= WELS_CPU_CACHELINE_128;
- } else if (CacheLineSize == 64) {
- uiCPU |= WELS_CPU_CACHELINE_64;
- } else if (CacheLineSize == 32) {
- uiCPU |= WELS_CPU_CACHELINE_32;
- } else if (CacheLineSize == 16) {
- uiCPU |= WELS_CPU_CACHELINE_16;
- }
- }
-
- return uiCPU;
-}
-
-
-void WelsCPURestore (const uint32_t kuiCPU) {
- if (kuiCPU & (WELS_CPU_MMX | WELS_CPU_MMXEXT | WELS_CPU_3DNOW | WELS_CPU_3DNOWEXT)) {
- WelsEmms();
- }
-}
-
-#elif defined(HAVE_NEON) //For supporting both android platform and iOS platform
-#if defined(ANDROID_NDK)
-uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors)
-{
- uint32_t uiCPU = 0;
- AndroidCpuFamily cpuFamily = ANDROID_CPU_FAMILY_UNKNOWN;
- uint64_t uiFeatures = 0;
- cpuFamily = android_getCpuFamily();
- if (cpuFamily == ANDROID_CPU_FAMILY_ARM) {
- uiFeatures = android_getCpuFeatures();
- if (uiFeatures & ANDROID_CPU_ARM_FEATURE_ARMv7){
- uiCPU |= WELS_CPU_ARMv7;
- }
- if (uiFeatures & ANDROID_CPU_ARM_FEATURE_VFPv3){
- uiCPU |= WELS_CPU_VFPv3;
- }
- if (uiFeatures & ANDROID_CPU_ARM_FEATURE_NEON){
- uiCPU |= WELS_CPU_NEON;
- }
- }
-
- if( pNumberOfLogicProcessors != NULL ){
- *pNumberOfLogicProcessors = android_getCpuCount();
- }
-
- return uiCPU;
-}
-
-#elif defined(__APPLE__)
-uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors)
-{
- uint32_t uiCPU = 0;
-
-#if defined(__ARM_NEON__)
- uiCPU |= WELS_CPU_ARMv7;
- uiCPU |= WELS_CPU_VFPv3;
- uiCPU |= WELS_CPU_NEON;
-#endif
- return uiCPU;
-}
-#elif defined(__linux__)
-
-/* Generic arm/linux cpu feature detection */
-uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
- FILE *f = fopen("/proc/cpuinfo", "r");
-
- if (!f)
- return 0;
-
- char buf[200];
- int flags = 0;
- while (fgets(buf, sizeof(buf), f)) {
- if (!strncmp(buf, "Features", strlen("Features"))) {
- if (strstr(buf, " neon "))
- flags |= WELS_CPU_NEON;
- if (strstr(buf, " vfpv3 "))
- flags |= WELS_CPU_VFPv3;
- break;
- }
- }
- fclose(f);
- return flags;
-}
-
-#else /* HAVE_NEON enabled but no runtime detection */
-
-/* No runtime feature detection available, but built with HAVE_NEON - assume
- * that NEON and all associated features are available. */
-
-uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
- return WELS_CPU_ARMv7 |
- WELS_CPU_VFPv3 |
- WELS_CPU_NEON;
-}
-#endif
-#else /* Neither X86_ASM nor HAVE_NEON */
-
-uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
- return 0;
-}
-
-#endif
-
-
--- a/codec/common/cpu.h
+++ /dev/null
@@ -1,80 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2009-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file cpu.h
- *
- * \brief CPU feature compatibility detection
- *
- * \date 04/29/2009 Created
- *
- *************************************************************************************
- */
-#if !defined(WELS_CPU_DETECTION_H__)
-#define WELS_CPU_DETECTION_H__
-
-#include "typedefs.h"
-#include "cpu_core.h"
-
-
-#if defined(__cplusplus)
-extern "C" {
-#endif//__cplusplus
-
-#if defined(X86_ASM)
-/*
- * cpuid support verify routine
- * return 0 if cpuid is not supported by cpu
- */
-int32_t WelsCPUIdVerify();
-
-void WelsCPUId (uint32_t uiIndex, uint32_t* pFeatureA, uint32_t* pFeatureB, uint32_t* pFeatureC, uint32_t* pFeatureD);
-
-int32_t WelsCPUSupportAVX (uint32_t eax, uint32_t ecx);
-int32_t WelsCPUSupportFMA (uint32_t eax, uint32_t ecx);
-
-void WelsEmms();
-
-/*
- * clear FPU registers states for potential float based calculation if support
- */
-void WelsCPURestore (const uint32_t kuiCPU);
-
-#else
-#define WelsEmms()
-#endif
-
-uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors);
-
-#if defined(__cplusplus)
-}
-#endif//__cplusplus
-
-#endif//WELS_CPU_DETECTION_H__
--- a/codec/common/cpu_core.h
+++ /dev/null
@@ -1,85 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2009-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file cpu_core.h
- *
- * \brief cpu core feature detection
- *
- * \date 4/24/2009 Created
- *
- *************************************************************************************
- */
-#if !defined(WELS_CPU_CORE_FEATURE_DETECTION_H__)
-#define WELS_CPU_CORE_FEATURE_DETECTION_H__
-
-/*
- * WELS CPU feature flags
- */
-#define WELS_CPU_MMX 0x00000001 /* mmx */
-#define WELS_CPU_MMXEXT 0x00000002 /* mmx-ext*/
-#define WELS_CPU_SSE 0x00000004 /* sse */
-#define WELS_CPU_SSE2 0x00000008 /* sse 2 */
-#define WELS_CPU_SSE3 0x00000010 /* sse 3 */
-#define WELS_CPU_SSE41 0x00000020 /* sse 4.1 */
-#define WELS_CPU_3DNOW 0x00000040 /* 3dnow! */
-#define WELS_CPU_3DNOWEXT 0x00000080 /* 3dnow! ext */
-#define WELS_CPU_ALTIVEC 0x00000100 /* altivec */
-#define WELS_CPU_SSSE3 0x00000200 /* ssse3 */
-#define WELS_CPU_SSE42 0x00000400 /* sse 4.2 */
-
-/* CPU features application extensive */
-#define WELS_CPU_AVX 0x00000800 /* Advanced Vector eXtentions */
-#define WELS_CPU_FPU 0x00001000 /* x87-FPU on chip */
-#define WELS_CPU_HTT 0x00002000 /* Hyper-Threading Technology (HTT), Multi-threading enabled feature:
- physical processor package is capable of supporting more than one logic processor
- */
-#define WELS_CPU_CMOV 0x00004000 /* Conditional Move Instructions,
- also if x87-FPU is present at indicated by the CPUID.FPU feature bit, then FCOMI and FCMOV are supported
- */
-#define WELS_CPU_MOVBE 0x00008000 /* MOVBE instruction */
-#define WELS_CPU_AES 0x00010000 /* AES instruction extensions */
-#define WELS_CPU_FMA 0x00020000 /* AVX VEX FMA instruction sets */
-
-#define WELS_CPU_CACHELINE_16 0x10000000 /* CacheLine Size 16 */
-#define WELS_CPU_CACHELINE_32 0x20000000 /* CacheLine Size 32 */
-#define WELS_CPU_CACHELINE_64 0x40000000 /* CacheLine Size 64 */
-#define WELS_CPU_CACHELINE_128 0x80000000 /* CacheLine Size 128 */
-
-/* For the android OS */
-#define WELS_CPU_ARMv7 0x000001 /* ARMv7 */
-#define WELS_CPU_VFPv3 0x000002 /* VFPv3 */
-#define WELS_CPU_NEON 0x000004 /* NEON */
-
-/*
- * Interfaces for CPU core feature detection as below
- */
-
-#endif//WELS_CPU_CORE_FEATURE_DETECTION_H__
--- a/codec/common/cpuid.asm
+++ /dev/null
@@ -1,212 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* cpu_mmx.asm
-;*
-;* Abstract
-;* verify cpuid feature support and cpuid detection
-;*
-;* History
-;* 04/29/2009 Created
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-;******************************************************************************************
-; Macros
-;******************************************************************************************
-
-
-;******************************************************************************************
-; Code
-;******************************************************************************************
-
-SECTION .text
-
-; refer to "The IA-32 Intel(R) Architecture Software Developers Manual, Volume 2A A-M"
-; section CPUID - CPU Identification
-
-;******************************************************************************************
-; int32_t WelsCPUIdVerify()
-;******************************************************************************************
-WELS_EXTERN WelsCPUIdVerify
- push r1
- PUSHRFLAGS
- PUSHRFLAGS
-
- pop r1
- mov eax, r1d
- xor eax, 00200000h
- xor eax, r1d
- POPRFLAGS
- pop r1
- ret
-
-;****************************************************************************************************
-; void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
-;****************************************************************************************************
-%ifdef WIN64
-
-WELS_EXTERN WelsCPUId
- push rbx
- push rdx
-
- mov eax, ecx
- mov rcx, [r9]
- cpuid
- mov [r9], ecx
- mov [r8], ebx
- mov rcx, [rsp + 2*8 + 40]
- mov [rcx], edx
- pop rdx
- mov [rdx], eax
-
- pop rbx
- ret
-
-%elifdef UNIX64
-WELS_EXTERN WelsCPUId
- push rbx
- push rcx
- push rdx
-
- mov eax, edi
- mov rcx, [rcx]
- cpuid
- mov [r8], edx
- pop rdx
- pop r8
- mov [r8], ecx
- mov [rdx], ebx
- mov [rsi], eax
-
- pop rbx
- ret
-
-%elifdef X86_32
-
-WELS_EXTERN WelsCPUId
- push ebx
- push edi
-
- mov eax, [esp+12] ; operating index
- mov edi, [esp+24]
- mov ecx, [edi]
- cpuid ; cpuid
-
- ; processing various information return
- mov edi, [esp+16]
- mov [edi], eax
- mov edi, [esp+20]
- mov [edi], ebx
- mov edi, [esp+24]
- mov [edi], ecx
- mov edi, [esp+28]
- mov [edi], edx
-
- pop edi
- pop ebx
- ret
-
-%endif
-
-; need call after cpuid=1 and eax, ecx flag got then
-;****************************************************************************************************
-; int32_t WelsCPUSupportAVX( uint32_t eax, uint32_t ecx )
-;****************************************************************************************************
-WELS_EXTERN WelsCPUSupportAVX
-%ifdef WIN64
- mov eax, ecx
- mov ecx, edx
-%elifdef UNIX64
- mov eax, edi
- mov ecx, esi
-%else
- mov eax, [esp+4]
- mov ecx, [esp+8]
-%endif
-
- ; refer to detection of AVX addressed in INTEL AVX manual document
- and ecx, 018000000H
- cmp ecx, 018000000H ; check both OSXSAVE and AVX feature flags
- jne avx_not_supported
- ; processor supports AVX instructions and XGETBV is enabled by OS
- mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
- XGETBV ; result in EDX:EAX
- and eax, 06H
- cmp eax, 06H ; check OS has enabled both XMM and YMM state support
- jne avx_not_supported
- mov eax, 1
- ret
-avx_not_supported:
- mov eax, 0
- ret
-
-
-; need call after cpuid=1 and eax, ecx flag got then
-;****************************************************************************************************
-; int32_t WelsCPUSupportFMA( uint32_t eax, uint32_t ecx )
-;****************************************************************************************************
-WELS_EXTERN WelsCPUSupportFMA
-%ifdef WIN64
- mov eax, ecx
- mov ecx, edx
-%elifdef UNIX64
- mov eax, edi
- mov ecx, esi
-%else
- mov eax, [esp+4]
- mov ecx, [esp+8]
-%endif
- ; refer to detection of FMA addressed in INTEL AVX manual document
- and ecx, 018001000H
- cmp ecx, 018001000H ; check OSXSAVE, AVX, FMA feature flags
- jne fma_not_supported
- ; processor supports AVX,FMA instructions and XGETBV is enabled by OS
- mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
- XGETBV ; result in EDX:EAX
- and eax, 06H
- cmp eax, 06H ; check OS has enabled both XMM and YMM state support
- jne fma_not_supported
- mov eax, 1
- ret
-fma_not_supported:
- mov eax, 0
- ret
-
-;******************************************************************************************
-; void WelsEmms()
-;******************************************************************************************
-WELS_EXTERN WelsEmms
- emms ; empty mmx technology states
- ret
-
--- a/codec/common/crt_util_safe_x.cpp
+++ /dev/null
@@ -1,256 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2009-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file crt_utils_safe_x.cpp
- *
- * \brief common tool/function utilization
- *
- * \date 03/10/2009 Created
- *
- *************************************************************************************
- */
-
-#include <string.h>
-#include <stdlib.h>
-#include <math.h>
-#include <time.h>
-#if defined(_WIN32)
-#include <windows.h>
-#include <sys/types.h>
-#include <sys/timeb.h>
-#ifndef _MSC_VER
-#include <sys/time.h>
-#endif //!_MSC_VER
-#else
-#include <sys/time.h>
-#include <sys/timeb.h>
-#endif //_WIN32
-
-#include "macros.h"
-#include "crt_util_safe_x.h" // Safe CRT routines like utils for cross platforms
-
-#if defined(_WIN32) && defined(_MSC_VER)
-
-#if defined(_MSC_VER) && (_MSC_VER>=1500)
-
-int32_t WelsSnprintf (char* pBuffer, int32_t iSizeOfBuffer, const char* kpFormat, ...) {
- va_list pArgPtr;
- int32_t iRc;
-
- va_start (pArgPtr, kpFormat);
-
- iRc = vsnprintf_s (pBuffer, iSizeOfBuffer, _TRUNCATE, kpFormat, pArgPtr);
-
- va_end (pArgPtr);
-
- return iRc;
-}
-
-char* WelsStrncpy (char* pDest, int32_t iSizeInBytes, const char* kpSrc) {
- strncpy_s (pDest, iSizeInBytes, kpSrc, _TRUNCATE);
-
- return pDest;
-}
-
-int32_t WelsVsnprintf (char* pBuffer, int32_t iSizeOfBuffer, const char* kpFormat, va_list pArgPtr) {
- return vsnprintf_s (pBuffer, iSizeOfBuffer, _TRUNCATE, kpFormat, pArgPtr);
-}
-
-WelsFileHandle* WelsFopen (const char* kpFilename, const char* kpMode) {
- WelsFileHandle* pFp = NULL;
- if (fopen_s (&pFp, kpFilename, kpMode) != 0) {
- return NULL;
- }
-
- return pFp;
-}
-
-int32_t WelsFclose (WelsFileHandle* pFp) {
- return fclose (pFp);
-}
-
-int32_t WelsGetTimeOfDay (SWelsTime* pTp) {
- return _ftime_s (pTp);
-}
-
-int32_t WelsStrftime (char* pBuffer, int32_t iSize, const char* kpFormat, const SWelsTime* kpTp) {
- struct tm sTimeNow;
- int32_t iRc;
-
- localtime_s (&sTimeNow, &kpTp->time);
-
- iRc = strftime (pBuffer, iSize, kpFormat, &sTimeNow);
- if (iRc == 0)
- pBuffer[0] = '\0';
- return iRc;
-}
-
-#else
-
-int32_t WelsSnprintf (char* pBuffer, int32_t iSizeOfBuffer, const char* kpFormat, ...) {
- va_list pArgPtr;
- int32_t iRc;
-
- va_start (pArgPtr, kpFormat);
-
- iRc = vsnprintf (pBuffer, iSizeOfBuffer, kpFormat, pArgPtr); //confirmed_safe_unsafe_usage
- if (iRc < 0)
- pBuffer[iSizeOfBuffer - 1] = '\0';
-
- va_end (pArgPtr);
-
- return iRc;
-}
-
-char* WelsStrncpy (char* pDest, int32_t iSizeInBytes, const char* kpSrc) {
- strncpy (pDest, kpSrc, iSizeInBytes); //confirmed_safe_unsafe_usage
- pDest[iSizeInBytes - 1] = '\0';
-
- return pDest;
-}
-
-int32_t WelsVsnprintf (char* pBuffer, int32_t iSizeOfBuffer, const char* kpFormat, va_list pArgPtr) {
- int32_t iRc = vsnprintf (pBuffer, iSizeOfBuffer, kpFormat, pArgPtr); //confirmed_safe_unsafe_usage
- if (iRc < 0)
- pBuffer[iSizeOfBuffer - 1] = '\0';
- return iRc;
-}
-
-
-WelsFileHandle* WelsFopen (const char* kpFilename, const char* kpMode) {
- return fopen (kpFilename, kpMode);
-}
-
-int32_t WelsFclose (WelsFileHandle* pFp) {
- return fclose (pFp);
-}
-
-int32_t WelsGetTimeOfDay (SWelsTime* pTp) {
- _ftime (pTp);
- return 0;
-}
-
-int32_t WelsStrftime (char* pBuffer, int32_t iSize, const char* kpFormat, const SWelsTime* kpTp) {
- struct tm* pTnow;
- int32_t iRc;
-
- pTnow = localtime (&kpTp->time);
-
- iRc = strftime (pBuffer, iSize, kpFormat, pTnow);
- if (iRc == 0)
- pBuffer[0] = '\0';
- return iRc;
-}
-
-
-#endif // _MSC_VER
-
-#else //GCC
-
-int32_t WelsSnprintf (char* pBuffer, int32_t iSizeOfBuffer, const char* kpFormat, ...) {
- va_list pArgPtr;
- int32_t iRc;
-
- va_start (pArgPtr, kpFormat);
-
- iRc = vsnprintf (pBuffer, iSizeOfBuffer, kpFormat, pArgPtr);
-
- va_end (pArgPtr);
-
- return iRc;
-}
-
-char* WelsStrncpy (char* pDest, int32_t iSizeInBytes, const char* kpSrc) {
- strncpy (pDest, kpSrc, iSizeInBytes); //confirmed_safe_unsafe_usage
- pDest[iSizeInBytes - 1] = '\0';
- return pDest;
-}
-
-int32_t WelsVsnprintf (char* pBuffer, int32_t iSizeOfBuffer, const char* kpFormat, va_list pArgPtr) {
- return vsnprintf (pBuffer, iSizeOfBuffer, kpFormat, pArgPtr); //confirmed_safe_unsafe_usage
-}
-
-WelsFileHandle* WelsFopen (const char* kpFilename, const char* kpMode) {
- return fopen (kpFilename, kpMode);
-}
-
-int32_t WelsFclose (WelsFileHandle* pFp) {
- return fclose (pFp);
-}
-
-int32_t WelsGetTimeOfDay (SWelsTime* pTp) {
- struct timeval sTv;
-
- if (gettimeofday (&sTv, NULL)) {
- return -1;
- }
-
- pTp->time = sTv.tv_sec;
- pTp->millitm = (uint16_t)sTv.tv_usec / 1000;
-
- return 0;
-}
-
-int32_t WelsStrftime (char* pBuffer, int32_t iSize, const char* kpFormat, const SWelsTime* kpTp) {
- struct tm* pTnow;
- int32_t iRc;
-
- pTnow = localtime (&kpTp->time);
-
- iRc = strftime (pBuffer, iSize, kpFormat, pTnow);
- if (iRc == 0)
- pBuffer[0] = '\0';
- return iRc;
-}
-
-#endif
-
-
-char* WelsStrcat (char* pDest, int32_t iSizeInBytes, const char* kpSrc) {
- int32_t iCurLen = strlen(pDest);
- return WelsStrncpy(pDest + iCurLen, iSizeInBytes - iCurLen, kpSrc);
-}
-
-int32_t WelsFwrite (const void* kpBuffer, int32_t iSize, int32_t iCount, WelsFileHandle* pFp) {
- return fwrite (kpBuffer, iSize, iCount, pFp);
-}
-
-uint16_t WelsGetMillisecond (const SWelsTime* kpTp) {
- return kpTp->millitm;
-}
-
-int32_t WelsFseek (WelsFileHandle* fp, int32_t offset, int32_t origin) {
- return fseek(fp, offset, origin);
-}
-
-int32_t WelsFflush (WelsFileHandle* pFp) {
- return fflush (pFp);
-}
--- a/codec/common/crt_util_safe_x.h
+++ /dev/null
@@ -1,99 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2010-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file crt_util_safe_x.h
- *
- * \brief Safe CRT like util for cross platfroms support
- *
- * \date 06/04/2010 Created
- *
- *************************************************************************************
- */
-#ifndef WELS_CRT_UTIL_SAFE_CROSS_PLATFORMS_H__
-#define WELS_CRT_UTIL_SAFE_CROSS_PLATFORMS_H__
-
-#include <string.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <math.h>
-#include <time.h>
-
-#if defined(_WIN32)
-#include <windows.h>
-#include <sys/types.h>
-#include <sys/timeb.h>
-#else
-#include <sys/timeb.h>
-#include <sys/time.h>
-#include "typedefs.h"
-#endif//_WIN32
-
-#include "typedefs.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define WELS_FILE_SEEK_SET SEEK_SET
-#define WELS_FILE_SEEK_CUR SEEK_CUR
-#define WESL_FILE_SEEK_END SEEK_END
-
-typedef FILE WelsFileHandle;
-
-#ifdef _WIN32
-typedef struct _timeb SWelsTime;
-#else
-typedef struct timeb SWelsTime;
-#endif
-
-int32_t WelsSnprintf (char* buffer, int32_t sizeOfBuffer, const char* format, ...);
-char* WelsStrncpy (char* dest, int32_t sizeInBytes, const char* src);
-char* WelsStrcat (char* dest, int32_t sizeInBytes, const char* src);
-int32_t WelsVsnprintf (char* buffer, int32_t sizeOfBuffer, const char* format, va_list argptr);
-
-WelsFileHandle* WelsFopen (const char* filename, const char* mode);
-int32_t WelsFclose (WelsFileHandle* fp);
-int32_t WelsFread (void* buffer, int32_t size, int32_t count, WelsFileHandle* fp);
-int32_t WelsFwrite (const void* buffer, int32_t size, int32_t count, WelsFileHandle* fp);
-int32_t WelsFseek (WelsFileHandle* fp, int32_t offset, int32_t origin);
-int32_t WelsFflush (WelsFileHandle* fp);
-
-int32_t WelsGetTimeOfDay (SWelsTime* tp);
-int32_t WelsStrftime (char* buffer, int32_t size, const char* format, const SWelsTime* tp);
-uint16_t WelsGetMillisecond (const SWelsTime* tp);
-
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif//WELS_CRT_UTIL_SAFE_CROSS_PLATFORMS_H__
--- a/codec/common/deblock.asm
+++ /dev/null
@@ -1,5278 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* deblock.asm
-;*
-;* Abstract
-;* edge loop
-;*
-;* History
-;* 08/07/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-SECTION .rodata align=16
-
-ALIGN 16
-FOUR_16B_SSE2: dw 4, 4, 4, 4, 4, 4, 4, 4
-
-
-SECTION .text
-
-%ifdef WIN64
-
-
-WELS_EXTERN DeblockLumaLt4V_ssse3
- push rbp
- mov r11,[rsp + 16 + 20h] ; pTC
- PUSH_XMM 16
- sub rsp,1B0h
- lea rbp,[rsp+20h]
- movd xmm4,r8d
- movd xmm2,r9d
- mov qword [rbp+180h],r12
- mov r10,rcx
- movsxd r12,edx
- add edx,edx
- movsxd rdx,edx
- sub r10,r12
- movsx r8d,byte [r11]
- pxor xmm3,xmm3
- punpcklwd xmm2,xmm2
- movaps [rbp+50h],xmm14
- lea rax,[r12+r12*2]
- movdqa xmm14,[rdx+rcx]
- neg rax
- pshufd xmm0,xmm2,0
- movd xmm2,r8d
- movsx edx,byte [r11+1]
- movsx r8d,byte [r11+2]
- movsx r11d,byte [r11+3]
- movaps [rbp+70h],xmm12
- movd xmm1,edx
- movaps [rbp+80h],xmm11
- movd xmm12,r8d
- movd xmm11,r11d
- movdqa xmm5, [rax+rcx]
- lea rax,[r12+r12]
- punpcklwd xmm12,xmm12
- neg rax
- punpcklwd xmm11,xmm11
- movaps [rbp],xmm8
- movdqa xmm8, [r10]
- punpcklwd xmm2,xmm2
- punpcklwd xmm1,xmm1
- punpcklqdq xmm12,xmm12
- punpcklqdq xmm11,xmm11
- punpcklqdq xmm2,xmm2
- punpcklqdq xmm1,xmm1
- shufps xmm12,xmm11,88h
- movdqa xmm11,xmm8
- movaps [rbp+30h],xmm9
- movdqa xmm9,[rcx]
- shufps xmm2,xmm1,88h
- movdqa xmm1,xmm5
- punpcklbw xmm11,xmm3
- movaps [rbp+20h],xmm6
- movaps [rbp+60h],xmm13
- movdqa xmm13,xmm11
- movaps [rbp+90h],xmm10
- movdqa xmm10,xmm9
- movdqa xmm6,[rax+rcx]
- punpcklbw xmm1,xmm3
- movaps [rbp+0A0h],xmm12
- psubw xmm13,xmm1
- movaps [rbp+40h],xmm15
- movdqa xmm15,xmm14
- movaps [rbp+10h],xmm7
- movdqa xmm7,xmm6
- punpcklbw xmm10,xmm3
- movdqa xmm12,[r12+rcx]
- punpcklbw xmm7,xmm3
- punpcklbw xmm12,xmm3
- punpcklbw xmm15,xmm3
- pabsw xmm3,xmm13
- movdqa xmm13,xmm10
- psubw xmm13,xmm15
- movdqa [rbp+0F0h],xmm15
- pabsw xmm15,xmm13
- movdqa xmm13,xmm11
- movdqa [rbp+0B0h],xmm1
- movdqa xmm1,xmm0
- pavgw xmm13,xmm10
- pcmpgtw xmm1,xmm3
- movdqa [rbp+120h],xmm13
- movaps xmm13,xmm2
- punpcklwd xmm4,xmm4
- movdqa xmm3,xmm0
- movdqa [rbp+100h],xmm1
- psubw xmm13,xmm1
- movdqa xmm1,xmm10
- pcmpgtw xmm3,xmm15
- pshufd xmm4,xmm4,0
- psubw xmm1,xmm11
- movdqa [rbp+0D0h],xmm10
- psubw xmm13,xmm3
- movdqa [rbp+110h],xmm3
- pabsw xmm15,xmm1
- movdqa xmm3,xmm4
- psubw xmm10,xmm12
- pcmpgtw xmm3,xmm15
- pabsw xmm15,xmm10
- movdqa xmm10,xmm0
- psllw xmm1,2
- movdqa [rbp+0C0h],xmm11
- psubw xmm11,xmm7
- pcmpgtw xmm10,xmm15
- pabsw xmm11,xmm11
- movdqa xmm15,xmm0
- pand xmm3,xmm10
- pcmpgtw xmm15,xmm11
- movaps xmm11,xmm2
- pxor xmm10,xmm10
- pand xmm3,xmm15
- pcmpgtw xmm11,xmm10
- pcmpeqw xmm10,xmm2
- por xmm11,xmm10
- pand xmm3,xmm11
- movdqa xmm11,xmm7
- psubw xmm11,xmm12
- pxor xmm15,xmm15
- paddw xmm11,xmm1
- psubw xmm15,xmm13
- movdqa [rbp+0E0h],xmm12
- paddw xmm11,[FOUR_16B_SSE2]
- pxor xmm12,xmm12
- psraw xmm11,3
- punpckhbw xmm8,xmm12
- pmaxsw xmm15,xmm11
- punpckhbw xmm5,xmm12
- movdqa xmm11,xmm8
- pminsw xmm13,xmm15
- psubw xmm11,xmm5
- punpckhbw xmm9,xmm12
- pand xmm13,xmm3
- movdqa [rbp+130h],xmm13
- pabsw xmm13,xmm11
- punpckhbw xmm14,xmm12
- movdqa xmm11,xmm9
- psubw xmm11,xmm14
- movdqa xmm15,xmm0
- movdqa [rbp+140h],xmm14
- pabsw xmm14,xmm11
- movdqa xmm11,xmm8
- pcmpgtw xmm15,xmm14
- movdqa xmm1,[r12+rcx]
- pavgw xmm11,xmm9
- movdqa [rbp+170h],xmm11
- movdqa xmm10,xmm9
- punpckhbw xmm6,xmm12
- psubw xmm10,xmm8
- punpckhbw xmm1,xmm12
- movdqa xmm12,xmm0
- movaps xmm11,[rbp+0A0h]
- pcmpgtw xmm12,xmm13
- movaps xmm13,xmm11
- psubw xmm13,xmm12
- movdqa [rbp+160h],xmm15
- psubw xmm13,xmm15
- movdqa xmm15,xmm9
- psubw xmm15,xmm1
- movdqa [rbp+150h],xmm12
- pabsw xmm12,xmm10
- pabsw xmm14,xmm15
- movdqa xmm15,xmm8
- pcmpgtw xmm4,xmm12
- movdqa xmm12,xmm0
- psubw xmm15,xmm6
- pcmpgtw xmm12,xmm14
- pabsw xmm14,xmm15
- psllw xmm10,2
- pcmpgtw xmm0,xmm14
- movdqa xmm14,xmm6
- psubw xmm14,xmm1
- pand xmm4,xmm12
- paddw xmm14,xmm10
- pand xmm4,xmm0
- paddw xmm14,[FOUR_16B_SSE2]
- pxor xmm15,xmm15
- movaps xmm12,xmm11
- psubw xmm15,xmm13
- pxor xmm0,xmm0
- psraw xmm14,3
- pcmpgtw xmm12,xmm0
- pcmpeqw xmm0,xmm11
- pmaxsw xmm15,xmm14
- por xmm12,xmm0
- movdqa xmm0,[rbp+120h]
- pminsw xmm13,xmm15
- movdqa xmm15,[rbp+0B0h]
- movdqa xmm10,xmm7
- pand xmm4,xmm12
- paddw xmm15,xmm0
- pxor xmm12,xmm12
- paddw xmm10,xmm7
- movdqa xmm14,xmm12
- psubw xmm15,xmm10
- psubw xmm14,xmm2
- psraw xmm15,1
- pmaxsw xmm15,xmm14
- movdqa xmm10,xmm6
- pminsw xmm15,xmm2
- paddw xmm10,xmm6
- pand xmm15,xmm3
- psubw xmm12,xmm11
- pand xmm15,[rbp+100h]
- pand xmm13,xmm4
- paddw xmm7,xmm15
- paddw xmm8,xmm13
- movdqa xmm15,[rbp+170h]
- psubw xmm9,xmm13
- paddw xmm5,xmm15
- psubw xmm5,xmm10
- psraw xmm5,1
- pmaxsw xmm5,xmm12
- pminsw xmm5,xmm11
- pand xmm5,xmm4
- pand xmm5,[rbp+150h]
- paddw xmm6,xmm5
- movdqa xmm5,[rbp+0C0h]
- packuswb xmm7,xmm6
- movdqa xmm6,[rbp+130h]
- paddw xmm5,xmm6
- packuswb xmm5,xmm8
- movdqa xmm8,[rbp+0D0h]
- psubw xmm8,xmm6
- movdqa xmm6,[rbp+0F0h]
- paddw xmm6,xmm0
- movdqa xmm0,[rbp+0E0h]
- packuswb xmm8,xmm9
- movdqa xmm9,xmm0
- paddw xmm9,xmm0
- psubw xmm6,xmm9
- psraw xmm6,1
- pmaxsw xmm14,xmm6
- pminsw xmm2,xmm14
- pand xmm2,xmm3
- pand xmm2,[rbp+110h]
- paddw xmm0,xmm2
- movdqa xmm2,[rbp+140h]
- paddw xmm2,xmm15
- movdqa xmm15,xmm1
- paddw xmm15,xmm1
- psubw xmm2,xmm15
- psraw xmm2,1
- pmaxsw xmm12,xmm2
- pminsw xmm11,xmm12
- pand xmm11,xmm4
- pand xmm11,[rbp+160h]
- paddw xmm1,xmm11
- movdqa [rax+rcx],xmm7
- movdqa [r10],xmm5
- packuswb xmm0,xmm1
- movdqa [rcx],xmm8
- movdqa [r12+rcx],xmm0
- mov r12,qword [rbp+180h]
- lea rsp,[rbp+190h]
- POP_XMM
- pop rbp
- ret
-
-
-WELS_EXTERN DeblockLumaEq4V_ssse3
- mov rax,rsp
- push rbx
- push rbp
- push rsi
- push rdi
- sub rsp,1D8h
- movaps [rax-38h],xmm6
- movaps [rax-48h],xmm7
- movaps [rax-58h],xmm8
- pxor xmm1,xmm1
- movsxd r10,edx
- mov rbp,rcx
- mov r11d,r8d
- mov rdx,rcx
- mov rdi,rbp
- mov rbx,rbp
- movdqa xmm5,[rbp]
- movaps [rax-68h],xmm9
- movaps [rax-78h],xmm10
- punpcklbw xmm5,xmm1
- movaps [rax-88h],xmm11
- movaps [rax-98h],xmm12
- movaps [rax-0A8h],xmm13
- movaps [rax-0B8h],xmm14
- movdqa xmm14,[r10+rbp]
- movaps [rax-0C8h],xmm15
- lea eax,[r10*4]
- movsxd r8,eax
- lea eax,[r10+r10*2]
- movsxd rcx,eax
- lea eax,[r10+r10]
- sub rdx,r8
- punpcklbw xmm14,xmm1
- movdqa [rsp+90h],xmm5
- movdqa [rsp+30h],xmm14
- movsxd rsi,eax
- movsx eax,r11w
- sub rdi,rcx
- sub rbx,rsi
- mov r8,rbp
- sub r8,r10
- movd xmm0,eax
- movsx eax,r9w
- movdqa xmm12,[rdi]
- movdqa xmm6, [rsi+rbp]
- movdqa xmm13,[rbx]
- punpcklwd xmm0,xmm0
- pshufd xmm11,xmm0,0
- punpcklbw xmm13,xmm1
- punpcklbw xmm6,xmm1
- movdqa xmm8,[r8]
- movd xmm0,eax
- movdqa xmm10,xmm11
- mov eax,2
- punpcklbw xmm8,xmm1
- punpcklbw xmm12,xmm1
- cwde
- punpcklwd xmm0,xmm0
- psraw xmm10,2
- movdqa xmm1,xmm8
- movdqa [rsp+0F0h],xmm13
- movdqa [rsp+0B0h],xmm8
- pshufd xmm7,xmm0,0
- psubw xmm1,xmm13
- movdqa xmm0,xmm5
- movdqa xmm4,xmm7
- movdqa xmm2,xmm7
- psubw xmm0,xmm8
- pabsw xmm3,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm5
- movdqa [rsp+40h],xmm7
- movdqa [rsp+60h],xmm6
- pcmpgtw xmm4,xmm0
- psubw xmm1,xmm14
- pabsw xmm0,xmm1
- pcmpgtw xmm2,xmm0
- pand xmm4,xmm2
- movdqa xmm0,xmm11
- pcmpgtw xmm0,xmm3
- pand xmm4,xmm0
- movd xmm0,eax
- movdqa [rsp+20h],xmm4
- punpcklwd xmm0,xmm0
- pshufd xmm2,xmm0,0
- paddw xmm10,xmm2
- movdqa [rsp+0A0h],xmm2
- movdqa xmm15,xmm7
- pxor xmm4,xmm4
- movdqa xmm0,xmm8
- psubw xmm0,xmm12
- mov eax,4
- pabsw xmm0,xmm0
- movdqa xmm1,xmm10
- cwde
- pcmpgtw xmm15,xmm0
- pcmpgtw xmm1,xmm3
- movdqa xmm3,xmm7
- movdqa xmm7,[rdx]
- movdqa xmm0,xmm5
- psubw xmm0,xmm6
- pand xmm15,xmm1
- punpcklbw xmm7,xmm4
- movdqa xmm9,xmm15
- pabsw xmm0,xmm0
- psllw xmm7,1
- pandn xmm9,xmm12
- pcmpgtw xmm3,xmm0
- paddw xmm7,xmm12
- movd xmm0,eax
- pand xmm3,xmm1
- paddw xmm7,xmm12
- punpcklwd xmm0,xmm0
- paddw xmm7,xmm12
- pshufd xmm1,xmm0,0
- paddw xmm7,xmm13
- movdqa xmm0,xmm3
- pandn xmm0,xmm6
- paddw xmm7,xmm8
- movdqa [rsp+70h],xmm1
- paddw xmm7,xmm5
- movdqa [rsp+120h],xmm0
- movdqa xmm0,[rcx+rbp]
- punpcklbw xmm0,xmm4
- paddw xmm7,xmm1
- movdqa xmm4,xmm15
- psllw xmm0,1
- psraw xmm7,3
- paddw xmm0,xmm6
- pand xmm7,xmm15
- paddw xmm0,xmm6
- paddw xmm0,xmm6
- paddw xmm0,xmm14
- movdqa xmm6,xmm15
- paddw xmm0,xmm5
- pandn xmm6,xmm13
- paddw xmm0,xmm8
- paddw xmm0,xmm1
- psraw xmm0,3
- movdqa xmm1,xmm12
- paddw xmm1,xmm13
- pand xmm0,xmm3
- movdqa [rsp+100h],xmm0
- movdqa xmm0,xmm8
- paddw xmm0,xmm5
- paddw xmm1,xmm0
- movdqa xmm0,xmm3
- paddw xmm1,xmm2
- psraw xmm1,2
- pandn xmm0,xmm14
- pand xmm4,xmm1
- movdqa [rsp+0E0h],xmm0
- movdqa xmm0,xmm5
- paddw xmm0,xmm8
- movdqa xmm1,[rsp+60h]
- paddw xmm1,xmm14
- movdqa xmm14,xmm3
- paddw xmm1,xmm0
- movdqa xmm0,xmm8
- paddw xmm0,[rsp+30h]
- paddw xmm1,xmm2
- psraw xmm1,2
- pand xmm14,xmm1
- movdqa xmm1,xmm13
- paddw xmm1,xmm13
- paddw xmm1,xmm0
- paddw xmm1,xmm2
- psraw xmm1,2
- movdqa xmm0,[rsp+30h]
- movdqa xmm2,xmm13
- movdqa xmm5,xmm15
- paddw xmm0,[rsp+70h]
- pandn xmm5,xmm1
- paddw xmm2,xmm8
- movdqa xmm8,[rsp+90h]
- movdqa xmm1,xmm12
- paddw xmm2,xmm8
- psllw xmm2,1
- paddw xmm2,xmm0
- paddw xmm1,xmm2
- movdqa xmm0,xmm8
- movdqa xmm8,xmm3
- movdqa xmm2,[rsp+30h]
- paddw xmm0,xmm13
- psraw xmm1,3
- pand xmm15,xmm1
- movdqa xmm1,xmm2
- paddw xmm1,xmm2
- paddw xmm2,[rsp+90h]
- paddw xmm2,[rsp+0B0h]
- paddw xmm1,xmm0
- movdqa xmm0,xmm13
- movdqa xmm13,[r8]
- paddw xmm0, [rsp+70h]
- paddw xmm1, [rsp+0A0h]
- psllw xmm2,1
- paddw xmm2,xmm0
- psraw xmm1,2
- movdqa xmm0, [rdi]
- pandn xmm8,xmm1
- movdqa xmm1, [rsp+60h]
- paddw xmm1,xmm2
- movdqa xmm2, [rbx]
- psraw xmm1,3
- pand xmm3,xmm1
- movdqa xmm1, [rbp]
- movdqa [rsp+0D0h],xmm3
- pxor xmm3,xmm3
- punpckhbw xmm0,xmm3
- punpckhbw xmm1,xmm3
- punpckhbw xmm13,xmm3
- movdqa [rsp+0C0h],xmm0
- movdqa xmm0,[r10+rbp]
- movdqa [rsp],xmm1
- punpckhbw xmm0,xmm3
- punpckhbw xmm2,xmm3
- movdqa [rsp+80h],xmm0
- movdqa xmm0,[rsi+rbp]
- movdqa [rsp+10h],xmm13
- punpckhbw xmm0,xmm3
- movdqa [rsp+50h],xmm0
- movdqa xmm0,xmm1
- movdqa xmm1,xmm13
- psubw xmm0,xmm13
- psubw xmm1,xmm2
- pabsw xmm3,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,[rsp]
- movdqa xmm13,[rsp+40h]
- movdqa [rsp+110h],xmm2
- psubw xmm1, [rsp+80h]
- pcmpgtw xmm13,xmm0
- pcmpgtw xmm11,xmm3
- pabsw xmm0,xmm1
- pcmpgtw xmm10,xmm3
- movdqa xmm1, [rsp+40h]
- movdqa xmm2,xmm1
- movdqa xmm3,xmm1
- pcmpgtw xmm2,xmm0
- movdqa xmm0, [rsp+10h]
- pand xmm13,xmm2
- pand xmm13,xmm11
- movdqa xmm11,[rsp+0C0h]
- psubw xmm0,xmm11
- pabsw xmm0,xmm0
- pcmpgtw xmm3,xmm0
- pand xmm3,xmm10
- movdqa xmm0,[rsp]
- psubw xmm0,[rsp+50h]
- movdqa xmm2,[rdx]
- pabsw xmm0,xmm0
- por xmm7,xmm9
- movdqa xmm9,[rsp+20h]
- pcmpgtw xmm1,xmm0
- pand xmm9,xmm7
- movdqa xmm7,[rsp+20h]
- movdqa xmm0,xmm7
- pandn xmm0,xmm12
- movdqa xmm12,[rsp+110h]
- pand xmm1,xmm10
- movdqa xmm10,[rsp+70h]
- movdqa [rsp+40h],xmm1
- movdqa xmm1,xmm13
- por xmm9,xmm0
- pxor xmm0,xmm0
- por xmm4,xmm6
- movdqa xmm6,xmm7
- punpckhbw xmm2,xmm0
- por xmm15,xmm5
- movdqa xmm5,[rsp+20h]
- movdqa xmm0,xmm3
- psllw xmm2,1
- pandn xmm0,xmm11
- pand xmm6,xmm4
- movdqa xmm4,[rsp]
- paddw xmm2,xmm11
- pand xmm5,xmm15
- movdqa xmm15,[rsp+20h]
- paddw xmm2,xmm11
- paddw xmm2,xmm11
- paddw xmm2,xmm12
- paddw xmm2,[rsp+10h]
- paddw xmm2,[rsp]
- paddw xmm2,xmm10
- psraw xmm2,3
- pand xmm2,xmm3
- por xmm2,xmm0
- pand xmm1,xmm2
- movdqa xmm0,xmm13
- movdqa xmm2,xmm11
- pandn xmm0,xmm11
- paddw xmm2,xmm12
- por xmm1,xmm0
- packuswb xmm9,xmm1
- movdqa xmm0,xmm7
- movdqa xmm7,[rsp+0A0h]
- pandn xmm0,[rsp+0F0h]
- movdqa xmm1,xmm3
- por xmm6,xmm0
- movdqa xmm0,[rsp+10h]
- paddw xmm0,xmm4
- paddw xmm2,xmm0
- paddw xmm2,xmm7
- movdqa xmm0,xmm3
- pandn xmm0,xmm12
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- movdqa xmm2,xmm13
- movdqa xmm0,xmm13
- pand xmm2,xmm1
- pandn xmm0,xmm12
- movdqa xmm1,xmm12
- paddw xmm1,[rsp+10h]
- por xmm2,xmm0
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+0B0h]
- paddw xmm1,xmm4
- packuswb xmm6,xmm2
- movdqa xmm2,xmm3
- psllw xmm1,1
- por xmm5,xmm0
- movdqa xmm0,[rsp+80h]
- paddw xmm0,xmm10
- paddw xmm1,xmm0
- paddw xmm11,xmm1
- psraw xmm11,3
- movdqa xmm1,xmm12
- pand xmm2,xmm11
- paddw xmm1,xmm12
- movdqa xmm11,[rsp+80h]
- movdqa xmm0, [rsp+10h]
- por xmm14,[rsp+0E0h]
- paddw xmm0,xmm11
- movdqa xmm4,xmm15
- paddw xmm1,xmm0
- movdqa xmm0,xmm13
- paddw xmm1,xmm7
- psraw xmm1,2
- pandn xmm3,xmm1
- por xmm2,xmm3
- movdqa xmm1,xmm13
- movdqa xmm3,[rsp+10h]
- pandn xmm0,xmm3
- pand xmm1,xmm2
- movdqa xmm2,xmm11
- paddw xmm2,[rsp]
- por xmm1,xmm0
- movdqa xmm0,[rsp+0D0h]
- por xmm0,xmm8
- paddw xmm2,xmm3
- packuswb xmm5,xmm1
- movdqa xmm8,[rsp+40h]
- movdqa xmm1,[rsp+50h]
- movdqa xmm3,xmm8
- pand xmm4,xmm0
- psllw xmm2,1
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+90h]
- por xmm4,xmm0
- movdqa xmm0,xmm12
- paddw xmm0,xmm10
- paddw xmm2,xmm0
- paddw xmm1,xmm2
- movdqa xmm0,[rsp]
- movdqa xmm2,xmm11
- paddw xmm0,xmm12
- movdqa xmm12,[rsp]
- paddw xmm2,xmm11
- paddw xmm2,xmm0
- psraw xmm1,3
- movdqa xmm0,xmm8
- pand xmm3,xmm1
- paddw xmm2,xmm7
- movdqa xmm1,xmm13
- psraw xmm2,2
- pandn xmm0,xmm2
- por xmm3,xmm0
- movdqa xmm2,[rsp+50h]
- movdqa xmm0,xmm13
- pandn xmm0,xmm12
- pand xmm1,xmm3
- paddw xmm2,xmm11
- movdqa xmm3,xmm15
- por xmm1,xmm0
- pand xmm3,xmm14
- movdqa xmm14,[rsp+10h]
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+30h]
- packuswb xmm4,xmm1
- movdqa xmm1,xmm8
- por xmm3,xmm0
- movdqa xmm0,xmm12
- paddw xmm0,xmm14
- paddw xmm2,xmm0
- paddw xmm2,xmm7
- movdqa xmm0,xmm8
- pandn xmm0,xmm11
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- movdqa xmm2,xmm13
- movdqa xmm0,xmm13
- pandn xmm0,xmm11
- pand xmm2,xmm1
- movdqa xmm1,xmm15
- por xmm2,xmm0
- packuswb xmm3,xmm2
- movdqa xmm0,[rsp+100h]
- por xmm0,[rsp+120h]
- pand xmm1,xmm0
- movdqa xmm2,[rcx+rbp]
- movdqa xmm7,[rsp+50h]
- pandn xmm15,[rsp+60h]
- lea r11,[rsp+1D8h]
- pxor xmm0,xmm0
- por xmm1,xmm15
- movaps xmm15,[r11-0A8h]
- movdqa [rdi],xmm9
- movaps xmm9,[r11-48h]
- punpckhbw xmm2,xmm0
- psllw xmm2,1
- paddw xmm2,xmm7
- paddw xmm2,xmm7
- movdqa [rbx],xmm6
- movaps xmm6,[r11-18h]
- paddw xmm2,xmm7
- paddw xmm2,xmm11
- movaps xmm11,[r11-68h]
- paddw xmm2,xmm12
- movaps xmm12,[r11-78h]
- paddw xmm2,xmm14
- paddw xmm2,xmm10
- psraw xmm2,3
- movaps xmm10,[r11-58h]
- movaps xmm14,[r11-98h]
- movdqa xmm0,xmm13
- pand xmm2,xmm8
- pandn xmm8,xmm7
- pandn xmm13,xmm7
- por xmm2,xmm8
- movaps xmm7,[r11-28h]
- movaps xmm8,[r11-38h]
- movdqa [r8],xmm5
- pand xmm0,xmm2
- por xmm0,xmm13
- packuswb xmm1,xmm0
- movaps xmm13,[r11-88h]
- movdqa [rbp],xmm4
- movdqa [r10+rbp],xmm3
- movdqa [rsi+rbp],xmm1
- mov rsp,r11
- pop rdi
- pop rsi
- pop rbp
- pop rbx
- ret
-
-
-WELS_EXTERN DeblockChromaLt4V_ssse3
- mov rax,rsp
- push rbx
- push rdi
- PUSH_XMM 16
- sub rsp,0C8h
- mov r10,qword [rax + 30h] ; pTC
- pxor xmm1,xmm1
- mov rbx,rcx
- movsxd r11,r8d
- movsx ecx,byte [r10]
- movsx r8d,byte [r10+2]
- mov rdi,rdx
- movq xmm2,[rbx]
- movq xmm9,[r11+rbx]
- movsx edx,byte [r10+1]
- mov word [rsp+2],cx
- mov word [rsp],cx
- movsx eax,byte [r10+3]
- mov word [rsp+6],dx
- mov word [rsp+4],dx
- movdqa xmm11,xmm1
- mov word [rsp+0Eh],ax
- mov word [rsp+0Ch],ax
- lea eax,[r11+r11]
- movsxd rcx,eax
- mov rax,rbx
- mov rdx,rdi
- sub rax,rcx
- mov word [rsp+0Ah],r8w
- mov word [rsp+8],r8w
- movdqa xmm6,[rsp]
- movdqa xmm7,xmm6
- movq xmm13, [rax]
- mov rax,rdi
- sub rax,rcx
- mov rcx,rbx
- pcmpgtw xmm7,xmm1
- psubw xmm11,xmm6
- sub rcx,r11
- sub rdx,r11
- movq xmm0,[rax]
- movsx eax,r9w
- movq xmm15,[rcx]
- punpcklqdq xmm13,xmm0
- movq xmm0, [rdx]
- movdqa xmm4,xmm13
- punpcklqdq xmm15,xmm0
- movq xmm0, [rdi]
- punpcklbw xmm4,xmm1
- movdqa xmm12,xmm15
- punpcklqdq xmm2,xmm0
- movq xmm0, [r11+rdi]
- punpcklbw xmm12,xmm1
- movdqa xmm14,xmm2
- punpcklqdq xmm9,xmm0
- punpckhbw xmm2,xmm1
- punpcklbw xmm14,xmm1
- movd xmm0,eax
- movsx eax,word [rsp + 0C8h + 38h + 160] ; iBeta
- punpckhbw xmm13,xmm1
- punpckhbw xmm15,xmm1
- movdqa xmm3,xmm9
- movdqa [rsp+10h],xmm2
- punpcklwd xmm0,xmm0
- punpckhbw xmm9,xmm1
- punpcklbw xmm3,xmm1
- movdqa xmm1,xmm14
- pshufd xmm10,xmm0,0
- movd xmm0,eax
- mov eax,4
- cwde
- punpcklwd xmm0,xmm0
- pshufd xmm8,xmm0,0
- movd xmm0,eax
- punpcklwd xmm0,xmm0
- pshufd xmm5,xmm0,0
- psubw xmm1,xmm12
- movdqa xmm2,xmm10
- lea r11,[rsp+0C8h]
- psllw xmm1,2
- movdqa xmm0,xmm4
- psubw xmm4,xmm12
- psubw xmm0,xmm3
- psubw xmm3,xmm14
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm11
- psraw xmm1,3
- pmaxsw xmm0,xmm1
- pminsw xmm6,xmm0
- movdqa xmm1,xmm8
- movdqa xmm0,xmm12
- psubw xmm0,xmm14
- pabsw xmm0,xmm0
- pcmpgtw xmm2,xmm0
- pabsw xmm0,xmm4
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm3
- movdqa xmm3,[rsp]
- pand xmm2,xmm1
- movdqa xmm1,xmm8
- pcmpgtw xmm1,xmm0
- movdqa xmm0,xmm13
- pand xmm2,xmm1
- psubw xmm0,xmm9
- psubw xmm13,xmm15
- pand xmm2,xmm7
- pand xmm6,xmm2
- paddw xmm12,xmm6
- psubw xmm14,xmm6
- movdqa xmm2,[rsp+10h]
- movaps xmm6,[r11-18h]
- movdqa xmm1,xmm2
- psubw xmm1,xmm15
- psubw xmm9,xmm2
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm15
- psubw xmm0,xmm2
- psraw xmm1,3
- pmaxsw xmm11,xmm1
- pabsw xmm0,xmm0
- movdqa xmm1,xmm8
- pcmpgtw xmm10,xmm0
- pabsw xmm0,xmm13
- pminsw xmm3,xmm11
- movaps xmm11,[r11-68h]
- movaps xmm13,[rsp+40h]
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm9
- movaps xmm9, [r11-48h]
- pand xmm10,xmm1
- pcmpgtw xmm8,xmm0
- pand xmm10,xmm8
- pand xmm10,xmm7
- movaps xmm8,[r11-38h]
- movaps xmm7,[r11-28h]
- pand xmm3,xmm10
- paddw xmm15,xmm3
- psubw xmm2,xmm3
- movaps xmm10,[r11-58h]
- packuswb xmm12,xmm15
- movaps xmm15,[rsp+20h]
- packuswb xmm14,xmm2
- movq [rcx],xmm12
- movq [rbx],xmm14
- psrldq xmm12,8
- psrldq xmm14,8
- movq [rdx],xmm12
- movaps xmm12,[r11-78h]
- movq [rdi],xmm14
- movaps xmm14,[rsp+30h]
- mov rsp,r11
- POP_XMM
- pop rdi
- pop rbx
- ret
-
-
-WELS_EXTERN DeblockChromaEq4V_ssse3
- mov rax,rsp
- push rbx
- PUSH_XMM 15
- sub rsp,90h
- pxor xmm1,xmm1
- mov r11,rcx
- mov rbx,rdx
- mov r10d,r9d
- movq xmm13,[r11]
- lea eax,[r8+r8]
- movsxd r9,eax
- mov rax,rcx
- sub rax,r9
- movq xmm14,[rax]
- mov rax,rdx
- sub rax,r9
- movq xmm0,[rax]
- movsxd rax,r8d
- sub rcx,rax
- sub rdx,rax
- movq xmm12,[rax+r11]
- movq xmm10,[rcx]
- punpcklqdq xmm14,xmm0
- movdqa xmm8,xmm14
- movq xmm0,[rdx]
- punpcklbw xmm8,xmm1
- punpckhbw xmm14,xmm1
- punpcklqdq xmm10,xmm0
- movq xmm0,[rbx]
- movdqa xmm5,xmm10
- punpcklqdq xmm13,xmm0
- movq xmm0, [rax+rbx]
- punpcklbw xmm5,xmm1
- movsx eax,r10w
- movdqa xmm9,xmm13
- punpcklqdq xmm12,xmm0
- punpcklbw xmm9,xmm1
- punpckhbw xmm10,xmm1
- movd xmm0,eax
- movsx eax,word [rsp + 90h + 8h + 28h + 144] ; iBeta
- punpckhbw xmm13,xmm1
- movdqa xmm7,xmm12
- punpcklwd xmm0,xmm0
- punpckhbw xmm12,xmm1
- pshufd xmm11,xmm0,0
- punpcklbw xmm7,xmm1
- movd xmm0,eax
- movdqa xmm1,xmm8
- psubw xmm1,xmm5
- punpcklwd xmm0,xmm0
- movdqa xmm6,xmm11
- pshufd xmm3,xmm0,0
- movdqa xmm0,xmm5
- psubw xmm0,xmm9
- movdqa xmm2,xmm3
- pabsw xmm0,xmm0
- pcmpgtw xmm6,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm3
- pcmpgtw xmm2,xmm0
- pand xmm6,xmm2
- movdqa xmm0,xmm7
- movdqa xmm2,xmm3
- psubw xmm0,xmm9
- pabsw xmm0,xmm0
- pcmpgtw xmm1,xmm0
- pand xmm6,xmm1
- movdqa xmm0,xmm10
- movdqa xmm1,xmm14
- psubw xmm0,xmm13
- psubw xmm1,xmm10
- pabsw xmm0,xmm0
- pcmpgtw xmm11,xmm0
- pabsw xmm0,xmm1
- pcmpgtw xmm2,xmm0
- pand xmm11,xmm2
- movdqa xmm0,xmm12
- movdqa xmm4,xmm6
- movdqa xmm1,xmm8
- mov eax,2
- cwde
- paddw xmm1,xmm8
- psubw xmm0,xmm13
- paddw xmm1,xmm5
- pabsw xmm0,xmm0
- movdqa xmm2,xmm14
- paddw xmm1,xmm7
- pcmpgtw xmm3,xmm0
- paddw xmm2,xmm14
- movd xmm0,eax
- pand xmm11,xmm3
- paddw xmm7,xmm7
- paddw xmm2,xmm10
- punpcklwd xmm0,xmm0
- paddw xmm2,xmm12
- paddw xmm12,xmm12
- pshufd xmm3,xmm0,0
- paddw xmm7,xmm9
- paddw xmm12,xmm13
- movdqa xmm0,xmm6
- paddw xmm1,xmm3
- pandn xmm0,xmm5
- paddw xmm7,xmm8
- psraw xmm1,2
- paddw xmm12,xmm14
- paddw xmm7,xmm3
- movaps xmm14,[rsp]
- pand xmm4,xmm1
- paddw xmm12,xmm3
- psraw xmm7,2
- movdqa xmm1,xmm11
- por xmm4,xmm0
- psraw xmm12,2
- paddw xmm2,xmm3
- movdqa xmm0,xmm11
- pandn xmm0,xmm10
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- packuswb xmm4,xmm1
- movdqa xmm0,xmm11
- movdqa xmm1,xmm6
- pand xmm1,xmm7
- movaps xmm7,[rsp+70h]
- movq [rcx],xmm4
- pandn xmm6,xmm9
- pandn xmm11,xmm13
- pand xmm0,xmm12
- por xmm1,xmm6
- por xmm0,xmm11
- psrldq xmm4,8
- packuswb xmm1,xmm0
- movq [r11],xmm1
- psrldq xmm1,8
- movq [rdx],xmm4
- lea r11,[rsp+90h]
- movaps xmm6,[r11-10h]
- movaps xmm8,[r11-30h]
- movaps xmm9,[r11-40h]
- movq [rbx],xmm1
- movaps xmm10,[r11-50h]
- movaps xmm11,[r11-60h]
- movaps xmm12,[r11-70h]
- movaps xmm13,[r11-80h]
- mov rsp,r11
- POP_XMM
- pop rbx
- ret
-
-
-
-
-
-WELS_EXTERN DeblockChromaEq4H_ssse3
- mov rax,rsp
- mov [rax+20h],rbx
- push rdi
- PUSH_XMM 16
- sub rsp,140h
- mov rdi,rdx
- lea eax,[r8*4]
- movsxd r10,eax
- mov eax,[rcx-2]
- mov [rsp+10h],eax
- lea rbx,[r10+rdx-2]
- lea r11,[r10+rcx-2]
- movdqa xmm5,[rsp+10h]
- movsxd r10,r8d
- mov eax,[r10+rcx-2]
- lea rdx,[r10+r10*2]
- mov [rsp+20h],eax
- mov eax,[rcx+r10*2-2]
- mov [rsp+30h],eax
- mov eax,[rdx+rcx-2]
- movdqa xmm2,[rsp+20h]
- mov [rsp+40h],eax
- mov eax, [rdi-2]
- movdqa xmm4,[rsp+30h]
- mov [rsp+50h],eax
- mov eax,[r10+rdi-2]
- movdqa xmm3,[rsp+40h]
- mov [rsp+60h],eax
- mov eax,[rdi+r10*2-2]
- punpckldq xmm5,[rsp+50h]
- mov [rsp+70h],eax
- mov eax, [rdx+rdi-2]
- punpckldq xmm2, [rsp+60h]
- mov [rsp+80h],eax
- mov eax,[r11]
- punpckldq xmm4, [rsp+70h]
- mov [rsp+50h],eax
- mov eax,[rbx]
- punpckldq xmm3,[rsp+80h]
- mov [rsp+60h],eax
- mov eax,[r10+r11]
- movdqa xmm0, [rsp+50h]
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm5,xmm0
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax,[r10+rbx]
- movdqa xmm0,[rsp+50h]
- movdqa xmm1,xmm5
- mov [rsp+60h],eax
- mov eax,[r11+r10*2]
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm2,xmm0
- punpcklbw xmm1,xmm2
- punpckhbw xmm5,xmm2
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax,[rbx+r10*2]
- movdqa xmm0,[rsp+50h]
- mov [rsp+60h],eax
- mov eax, [rdx+r11]
- movdqa xmm15,xmm1
- punpckldq xmm0,[rsp+60h]
- punpcklqdq xmm4,xmm0
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax, [rdx+rbx]
- movdqa xmm0,[rsp+50h]
- mov [rsp+60h],eax
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm3,xmm0
- movdqa xmm0,xmm4
- punpcklbw xmm0,xmm3
- punpckhbw xmm4,xmm3
- punpcklwd xmm15,xmm0
- punpckhwd xmm1,xmm0
- movdqa xmm0,xmm5
- movdqa xmm12,xmm15
- punpcklwd xmm0,xmm4
- punpckhwd xmm5,xmm4
- punpckldq xmm12,xmm0
- punpckhdq xmm15,xmm0
- movdqa xmm0,xmm1
- movdqa xmm11,xmm12
- punpckldq xmm0,xmm5
- punpckhdq xmm1,xmm5
- punpcklqdq xmm11,xmm0
- punpckhqdq xmm12,xmm0
- movsx eax,r9w
- movdqa xmm14,xmm15
- punpcklqdq xmm14,xmm1
- punpckhqdq xmm15,xmm1
- pxor xmm1,xmm1
- movd xmm0,eax
- movdqa xmm4,xmm12
- movdqa xmm8,xmm11
- movsx eax,word [rsp+170h + 160] ; iBeta
- punpcklwd xmm0,xmm0
- punpcklbw xmm4,xmm1
- punpckhbw xmm12,xmm1
- movdqa xmm9,xmm14
- movdqa xmm7,xmm15
- movdqa xmm10,xmm15
- pshufd xmm13,xmm0,0
- punpcklbw xmm9,xmm1
- punpckhbw xmm14,xmm1
- movdqa xmm6,xmm13
- movd xmm0,eax
- movdqa [rsp],xmm11
- mov eax,2
- cwde
- punpckhbw xmm11,xmm1
- punpckhbw xmm10,xmm1
- punpcklbw xmm7,xmm1
- punpcklwd xmm0,xmm0
- punpcklbw xmm8,xmm1
- pshufd xmm3,xmm0,0
- movdqa xmm1,xmm8
- movdqa xmm0,xmm4
- psubw xmm0,xmm9
- psubw xmm1,xmm4
- movdqa xmm2,xmm3
- pabsw xmm0,xmm0
- pcmpgtw xmm6,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm3
- pcmpgtw xmm2,xmm0
- pand xmm6,xmm2
- movdqa xmm0,xmm7
- movdqa xmm2,xmm3
- psubw xmm0,xmm9
- pabsw xmm0,xmm0
- pcmpgtw xmm1,xmm0
- pand xmm6,xmm1
- movdqa xmm0,xmm12
- movdqa xmm1,xmm11
- psubw xmm0,xmm14
- psubw xmm1,xmm12
- movdqa xmm5,xmm6
- pabsw xmm0,xmm0
- pcmpgtw xmm13,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm8
- pcmpgtw xmm2,xmm0
- paddw xmm1,xmm8
- movdqa xmm0,xmm10
- pand xmm13,xmm2
- psubw xmm0,xmm14
- paddw xmm1,xmm4
- movdqa xmm2,xmm11
- pabsw xmm0,xmm0
- paddw xmm2,xmm11
- paddw xmm1,xmm7
- pcmpgtw xmm3,xmm0
- paddw xmm2,xmm12
- movd xmm0,eax
- pand xmm13,xmm3
- paddw xmm2,xmm10
- punpcklwd xmm0,xmm0
- pshufd xmm3,xmm0,0
- movdqa xmm0,xmm6
- paddw xmm1,xmm3
- pandn xmm0,xmm4
- paddw xmm2,xmm3
- psraw xmm1,2
- pand xmm5,xmm1
- por xmm5,xmm0
- paddw xmm7,xmm7
- paddw xmm10,xmm10
- psraw xmm2,2
- movdqa xmm1,xmm13
- movdqa xmm0,xmm13
- pandn xmm0,xmm12
- pand xmm1,xmm2
- paddw xmm7,xmm9
- por xmm1,xmm0
- paddw xmm10,xmm14
- paddw xmm7,xmm8
- movdqa xmm0,xmm13
- packuswb xmm5,xmm1
- paddw xmm7,xmm3
- paddw xmm10,xmm11
- movdqa xmm1,xmm6
- paddw xmm10,xmm3
- pandn xmm6,xmm9
- psraw xmm7,2
- pand xmm1,xmm7
- psraw xmm10,2
- pandn xmm13,xmm14
- pand xmm0,xmm10
- por xmm1,xmm6
- movdqa xmm6,[rsp]
- movdqa xmm4,xmm6
- por xmm0,xmm13
- punpcklbw xmm4,xmm5
- punpckhbw xmm6,xmm5
- movdqa xmm3,xmm4
- packuswb xmm1,xmm0
- movdqa xmm0,xmm1
- punpckhbw xmm1,xmm15
- punpcklbw xmm0,xmm15
- punpcklwd xmm3,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm0,xmm6
- movdqa xmm2,xmm3
- punpcklwd xmm0,xmm1
- punpckhwd xmm6,xmm1
- movdqa xmm1,xmm4
- punpckldq xmm2,xmm0
- punpckhdq xmm3,xmm0
- punpckldq xmm1,xmm6
- movdqa xmm0,xmm2
- punpcklqdq xmm0,xmm1
- punpckhdq xmm4,xmm6
- punpckhqdq xmm2,xmm1
- movdqa [rsp+10h],xmm0
- movdqa [rsp+60h],xmm2
- movdqa xmm0,xmm3
- mov eax,[rsp+10h]
- mov [rcx-2],eax
- mov eax,[rsp+60h]
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm3,xmm4
- mov [r10+rcx-2],eax
- movdqa [rsp+20h],xmm0
- mov eax, [rsp+20h]
- movdqa [rsp+70h],xmm3
- mov [rcx+r10*2-2],eax
- mov eax,[rsp+70h]
- mov [rdx+rcx-2],eax
- mov eax,[rsp+18h]
- mov [r11],eax
- mov eax,[rsp+68h]
- mov [r10+r11],eax
- mov eax,[rsp+28h]
- mov [r11+r10*2],eax
- mov eax,[rsp+78h]
- mov [rdx+r11],eax
- mov eax,[rsp+14h]
- mov [rdi-2],eax
- mov eax,[rsp+64h]
- mov [r10+rdi-2],eax
- mov eax,[rsp+24h]
- mov [rdi+r10*2-2],eax
- mov eax, [rsp+74h]
- mov [rdx+rdi-2],eax
- mov eax, [rsp+1Ch]
- mov [rbx],eax
- mov eax, [rsp+6Ch]
- mov [r10+rbx],eax
- mov eax,[rsp+2Ch]
- mov [rbx+r10*2],eax
- mov eax,[rsp+7Ch]
- mov [rdx+rbx],eax
- lea rsp,[rsp+140h]
- POP_XMM
- mov rbx, [rsp+28h]
- pop rdi
- ret
-
-
-
-WELS_EXTERN DeblockChromaLt4H_ssse3
- mov rax,rsp
- push rbx
- push rbp
- push rsi
- push rdi
- push r12
- PUSH_XMM 16
- sub rsp,170h
-
- movsxd rsi,r8d
- lea eax,[r8*4]
- mov r11d,r9d
- movsxd r10,eax
- mov eax, [rcx-2]
- mov r12,rdx
- mov [rsp+40h],eax
- mov eax, [rsi+rcx-2]
- lea rbx,[r10+rcx-2]
- movdqa xmm5,[rsp+40h]
- mov [rsp+50h],eax
- mov eax, [rcx+rsi*2-2]
- lea rbp,[r10+rdx-2]
- movdqa xmm2, [rsp+50h]
- mov [rsp+60h],eax
- lea r10,[rsi+rsi*2]
- mov rdi,rcx
- mov eax,[r10+rcx-2]
- movdqa xmm4,[rsp+60h]
- mov [rsp+70h],eax
- mov eax,[rdx-2]
- mov [rsp+80h],eax
- mov eax, [rsi+rdx-2]
- movdqa xmm3,[rsp+70h]
- mov [rsp+90h],eax
- mov eax,[rdx+rsi*2-2]
- punpckldq xmm5,[rsp+80h]
- mov [rsp+0A0h],eax
- mov eax, [r10+rdx-2]
- punpckldq xmm2,[rsp+90h]
- mov [rsp+0B0h],eax
- mov eax, [rbx]
- punpckldq xmm4,[rsp+0A0h]
- mov [rsp+80h],eax
- mov eax,[rbp]
- punpckldq xmm3,[rsp+0B0h]
- mov [rsp+90h],eax
- mov eax,[rsi+rbx]
- movdqa xmm0,[rsp+80h]
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm5,xmm0
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax,[rsi+rbp]
- movdqa xmm0,[rsp+80h]
- movdqa xmm1,xmm5
- mov [rsp+90h],eax
- mov eax,[rbx+rsi*2]
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm2,xmm0
- punpcklbw xmm1,xmm2
- punpckhbw xmm5,xmm2
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax,[rbp+rsi*2]
- movdqa xmm0, [rsp+80h]
- mov [rsp+90h],eax
- mov eax,[r10+rbx]
- movdqa xmm7,xmm1
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm4,xmm0
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax, [r10+rbp]
- movdqa xmm0,[rsp+80h]
- mov [rsp+90h],eax
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm3,xmm0
- movdqa xmm0,xmm4
- punpcklbw xmm0,xmm3
- punpckhbw xmm4,xmm3
- punpcklwd xmm7,xmm0
- punpckhwd xmm1,xmm0
- movdqa xmm0,xmm5
- movdqa xmm6,xmm7
- punpcklwd xmm0,xmm4
- punpckhwd xmm5,xmm4
- punpckldq xmm6,xmm0
- punpckhdq xmm7,xmm0
- movdqa xmm0,xmm1
- punpckldq xmm0,xmm5
- mov rax, [rsp+1C8h+160] ; pTC
- punpckhdq xmm1,xmm5
- movdqa xmm9,xmm6
- punpckhqdq xmm6,xmm0
- punpcklqdq xmm9,xmm0
- movdqa xmm2,xmm7
- movdqa xmm13,xmm6
- movdqa xmm4,xmm9
- movdqa [rsp+10h],xmm9
- punpcklqdq xmm2,xmm1
- punpckhqdq xmm7,xmm1
- pxor xmm1,xmm1
- movsx ecx,byte [rax+3]
- movsx edx,byte [rax+2]
- movsx r8d,byte [rax+1]
- movsx r9d,byte [rax]
- movdqa xmm10,xmm1
- movdqa xmm15,xmm2
- punpckhbw xmm2,xmm1
- punpckhbw xmm6,xmm1
- punpcklbw xmm4,xmm1
- movsx eax,r11w
- mov word [rsp+0Eh],cx
- mov word [rsp+0Ch],cx
- movdqa xmm3,xmm7
- movdqa xmm8,xmm7
- movdqa [rsp+20h],xmm7
- punpcklbw xmm15,xmm1
- punpcklbw xmm13,xmm1
- punpcklbw xmm3,xmm1
- mov word [rsp+0Ah],dx
- mov word [rsp+8],dx
- mov word [rsp+6],r8w
- movd xmm0,eax
- movdqa [rsp+30h],xmm6
- punpckhbw xmm9,xmm1
- punpckhbw xmm8,xmm1
- punpcklwd xmm0,xmm0
- movsx eax,word [rsp+1C0h+160] ; iBeta
- mov word [rsp+4],r8w
- mov word [rsp+2],r9w
- pshufd xmm12,xmm0,0
- mov word [rsp],r9w
- movd xmm0,eax
- mov eax,4
- cwde
- movdqa xmm14, [rsp]
- movdqa [rsp],xmm2
- movdqa xmm2,xmm12
- punpcklwd xmm0,xmm0
- pshufd xmm11,xmm0,0
- psubw xmm10,xmm14
- movd xmm0,eax
- movdqa xmm7,xmm14
- movdqa xmm6,xmm14
- pcmpgtw xmm7,xmm1
- punpcklwd xmm0,xmm0
- pshufd xmm5,xmm0,0
- movdqa xmm0,xmm4
- movdqa xmm1,xmm15
- psubw xmm4,xmm13
- psubw xmm0,xmm3
- psubw xmm1,xmm13
- psubw xmm3,xmm15
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm10
- psraw xmm1,3
- pmaxsw xmm0,xmm1
- pminsw xmm6,xmm0
- movdqa xmm1,xmm11
- movdqa xmm0,xmm13
- psubw xmm0,xmm15
- pabsw xmm0,xmm0
- pcmpgtw xmm2,xmm0
- pabsw xmm0,xmm4
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm3
- pand xmm2,xmm1
- movdqa xmm1,xmm11
- movdqa xmm3,[rsp+30h]
- pcmpgtw xmm1,xmm0
- movdqa xmm0,xmm9
- pand xmm2,xmm1
- psubw xmm0,xmm8
- psubw xmm9,xmm3
- pand xmm2,xmm7
- pand xmm6,xmm2
- psubw xmm15,xmm6
- paddw xmm13,xmm6
- movdqa xmm2,[rsp]
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- psubw xmm8,xmm2
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm3
- movdqa xmm5,[rsp+10h]
- psubw xmm0,xmm2
- psraw xmm1,3
- movdqa xmm4,xmm5
- pabsw xmm0,xmm0
- pmaxsw xmm10,xmm1
- movdqa xmm1,xmm11
- pcmpgtw xmm12,xmm0
- pabsw xmm0,xmm9
- pminsw xmm14,xmm10
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm8
- pcmpgtw xmm11,xmm0
- pand xmm12,xmm1
- movdqa xmm1,[rsp+20h]
- pand xmm12,xmm11
- pand xmm12,xmm7
- pand xmm14,xmm12
- paddw xmm3,xmm14
- psubw xmm2,xmm14
- packuswb xmm13,xmm3
- packuswb xmm15,xmm2
- punpcklbw xmm4,xmm13
- punpckhbw xmm5,xmm13
- movdqa xmm0,xmm15
- punpcklbw xmm0,xmm1
- punpckhbw xmm15,xmm1
- movdqa xmm3,xmm4
- punpcklwd xmm3,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm0,xmm5
- movdqa xmm2,xmm3
- movdqa xmm1,xmm4
- punpcklwd xmm0,xmm15
- punpckhwd xmm5,xmm15
- punpckldq xmm2,xmm0
- punpckhdq xmm3,xmm0
- punpckldq xmm1,xmm5
- movdqa xmm0,xmm2
- punpcklqdq xmm0,xmm1
- punpckhdq xmm4,xmm5
- punpckhqdq xmm2,xmm1
- movdqa [rsp+40h],xmm0
- movdqa xmm0,xmm3
- movdqa [rsp+90h],xmm2
- mov eax,[rsp+40h]
- mov [rdi-2],eax
- mov eax, [rsp+90h]
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm3,xmm4
- mov [rsi+rdi-2],eax
- movdqa [rsp+50h],xmm0
- mov eax,[rsp+50h]
- movdqa [rsp+0A0h],xmm3
- mov [rdi+rsi*2-2],eax
- mov eax,[rsp+0A0h]
- mov [r10+rdi-2],eax
- mov eax,[rsp+48h]
- mov [rbx],eax
- mov eax,[rsp+98h]
- mov [rsi+rbx],eax
- mov eax,[rsp+58h]
- mov [rbx+rsi*2],eax
- mov eax, [rsp+0A8h]
- mov [r10+rbx],eax
- mov eax, [rsp+44h]
- mov [r12-2],eax
- mov eax,[rsp+94h]
- mov [rsi+r12-2],eax
- mov eax,[rsp+54h]
- mov [r12+rsi*2-2],eax
- mov eax, [rsp+0A4h]
- mov [r10+r12-2],eax
- mov eax,[rsp+4Ch]
- mov [rbp],eax
- mov eax,[rsp+9Ch]
- mov [rsi+rbp],eax
- mov eax, [rsp+5Ch]
- mov [rbp+rsi*2],eax
- mov eax,[rsp+0ACh]
- mov [r10+rbp],eax
- lea r11,[rsp+170h]
- mov rsp,r11
- POP_XMM
- pop r12
- pop rdi
- pop rsi
- pop rbp
- pop rbx
- ret
-
-
-
-%elifdef UNIX64
-
-
-WELS_EXTERN DeblockLumaLt4V_ssse3
- push rbp
- mov r11,r8 ; pTC
- sub rsp,1B0h
- lea rbp,[rsp+20h]
- movd xmm4,edx
- movd xmm2,ecx
- mov qword [rbp+180h],r12
- mov r10,rdi
- movsxd r12,esi
- add rsi,rsi
- movsxd rdx,esi
- sub r10,r12
- movsx r8d,byte [r11]
- pxor xmm3,xmm3
- punpcklwd xmm2,xmm2
- movaps [rbp+50h],xmm14
- lea rax,[r12+r12*2]
- movdqa xmm14,[rdx+rdi]
- neg rax
- pshufd xmm0,xmm2,0
- movd xmm2,r8d
- movsx rsi,byte [r11+1]
- movsx r8d,byte [r11+2]
- movsx r11d,byte [r11+3]
- movaps [rbp+70h],xmm12
- movd xmm1,esi
- movaps [rbp+80h],xmm11
- movd xmm12,r8d
- movd xmm11,r11d
- movdqa xmm5, [rax+rdi]
- lea rax,[r12+r12]
- punpcklwd xmm12,xmm12
- neg rax
- punpcklwd xmm11,xmm11
- movaps [rbp],xmm8
- movdqa xmm8, [r10]
- punpcklwd xmm2,xmm2
- punpcklwd xmm1,xmm1
- punpcklqdq xmm12,xmm12
- punpcklqdq xmm11,xmm11
- punpcklqdq xmm2,xmm2
- punpcklqdq xmm1,xmm1
- shufps xmm12,xmm11,88h
- movdqa xmm11,xmm8
- movaps [rbp+30h],xmm9
- movdqa xmm9,[rdi]
- shufps xmm2,xmm1,88h
- movdqa xmm1,xmm5
- punpcklbw xmm11,xmm3
- movaps [rbp+20h],xmm6
- movaps [rbp+60h],xmm13
- movdqa xmm13,xmm11
- movaps [rbp+90h],xmm10
- movdqa xmm10,xmm9
- movdqa xmm6,[rax+rdi]
- punpcklbw xmm1,xmm3
- movaps [rbp+0A0h],xmm12
- psubw xmm13,xmm1
- movaps [rbp+40h],xmm15
- movdqa xmm15,xmm14
- movaps [rbp+10h],xmm7
- movdqa xmm7,xmm6
- punpcklbw xmm10,xmm3
- movdqa xmm12,[r12+rdi]
- punpcklbw xmm7,xmm3
- punpcklbw xmm12,xmm3
- punpcklbw xmm15,xmm3
- pabsw xmm3,xmm13
- movdqa xmm13,xmm10
- psubw xmm13,xmm15
- movdqa [rbp+0F0h],xmm15
- pabsw xmm15,xmm13
- movdqa xmm13,xmm11
- movdqa [rbp+0B0h],xmm1
- movdqa xmm1,xmm0
- pavgw xmm13,xmm10
- pcmpgtw xmm1,xmm3
- movdqa [rbp+120h],xmm13
- movaps xmm13,xmm2
- punpcklwd xmm4,xmm4
- movdqa xmm3,xmm0
- movdqa [rbp+100h],xmm1
- psubw xmm13,xmm1
- movdqa xmm1,xmm10
- pcmpgtw xmm3,xmm15
- pshufd xmm4,xmm4,0
- psubw xmm1,xmm11
- movdqa [rbp+0D0h],xmm10
- psubw xmm13,xmm3
- movdqa [rbp+110h],xmm3
- pabsw xmm15,xmm1
- movdqa xmm3,xmm4
- psubw xmm10,xmm12
- pcmpgtw xmm3,xmm15
- pabsw xmm15,xmm10
- movdqa xmm10,xmm0
- psllw xmm1,2
- movdqa [rbp+0C0h],xmm11
- psubw xmm11,xmm7
- pcmpgtw xmm10,xmm15
- pabsw xmm11,xmm11
- movdqa xmm15,xmm0
- pand xmm3,xmm10
- pcmpgtw xmm15,xmm11
- movaps xmm11,xmm2
- pxor xmm10,xmm10
- pand xmm3,xmm15
- pcmpgtw xmm11,xmm10
- pcmpeqw xmm10,xmm2
- por xmm11,xmm10
- pand xmm3,xmm11
- movdqa xmm11,xmm7
- psubw xmm11,xmm12
- pxor xmm15,xmm15
- paddw xmm11,xmm1
- psubw xmm15,xmm13
- movdqa [rbp+0E0h],xmm12
- paddw xmm11,[FOUR_16B_SSE2]
- pxor xmm12,xmm12
- psraw xmm11,3
- punpckhbw xmm8,xmm12
- pmaxsw xmm15,xmm11
- punpckhbw xmm5,xmm12
- movdqa xmm11,xmm8
- pminsw xmm13,xmm15
- psubw xmm11,xmm5
- punpckhbw xmm9,xmm12
- pand xmm13,xmm3
- movdqa [rbp+130h],xmm13
- pabsw xmm13,xmm11
- punpckhbw xmm14,xmm12
- movdqa xmm11,xmm9
- psubw xmm11,xmm14
- movdqa xmm15,xmm0
- movdqa [rbp+140h],xmm14
- pabsw xmm14,xmm11
- movdqa xmm11,xmm8
- pcmpgtw xmm15,xmm14
- movdqa xmm1,[r12+rdi]
- pavgw xmm11,xmm9
- movdqa [rbp+170h],xmm11
- movdqa xmm10,xmm9
- punpckhbw xmm6,xmm12
- psubw xmm10,xmm8
- punpckhbw xmm1,xmm12
- movdqa xmm12,xmm0
- movaps xmm11,[rbp+0A0h]
- pcmpgtw xmm12,xmm13
- movaps xmm13,xmm11
- psubw xmm13,xmm12
- movdqa [rbp+160h],xmm15
- psubw xmm13,xmm15
- movdqa xmm15,xmm9
- psubw xmm15,xmm1
- movdqa [rbp+150h],xmm12
- pabsw xmm12,xmm10
- pabsw xmm14,xmm15
- movdqa xmm15,xmm8
- pcmpgtw xmm4,xmm12
- movdqa xmm12,xmm0
- psubw xmm15,xmm6
- pcmpgtw xmm12,xmm14
- pabsw xmm14,xmm15
- psllw xmm10,2
- pcmpgtw xmm0,xmm14
- movdqa xmm14,xmm6
- psubw xmm14,xmm1
- pand xmm4,xmm12
- paddw xmm14,xmm10
- pand xmm4,xmm0
- paddw xmm14,[FOUR_16B_SSE2]
- pxor xmm15,xmm15
- movaps xmm12,xmm11
- psubw xmm15,xmm13
- pxor xmm0,xmm0
- psraw xmm14,3
- pcmpgtw xmm12,xmm0
- pcmpeqw xmm0,xmm11
- pmaxsw xmm15,xmm14
- por xmm12,xmm0
- movdqa xmm0,[rbp+120h]
- pminsw xmm13,xmm15
- movdqa xmm15,[rbp+0B0h]
- movdqa xmm10,xmm7
- pand xmm4,xmm12
- paddw xmm15,xmm0
- pxor xmm12,xmm12
- paddw xmm10,xmm7
- movdqa xmm14,xmm12
- psubw xmm15,xmm10
- psubw xmm14,xmm2
- psraw xmm15,1
- pmaxsw xmm15,xmm14
- movdqa xmm10,xmm6
- pminsw xmm15,xmm2
- paddw xmm10,xmm6
- pand xmm15,xmm3
- psubw xmm12,xmm11
- pand xmm15,[rbp+100h]
- pand xmm13,xmm4
- paddw xmm7,xmm15
- paddw xmm8,xmm13
- movdqa xmm15,[rbp+170h]
- psubw xmm9,xmm13
- paddw xmm5,xmm15
- psubw xmm5,xmm10
- psraw xmm5,1
- pmaxsw xmm5,xmm12
- pminsw xmm5,xmm11
- pand xmm5,xmm4
- pand xmm5,[rbp+150h]
- paddw xmm6,xmm5
- movdqa xmm5,[rbp+0C0h]
- packuswb xmm7,xmm6
- movdqa xmm6,[rbp+130h]
- paddw xmm5,xmm6
- packuswb xmm5,xmm8
- movdqa xmm8,[rbp+0D0h]
- psubw xmm8,xmm6
- movdqa xmm6,[rbp+0F0h]
- paddw xmm6,xmm0
- movdqa xmm0,[rbp+0E0h]
- packuswb xmm8,xmm9
- movdqa xmm9,xmm0
- paddw xmm9,xmm0
- psubw xmm6,xmm9
- psraw xmm6,1
- pmaxsw xmm14,xmm6
- pminsw xmm2,xmm14
- pand xmm2,xmm3
- pand xmm2,[rbp+110h]
- paddw xmm0,xmm2
- movdqa xmm2,[rbp+140h]
- paddw xmm2,xmm15
- movdqa xmm15,xmm1
- paddw xmm15,xmm1
- psubw xmm2,xmm15
- psraw xmm2,1
- pmaxsw xmm12,xmm2
- pminsw xmm11,xmm12
- pand xmm11,xmm4
- pand xmm11,[rbp+160h]
- paddw xmm1,xmm11
- movdqa [rax+rdi],xmm7
- movdqa [r10],xmm5
- packuswb xmm0,xmm1
- movdqa [rdi],xmm8
- movdqa [r12+rdi],xmm0
- mov r12,qword [rbp+180h]
- lea rsp,[rbp+190h]
- pop rbp
- ret
-
-
-WELS_EXTERN DeblockLumaEq4V_ssse3
- mov rax,rsp
- push rbx
- push rbp
- mov r8, rdx
- mov r9, rcx
- mov rcx, rdi
- mov rdx, rsi
- sub rsp,1D8h
- movaps [rax-38h],xmm6
- movaps [rax-48h],xmm7
- movaps [rax-58h],xmm8
- pxor xmm1,xmm1
- movsxd r10,edx
- mov rbp,rcx
- mov r11d,r8d
- mov rdx,rcx
- mov rdi,rbp
- mov rbx,rbp
- movdqa xmm5,[rbp]
- movaps [rax-68h],xmm9
- movaps [rax-78h],xmm10
- punpcklbw xmm5,xmm1
- movaps [rax-88h],xmm11
- movaps [rax-98h],xmm12
- movaps [rax-0A8h],xmm13
- movaps [rax-0B8h],xmm14
- movdqa xmm14,[r10+rbp]
- movaps [rax-0C8h],xmm15
- lea eax,[r10*4]
- movsxd r8,eax
- lea eax,[r10+r10*2]
- movsxd rcx,eax
- lea eax,[r10+r10]
- sub rdx,r8
- punpcklbw xmm14,xmm1
- movdqa [rsp+90h],xmm5
- movdqa [rsp+30h],xmm14
- movsxd rsi,eax
- movsx eax,r11w
- sub rdi,rcx
- sub rbx,rsi
- mov r8,rbp
- sub r8,r10
- movd xmm0,eax
- movsx eax,r9w
- movdqa xmm12,[rdi]
- movdqa xmm6, [rsi+rbp]
- movdqa xmm13,[rbx]
- punpcklwd xmm0,xmm0
- pshufd xmm11,xmm0,0
- punpcklbw xmm13,xmm1
- punpcklbw xmm6,xmm1
- movdqa xmm8,[r8]
- movd xmm0,eax
- movdqa xmm10,xmm11
- mov eax,2
- punpcklbw xmm8,xmm1
- punpcklbw xmm12,xmm1
- cwde
- punpcklwd xmm0,xmm0
- psraw xmm10,2
- movdqa xmm1,xmm8
- movdqa [rsp+0F0h],xmm13
- movdqa [rsp+0B0h],xmm8
- pshufd xmm7,xmm0,0
- psubw xmm1,xmm13
- movdqa xmm0,xmm5
- movdqa xmm4,xmm7
- movdqa xmm2,xmm7
- psubw xmm0,xmm8
- pabsw xmm3,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm5
- movdqa [rsp+40h],xmm7
- movdqa [rsp+60h],xmm6
- pcmpgtw xmm4,xmm0
- psubw xmm1,xmm14
- pabsw xmm0,xmm1
- pcmpgtw xmm2,xmm0
- pand xmm4,xmm2
- movdqa xmm0,xmm11
- pcmpgtw xmm0,xmm3
- pand xmm4,xmm0
- movd xmm0,eax
- movdqa [rsp+20h],xmm4
- punpcklwd xmm0,xmm0
- pshufd xmm2,xmm0,0
- paddw xmm10,xmm2
- movdqa [rsp+0A0h],xmm2
- movdqa xmm15,xmm7
- pxor xmm4,xmm4
- movdqa xmm0,xmm8
- psubw xmm0,xmm12
- mov eax,4
- pabsw xmm0,xmm0
- movdqa xmm1,xmm10
- cwde
- pcmpgtw xmm15,xmm0
- pcmpgtw xmm1,xmm3
- movdqa xmm3,xmm7
- movdqa xmm7,[rdx]
- movdqa xmm0,xmm5
- psubw xmm0,xmm6
- pand xmm15,xmm1
- punpcklbw xmm7,xmm4
- movdqa xmm9,xmm15
- pabsw xmm0,xmm0
- psllw xmm7,1
- pandn xmm9,xmm12
- pcmpgtw xmm3,xmm0
- paddw xmm7,xmm12
- movd xmm0,eax
- pand xmm3,xmm1
- paddw xmm7,xmm12
- punpcklwd xmm0,xmm0
- paddw xmm7,xmm12
- pshufd xmm1,xmm0,0
- paddw xmm7,xmm13
- movdqa xmm0,xmm3
- pandn xmm0,xmm6
- paddw xmm7,xmm8
- movdqa [rsp+70h],xmm1
- paddw xmm7,xmm5
- movdqa [rsp+120h],xmm0
- movdqa xmm0,[rcx+rbp]
- punpcklbw xmm0,xmm4
- paddw xmm7,xmm1
- movdqa xmm4,xmm15
- psllw xmm0,1
- psraw xmm7,3
- paddw xmm0,xmm6
- pand xmm7,xmm15
- paddw xmm0,xmm6
- paddw xmm0,xmm6
- paddw xmm0,xmm14
- movdqa xmm6,xmm15
- paddw xmm0,xmm5
- pandn xmm6,xmm13
- paddw xmm0,xmm8
- paddw xmm0,xmm1
- psraw xmm0,3
- movdqa xmm1,xmm12
- paddw xmm1,xmm13
- pand xmm0,xmm3
- movdqa [rsp+100h],xmm0
- movdqa xmm0,xmm8
- paddw xmm0,xmm5
- paddw xmm1,xmm0
- movdqa xmm0,xmm3
- paddw xmm1,xmm2
- psraw xmm1,2
- pandn xmm0,xmm14
- pand xmm4,xmm1
- movdqa [rsp+0E0h],xmm0
- movdqa xmm0,xmm5
- paddw xmm0,xmm8
- movdqa xmm1,[rsp+60h]
- paddw xmm1,xmm14
- movdqa xmm14,xmm3
- paddw xmm1,xmm0
- movdqa xmm0,xmm8
- paddw xmm0,[rsp+30h]
- paddw xmm1,xmm2
- psraw xmm1,2
- pand xmm14,xmm1
- movdqa xmm1,xmm13
- paddw xmm1,xmm13
- paddw xmm1,xmm0
- paddw xmm1,xmm2
- psraw xmm1,2
- movdqa xmm0,[rsp+30h]
- movdqa xmm2,xmm13
- movdqa xmm5,xmm15
- paddw xmm0,[rsp+70h]
- pandn xmm5,xmm1
- paddw xmm2,xmm8
- movdqa xmm8,[rsp+90h]
- movdqa xmm1,xmm12
- paddw xmm2,xmm8
- psllw xmm2,1
- paddw xmm2,xmm0
- paddw xmm1,xmm2
- movdqa xmm0,xmm8
- movdqa xmm8,xmm3
- movdqa xmm2,[rsp+30h]
- paddw xmm0,xmm13
- psraw xmm1,3
- pand xmm15,xmm1
- movdqa xmm1,xmm2
- paddw xmm1,xmm2
- paddw xmm2,[rsp+90h]
- paddw xmm2,[rsp+0B0h]
- paddw xmm1,xmm0
- movdqa xmm0,xmm13
- movdqa xmm13,[r8]
- paddw xmm0, [rsp+70h]
- paddw xmm1, [rsp+0A0h]
- psllw xmm2,1
- paddw xmm2,xmm0
- psraw xmm1,2
- movdqa xmm0, [rdi]
- pandn xmm8,xmm1
- movdqa xmm1, [rsp+60h]
- paddw xmm1,xmm2
- movdqa xmm2, [rbx]
- psraw xmm1,3
- pand xmm3,xmm1
- movdqa xmm1, [rbp]
- movdqa [rsp+0D0h],xmm3
- pxor xmm3,xmm3
- punpckhbw xmm0,xmm3
- punpckhbw xmm1,xmm3
- punpckhbw xmm13,xmm3
- movdqa [rsp+0C0h],xmm0
- movdqa xmm0,[r10+rbp]
- movdqa [rsp],xmm1
- punpckhbw xmm0,xmm3
- punpckhbw xmm2,xmm3
- movdqa [rsp+80h],xmm0
- movdqa xmm0,[rsi+rbp]
- movdqa [rsp+10h],xmm13
- punpckhbw xmm0,xmm3
- movdqa [rsp+50h],xmm0
- movdqa xmm0,xmm1
- movdqa xmm1,xmm13
- psubw xmm0,xmm13
- psubw xmm1,xmm2
- pabsw xmm3,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,[rsp]
- movdqa xmm13,[rsp+40h]
- movdqa [rsp+110h],xmm2
- psubw xmm1, [rsp+80h]
- pcmpgtw xmm13,xmm0
- pcmpgtw xmm11,xmm3
- pabsw xmm0,xmm1
- pcmpgtw xmm10,xmm3
- movdqa xmm1, [rsp+40h]
- movdqa xmm2,xmm1
- movdqa xmm3,xmm1
- pcmpgtw xmm2,xmm0
- movdqa xmm0, [rsp+10h]
- pand xmm13,xmm2
- pand xmm13,xmm11
- movdqa xmm11,[rsp+0C0h]
- psubw xmm0,xmm11
- pabsw xmm0,xmm0
- pcmpgtw xmm3,xmm0
- pand xmm3,xmm10
- movdqa xmm0,[rsp]
- psubw xmm0,[rsp+50h]
- movdqa xmm2,[rdx]
- pabsw xmm0,xmm0
- por xmm7,xmm9
- movdqa xmm9,[rsp+20h]
- pcmpgtw xmm1,xmm0
- pand xmm9,xmm7
- movdqa xmm7,[rsp+20h]
- movdqa xmm0,xmm7
- pandn xmm0,xmm12
- movdqa xmm12,[rsp+110h]
- pand xmm1,xmm10
- movdqa xmm10,[rsp+70h]
- movdqa [rsp+40h],xmm1
- movdqa xmm1,xmm13
- por xmm9,xmm0
- pxor xmm0,xmm0
- por xmm4,xmm6
- movdqa xmm6,xmm7
- punpckhbw xmm2,xmm0
- por xmm15,xmm5
- movdqa xmm5,[rsp+20h]
- movdqa xmm0,xmm3
- psllw xmm2,1
- pandn xmm0,xmm11
- pand xmm6,xmm4
- movdqa xmm4,[rsp]
- paddw xmm2,xmm11
- pand xmm5,xmm15
- movdqa xmm15,[rsp+20h]
- paddw xmm2,xmm11
- paddw xmm2,xmm11
- paddw xmm2,xmm12
- paddw xmm2,[rsp+10h]
- paddw xmm2,[rsp]
- paddw xmm2,xmm10
- psraw xmm2,3
- pand xmm2,xmm3
- por xmm2,xmm0
- pand xmm1,xmm2
- movdqa xmm0,xmm13
- movdqa xmm2,xmm11
- pandn xmm0,xmm11
- paddw xmm2,xmm12
- por xmm1,xmm0
- packuswb xmm9,xmm1
- movdqa xmm0,xmm7
- movdqa xmm7,[rsp+0A0h]
- pandn xmm0,[rsp+0F0h]
- movdqa xmm1,xmm3
- por xmm6,xmm0
- movdqa xmm0,[rsp+10h]
- paddw xmm0,xmm4
- paddw xmm2,xmm0
- paddw xmm2,xmm7
- movdqa xmm0,xmm3
- pandn xmm0,xmm12
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- movdqa xmm2,xmm13
- movdqa xmm0,xmm13
- pand xmm2,xmm1
- pandn xmm0,xmm12
- movdqa xmm1,xmm12
- paddw xmm1,[rsp+10h]
- por xmm2,xmm0
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+0B0h]
- paddw xmm1,xmm4
- packuswb xmm6,xmm2
- movdqa xmm2,xmm3
- psllw xmm1,1
- por xmm5,xmm0
- movdqa xmm0,[rsp+80h]
- paddw xmm0,xmm10
- paddw xmm1,xmm0
- paddw xmm11,xmm1
- psraw xmm11,3
- movdqa xmm1,xmm12
- pand xmm2,xmm11
- paddw xmm1,xmm12
- movdqa xmm11,[rsp+80h]
- movdqa xmm0, [rsp+10h]
- por xmm14,[rsp+0E0h]
- paddw xmm0,xmm11
- movdqa xmm4,xmm15
- paddw xmm1,xmm0
- movdqa xmm0,xmm13
- paddw xmm1,xmm7
- psraw xmm1,2
- pandn xmm3,xmm1
- por xmm2,xmm3
- movdqa xmm1,xmm13
- movdqa xmm3,[rsp+10h]
- pandn xmm0,xmm3
- pand xmm1,xmm2
- movdqa xmm2,xmm11
- paddw xmm2,[rsp]
- por xmm1,xmm0
- movdqa xmm0,[rsp+0D0h]
- por xmm0,xmm8
- paddw xmm2,xmm3
- packuswb xmm5,xmm1
- movdqa xmm8,[rsp+40h]
- movdqa xmm1,[rsp+50h]
- movdqa xmm3,xmm8
- pand xmm4,xmm0
- psllw xmm2,1
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+90h]
- por xmm4,xmm0
- movdqa xmm0,xmm12
- paddw xmm0,xmm10
- paddw xmm2,xmm0
- paddw xmm1,xmm2
- movdqa xmm0,[rsp]
- movdqa xmm2,xmm11
- paddw xmm0,xmm12
- movdqa xmm12,[rsp]
- paddw xmm2,xmm11
- paddw xmm2,xmm0
- psraw xmm1,3
- movdqa xmm0,xmm8
- pand xmm3,xmm1
- paddw xmm2,xmm7
- movdqa xmm1,xmm13
- psraw xmm2,2
- pandn xmm0,xmm2
- por xmm3,xmm0
- movdqa xmm2,[rsp+50h]
- movdqa xmm0,xmm13
- pandn xmm0,xmm12
- pand xmm1,xmm3
- paddw xmm2,xmm11
- movdqa xmm3,xmm15
- por xmm1,xmm0
- pand xmm3,xmm14
- movdqa xmm14,[rsp+10h]
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+30h]
- packuswb xmm4,xmm1
- movdqa xmm1,xmm8
- por xmm3,xmm0
- movdqa xmm0,xmm12
- paddw xmm0,xmm14
- paddw xmm2,xmm0
- paddw xmm2,xmm7
- movdqa xmm0,xmm8
- pandn xmm0,xmm11
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- movdqa xmm2,xmm13
- movdqa xmm0,xmm13
- pandn xmm0,xmm11
- pand xmm2,xmm1
- movdqa xmm1,xmm15
- por xmm2,xmm0
- packuswb xmm3,xmm2
- movdqa xmm0,[rsp+100h]
- por xmm0,[rsp+120h]
- pand xmm1,xmm0
- movdqa xmm2,[rcx+rbp]
- movdqa xmm7,[rsp+50h]
- pandn xmm15,[rsp+60h]
- lea r11,[rsp+1D8h]
- pxor xmm0,xmm0
- por xmm1,xmm15
- movaps xmm15,[r11-0A8h]
- movdqa [rdi],xmm9
- movaps xmm9,[r11-48h]
- punpckhbw xmm2,xmm0
- psllw xmm2,1
- paddw xmm2,xmm7
- paddw xmm2,xmm7
- movdqa [rbx],xmm6
- movaps xmm6,[r11-18h]
- paddw xmm2,xmm7
- paddw xmm2,xmm11
- movaps xmm11,[r11-68h]
- paddw xmm2,xmm12
- movaps xmm12,[r11-78h]
- paddw xmm2,xmm14
- paddw xmm2,xmm10
- psraw xmm2,3
- movaps xmm10,[r11-58h]
- movaps xmm14,[r11-98h]
- movdqa xmm0,xmm13
- pand xmm2,xmm8
- pandn xmm8,xmm7
- pandn xmm13,xmm7
- por xmm2,xmm8
- movaps xmm7,[r11-28h]
- movaps xmm8,[r11-38h]
- movdqa [r8],xmm5
- pand xmm0,xmm2
- por xmm0,xmm13
- packuswb xmm1,xmm0
- movaps xmm13,[r11-88h]
- movdqa [rbp],xmm4
- movdqa [r10+rbp],xmm3
- movdqa [rsi+rbp],xmm1
- mov rsp,r11
- pop rbp
- pop rbx
- ret
-
-WELS_EXTERN DeblockChromaLt4V_ssse3
- mov rax,rsp
- push rbx
- push rbp
- mov r10, rdx
- mov r11, rcx
- mov rcx, rdi
- mov rdx, rsi
- mov rsi, r10
- mov r10, r9
- mov rbp, r8
- mov r8, rsi
- mov r9, r11
- sub rsp,0C8h
- pxor xmm1,xmm1
- mov rbx,rcx
- movsxd r11,r8d
- movsx ecx,byte [r10]
- movsx r8d,byte [r10+2]
- mov rdi,rdx
- movq xmm2,[rbx]
- movq xmm9,[r11+rbx]
- movsx edx,byte [r10+1]
- mov word [rsp+2],cx
- mov word [rsp],cx
- movsx eax,byte [r10+3]
- mov word [rsp+6],dx
- mov word [rsp+4],dx
- movdqa xmm11,xmm1
- mov word [rsp+0Eh],ax
- mov word [rsp+0Ch],ax
- lea eax,[r11+r11]
- movsxd rcx,eax
- mov rax,rbx
- mov rdx,rdi
- sub rax,rcx
- mov word [rsp+0Ah],r8w
- mov word [rsp+8],r8w
- movdqa xmm6,[rsp]
- movdqa xmm7,xmm6
- movq xmm13, [rax]
- mov rax,rdi
- sub rax,rcx
- mov rcx,rbx
- pcmpgtw xmm7,xmm1
- psubw xmm11,xmm6
- sub rcx,r11
- sub rdx,r11
- movq xmm0,[rax]
- movsx eax,r9w
- movq xmm15,[rcx]
- punpcklqdq xmm13,xmm0
- movq xmm0, [rdx]
- movdqa xmm4,xmm13
- punpcklqdq xmm15,xmm0
- movq xmm0, [rdi]
- punpcklbw xmm4,xmm1
- movdqa xmm12,xmm15
- punpcklqdq xmm2,xmm0
- movq xmm0, [r11+rdi]
- punpcklbw xmm12,xmm1
- movdqa xmm14,xmm2
- punpcklqdq xmm9,xmm0
- punpckhbw xmm2,xmm1
- punpcklbw xmm14,xmm1
- movd xmm0,eax
- mov eax, ebp ; iBeta
- punpckhbw xmm13,xmm1
- punpckhbw xmm15,xmm1
- movdqa xmm3,xmm9
- movdqa [rsp+10h],xmm2
- punpcklwd xmm0,xmm0
- punpckhbw xmm9,xmm1
- punpcklbw xmm3,xmm1
- movdqa xmm1,xmm14
- pshufd xmm10,xmm0,0
- movd xmm0,eax
- mov eax,4
- cwde
- punpcklwd xmm0,xmm0
- pshufd xmm8,xmm0,0
- movd xmm0,eax
- punpcklwd xmm0,xmm0
- pshufd xmm5,xmm0,0
- psubw xmm1,xmm12
- movdqa xmm2,xmm10
- lea r11,[rsp+0C8h]
- psllw xmm1,2
- movdqa xmm0,xmm4
- psubw xmm4,xmm12
- psubw xmm0,xmm3
- psubw xmm3,xmm14
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm11
- psraw xmm1,3
- pmaxsw xmm0,xmm1
- pminsw xmm6,xmm0
- movdqa xmm1,xmm8
- movdqa xmm0,xmm12
- psubw xmm0,xmm14
- pabsw xmm0,xmm0
- pcmpgtw xmm2,xmm0
- pabsw xmm0,xmm4
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm3
- movdqa xmm3,[rsp]
- pand xmm2,xmm1
- movdqa xmm1,xmm8
- pcmpgtw xmm1,xmm0
- movdqa xmm0,xmm13
- pand xmm2,xmm1
- psubw xmm0,xmm9
- psubw xmm13,xmm15
- pand xmm2,xmm7
- pand xmm6,xmm2
- paddw xmm12,xmm6
- psubw xmm14,xmm6
- movdqa xmm2,[rsp+10h]
- movaps xmm6,[r11-18h]
- movdqa xmm1,xmm2
- psubw xmm1,xmm15
- psubw xmm9,xmm2
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm15
- psubw xmm0,xmm2
- psraw xmm1,3
- pmaxsw xmm11,xmm1
- pabsw xmm0,xmm0
- movdqa xmm1,xmm8
- pcmpgtw xmm10,xmm0
- pabsw xmm0,xmm13
- pminsw xmm3,xmm11
- movaps xmm11,[r11-68h]
- movaps xmm13,[rsp+40h]
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm9
- movaps xmm9, [r11-48h]
- pand xmm10,xmm1
- pcmpgtw xmm8,xmm0
- pand xmm10,xmm8
- pand xmm10,xmm7
- movaps xmm8,[r11-38h]
- movaps xmm7,[r11-28h]
- pand xmm3,xmm10
- paddw xmm15,xmm3
- psubw xmm2,xmm3
- movaps xmm10,[r11-58h]
- packuswb xmm12,xmm15
- movaps xmm15,[rsp+20h]
- packuswb xmm14,xmm2
- movq [rcx],xmm12
- movq [rbx],xmm14
- psrldq xmm12,8
- psrldq xmm14,8
- movq [rdx],xmm12
- movaps xmm12,[r11-78h]
- movq [rdi],xmm14
- movaps xmm14,[rsp+30h]
- mov rsp,r11
- pop rbp
- pop rbx
- ret
-
-WELS_EXTERN DeblockChromaEq4V_ssse3
- mov rax,rsp
- push rbx
- push rbp
-
- mov rbp, r8
- mov r8, rdx
- mov r9, rcx
- mov rcx, rdi
- mov rdx, rsi
-
- sub rsp,90h
- pxor xmm1,xmm1
- mov r11,rcx
- mov rbx,rdx
- mov r10d,r9d
- movq xmm13,[r11]
- lea eax,[r8+r8]
- movsxd r9,eax
- mov rax,rcx
- sub rax,r9
- movq xmm14,[rax]
- mov rax,rdx
- sub rax,r9
- movq xmm0,[rax]
- movsxd rax,r8d
- sub rcx,rax
- sub rdx,rax
- movq xmm12,[rax+r11]
- movq xmm10,[rcx]
- punpcklqdq xmm14,xmm0
- movdqa xmm8,xmm14
- movq xmm0,[rdx]
- punpcklbw xmm8,xmm1
- punpckhbw xmm14,xmm1
- punpcklqdq xmm10,xmm0
- movq xmm0,[rbx]
- movdqa xmm5,xmm10
- punpcklqdq xmm13,xmm0
- movq xmm0, [rax+rbx]
- punpcklbw xmm5,xmm1
- movsx eax,r10w
- movdqa xmm9,xmm13
- punpcklqdq xmm12,xmm0
- punpcklbw xmm9,xmm1
- punpckhbw xmm10,xmm1
- movd xmm0,eax
- mov eax, ebp ; iBeta
- punpckhbw xmm13,xmm1
- movdqa xmm7,xmm12
- punpcklwd xmm0,xmm0
- punpckhbw xmm12,xmm1
- pshufd xmm11,xmm0,0
- punpcklbw xmm7,xmm1
- movd xmm0,eax
- movdqa xmm1,xmm8
- psubw xmm1,xmm5
- punpcklwd xmm0,xmm0
- movdqa xmm6,xmm11
- pshufd xmm3,xmm0,0
- movdqa xmm0,xmm5
- psubw xmm0,xmm9
- movdqa xmm2,xmm3
- pabsw xmm0,xmm0
- pcmpgtw xmm6,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm3
- pcmpgtw xmm2,xmm0
- pand xmm6,xmm2
- movdqa xmm0,xmm7
- movdqa xmm2,xmm3
- psubw xmm0,xmm9
- pabsw xmm0,xmm0
- pcmpgtw xmm1,xmm0
- pand xmm6,xmm1
- movdqa xmm0,xmm10
- movdqa xmm1,xmm14
- psubw xmm0,xmm13
- psubw xmm1,xmm10
- pabsw xmm0,xmm0
- pcmpgtw xmm11,xmm0
- pabsw xmm0,xmm1
- pcmpgtw xmm2,xmm0
- pand xmm11,xmm2
- movdqa xmm0,xmm12
- movdqa xmm4,xmm6
- movdqa xmm1,xmm8
- mov eax,2
- cwde
- paddw xmm1,xmm8
- psubw xmm0,xmm13
- paddw xmm1,xmm5
- pabsw xmm0,xmm0
- movdqa xmm2,xmm14
- paddw xmm1,xmm7
- pcmpgtw xmm3,xmm0
- paddw xmm2,xmm14
- movd xmm0,eax
- pand xmm11,xmm3
- paddw xmm7,xmm7
- paddw xmm2,xmm10
- punpcklwd xmm0,xmm0
- paddw xmm2,xmm12
- paddw xmm12,xmm12
- pshufd xmm3,xmm0,0
- paddw xmm7,xmm9
- paddw xmm12,xmm13
- movdqa xmm0,xmm6
- paddw xmm1,xmm3
- pandn xmm0,xmm5
- paddw xmm7,xmm8
- psraw xmm1,2
- paddw xmm12,xmm14
- paddw xmm7,xmm3
- ;movaps xmm14,[rsp]
- pand xmm4,xmm1
- paddw xmm12,xmm3
- psraw xmm7,2
- movdqa xmm1,xmm11
- por xmm4,xmm0
- psraw xmm12,2
- paddw xmm2,xmm3
- movdqa xmm0,xmm11
- pandn xmm0,xmm10
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- packuswb xmm4,xmm1
- movdqa xmm0,xmm11
- movdqa xmm1,xmm6
- pand xmm1,xmm7
- movq [rcx],xmm4
- pandn xmm6,xmm9
- pandn xmm11,xmm13
- pand xmm0,xmm12
- por xmm1,xmm6
- por xmm0,xmm11
- psrldq xmm4,8
- packuswb xmm1,xmm0
- movq [r11],xmm1
- psrldq xmm1,8
- movq [rdx],xmm4
- lea r11,[rsp+90h]
- movq [rbx],xmm1
- mov rsp,r11
- pop rbp
- pop rbx
- ret
-
-WELS_EXTERN DeblockChromaEq4H_ssse3
- mov rax,rsp
- push rbx
- push rbp
- push r12
-
- mov rbp, r8
- mov r8, rdx
- mov r9, rcx
- mov rcx, rdi
- mov rdx, rsi
- mov rdi, rdx
-
- sub rsp,140h
- lea eax,[r8*4]
- movsxd r10,eax
- mov eax,[rcx-2]
- mov [rsp+10h],eax
- lea rbx,[r10+rdx-2]
- lea r11,[r10+rcx-2]
-
- movdqa xmm5,[rsp+10h]
- movsxd r10,r8d
- mov eax,[r10+rcx-2]
- lea rdx,[r10+r10*2]
- mov [rsp+20h],eax
- mov eax,[rcx+r10*2-2]
- mov [rsp+30h],eax
- mov eax,[rdx+rcx-2]
- movdqa xmm2,[rsp+20h]
- mov [rsp+40h],eax
- mov eax, [rdi-2]
- movdqa xmm4,[rsp+30h]
- mov [rsp+50h],eax
- mov eax,[r10+rdi-2]
- movdqa xmm3,[rsp+40h]
- mov [rsp+60h],eax
- mov eax,[rdi+r10*2-2]
- punpckldq xmm5,[rsp+50h]
- mov [rsp+70h],eax
- mov eax, [rdx+rdi-2]
- punpckldq xmm2, [rsp+60h]
- mov [rsp+80h],eax
- mov eax,[r11]
- punpckldq xmm4, [rsp+70h]
- mov [rsp+50h],eax
- mov eax,[rbx]
- punpckldq xmm3,[rsp+80h]
- mov [rsp+60h],eax
- mov eax,[r10+r11]
- movdqa xmm0, [rsp+50h]
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm5,xmm0
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax,[r10+rbx]
- movdqa xmm0,[rsp+50h]
- movdqa xmm1,xmm5
- mov [rsp+60h],eax
- mov eax,[r11+r10*2]
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm2,xmm0
- punpcklbw xmm1,xmm2
- punpckhbw xmm5,xmm2
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax,[rbx+r10*2]
- movdqa xmm0,[rsp+50h]
- mov [rsp+60h],eax
- mov eax, [rdx+r11]
- movdqa xmm15,xmm1
- punpckldq xmm0,[rsp+60h]
- punpcklqdq xmm4,xmm0
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax, [rdx+rbx]
- movdqa xmm0,[rsp+50h]
- mov [rsp+60h],eax
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm3,xmm0
- movdqa xmm0,xmm4
- punpcklbw xmm0,xmm3
- punpckhbw xmm4,xmm3
- punpcklwd xmm15,xmm0
- punpckhwd xmm1,xmm0
- movdqa xmm0,xmm5
- movdqa xmm12,xmm15
- punpcklwd xmm0,xmm4
- punpckhwd xmm5,xmm4
- punpckldq xmm12,xmm0
- punpckhdq xmm15,xmm0
- movdqa xmm0,xmm1
- movdqa xmm11,xmm12
- punpckldq xmm0,xmm5
- punpckhdq xmm1,xmm5
- punpcklqdq xmm11,xmm0
- punpckhqdq xmm12,xmm0
- movsx eax,r9w
- movdqa xmm14,xmm15
- punpcklqdq xmm14,xmm1
- punpckhqdq xmm15,xmm1
- pxor xmm1,xmm1
- movd xmm0,eax
- movdqa xmm4,xmm12
- movdqa xmm8,xmm11
- mov eax, ebp ; iBeta
- punpcklwd xmm0,xmm0
- punpcklbw xmm4,xmm1
- punpckhbw xmm12,xmm1
- movdqa xmm9,xmm14
- movdqa xmm7,xmm15
- movdqa xmm10,xmm15
- pshufd xmm13,xmm0,0
- punpcklbw xmm9,xmm1
- punpckhbw xmm14,xmm1
- movdqa xmm6,xmm13
- movd xmm0,eax
- movdqa [rsp],xmm11
- mov eax,2
- cwde
- punpckhbw xmm11,xmm1
- punpckhbw xmm10,xmm1
- punpcklbw xmm7,xmm1
- punpcklwd xmm0,xmm0
- punpcklbw xmm8,xmm1
- pshufd xmm3,xmm0,0
- movdqa xmm1,xmm8
- movdqa xmm0,xmm4
- psubw xmm0,xmm9
- psubw xmm1,xmm4
- movdqa xmm2,xmm3
- pabsw xmm0,xmm0
- pcmpgtw xmm6,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm3
- pcmpgtw xmm2,xmm0
- pand xmm6,xmm2
- movdqa xmm0,xmm7
- movdqa xmm2,xmm3
- psubw xmm0,xmm9
- pabsw xmm0,xmm0
- pcmpgtw xmm1,xmm0
- pand xmm6,xmm1
- movdqa xmm0,xmm12
- movdqa xmm1,xmm11
- psubw xmm0,xmm14
- psubw xmm1,xmm12
- movdqa xmm5,xmm6
- pabsw xmm0,xmm0
- pcmpgtw xmm13,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm8
- pcmpgtw xmm2,xmm0
- paddw xmm1,xmm8
- movdqa xmm0,xmm10
- pand xmm13,xmm2
- psubw xmm0,xmm14
- paddw xmm1,xmm4
- movdqa xmm2,xmm11
- pabsw xmm0,xmm0
- paddw xmm2,xmm11
- paddw xmm1,xmm7
- pcmpgtw xmm3,xmm0
- paddw xmm2,xmm12
- movd xmm0,eax
- pand xmm13,xmm3
- paddw xmm2,xmm10
- punpcklwd xmm0,xmm0
- pshufd xmm3,xmm0,0
- movdqa xmm0,xmm6
- paddw xmm1,xmm3
- pandn xmm0,xmm4
- paddw xmm2,xmm3
- psraw xmm1,2
- pand xmm5,xmm1
- por xmm5,xmm0
- paddw xmm7,xmm7
- paddw xmm10,xmm10
- psraw xmm2,2
- movdqa xmm1,xmm13
- movdqa xmm0,xmm13
- pandn xmm0,xmm12
- pand xmm1,xmm2
- paddw xmm7,xmm9
- por xmm1,xmm0
- paddw xmm10,xmm14
- paddw xmm7,xmm8
- movdqa xmm0,xmm13
- packuswb xmm5,xmm1
- paddw xmm7,xmm3
- paddw xmm10,xmm11
- movdqa xmm1,xmm6
- paddw xmm10,xmm3
- pandn xmm6,xmm9
- psraw xmm7,2
- pand xmm1,xmm7
- psraw xmm10,2
- pandn xmm13,xmm14
- pand xmm0,xmm10
- por xmm1,xmm6
- movdqa xmm6,[rsp]
- movdqa xmm4,xmm6
- por xmm0,xmm13
- punpcklbw xmm4,xmm5
- punpckhbw xmm6,xmm5
- movdqa xmm3,xmm4
- packuswb xmm1,xmm0
- movdqa xmm0,xmm1
- punpckhbw xmm1,xmm15
- punpcklbw xmm0,xmm15
- punpcklwd xmm3,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm0,xmm6
- movdqa xmm2,xmm3
- punpcklwd xmm0,xmm1
- punpckhwd xmm6,xmm1
- movdqa xmm1,xmm4
- punpckldq xmm2,xmm0
- punpckhdq xmm3,xmm0
- punpckldq xmm1,xmm6
- movdqa xmm0,xmm2
- punpcklqdq xmm0,xmm1
- punpckhdq xmm4,xmm6
- punpckhqdq xmm2,xmm1
- movdqa [rsp+10h],xmm0
- movdqa [rsp+60h],xmm2
- movdqa xmm0,xmm3
- mov eax,[rsp+10h]
- mov [rcx-2],eax
- mov eax,[rsp+60h]
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm3,xmm4
- mov [r10+rcx-2],eax
- movdqa [rsp+20h],xmm0
- mov eax, [rsp+20h]
- movdqa [rsp+70h],xmm3
- mov [rcx+r10*2-2],eax
- mov eax,[rsp+70h]
- mov [rdx+rcx-2],eax
- mov eax,[rsp+18h]
- mov [r11],eax
- mov eax,[rsp+68h]
- mov [r10+r11],eax
- mov eax,[rsp+28h]
- mov [r11+r10*2],eax
- mov eax,[rsp+78h]
- mov [rdx+r11],eax
- mov eax,[rsp+14h]
- mov [rdi-2],eax
- mov eax,[rsp+64h]
- mov [r10+rdi-2],eax
- mov eax,[rsp+24h]
- mov [rdi+r10*2-2],eax
- mov eax, [rsp+74h]
- mov [rdx+rdi-2],eax
- mov eax, [rsp+1Ch]
- mov [rbx],eax
- mov eax, [rsp+6Ch]
- mov [r10+rbx],eax
- mov eax,[rsp+2Ch]
- mov [rbx+r10*2],eax
- mov eax,[rsp+7Ch]
- mov [rdx+rbx],eax
- lea r11,[rsp+140h]
- mov rbx, [r11+28h]
- mov rsp,r11
- pop r12
- pop rbp
- pop rbx
- ret
-
-
-WELS_EXTERN DeblockChromaLt4H_ssse3
- mov rax,rsp
- push rbx
- push rbp
- push r12
- push r13
- push r14
- sub rsp,170h
-
- mov r13, r8
- mov r14, r9
- mov r8, rdx
- mov r9, rcx
- mov rdx, rdi
- mov rcx, rsi
-
- movsxd rsi,r8d
- lea eax,[r8*4]
- mov r11d,r9d
- movsxd r10,eax
- mov eax, [rcx-2]
- mov r12,rdx
- mov [rsp+40h],eax
- mov eax, [rsi+rcx-2]
- lea rbx,[r10+rcx-2]
- movdqa xmm5,[rsp+40h]
- mov [rsp+50h],eax
- mov eax, [rcx+rsi*2-2]
- lea rbp,[r10+rdx-2]
- movdqa xmm2, [rsp+50h]
- mov [rsp+60h],eax
- lea r10,[rsi+rsi*2]
- mov rdi,rcx
- mov eax,[r10+rcx-2]
- movdqa xmm4,[rsp+60h]
- mov [rsp+70h],eax
- mov eax,[rdx-2]
- mov [rsp+80h],eax
- mov eax, [rsi+rdx-2]
- movdqa xmm3,[rsp+70h]
- mov [rsp+90h],eax
- mov eax,[rdx+rsi*2-2]
- punpckldq xmm5,[rsp+80h]
- mov [rsp+0A0h],eax
- mov eax, [r10+rdx-2]
- punpckldq xmm2,[rsp+90h]
- mov [rsp+0B0h],eax
- mov eax, [rbx]
- punpckldq xmm4,[rsp+0A0h]
- mov [rsp+80h],eax
- mov eax,[rbp]
- punpckldq xmm3,[rsp+0B0h]
- mov [rsp+90h],eax
- mov eax,[rsi+rbx]
- movdqa xmm0,[rsp+80h]
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm5,xmm0
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax,[rsi+rbp]
- movdqa xmm0,[rsp+80h]
- movdqa xmm1,xmm5
- mov [rsp+90h],eax
- mov eax,[rbx+rsi*2]
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm2,xmm0
- punpcklbw xmm1,xmm2
- punpckhbw xmm5,xmm2
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax,[rbp+rsi*2]
- movdqa xmm0, [rsp+80h]
- mov [rsp+90h],eax
- mov eax,[r10+rbx]
- movdqa xmm7,xmm1
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm4,xmm0
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax, [r10+rbp]
- movdqa xmm0,[rsp+80h]
- mov [rsp+90h],eax
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm3,xmm0
- movdqa xmm0,xmm4
- punpcklbw xmm0,xmm3
- punpckhbw xmm4,xmm3
- punpcklwd xmm7,xmm0
- punpckhwd xmm1,xmm0
- movdqa xmm0,xmm5
- movdqa xmm6,xmm7
- punpcklwd xmm0,xmm4
- punpckhwd xmm5,xmm4
- punpckldq xmm6,xmm0
- punpckhdq xmm7,xmm0
- movdqa xmm0,xmm1
- punpckldq xmm0,xmm5
- mov rax, r14 ; pTC
- punpckhdq xmm1,xmm5
- movdqa xmm9,xmm6
- punpckhqdq xmm6,xmm0
- punpcklqdq xmm9,xmm0
- movdqa xmm2,xmm7
- movdqa xmm13,xmm6
- movdqa xmm4,xmm9
- movdqa [rsp+10h],xmm9
- punpcklqdq xmm2,xmm1
- punpckhqdq xmm7,xmm1
- pxor xmm1,xmm1
- movsx ecx,byte [rax+3]
- movsx edx,byte [rax+2]
- movsx r8d,byte [rax+1]
- movsx r9d,byte [rax]
- movdqa xmm10,xmm1
- movdqa xmm15,xmm2
- punpckhbw xmm2,xmm1
- punpckhbw xmm6,xmm1
- punpcklbw xmm4,xmm1
- movsx eax,r11w
- mov word [rsp+0Eh],cx
- mov word [rsp+0Ch],cx
- movdqa xmm3,xmm7
- movdqa xmm8,xmm7
- movdqa [rsp+20h],xmm7
- punpcklbw xmm15,xmm1
- punpcklbw xmm13,xmm1
- punpcklbw xmm3,xmm1
- mov word [rsp+0Ah],dx
- mov word [rsp+8],dx
- mov word [rsp+6],r8w
- movd xmm0,eax
- movdqa [rsp+30h],xmm6
- punpckhbw xmm9,xmm1
- punpckhbw xmm8,xmm1
- punpcklwd xmm0,xmm0
- mov eax, r13d ; iBeta
- mov word [rsp+4],r8w
- mov word [rsp+2],r9w
- pshufd xmm12,xmm0,0
- mov word [rsp],r9w
- movd xmm0,eax
- mov eax,4
- cwde
- movdqa xmm14, [rsp]
- movdqa [rsp],xmm2
- movdqa xmm2,xmm12
- punpcklwd xmm0,xmm0
- pshufd xmm11,xmm0,0
- psubw xmm10,xmm14
- movd xmm0,eax
- movdqa xmm7,xmm14
- movdqa xmm6,xmm14
- pcmpgtw xmm7,xmm1
- punpcklwd xmm0,xmm0
- pshufd xmm5,xmm0,0
- movdqa xmm0,xmm4
- movdqa xmm1,xmm15
- psubw xmm4,xmm13
- psubw xmm0,xmm3
- psubw xmm1,xmm13
- psubw xmm3,xmm15
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm10
- psraw xmm1,3
- pmaxsw xmm0,xmm1
- pminsw xmm6,xmm0
- movdqa xmm1,xmm11
- movdqa xmm0,xmm13
- psubw xmm0,xmm15
- pabsw xmm0,xmm0
- pcmpgtw xmm2,xmm0
- pabsw xmm0,xmm4
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm3
- pand xmm2,xmm1
- movdqa xmm1,xmm11
- movdqa xmm3,[rsp+30h]
- pcmpgtw xmm1,xmm0
- movdqa xmm0,xmm9
- pand xmm2,xmm1
- psubw xmm0,xmm8
- psubw xmm9,xmm3
- pand xmm2,xmm7
- pand xmm6,xmm2
- psubw xmm15,xmm6
- paddw xmm13,xmm6
- movdqa xmm2,[rsp]
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- psubw xmm8,xmm2
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm3
- movdqa xmm5,[rsp+10h]
- psubw xmm0,xmm2
- psraw xmm1,3
- movdqa xmm4,xmm5
- pabsw xmm0,xmm0
- pmaxsw xmm10,xmm1
- movdqa xmm1,xmm11
- pcmpgtw xmm12,xmm0
- pabsw xmm0,xmm9
- pminsw xmm14,xmm10
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm8
- pcmpgtw xmm11,xmm0
- pand xmm12,xmm1
- movdqa xmm1,[rsp+20h]
- pand xmm12,xmm11
- pand xmm12,xmm7
- pand xmm14,xmm12
- paddw xmm3,xmm14
- psubw xmm2,xmm14
- packuswb xmm13,xmm3
- packuswb xmm15,xmm2
- punpcklbw xmm4,xmm13
- punpckhbw xmm5,xmm13
- movdqa xmm0,xmm15
- punpcklbw xmm0,xmm1
- punpckhbw xmm15,xmm1
- movdqa xmm3,xmm4
- punpcklwd xmm3,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm0,xmm5
- movdqa xmm2,xmm3
- movdqa xmm1,xmm4
- punpcklwd xmm0,xmm15
- punpckhwd xmm5,xmm15
- punpckldq xmm2,xmm0
- punpckhdq xmm3,xmm0
- punpckldq xmm1,xmm5
- movdqa xmm0,xmm2
- punpcklqdq xmm0,xmm1
- punpckhdq xmm4,xmm5
- punpckhqdq xmm2,xmm1
- movdqa [rsp+40h],xmm0
- movdqa xmm0,xmm3
- movdqa [rsp+90h],xmm2
- mov eax,[rsp+40h]
- mov [rdi-2],eax
- mov eax, [rsp+90h]
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm3,xmm4
- mov [rsi+rdi-2],eax
- movdqa [rsp+50h],xmm0
- mov eax,[rsp+50h]
- movdqa [rsp+0A0h],xmm3
- mov [rdi+rsi*2-2],eax
- mov eax,[rsp+0A0h]
- mov [r10+rdi-2],eax
- mov eax,[rsp+48h]
- mov [rbx],eax
- mov eax,[rsp+98h]
- mov [rsi+rbx],eax
- mov eax,[rsp+58h]
- mov [rbx+rsi*2],eax
- mov eax, [rsp+0A8h]
- mov [r10+rbx],eax
- mov eax, [rsp+44h]
- mov [r12-2],eax
- mov eax,[rsp+94h]
- mov [rsi+r12-2],eax
- mov eax,[rsp+54h]
- mov [r12+rsi*2-2],eax
- mov eax, [rsp+0A4h]
- mov [r10+r12-2],eax
- mov eax,[rsp+4Ch]
- mov [rbp],eax
- mov eax,[rsp+9Ch]
- mov [rsi+rbp],eax
- mov eax, [rsp+5Ch]
- mov [rbp+rsi*2],eax
- mov eax,[rsp+0ACh]
- mov [r10+rbp],eax
- lea r11,[rsp+170h]
- mov rsp,r11
- pop r14
- pop r13
- pop r12
- pop rbp
- pop rbx
- ret
-
-
-
-%elifdef X86_32
-
-;********************************************************************************
-; void DeblockChromaEq4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta)
-;********************************************************************************
-WELS_EXTERN DeblockChromaEq4V_ssse3
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,68h
- mov edx,[ebp+10h] ; iStride
- mov eax,[ebp+8] ; pPixCb
- mov ecx,[ebp+0Ch] ; pPixCr
- movq xmm4,[ecx]
- movq xmm5,[edx+ecx]
- push esi
- push edi
- lea esi,[edx+edx]
- mov edi,eax
- sub edi,esi
- movq xmm1,[edi]
- mov edi,ecx
- sub edi,esi
- movq xmm2,[edi]
- punpcklqdq xmm1,xmm2
- mov esi,eax
- sub esi,edx
- movq xmm2,[esi]
- mov edi,ecx
- sub edi,edx
- movq xmm3,[edi]
- punpcklqdq xmm2,xmm3
- movq xmm3,[eax]
- punpcklqdq xmm3,xmm4
- movq xmm4,[edx+eax]
- mov edx, [ebp + 14h]
- punpcklqdq xmm4,xmm5
- movd xmm5,edx
- mov edx, [ebp + 18h]
- pxor xmm0,xmm0
- movdqa xmm6,xmm5
- punpcklwd xmm6,xmm5
- pshufd xmm5,xmm6,0
- movd xmm6,edx
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa xmm7,xmm1
- punpckhbw xmm1,xmm0
- punpcklbw xmm7,xmm0
- movdqa [esp+40h],xmm1
- movdqa [esp+60h],xmm7
- movdqa xmm7,xmm2
- punpcklbw xmm7,xmm0
- movdqa [esp+10h],xmm7
- movdqa xmm7,xmm3
- punpcklbw xmm7,xmm0
- punpckhbw xmm3,xmm0
- movdqa [esp+50h],xmm7
- movdqa xmm7,xmm4
- punpckhbw xmm4,xmm0
- punpckhbw xmm2,xmm0
- punpcklbw xmm7,xmm0
- movdqa [esp+30h],xmm3
- movdqa xmm3,[esp+10h]
- movdqa xmm1,xmm3
- psubw xmm1,[esp+50h]
- pabsw xmm1,xmm1
- movdqa [esp+20h],xmm4
- movdqa xmm0,xmm5
- pcmpgtw xmm0,xmm1
- movdqa xmm1,[esp+60h]
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- pand xmm0,xmm4
- movdqa xmm1,xmm7
- psubw xmm1,[esp+50h]
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- movdqa xmm1,xmm2
- psubw xmm1,[esp+30h]
- pabsw xmm1,xmm1
- pcmpgtw xmm5,xmm1
- movdqa xmm1,[esp+40h]
- pand xmm0,xmm4
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- movdqa xmm1,[esp+20h]
- psubw xmm1,[esp+30h]
- pand xmm5,xmm4
- pabsw xmm1,xmm1
- pcmpgtw xmm6,xmm1
- pand xmm5,xmm6
- mov edx,2
- movsx edx,dx
- movd xmm1,edx
- movdqa xmm4,xmm1
- punpcklwd xmm4,xmm1
- pshufd xmm1,xmm4,0
- movdqa xmm4,[esp+60h]
- movdqa xmm6,xmm4
- paddw xmm6,xmm4
- paddw xmm6,xmm3
- paddw xmm6,xmm7
- movdqa [esp+10h],xmm1
- paddw xmm6,[esp+10h]
- psraw xmm6,2
- movdqa xmm4,xmm0
- pandn xmm4,xmm3
- movdqa xmm3,[esp+40h]
- movdqa xmm1,xmm0
- pand xmm1,xmm6
- por xmm1,xmm4
- movdqa xmm6,xmm3
- paddw xmm6,xmm3
- movdqa xmm3,[esp+10h]
- paddw xmm6,xmm2
- paddw xmm6,[esp+20h]
- paddw xmm6,xmm3
- psraw xmm6,2
- movdqa xmm4,xmm5
- pand xmm4,xmm6
- movdqa xmm6,xmm5
- pandn xmm6,xmm2
- por xmm4,xmm6
- packuswb xmm1,xmm4
- movdqa xmm4,[esp+50h]
- movdqa xmm6,xmm7
- paddw xmm6,xmm7
- paddw xmm6,xmm4
- paddw xmm6,[esp+60h]
- paddw xmm6,xmm3
- psraw xmm6,2
- movdqa xmm2,xmm0
- pand xmm2,xmm6
- pandn xmm0,xmm4
- por xmm2,xmm0
- movdqa xmm0,[esp+20h]
- movdqa xmm6,xmm0
- paddw xmm6,xmm0
- movdqa xmm0,[esp+30h]
- paddw xmm6,xmm0
- paddw xmm6,[esp+40h]
- movdqa xmm4,xmm5
- paddw xmm6,xmm3
- movq [esi],xmm1
- psraw xmm6,2
- pand xmm4,xmm6
- pandn xmm5,xmm0
- por xmm4,xmm5
- packuswb xmm2,xmm4
- movq [eax],xmm2
- psrldq xmm1,8
- movq [edi],xmm1
- pop edi
- psrldq xmm2,8
- movq [ecx],xmm2
- pop esi
- mov esp,ebp
- pop ebp
- ret
-
-;******************************************************************************
-; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN DeblockChromaLt4V_ssse3
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,0E4h
- push ebx
- push esi
- mov esi, [ebp+1Ch] ; pTC
- movsx ebx, byte [esi+2]
- push edi
- movsx di,byte [esi+3]
- mov word [esp+0Ch],bx
- movsx bx,byte [esi+1]
- movsx esi,byte [esi]
- mov word [esp+0Eh],si
- movzx esi,di
- movd xmm1,esi
- movzx esi,di
- movd xmm2,esi
- mov si,word [esp+0Ch]
- mov edx, [ebp + 10h]
- mov eax, [ebp + 08h]
- movzx edi,si
- movzx esi,si
- mov ecx, [ebp + 0Ch]
- movd xmm4,esi
- movzx esi,bx
- movd xmm5,esi
- movd xmm3,edi
- movzx esi,bx
- movd xmm6,esi
- mov si,word [esp+0Eh]
- movzx edi,si
- movzx esi,si
- punpcklwd xmm6,xmm2
- pxor xmm0,xmm0
- movdqa [esp+40h],xmm0
- movd xmm7,edi
- movd xmm0,esi
- lea esi,[edx+edx]
- mov edi,eax
- sub edi,esi
- punpcklwd xmm5,xmm1
- movdqa xmm1,[esp+40h]
- punpcklwd xmm0,xmm4
- movq xmm4,[edx+ecx]
- punpcklwd xmm7,xmm3
- movq xmm3,[eax]
- punpcklwd xmm0,xmm6
- movq xmm6,[edi]
- punpcklwd xmm7,xmm5
- punpcklwd xmm0,xmm7
- mov edi,ecx
- sub edi,esi
- movdqa xmm2,xmm1
- psubw xmm2,xmm0
- movdqa [esp+60h],xmm2
- movq xmm2, [edi]
- punpcklqdq xmm6,xmm2
- mov esi,eax
- sub esi,edx
- movq xmm7,[esi]
- mov edi,ecx
- sub edi,edx
- movq xmm2,[edi]
- punpcklqdq xmm7,xmm2
- movq xmm2,[ecx]
- punpcklqdq xmm3,xmm2
- movq xmm2,[edx+eax]
- movsx edx,word [ebp + 14h]
- punpcklqdq xmm2,xmm4
- movdqa [esp+0E0h],xmm2
- movd xmm2,edx
- movsx edx,word [ebp + 18h]
- movdqa xmm4,xmm2
- punpcklwd xmm4,xmm2
- movd xmm2,edx
- movdqa xmm5,xmm2
- punpcklwd xmm5,xmm2
- pshufd xmm2,xmm5,0
- movdqa [esp+50h],xmm2
- movdqa xmm2,xmm6
- punpcklbw xmm2,xmm1
- movdqa [esp+0D0h],xmm3
- pshufd xmm4,xmm4,0
- movdqa [esp+30h],xmm2
- punpckhbw xmm6,xmm1
- movdqa [esp+80h],xmm6
- movdqa xmm6,[esp+0D0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+70h],xmm6
- movdqa xmm6, [esp+0E0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+90h],xmm6
- movdqa xmm5, [esp+0E0h]
- movdqa xmm2,xmm7
- punpckhbw xmm7,xmm1
- punpcklbw xmm5,xmm1
- movdqa [esp+0A0h],xmm7
- punpcklbw xmm3,xmm1
- mov edx,4
- punpcklbw xmm2,xmm1
- movsx edx,dx
- movd xmm6,edx
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa xmm7,[esp+30h]
- movdqa [esp+20h],xmm6
- psubw xmm7,xmm5
- movdqa xmm6,xmm0
- pcmpgtw xmm6,xmm1
- movdqa xmm1,[esp+60h]
- movdqa [esp+40h],xmm6
- movdqa xmm6,xmm3
- psubw xmm6,xmm2
- psllw xmm6,2
- paddw xmm6,xmm7
- paddw xmm6, [esp+20h]
- movdqa xmm7, [esp+50h]
- psraw xmm6,3
- pmaxsw xmm1,xmm6
- movdqa [esp+10h],xmm0
- movdqa xmm6, [esp+10h]
- pminsw xmm6,xmm1
- movdqa [esp+10h],xmm6
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- movdqa xmm6,xmm4
- pcmpgtw xmm6,xmm1
- movdqa xmm1, [esp+30h]
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- pcmpgtw xmm7,xmm1
- movdqa xmm1,[esp+50h]
- pand xmm6,xmm7
- movdqa xmm7,[esp+50h]
- psubw xmm5,xmm3
- pabsw xmm5,xmm5
- pcmpgtw xmm1,xmm5
- movdqa xmm5,[esp+80h]
- psubw xmm5,[esp+90h]
- pand xmm6,xmm1
- pand xmm6,[esp+40h]
- movdqa xmm1,[esp+10h]
- pand xmm1,xmm6
- movdqa xmm6,[esp+70h]
- movdqa [esp+30h],xmm1
- movdqa xmm1,[esp+0A0h]
- psubw xmm6,xmm1
- psllw xmm6,2
- paddw xmm6,xmm5
- paddw xmm6,[esp+20h]
- movdqa xmm5,[esp+60h]
- psraw xmm6,3
- pmaxsw xmm5,xmm6
- pminsw xmm0,xmm5
- movdqa xmm5,[esp+70h]
- movdqa xmm6,xmm1
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm4,xmm6
- movdqa xmm6,[esp+80h]
- psubw xmm6,xmm1
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+90h]
- pand xmm4,xmm7
- movdqa xmm7,[esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- pand xmm4,xmm7
- pand xmm4,[esp+40h]
- pand xmm0,xmm4
- movdqa xmm4,[esp+30h]
- paddw xmm2,xmm4
- paddw xmm1,xmm0
- packuswb xmm2,xmm1
- movq [esi],xmm2
- psubw xmm3,xmm4
- psubw xmm5,xmm0
- packuswb xmm3,xmm5
- movq [eax],xmm3
- psrldq xmm2,8
- movq [edi],xmm2
- pop edi
- pop esi
- psrldq xmm3,8
- movq [ecx],xmm3
- pop ebx
- mov esp,ebp
- pop ebp
- ret
-
-;***************************************************************************
-; void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta)
-;***************************************************************************
-
-WELS_EXTERN DeblockChromaEq4H_ssse3
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,0C8h
- mov ecx,dword [ebp+8]
- mov edx,dword [ebp+0Ch]
- mov eax,dword [ebp+10h]
- sub ecx,2
- sub edx,2
- push esi
- lea esi,[eax+eax*2]
- mov dword [esp+18h],ecx
- mov dword [esp+4],edx
- lea ecx,[ecx+eax*4]
- lea edx,[edx+eax*4]
- lea eax,[esp+7Ch]
- push edi
- mov dword [esp+14h],esi
- mov dword [esp+18h],ecx
- mov dword [esp+0Ch],edx
- mov dword [esp+10h],eax
- mov esi,dword [esp+1Ch]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+14h]
- movd xmm0,dword [esi]
- movd xmm1,dword [esi+ecx]
- movd xmm2,dword [esi+ecx*2]
- movd xmm3,dword [esi+edx]
- mov esi,dword [esp+8]
- movd xmm4,dword [esi]
- movd xmm5,dword [esi+ecx]
- movd xmm6,dword [esi+ecx*2]
- movd xmm7,dword [esi+edx]
- punpckldq xmm0,xmm4
- punpckldq xmm1,xmm5
- punpckldq xmm2,xmm6
- punpckldq xmm3,xmm7
- mov esi,dword [esp+18h]
- mov edi,dword [esp+0Ch]
- movd xmm4,dword [esi]
- movd xmm5,dword [edi]
- punpckldq xmm4,xmm5
- punpcklqdq xmm0,xmm4
- movd xmm4,dword [esi+ecx]
- movd xmm5,dword [edi+ecx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm1,xmm4
- movd xmm4,dword [esi+ecx*2]
- movd xmm5,dword [edi+ecx*2]
- punpckldq xmm4,xmm5
- punpcklqdq xmm2,xmm4
- movd xmm4,dword [esi+edx]
- movd xmm5,dword [edi+edx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm3,xmm4
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov edi,dword [esp+10h]
- movdqa [edi],xmm0
- movdqa [edi+10h],xmm5
- movdqa [edi+20h],xmm1
- movdqa [edi+30h],xmm6
- movsx ecx,word [ebp+14h]
- movsx edx,word [ebp+18h]
- movdqa xmm6,[esp+80h]
- movdqa xmm4,[esp+90h]
- movdqa xmm5,[esp+0A0h]
- movdqa xmm7,[esp+0B0h]
- pxor xmm0,xmm0
- movd xmm1,ecx
- movdqa xmm2,xmm1
- punpcklwd xmm2,xmm1
- pshufd xmm1,xmm2,0
- movd xmm2,edx
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm2,xmm3,0
- movdqa xmm3,xmm6
- punpckhbw xmm6,xmm0
- movdqa [esp+60h],xmm6
- movdqa xmm6,[esp+90h]
- punpckhbw xmm6,xmm0
- movdqa [esp+30h],xmm6
- movdqa xmm6,[esp+0A0h]
- punpckhbw xmm6,xmm0
- movdqa [esp+40h],xmm6
- movdqa xmm6,[esp+0B0h]
- punpckhbw xmm6,xmm0
- movdqa [esp+70h],xmm6
- punpcklbw xmm7,xmm0
- punpcklbw xmm4,xmm0
- punpcklbw xmm5,xmm0
- punpcklbw xmm3,xmm0
- movdqa [esp+50h],xmm7
- movdqa xmm6,xmm4
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- movdqa xmm0,xmm1
- pcmpgtw xmm0,xmm6
- movdqa xmm6,xmm3
- psubw xmm6,xmm4
- pabsw xmm6,xmm6
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pand xmm0,xmm7
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+30h]
- psubw xmm6,[esp+40h]
- pabsw xmm6,xmm6
- pcmpgtw xmm1,xmm6
- movdqa xmm6,[esp+60h]
- psubw xmm6,[esp+30h]
- pabsw xmm6,xmm6
- pand xmm0,xmm7
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+70h]
- psubw xmm6,[esp+40h]
- pabsw xmm6,xmm6
- pand xmm1,xmm7
- pcmpgtw xmm2,xmm6
- pand xmm1,xmm2
- mov eax,2
- movsx ecx,ax
- movd xmm2,ecx
- movdqa xmm6,xmm2
- punpcklwd xmm6,xmm2
- pshufd xmm2,xmm6,0
- movdqa [esp+20h],xmm2
- movdqa xmm2,xmm3
- paddw xmm2,xmm3
- paddw xmm2,xmm4
- paddw xmm2,[esp+50h]
- paddw xmm2,[esp+20h]
- psraw xmm2,2
- movdqa xmm6,xmm0
- pand xmm6,xmm2
- movdqa xmm2,xmm0
- pandn xmm2,xmm4
- por xmm6,xmm2
- movdqa xmm2,[esp+60h]
- movdqa xmm7,xmm2
- paddw xmm7,xmm2
- paddw xmm7,[esp+30h]
- paddw xmm7,[esp+70h]
- paddw xmm7,[esp+20h]
- movdqa xmm4,xmm1
- movdqa xmm2,xmm1
- pandn xmm2,[esp+30h]
- psraw xmm7,2
- pand xmm4,xmm7
- por xmm4,xmm2
- movdqa xmm2,[esp+50h]
- packuswb xmm6,xmm4
- movdqa [esp+90h],xmm6
- movdqa xmm6,xmm2
- paddw xmm6,xmm2
- movdqa xmm2,[esp+20h]
- paddw xmm6,xmm5
- paddw xmm6,xmm3
- movdqa xmm4,xmm0
- pandn xmm0,xmm5
- paddw xmm6,xmm2
- psraw xmm6,2
- pand xmm4,xmm6
- por xmm4,xmm0
- movdqa xmm0,[esp+70h]
- movdqa xmm5,xmm0
- paddw xmm5,xmm0
- movdqa xmm0,[esp+40h]
- paddw xmm5,xmm0
- paddw xmm5,[esp+60h]
- movdqa xmm3,xmm1
- paddw xmm5,xmm2
- psraw xmm5,2
- pand xmm3,xmm5
- pandn xmm1,xmm0
- por xmm3,xmm1
- packuswb xmm4,xmm3
- movdqa [esp+0A0h],xmm4
- mov esi,dword [esp+10h]
- movdqa xmm0,[esi]
- movdqa xmm1,[esi+10h]
- movdqa xmm2,[esi+20h]
- movdqa xmm3,[esi+30h]
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov esi,dword [esp+1Ch]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+14h]
- mov edi,dword [esp+8]
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov esi,dword [esp+18h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov edi,dword [esp+0Ch]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- pop edi
- pop esi
- mov esp,ebp
- pop ebp
- ret
-
-;*******************************************************************************
-; void DeblockChromaLt4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN DeblockChromaLt4H_ssse3
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,108h
- mov ecx,dword [ebp+8]
- mov edx,dword [ebp+0Ch]
- mov eax,dword [ebp+10h]
- sub ecx,2
- sub edx,2
- push esi
- lea esi,[eax+eax*2]
- mov dword [esp+10h],ecx
- mov dword [esp+4],edx
- lea ecx,[ecx+eax*4]
- lea edx,[edx+eax*4]
- lea eax,[esp+6Ch]
- push edi
- mov dword [esp+0Ch],esi
- mov dword [esp+18h],ecx
- mov dword [esp+10h],edx
- mov dword [esp+1Ch],eax
- mov esi,dword [esp+14h]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+0Ch]
- movd xmm0,dword [esi]
- movd xmm1,dword [esi+ecx]
- movd xmm2,dword [esi+ecx*2]
- movd xmm3,dword [esi+edx]
- mov esi,dword [esp+8]
- movd xmm4,dword [esi]
- movd xmm5,dword [esi+ecx]
- movd xmm6,dword [esi+ecx*2]
- movd xmm7,dword [esi+edx]
- punpckldq xmm0,xmm4
- punpckldq xmm1,xmm5
- punpckldq xmm2,xmm6
- punpckldq xmm3,xmm7
- mov esi,dword [esp+18h]
- mov edi,dword [esp+10h]
- movd xmm4,dword [esi]
- movd xmm5,dword [edi]
- punpckldq xmm4,xmm5
- punpcklqdq xmm0,xmm4
- movd xmm4,dword [esi+ecx]
- movd xmm5,dword [edi+ecx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm1,xmm4
- movd xmm4,dword [esi+ecx*2]
- movd xmm5,dword [edi+ecx*2]
- punpckldq xmm4,xmm5
- punpcklqdq xmm2,xmm4
- movd xmm4,dword [esi+edx]
- movd xmm5,dword [edi+edx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm3,xmm4
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov edi,dword [esp+1Ch]
- movdqa [edi],xmm0
- movdqa [edi+10h],xmm5
- movdqa [edi+20h],xmm1
- movdqa [edi+30h],xmm6
- mov eax,dword [ebp+1Ch]
- movsx cx,byte [eax+3]
- movsx dx,byte [eax+2]
- movsx si,byte [eax+1]
- movsx ax,byte [eax]
- movzx edi,cx
- movzx ecx,cx
- movd xmm2,ecx
- movzx ecx,dx
- movzx edx,dx
- movd xmm3,ecx
- movd xmm4,edx
- movzx ecx,si
- movzx edx,si
- movd xmm5,ecx
- pxor xmm0,xmm0
- movd xmm6,edx
- movzx ecx,ax
- movdqa [esp+60h],xmm0
- movzx edx,ax
- movsx eax,word [ebp+14h]
- punpcklwd xmm6,xmm2
- movd xmm1,edi
- movd xmm7,ecx
- movsx ecx,word [ebp+18h]
- movd xmm0,edx
- punpcklwd xmm7,xmm3
- punpcklwd xmm5,xmm1
- movdqa xmm1,[esp+60h]
- punpcklwd xmm7,xmm5
- movdqa xmm5,[esp+0A0h]
- punpcklwd xmm0,xmm4
- punpcklwd xmm0,xmm6
- movdqa xmm6, [esp+70h]
- punpcklwd xmm0,xmm7
- movdqa xmm7,[esp+80h]
- movdqa xmm2,xmm1
- psubw xmm2,xmm0
- movdqa [esp+0D0h],xmm2
- movd xmm2,eax
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm4,xmm3,0
- movd xmm2,ecx
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm2,xmm3,0
- movdqa xmm3, [esp+90h]
- movdqa [esp+50h],xmm2
- movdqa xmm2,xmm6
- punpcklbw xmm2,xmm1
- punpckhbw xmm6,xmm1
- movdqa [esp+40h],xmm2
- movdqa [esp+0B0h],xmm6
- movdqa xmm6,[esp+90h]
- movdqa xmm2,xmm7
- punpckhbw xmm7,xmm1
- punpckhbw xmm6,xmm1
- punpcklbw xmm2,xmm1
- punpcklbw xmm3,xmm1
- punpcklbw xmm5,xmm1
- movdqa [esp+0F0h],xmm7
- movdqa [esp+0C0h],xmm6
- movdqa xmm6, [esp+0A0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+0E0h],xmm6
- mov edx,4
- movsx eax,dx
- movd xmm6,eax
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa [esp+30h],xmm6
- movdqa xmm7, [esp+40h]
- psubw xmm7,xmm5
- movdqa xmm6,xmm0
- pcmpgtw xmm6,xmm1
- movdqa [esp+60h],xmm6
- movdqa xmm1, [esp+0D0h]
- movdqa xmm6,xmm3
- psubw xmm6,xmm2
- psllw xmm6,2
- paddw xmm6,xmm7
- paddw xmm6,[esp+30h]
- psraw xmm6,3
- pmaxsw xmm1,xmm6
- movdqa xmm7,[esp+50h]
- movdqa [esp+20h],xmm0
- movdqa xmm6, [esp+20h]
- pminsw xmm6,xmm1
- movdqa [esp+20h],xmm6
- movdqa xmm6,xmm4
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- pcmpgtw xmm6,xmm1
- movdqa xmm1, [esp+40h]
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- pcmpgtw xmm7,xmm1
- movdqa xmm1, [esp+50h]
- pand xmm6,xmm7
- movdqa xmm7, [esp+50h]
- psubw xmm5,xmm3
- pabsw xmm5,xmm5
- pcmpgtw xmm1,xmm5
- movdqa xmm5, [esp+0B0h]
- psubw xmm5,[esp+0E0h]
- pand xmm6,xmm1
- pand xmm6, [esp+60h]
- movdqa xmm1, [esp+20h]
- pand xmm1,xmm6
- movdqa xmm6, [esp+0C0h]
- movdqa [esp+40h],xmm1
- movdqa xmm1, [esp+0F0h]
- psubw xmm6,xmm1
- psllw xmm6,2
- paddw xmm6,xmm5
- paddw xmm6, [esp+30h]
- movdqa xmm5, [esp+0D0h]
- psraw xmm6,3
- pmaxsw xmm5,xmm6
- pminsw xmm0,xmm5
- movdqa xmm5,[esp+0C0h]
- movdqa xmm6,xmm1
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm4,xmm6
- movdqa xmm6,[esp+0B0h]
- psubw xmm6,xmm1
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- movdqa xmm6, [esp+0E0h]
- pand xmm4,xmm7
- movdqa xmm7, [esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- pand xmm4,xmm7
- pand xmm4,[esp+60h]
- pand xmm0,xmm4
- movdqa xmm4, [esp+40h]
- paddw xmm2,xmm4
- paddw xmm1,xmm0
- psubw xmm3,xmm4
- psubw xmm5,xmm0
- packuswb xmm2,xmm1
- packuswb xmm3,xmm5
- movdqa [esp+80h],xmm2
- movdqa [esp+90h],xmm3
- mov esi,dword [esp+1Ch]
- movdqa xmm0, [esi]
- movdqa xmm1, [esi+10h]
- movdqa xmm2, [esi+20h]
- movdqa xmm3, [esi+30h]
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov esi,dword [esp+14h]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+0Ch]
- mov edi,dword [esp+8]
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov esi,dword [esp+18h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov edi,dword [esp+10h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- pop edi
- pop esi
- mov esp,ebp
- pop ebp
- ret
-
-
-
-;*******************************************************************************
-; void DeblockLumaLt4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
-; int32_t iBeta, int8_t * pTC)
-;*******************************************************************************
-
-
-WELS_EXTERN DeblockLumaLt4V_ssse3
- push ebp
- mov ebp, esp
- and esp, -16 ; fffffff0H
- sub esp, 420 ; 000001a4H
- mov eax, dword [ebp+8]
- mov ecx, dword [ebp+12]
-
- pxor xmm0, xmm0
- push ebx
- mov edx, dword [ebp+24]
- movdqa [esp+424-384], xmm0
- push esi
-
- lea esi, [ecx+ecx*2]
- push edi
- mov edi, eax
- sub edi, esi
- movdqa xmm0, [edi]
-
- lea esi, [ecx+ecx]
- movdqa [esp+432-208], xmm0
- mov edi, eax
- sub edi, esi
- movdqa xmm0, [edi]
- movdqa [esp+448-208], xmm0
-
- mov ebx, eax
- sub ebx, ecx
- movdqa xmm0, [ebx]
- movdqa [esp+464-208], xmm0
-
- movdqa xmm0, [eax]
-
- add ecx, eax
- movdqa [esp+480-208], xmm0
- movdqa xmm0, [ecx]
- mov dword [esp+432-404], ecx
-
- movsx ecx, word [ebp+16]
- movdqa [esp+496-208], xmm0
- movdqa xmm0, [esi+eax]
-
- movsx si, byte [edx]
- movdqa [esp+512-208], xmm0
- movd xmm0, ecx
- movsx ecx, word [ebp+20]
- movdqa xmm1, xmm0
- punpcklwd xmm1, xmm0
- pshufd xmm0, xmm1, 0
- movdqa [esp+432-112], xmm0
- movd xmm0, ecx
- movsx cx, byte [edx+1]
- movdqa xmm1, xmm0
- punpcklwd xmm1, xmm0
- mov dword [esp+432-408], ebx
- movzx ebx, cx
- pshufd xmm0, xmm1, 0
- movd xmm1, ebx
- movzx ebx, cx
- movd xmm2, ebx
- movzx ebx, cx
- movzx ecx, cx
- movd xmm4, ecx
- movzx ecx, si
- movd xmm5, ecx
- movzx ecx, si
- movd xmm6, ecx
- movzx ecx, si
- movd xmm7, ecx
- movzx ecx, si
- movdqa [esp+432-336], xmm0
- movd xmm0, ecx
-
- movsx cx, byte [edx+3]
- movsx dx, byte [edx+2]
- movd xmm3, ebx
- punpcklwd xmm0, xmm4
- movzx esi, cx
- punpcklwd xmm6, xmm2
- punpcklwd xmm5, xmm1
- punpcklwd xmm0, xmm6
- punpcklwd xmm7, xmm3
- punpcklwd xmm7, xmm5
- punpcklwd xmm0, xmm7
- movdqa [esp+432-400], xmm0
- movd xmm0, esi
- movzx esi, cx
- movd xmm2, esi
- movzx esi, cx
- movzx ecx, cx
- movd xmm4, ecx
- movzx ecx, dx
- movd xmm3, esi
- movd xmm5, ecx
- punpcklwd xmm5, xmm0
-
- movdqa xmm0, [esp+432-384]
- movzx ecx, dx
- movd xmm6, ecx
- movzx ecx, dx
- movzx edx, dx
- punpcklwd xmm6, xmm2
- movd xmm7, ecx
- movd xmm1, edx
-
- movdqa xmm2, [esp+448-208]
- punpcklbw xmm2, xmm0
-
- mov ecx, 4
- movsx edx, cx
- punpcklwd xmm7, xmm3
- punpcklwd xmm7, xmm5
- movdqa xmm5, [esp+496-208]
- movdqa xmm3, [esp+464-208]
- punpcklbw xmm5, xmm0
- movdqa [esp+432-240], xmm5
- movdqa xmm5, [esp+512-208]
- punpcklbw xmm5, xmm0
- movdqa [esp+432-352], xmm5
- punpcklwd xmm1, xmm4
- movdqa xmm4, [esp+432-208]
- punpcklwd xmm1, xmm6
- movdqa xmm6, [esp+480-208]
- punpcklwd xmm1, xmm7
- punpcklbw xmm6, xmm0
- punpcklbw xmm3, xmm0
- punpcklbw xmm4, xmm0
- movdqa xmm7, xmm3
- psubw xmm7, xmm4
- pabsw xmm7, xmm7
- movdqa [esp+432-272], xmm4
- movdqa xmm4, [esp+432-336]
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-288], xmm5
- movdqa xmm7, xmm6
- psubw xmm7, [esp+432-352]
- pabsw xmm7, xmm7
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-256], xmm5
- movdqa xmm5, xmm3
- pavgw xmm5, xmm6
- movdqa [esp+432-304], xmm5
- movdqa xmm5, [esp+432-400]
- psubw xmm5, [esp+432-288]
- psubw xmm5, [esp+432-256]
- movdqa [esp+432-224], xmm5
- movdqa xmm5, xmm6
- psubw xmm5, xmm3
- movdqa [esp+432-32], xmm6
- psubw xmm6, [esp+432-240]
- movdqa xmm7, xmm5
- movdqa [esp+432-384], xmm5
- movdqa xmm5, [esp+432-112]
- pabsw xmm7, xmm7
- pcmpgtw xmm5, xmm7
- pabsw xmm6, xmm6
- movdqa xmm7, xmm4
- pcmpgtw xmm7, xmm6
-
- pand xmm5, xmm7
- movdqa xmm6, xmm3
- psubw xmm6, xmm2
- pabsw xmm6, xmm6
- movdqa xmm7, xmm4
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+432-400]
- pand xmm5, xmm7
- movdqa xmm7, xmm6
- pcmpeqw xmm6, xmm0
- pcmpgtw xmm7, xmm0
- por xmm7, xmm6
- pand xmm5, xmm7
- movdqa [esp+432-320], xmm5
- movd xmm5, edx
- movdqa xmm6, xmm5
- punpcklwd xmm6, xmm5
- pshufd xmm5, xmm6, 0
- movdqa [esp+432-336], xmm5
- movdqa xmm5, [esp+432-224]
- movdqa [esp+432-368], xmm5
- movdqa xmm6, xmm0
- psubw xmm6, xmm5
- movdqa xmm5, [esp+432-384]
- psllw xmm5, 2
- movdqa xmm7, xmm2
- psubw xmm7, [esp+432-240]
- paddw xmm7, xmm5
- paddw xmm7, [esp+432-336]
- movdqa xmm5, [esp+432-368]
- psraw xmm7, 3
- pmaxsw xmm6, xmm7
- pminsw xmm5, xmm6
-
- pand xmm5, [esp+432-320]
- movdqa xmm6, [esp+432-400]
- movdqa [esp+432-64], xmm5
- movdqa [esp+432-384], xmm6
- movdqa xmm5, xmm0
- psubw xmm5, xmm6
- movdqa [esp+432-368], xmm5
- movdqa xmm6, xmm5
- movdqa xmm5, [esp+432-272]
- paddw xmm5, [esp+432-304]
- movdqa xmm7, xmm2
- paddw xmm7, xmm2
- psubw xmm5, xmm7
- psraw xmm5, 1
- pmaxsw xmm6, xmm5
- movdqa xmm5, [esp+432-384]
- pminsw xmm5, xmm6
-
- pand xmm5, [esp+432-320]
- pand xmm5, [esp+432-288]
- movdqa xmm6, [esp+432-240]
- movdqa [esp+432-96], xmm5
- movdqa xmm5, [esp+432-352]
- paddw xmm5, [esp+432-304]
- movdqa xmm7, xmm6
- paddw xmm7, xmm6
- movdqa xmm6, [esp+432-368]
- psubw xmm5, xmm7
-
- movdqa xmm7, [esp+496-208]
- psraw xmm5, 1
- pmaxsw xmm6, xmm5
- movdqa xmm5, [esp+432-400]
- pminsw xmm5, xmm6
- pand xmm5, [esp+432-320]
- pand xmm5, [esp+432-256]
- movdqa xmm6, [esp+448-208]
- punpckhbw xmm7, xmm0
- movdqa [esp+432-352], xmm7
-
- movdqa xmm7, [esp+512-208]
- punpckhbw xmm6, xmm0
- movdqa [esp+432-48], xmm5
- movdqa xmm5, [esp+432-208]
- movdqa [esp+432-368], xmm6
- movdqa xmm6, [esp+464-208]
- punpckhbw xmm7, xmm0
- punpckhbw xmm5, xmm0
- movdqa [esp+432-384], xmm7
- punpckhbw xmm6, xmm0
- movdqa [esp+432-400], xmm6
-
- movdqa xmm7, [esp+432-400]
- movdqa xmm6, [esp+480-208]
- psubw xmm7, xmm5
- movdqa [esp+432-16], xmm5
- pabsw xmm7, xmm7
- punpckhbw xmm6, xmm0
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-288], xmm5
-
- movdqa xmm7, xmm6
- psubw xmm7, [esp+432-384]
- pabsw xmm7, xmm7
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-256], xmm5
-
- movdqa xmm5, [esp+432-400]
- movdqa [esp+432-80], xmm6
- pavgw xmm5, xmm6
- movdqa [esp+432-304], xmm5
-
- movdqa xmm5, xmm1
- psubw xmm5, [esp+432-288]
- psubw xmm5, [esp+432-256]
- movdqa [esp+432-224], xmm5
- movdqa xmm5, xmm6
- psubw xmm5, [esp+432-400]
- psubw xmm6, [esp+432-352]
- movdqa [esp+432-272], xmm5
- movdqa xmm7, xmm5
- movdqa xmm5, [esp+432-112]
- pabsw xmm7, xmm7
- pcmpgtw xmm5, xmm7
- movdqa xmm7, xmm4
- pabsw xmm6, xmm6
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+432-368]
-
- pand xmm5, xmm7
- movdqa xmm7, [esp+432-400]
- psubw xmm7, xmm6
- psubw xmm6, [esp+432-352]
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
- pand xmm5, xmm4
-
- paddw xmm2, [esp+432-96]
- movdqa xmm4, xmm1
- pcmpgtw xmm4, xmm0
- movdqa xmm7, xmm1
- pcmpeqw xmm7, xmm0
- por xmm4, xmm7
- pand xmm5, xmm4
- movdqa xmm4, [esp+432-224]
- movdqa [esp+432-320], xmm5
- movdqa xmm5, [esp+432-272]
- movdqa xmm7, xmm0
- psubw xmm7, xmm4
- psubw xmm0, xmm1
- psllw xmm5, 2
- paddw xmm6, xmm5
- paddw xmm6, [esp+432-336]
- movdqa xmm5, [esp+432-368]
- movdqa [esp+432-336], xmm0
- psraw xmm6, 3
- pmaxsw xmm7, xmm6
- pminsw xmm4, xmm7
- pand xmm4, [esp+432-320]
- movdqa xmm6, xmm0
- movdqa xmm0, [esp+432-16]
- paddw xmm0, [esp+432-304]
- movdqa [esp+432-272], xmm4
- movdqa xmm4, [esp+432-368]
- paddw xmm4, xmm4
- psubw xmm0, xmm4
-
- movdqa xmm4, [esp+432-64]
- psraw xmm0, 1
- pmaxsw xmm6, xmm0
- movdqa xmm0, [esp+432-400]
- movdqa xmm7, xmm1
- pminsw xmm7, xmm6
- movdqa xmm6, [esp+432-320]
- pand xmm7, xmm6
- pand xmm7, [esp+432-288]
- paddw xmm5, xmm7
- packuswb xmm2, xmm5
- movdqa xmm5, [esp+432-272]
- paddw xmm0, xmm5
- paddw xmm3, xmm4
- packuswb xmm3, xmm0
-
- movdqa xmm0, [esp+432-32]
- psubw xmm0, xmm4
- movdqa xmm4, [esp+432-80]
- psubw xmm4, xmm5
-
- movdqa xmm5, [esp+432-240]
- paddw xmm5, [esp+432-48]
- packuswb xmm0, xmm4
- movdqa xmm4, [esp+432-384]
- paddw xmm4, [esp+432-304]
- movdqa [esp+480-208], xmm0
- movdqa xmm0, [esp+432-352]
- movdqa xmm7, xmm0
- paddw xmm0, xmm0
-
- mov ecx, dword [esp+432-408]
-
- mov edx, dword [esp+432-404]
- psubw xmm4, xmm0
- movdqa xmm0, [esp+432-336]
- movdqa [edi], xmm2
- psraw xmm4, 1
- pmaxsw xmm0, xmm4
- pminsw xmm1, xmm0
- movdqa xmm0, [esp+480-208]
-
- pop edi
- pand xmm1, xmm6
- pand xmm1, [esp+428-256]
- movdqa [ecx], xmm3
- paddw xmm7, xmm1
- pop esi
- packuswb xmm5, xmm7
- movdqa [eax], xmm0
- movdqa [edx], xmm5
- pop ebx
- mov esp, ebp
- pop ebp
- ret
-
-
-;*******************************************************************************
-; void DeblockLumaEq4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
-; int32_t iBeta)
-;*******************************************************************************
-
-
-WELS_EXTERN DeblockLumaEq4V_ssse3
-
- push ebp
- mov ebp, esp
- and esp, -16 ; fffffff0H
- sub esp, 628 ; 00000274H
- mov eax, dword [ebp+8]
- mov ecx, dword [ebp+12]
- push ebx
- push esi
-
- lea edx, [ecx*4]
- pxor xmm0, xmm0
- movdqa xmm2, xmm0
-
- movdqa xmm0, [ecx+eax]
- mov esi, eax
- sub esi, edx
- movdqa xmm3, [esi]
- movdqa xmm5, [eax]
- push edi
- lea edi, [ecx+ecx]
- lea ebx, [ecx+ecx*2]
- mov dword [esp+640-600], edi
- mov esi, eax
- sub esi, edi
- movdqa xmm1, [esi]
- movdqa [esp+720-272], xmm0
- mov edi, eax
- sub edi, ecx
- movdqa xmm4, [edi]
- add ecx, eax
- mov dword [esp+640-596], ecx
-
- mov ecx, dword [esp+640-600]
- movdqa xmm0, [ecx+eax]
- movdqa [esp+736-272], xmm0
-
- movdqa xmm0, [eax+ebx]
- mov edx, eax
- sub edx, ebx
-
- movsx ebx, word [ebp+16]
- movdqa xmm6, [edx]
- add ecx, eax
- movdqa [esp+752-272], xmm0
- movd xmm0, ebx
-
- movsx ebx, word [ebp+20]
- movdqa xmm7, xmm0
- punpcklwd xmm7, xmm0
- pshufd xmm0, xmm7, 0
- movdqa [esp+640-320], xmm0
- movd xmm0, ebx
- movdqa xmm7, xmm0
- punpcklwd xmm7, xmm0
- pshufd xmm0, xmm7, 0
-
- movdqa xmm7, [esp+736-272]
- punpcklbw xmm7, xmm2
- movdqa [esp+640-416], xmm7
- movdqa [esp+640-512], xmm0
- movdqa xmm0, xmm1
- movdqa [esp+672-272], xmm1
- movdqa xmm1, xmm4
- movdqa [esp+704-272], xmm5
- punpcklbw xmm5, xmm2
- punpcklbw xmm1, xmm2
-
- movdqa xmm7, xmm5
- psubw xmm7, xmm1
- pabsw xmm7, xmm7
- movdqa [esp+640-560], xmm7
- punpcklbw xmm0, xmm2
- movdqa [esp+688-272], xmm4
- movdqa xmm4, [esp+720-272]
- movdqa [esp+640-480], xmm0
-
- movdqa xmm7, xmm1
- psubw xmm7, xmm0
-
- movdqa xmm0, [esp+640-512]
- pabsw xmm7, xmm7
- punpcklbw xmm4, xmm2
- pcmpgtw xmm0, xmm7
- movdqa [esp+640-384], xmm4
- movdqa xmm7, xmm5
- psubw xmm7, xmm4
- movdqa xmm4, [esp+640-512]
- movdqa [esp+656-272], xmm6
- punpcklbw xmm6, xmm2
- pabsw xmm7, xmm7
- movdqa [esp+640-48], xmm2
- movdqa [esp+640-368], xmm6
- movdqa [esp+640-144], xmm1
- movdqa [esp+640-400], xmm5
- pcmpgtw xmm4, xmm7
- pand xmm0, xmm4
- movdqa xmm4, [esp+640-320]
- pcmpgtw xmm4, [esp+640-560]
- pand xmm0, xmm4
-
- mov ebx, 2
- movsx ebx, bx
- movd xmm4, ebx
- movdqa xmm7, xmm4
- punpcklwd xmm7, xmm4
- movdqa xmm4, [esp+640-320]
- psraw xmm4, 2
- pshufd xmm7, xmm7, 0
- paddw xmm4, xmm7
- movdqa [esp+640-576], xmm4
- pcmpgtw xmm4, [esp+640-560]
- movdqa [esp+640-560], xmm4
-
- movdqa xmm4, [esp+640-512]
- movdqa [esp+640-624], xmm7
- movdqa xmm7, xmm1
- psubw xmm7, xmm6
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
-
- pand xmm4, [esp+640-560]
- movdqa [esp+640-544], xmm4
- movdqa xmm4, [esp+640-512]
- movdqa xmm7, xmm5
- psubw xmm7, [esp+640-416]
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
-
- pand xmm4, [esp+640-560]
- movdqa [esp+640-560], xmm4
-
- movdqa xmm4, [esp+640-544]
- pandn xmm4, xmm6
- movdqa [esp+640-16], xmm4
- mov ebx, 4
- movsx ebx, bx
- movd xmm4, ebx
- movdqa xmm7, xmm4
- punpcklwd xmm7, xmm4
- movdqa xmm4, xmm3
- punpcklbw xmm4, xmm2
- psllw xmm4, 1
- paddw xmm4, xmm6
- paddw xmm4, xmm6
- paddw xmm4, xmm6
- paddw xmm4, [esp+640-480]
-
- movdqa xmm6, [esp+640-560]
- pshufd xmm7, xmm7, 0
- paddw xmm4, xmm1
- movdqa [esp+640-592], xmm7
- paddw xmm4, xmm5
- paddw xmm4, xmm7
- movdqa xmm7, [esp+640-416]
- pandn xmm6, xmm7
- movdqa [esp+640-80], xmm6
- movdqa xmm6, [esp+752-272]
- punpcklbw xmm6, xmm2
- psllw xmm6, 1
- paddw xmm6, xmm7
- paddw xmm6, xmm7
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-384]
-
- movdqa xmm7, [esp+640-480]
- paddw xmm6, xmm5
- paddw xmm6, xmm1
- paddw xmm6, [esp+640-592]
- psraw xmm6, 3
- pand xmm6, [esp+640-560]
- movdqa [esp+640-112], xmm6
- movdqa xmm6, [esp+640-544]
- pandn xmm6, xmm7
- movdqa [esp+640-336], xmm6
- movdqa xmm6, [esp+640-544]
- movdqa [esp+640-528], xmm6
- movdqa xmm6, [esp+640-368]
- paddw xmm6, xmm7
- movdqa xmm7, xmm1
- psraw xmm4, 3
- pand xmm4, [esp+640-544]
- paddw xmm7, xmm5
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-624]
- movdqa xmm7, [esp+640-528]
-
- paddw xmm5, xmm1
- psraw xmm6, 2
- pand xmm7, xmm6
-
- movdqa xmm6, [esp+640-384]
- movdqa [esp+640-64], xmm7
- movdqa xmm7, [esp+640-560]
- pandn xmm7, xmm6
- movdqa [esp+640-304], xmm7
- movdqa xmm7, [esp+640-560]
- movdqa [esp+640-528], xmm7
- movdqa xmm7, [esp+640-416]
- paddw xmm7, xmm6
- paddw xmm7, xmm5
- paddw xmm7, [esp+640-624]
- movdqa xmm5, [esp+640-528]
- psraw xmm7, 2
- pand xmm5, xmm7
- movdqa [esp+640-32], xmm5
-
- movdqa xmm5, [esp+640-544]
- movdqa [esp+640-528], xmm5
- movdqa xmm5, [esp+640-480]
- movdqa xmm7, xmm5
- paddw xmm7, xmm5
- movdqa xmm5, xmm1
- paddw xmm5, xmm6
- paddw xmm6, [esp+640-592]
- paddw xmm7, xmm5
- paddw xmm7, [esp+640-624]
- movdqa xmm5, [esp+640-528]
- psraw xmm7, 2
- pandn xmm5, xmm7
- movdqa xmm7, [esp+640-480]
- paddw xmm7, xmm1
- paddw xmm7, [esp+640-400]
- movdqa xmm1, [esp+640-544]
- movdqa [esp+640-352], xmm5
- movdqa xmm5, [esp+640-368]
- psllw xmm7, 1
- paddw xmm7, xmm6
- paddw xmm5, xmm7
-
- movdqa xmm7, [esp+640-400]
- psraw xmm5, 3
- pand xmm1, xmm5
- movdqa xmm5, [esp+640-480]
- movdqa [esp+640-96], xmm1
- movdqa xmm1, [esp+640-560]
- movdqa [esp+640-528], xmm1
- movdqa xmm1, [esp+640-384]
- movdqa xmm6, xmm1
- paddw xmm6, xmm1
- paddw xmm1, [esp+640-400]
- paddw xmm1, [esp+640-144]
- paddw xmm7, xmm5
- paddw xmm5, [esp+640-592]
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-624]
- movdqa xmm7, [esp+640-528]
- psraw xmm6, 2
- psllw xmm1, 1
- paddw xmm1, xmm5
-
- movdqa xmm5, [esp+656-272]
- pandn xmm7, xmm6
- movdqa xmm6, [esp+640-416]
- paddw xmm6, xmm1
- movdqa xmm1, [esp+640-560]
- psraw xmm6, 3
- pand xmm1, xmm6
-
- movdqa xmm6, [esp+704-272]
- movdqa [esp+640-128], xmm1
- movdqa xmm1, [esp+672-272]
- punpckhbw xmm1, xmm2
- movdqa [esp+640-448], xmm1
- movdqa xmm1, [esp+688-272]
- punpckhbw xmm1, xmm2
- punpckhbw xmm6, xmm2
- movdqa [esp+640-288], xmm7
- punpckhbw xmm5, xmm2
- movdqa [esp+640-496], xmm1
- movdqa [esp+640-432], xmm6
-
- movdqa xmm7, [esp+720-272]
- punpckhbw xmm7, xmm2
- movdqa [esp+640-464], xmm7
-
- movdqa xmm7, [esp+736-272]
- punpckhbw xmm7, xmm2
- movdqa [esp+640-528], xmm7
-
- movdqa xmm7, xmm6
-
- psubw xmm6, [esp+640-464]
- psubw xmm7, xmm1
- pabsw xmm7, xmm7
- movdqa [esp+640-560], xmm7
- por xmm4, [esp+640-16]
- pabsw xmm6, xmm6
- movdqa xmm7, xmm1
- psubw xmm7, [esp+640-448]
-
- movdqa xmm1, [esp+640-512]
- pabsw xmm7, xmm7
- pcmpgtw xmm1, xmm7
- movdqa xmm7, [esp+640-512]
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+640-320]
- pand xmm1, xmm7
- movdqa xmm7, [esp+640-560]
- pcmpgtw xmm6, xmm7
- pand xmm1, xmm6
-
- movdqa xmm6, [esp+640-576]
- pcmpgtw xmm6, xmm7
-
- movdqa xmm7, [esp+640-496]
- punpckhbw xmm3, xmm2
- movdqa [esp+640-560], xmm6
- movdqa xmm6, [esp+640-512]
- psubw xmm7, xmm5
- pabsw xmm7, xmm7
- pcmpgtw xmm6, xmm7
-
- pand xmm6, [esp+640-560]
- movdqa xmm7, [esp+640-432]
- psubw xmm7, [esp+640-528]
-
- psllw xmm3, 1
- movdqa [esp+640-544], xmm6
- movdqa xmm6, [esp+640-512]
-
- movdqa xmm2, [esp+640-544]
- paddw xmm3, xmm5
- paddw xmm3, xmm5
- paddw xmm3, xmm5
- paddw xmm3, [esp+640-448]
- paddw xmm3, [esp+640-496]
- pabsw xmm7, xmm7
- pcmpgtw xmm6, xmm7
- pand xmm6, [esp+640-560]
- movdqa [esp+640-560], xmm6
-
- movdqa xmm6, xmm0
- pand xmm6, xmm4
- movdqa xmm4, xmm0
- pandn xmm4, [esp+640-368]
- por xmm6, xmm4
- movdqa xmm4, [esp+640-432]
- paddw xmm3, xmm4
- paddw xmm3, [esp+640-592]
- psraw xmm3, 3
- pand xmm3, xmm2
- pandn xmm2, xmm5
- por xmm3, xmm2
- movdqa xmm7, xmm1
- pand xmm7, xmm3
- movdqa xmm3, [esp+640-64]
- por xmm3, [esp+640-336]
- movdqa xmm2, xmm1
- pandn xmm2, xmm5
- por xmm7, xmm2
-
- movdqa xmm2, xmm0
- pand xmm2, xmm3
- movdqa xmm3, xmm0
- pandn xmm3, [esp+640-480]
- por xmm2, xmm3
- packuswb xmm6, xmm7
- movdqa [esp+640-336], xmm2
- movdqa [esp+656-272], xmm6
- movdqa xmm6, [esp+640-544]
- movdqa xmm2, xmm5
- paddw xmm2, [esp+640-448]
- movdqa xmm3, xmm1
- movdqa xmm7, [esp+640-496]
- paddw xmm7, xmm4
- paddw xmm2, xmm7
- paddw xmm2, [esp+640-624]
- movdqa xmm7, [esp+640-544]
- psraw xmm2, 2
- pand xmm6, xmm2
- movdqa xmm2, [esp+640-448]
- pandn xmm7, xmm2
- por xmm6, xmm7
- pand xmm3, xmm6
- movdqa xmm6, xmm1
- pandn xmm6, xmm2
- paddw xmm2, [esp+640-496]
- paddw xmm2, xmm4
- por xmm3, xmm6
- movdqa xmm6, [esp+640-336]
- packuswb xmm6, xmm3
- psllw xmm2, 1
- movdqa [esp+672-272], xmm6
- movdqa xmm6, [esp+640-96]
- por xmm6, [esp+640-352]
-
- movdqa xmm3, xmm0
- pand xmm3, xmm6
- movdqa xmm6, xmm0
- pandn xmm6, [esp+640-144]
- por xmm3, xmm6
- movdqa xmm6, [esp+640-544]
- movdqa [esp+640-352], xmm3
- movdqa xmm3, [esp+640-464]
- paddw xmm3, [esp+640-592]
- paddw xmm2, xmm3
- movdqa xmm3, [esp+640-448]
- paddw xmm5, xmm2
- movdqa xmm2, [esp+640-496]
- psraw xmm5, 3
- pand xmm6, xmm5
- movdqa xmm5, [esp+640-464]
- paddw xmm2, xmm5
- paddw xmm5, [esp+640-432]
- movdqa xmm4, xmm3
- paddw xmm4, xmm3
- paddw xmm4, xmm2
- paddw xmm4, [esp+640-624]
- movdqa xmm2, [esp+640-544]
- paddw xmm3, [esp+640-592]
- psraw xmm4, 2
- pandn xmm2, xmm4
- por xmm6, xmm2
- movdqa xmm7, xmm1
- pand xmm7, xmm6
- movdqa xmm6, [esp+640-496]
- movdqa xmm2, xmm1
- pandn xmm2, xmm6
- por xmm7, xmm2
- movdqa xmm2, [esp+640-352]
- packuswb xmm2, xmm7
- movdqa [esp+688-272], xmm2
- movdqa xmm2, [esp+640-128]
- por xmm2, [esp+640-288]
-
- movdqa xmm4, xmm0
- pand xmm4, xmm2
- paddw xmm5, xmm6
- movdqa xmm2, xmm0
- pandn xmm2, [esp+640-400]
- por xmm4, xmm2
- movdqa xmm2, [esp+640-528]
- psllw xmm5, 1
- paddw xmm5, xmm3
- movdqa xmm3, [esp+640-560]
- paddw xmm2, xmm5
- psraw xmm2, 3
- movdqa [esp+640-288], xmm4
- movdqa xmm4, [esp+640-560]
- pand xmm4, xmm2
- movdqa xmm2, [esp+640-464]
- movdqa xmm5, xmm2
- paddw xmm5, xmm2
- movdqa xmm2, [esp+640-432]
- paddw xmm2, [esp+640-448]
- movdqa xmm7, xmm1
- paddw xmm5, xmm2
- paddw xmm5, [esp+640-624]
- movdqa xmm6, [esp+640-560]
- psraw xmm5, 2
- pandn xmm3, xmm5
- por xmm4, xmm3
- movdqa xmm3, [esp+640-32]
- por xmm3, [esp+640-304]
- pand xmm7, xmm4
- movdqa xmm4, [esp+640-432]
- movdqa xmm5, [esp+640-464]
- movdqa xmm2, xmm1
- pandn xmm2, xmm4
- paddw xmm4, [esp+640-496]
- por xmm7, xmm2
- movdqa xmm2, [esp+640-288]
- packuswb xmm2, xmm7
- movdqa [esp+704-272], xmm2
-
- movdqa xmm2, xmm0
- pand xmm2, xmm3
- movdqa xmm3, xmm0
- pandn xmm3, [esp+640-384]
- por xmm2, xmm3
- movdqa [esp+640-304], xmm2
- movdqa xmm2, [esp+640-528]
- movdqa xmm3, xmm2
- paddw xmm3, [esp+640-464]
- paddw xmm3, xmm4
- paddw xmm3, [esp+640-624]
- psraw xmm3, 2
- pand xmm6, xmm3
- movdqa xmm3, [esp+640-560]
- movdqa xmm4, xmm3
- pandn xmm4, xmm5
- por xmm6, xmm4
- movdqa xmm7, xmm1
- pand xmm7, xmm6
- movdqa xmm6, [esp+640-304]
- movdqa xmm4, xmm1
- pandn xmm4, xmm5
- por xmm7, xmm4
-
- movdqa xmm4, xmm0
- pandn xmm0, [esp+640-416]
- packuswb xmm6, xmm7
- movdqa xmm7, [esp+640-112]
- por xmm7, [esp+640-80]
- pand xmm4, xmm7
- por xmm4, xmm0
- movdqa xmm0, [esp+752-272]
- punpckhbw xmm0, [esp+640-48]
- psllw xmm0, 1
- paddw xmm0, xmm2
- paddw xmm0, xmm2
- paddw xmm0, xmm2
- paddw xmm0, xmm5
- paddw xmm0, [esp+640-432]
- paddw xmm0, [esp+640-496]
- paddw xmm0, [esp+640-592]
- psraw xmm0, 3
- pand xmm0, xmm3
- movdqa xmm7, xmm1
- pandn xmm3, xmm2
- por xmm0, xmm3
- pand xmm7, xmm0
-
- movdqa xmm0, [esp+656-272]
- movdqa [edx], xmm0
-
- movdqa xmm0, [esp+672-272]
-
- mov edx, dword [esp+640-596]
- movdqa [esi], xmm0
- movdqa xmm0, [esp+688-272]
- movdqa [edi], xmm0
- movdqa xmm0, [esp+704-272]
-
- pop edi
- pandn xmm1, xmm2
- movdqa [eax], xmm0
- por xmm7, xmm1
- pop esi
- packuswb xmm4, xmm7
- movdqa [edx], xmm6
- movdqa [ecx], xmm4
- pop ebx
- mov esp, ebp
- pop ebp
- ret
-
-%endif
-
-
-
-;********************************************************************************
-;
-; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
-;
-;********************************************************************************
-
-WELS_EXTERN DeblockLumaTransposeH2V_sse2
- push r3
- push r4
- push r5
-
-%assign push_num 3
- LOAD_3_PARA
- PUSH_XMM 8
-
- SIGN_EXTENSION r1, r1d
-
- mov r5, r7
- mov r3, r7
- and r3, 0Fh
- sub r7, r3
- sub r7, 10h
-
- lea r3, [r0 + r1 * 8]
- lea r4, [r1 * 3]
-
- movq xmm0, [r0]
- movq xmm7, [r3]
- punpcklqdq xmm0, xmm7
- movq xmm1, [r0 + r1]
- movq xmm7, [r3 + r1]
- punpcklqdq xmm1, xmm7
- movq xmm2, [r0 + r1*2]
- movq xmm7, [r3 + r1*2]
- punpcklqdq xmm2, xmm7
- movq xmm3, [r0 + r4]
- movq xmm7, [r3 + r4]
- punpcklqdq xmm3, xmm7
-
- lea r0, [r0 + r1 * 4]
- lea r3, [r3 + r1 * 4]
- movq xmm4, [r0]
- movq xmm7, [r3]
- punpcklqdq xmm4, xmm7
- movq xmm5, [r0 + r1]
- movq xmm7, [r3 + r1]
- punpcklqdq xmm5, xmm7
- movq xmm6, [r0 + r1*2]
- movq xmm7, [r3 + r1*2]
- punpcklqdq xmm6, xmm7
-
- movdqa [r7], xmm0
- movq xmm7, [r0 + r4]
- movq xmm0, [r3 + r4]
- punpcklqdq xmm7, xmm0
- movdqa xmm0, [r7]
-
- SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
- ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-
- movdqa [r2], xmm4
- movdqa [r2 + 10h], xmm2
- movdqa [r2 + 20h], xmm3
- movdqa [r2 + 30h], xmm7
- movdqa [r2 + 40h], xmm5
- movdqa [r2 + 50h], xmm1
- movdqa [r2 + 60h], xmm6
- movdqa [r2 + 70h], xmm0
-
- mov r7, r5
- POP_XMM
- pop r5
- pop r4
- pop r3
- ret
-
-
-;*******************************************************************************************
-;
-; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
-;
-;*******************************************************************************************
-
-WELS_EXTERN DeblockLumaTransposeV2H_sse2
- push r3
- push r4
-
-%assign push_num 2
- LOAD_3_PARA
- PUSH_XMM 8
-
- SIGN_EXTENSION r1, r1d
-
- mov r4, r7
- mov r3, r7
- and r3, 0Fh
- sub r7, r3
- sub r7, 10h
-
- movdqa xmm0, [r2]
- movdqa xmm1, [r2 + 10h]
- movdqa xmm2, [r2 + 20h]
- movdqa xmm3, [r2 + 30h]
- movdqa xmm4, [r2 + 40h]
- movdqa xmm5, [r2 + 50h]
- movdqa xmm6, [r2 + 60h]
- movdqa xmm7, [r2 + 70h]
-
- SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
- ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-
- lea r2, [r1 * 3]
-
- movq [r0], xmm4
- movq [r0 + r1], xmm2
- movq [r0 + r1*2], xmm3
- movq [r0 + r2], xmm7
-
- lea r0, [r0 + r1*4]
- movq [r0], xmm5
- movq [r0 + r1], xmm1
- movq [r0 + r1*2], xmm6
- movq [r0 + r2], xmm0
-
- psrldq xmm4, 8
- psrldq xmm2, 8
- psrldq xmm3, 8
- psrldq xmm7, 8
- psrldq xmm5, 8
- psrldq xmm1, 8
- psrldq xmm6, 8
- psrldq xmm0, 8
-
- lea r0, [r0 + r1*4]
- movq [r0], xmm4
- movq [r0 + r1], xmm2
- movq [r0 + r1*2], xmm3
- movq [r0 + r2], xmm7
-
- lea r0, [r0 + r1*4]
- movq [r0], xmm5
- movq [r0 + r1], xmm1
- movq [r0 + r1*2], xmm6
- movq [r0 + r2], xmm0
-
-
- mov r7, r4
- POP_XMM
- pop r4
- pop r3
- ret
-
--- a/codec/common/deblocking_common.cpp
+++ /dev/null
@@ -1,204 +1,0 @@
-#include "deblocking_common.h"
-#include "macros.h"
-// C code only
-void DeblockLumaLt4_c (uint8_t* pPix, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta,
- int8_t* pTc) {
- for (int32_t i = 0; i < 16; i++) {
- int32_t iTc0 = pTc[i >> 2];
- if (iTc0 >= 0) {
- int32_t p0 = pPix[-iStrideX];
- int32_t p1 = pPix[-2 * iStrideX];
- int32_t p2 = pPix[-3 * iStrideX];
- int32_t q0 = pPix[0];
- int32_t q1 = pPix[iStrideX];
- int32_t q2 = pPix[2 * iStrideX];
- bool bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
- bool bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
- bool bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
- int32_t iTc = iTc0;
- if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
- bool bDetaP2P0 = WELS_ABS (p2 - p0) < iBeta;
- bool bDetaQ2Q0 = WELS_ABS (q2 - q0) < iBeta;
- if (bDetaP2P0) {
- pPix[-2 * iStrideX] = p1 + WELS_CLIP3 ((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1, -iTc0, iTc0);
- iTc++;
- }
- if (bDetaQ2Q0) {
- pPix[iStrideX] = q1 + WELS_CLIP3 ((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1, -iTc0, iTc0);
- iTc++;
- }
- int32_t iDeta = WELS_CLIP3 ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3, -iTc, iTc);
- pPix[-iStrideX] = WelsClip1 (p0 + iDeta); /* p0' */
- pPix[0] = WelsClip1 (q0 - iDeta); /* q0' */
- }
- }
- pPix += iStrideY;
- }
-}
-void DeblockLumaEq4_c (uint8_t* pPix, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta) {
- int32_t p0, p1, p2, q0, q1, q2;
- int32_t iDetaP0Q0;
- bool bDetaP1P0, bDetaQ1Q0;
- for (int32_t i = 0; i < 16; i++) {
- p0 = pPix[-iStrideX];
- p1 = pPix[-2 * iStrideX];
- p2 = pPix[-3 * iStrideX];
- q0 = pPix[0];
- q1 = pPix[iStrideX];
- q2 = pPix[2 * iStrideX];
- iDetaP0Q0 = WELS_ABS (p0 - q0);
- bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
- bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
- if ((iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0) {
- if (iDetaP0Q0 < ((iAlpha >> 2) + 2)) {
- bool bDetaP2P0 = WELS_ABS (p2 - p0) < iBeta;
- bool bDetaQ2Q0 = WELS_ABS (q2 - q0) < iBeta;
- if (bDetaP2P0) {
- const int32_t p3 = pPix[-4 * iStrideX];
- pPix[-iStrideX] = (p2 + (p1 << 1) + (p0 << 1) + (q0 << 1) + q1 + 4) >> 3; //p0
- pPix[-2 * iStrideX] = (p2 + p1 + p0 + q0 + 2) >> 2; //p1
- pPix[-3 * iStrideX] = ((p3 << 1) + p2 + (p2 << 1) + p1 + p0 + q0 + 4) >> 3;//p2
- } else {
- pPix[-1 * iStrideX] = ((p1 << 1) + p0 + q1 + 2) >> 2; //p0
- }
- if (bDetaQ2Q0) {
- const int32_t q3 = pPix[3 * iStrideX];
- pPix[0] = (p1 + (p0 << 1) + (q0 << 1) + (q1 << 1) + q2 + 4) >> 3; //q0
- pPix[iStrideX] = (p0 + q0 + q1 + q2 + 2) >> 2; //q1
- pPix[2 * iStrideX] = ((q3 << 1) + q2 + (q2 << 1) + q1 + q0 + p0 + 4) >> 3;//q2
- } else {
- pPix[0] = ((q1 << 1) + q0 + p1 + 2) >> 2; //q0
- }
- } else {
- pPix[-iStrideX] = ((p1 << 1) + p0 + q1 + 2) >> 2; //p0
- pPix[ 0] = ((q1 << 1) + q0 + p1 + 2) >> 2; //q0
- }
- }
- pPix += iStrideY;
- }
-}
-void DeblockLumaLt4V_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc) {
- DeblockLumaLt4_c (pPix, iStride, 1, iAlpha, iBeta, tc);
-}
-void DeblockLumaLt4H_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc) {
- DeblockLumaLt4_c (pPix, 1, iStride, iAlpha, iBeta, tc);
-}
-void DeblockLumaEq4V_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
- DeblockLumaEq4_c (pPix, iStride, 1, iAlpha, iBeta);
-}
-void DeblockLumaEq4H_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
- DeblockLumaEq4_c (pPix, 1, iStride, iAlpha, iBeta);
-}
-void DeblockChromaLt4_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha,
- int32_t iBeta, int8_t* pTc) {
- int32_t p0, p1, q0, q1, iDeta;
- bool bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
-
- for (int32_t i = 0; i < 8; i++) {
- int32_t iTc0 = pTc[i >> 1];
- if (iTc0 > 0) {
- p0 = pPixCb[-iStrideX];
- p1 = pPixCb[-2 * iStrideX];
- q0 = pPixCb[0];
- q1 = pPixCb[iStrideX];
-
- bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
- bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
- bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
- if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
- iDeta = WELS_CLIP3 ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3, -iTc0, iTc0);
- pPixCb[-iStrideX] = WelsClip1 (p0 + iDeta); /* p0' */
- pPixCb[0] = WelsClip1 (q0 - iDeta); /* q0' */
- }
-
-
- p0 = pPixCr[-iStrideX];
- p1 = pPixCr[-2 * iStrideX];
- q0 = pPixCr[0];
- q1 = pPixCr[iStrideX];
-
- bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
- bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
- bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
-
- if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
- iDeta = WELS_CLIP3 ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3, -iTc0, iTc0);
- pPixCr[-iStrideX] = WelsClip1 (p0 + iDeta); /* p0' */
- pPixCr[0] = WelsClip1 (q0 - iDeta); /* q0' */
- }
- }
- pPixCb += iStrideY;
- pPixCr += iStrideY;
- }
-}
-void DeblockChromaEq4_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha,
- int32_t iBeta) {
- int32_t p0, p1, q0, q1;
- bool bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
- for (int32_t i = 0; i < 8; i++) {
- //cb
- p0 = pPixCb[-iStrideX];
- p1 = pPixCb[-2 * iStrideX];
- q0 = pPixCb[0];
- q1 = pPixCb[iStrideX];
- bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
- bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
- bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
- if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
- pPixCb[-iStrideX] = ((p1 << 1) + p0 + q1 + 2) >> 2; /* p0' */
- pPixCb[0] = ((q1 << 1) + q0 + p1 + 2) >> 2; /* q0' */
- }
-
- //cr
- p0 = pPixCr[-iStrideX];
- p1 = pPixCr[-2 * iStrideX];
- q0 = pPixCr[0];
- q1 = pPixCr[iStrideX];
- bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
- bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
- bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
- if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
- pPixCr[-iStrideX] = ((p1 << 1) + p0 + q1 + 2) >> 2; /* p0' */
- pPixCr[0] = ((q1 << 1) + q0 + p1 + 2) >> 2; /* q0' */
- }
- pPixCr += iStrideY;
- pPixCb += iStrideY;
- }
-}
-void DeblockChromaLt4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
- int8_t* tc) {
- DeblockChromaLt4_c (pPixCb, pPixCr, iStride, 1, iAlpha, iBeta, tc);
-}
-void DeblockChromaLt4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
- int8_t* tc) {
- DeblockChromaLt4_c (pPixCb, pPixCr, 1, iStride, iAlpha, iBeta, tc);
-}
-void DeblockChromaEq4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
- DeblockChromaEq4_c (pPixCb, pPixCr, iStride, 1, iAlpha, iBeta);
-}
-void DeblockChromaEq4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
- DeblockChromaEq4_c (pPixCb, pPixCr, 1, iStride, iAlpha, iBeta);
-}
-
-#ifdef X86_ASM
-extern "C" {
- void DeblockLumaLt4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
- ENFORCE_STACK_ALIGN_1D (uint8_t, uiBuf, 16 * 8, 16);
-
- DeblockLumaTransposeH2V_sse2 (pPixY - 4, iStride, &uiBuf[0]);
- DeblockLumaLt4V_ssse3 (&uiBuf[4 * 16], 16, iAlpha, iBeta, pTc);
- DeblockLumaTransposeV2H_sse2 (pPixY - 4, iStride, &uiBuf[0]);
- }
-
- void DeblockLumaEq4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
- ENFORCE_STACK_ALIGN_1D (uint8_t, uiBuf, 16 * 8, 16);
-
- DeblockLumaTransposeH2V_sse2 (pPixY - 4, iStride, &uiBuf[0]);
- DeblockLumaEq4V_ssse3 (&uiBuf[4 * 16], 16, iAlpha, iBeta);
- DeblockLumaTransposeV2H_sse2 (pPixY - 4, iStride, &uiBuf[0]);
- }
-
-}
-
-#endif
-
--- a/codec/common/deblocking_common.h
+++ /dev/null
@@ -1,55 +1,0 @@
-#ifndef WELS_DEBLOCKING_COMMON_H__
-#define WELS_DEBLOCKING_COMMON_H__
-#include "typedefs.h"
-void DeblockLumaLt4V_c (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
-void DeblockLumaEq4V_c (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-
-void DeblockLumaLt4H_c (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
-void DeblockLumaEq4H_c (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-
-void DeblockChromaLt4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
- int8_t* pTc);
-void DeblockChromaEq4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-
-void DeblockChromaLt4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
- int8_t* pTc);
-void DeblockChromaEq4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-
-#if defined(__cplusplus)
-extern "C" {
-#endif//__cplusplus
-
-#ifdef X86_ASM
-void DeblockLumaLt4V_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
-void DeblockLumaEq4V_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-void DeblockLumaTransposeH2V_sse2 (uint8_t* pPixY, int32_t iStride, uint8_t* pDst);
-void DeblockLumaTransposeV2H_sse2 (uint8_t* pPixY, int32_t iStride, uint8_t* pSrc);
-void DeblockLumaLt4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
-void DeblockLumaEq4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-void DeblockChromaEq4V_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-void DeblockChromaLt4V_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
- int8_t* pTC);
-void DeblockChromaEq4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-void DeblockChromaLt4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
- int8_t* pTC);
-#endif
-
-#if defined(HAVE_NEON)
-void DeblockLumaLt4V_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
-void DeblockLumaEq4V_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-
-void DeblockLumaLt4H_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
-void DeblockLumaEq4H_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-
-void DeblockChromaLt4V_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTC);
-void DeblockChromaEq4V_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-
-void DeblockChromaLt4H_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTC);
-void DeblockChromaEq4H_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-#endif
-
-#if defined(__cplusplus)
-}
-#endif//__cplusplus
-
-#endif //WELS_DEBLOCKING_COMMON_H__
--- a/codec/common/deblocking_neon.S
+++ /dev/null
@@ -1,1052 +1,0 @@
-/*!
-* \copy
-* Copyright (c) 2013, Cisco Systems
-* All rights reserved.
-
-* Redistribution and use in source and binary forms, with or without
-* modification, are permitted provided that the following conditions
-* are met:
-
-* * Redistributions of source code must retain the above copyright
-* notice, this list of conditions and the following disclaimer.
-
-* * Redistributions in binary form must reproduce the above copyright
-* notice, this list of conditions and the following disclaimer in
-* the documentation and/or other materials provided with the
-* distribution.
-
-* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-* POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifdef HAVE_NEON
-.text
-
-#include "arm_arch_common_macro.S"
-
-#ifdef __APPLE__
-.macro JMP_IF_128BITS_IS_ZERO
- vorr.s16 $2, $0, $1
- vmov r3, r2, $2
- orr r3, r3, r2
- cmp r3, #0
-.endm
-
-.macro MASK_MATRIX
- vabd.u8 $6, $1, $2
- vcgt.u8 $6, $4, $6
-
- vabd.u8 $4, $0, $1
- vclt.u8 $4, $4, $5
- vand.u8 $6, $6, $4
-
- vabd.u8 $4, $3, $2
- vclt.u8 $4, $4, $5
- vand.u8 $6, $6, $4
-.endm
-
-
-.macro DIFF_LUMA_LT4_P1_Q1
- vabd.u8 $9, $0, $2
- vclt.u8 $9, $9, $4
- vrhadd.u8 $8, $2, $3
- vhadd.u8 $8, $0, $8
- vsub.s8 $8, $8, $1
- vmax.s8 $8, $8, $5
- vmin.s8 $8, $8, $6
- vand.s8 $8, $8, $9
- vand.s8 $8, $8, $7
- vadd.u8 $8, $1, $8
- vabs.s8 $9, $9
-.endm
-
-.macro DIFF_LUMA_LT4_P0_Q0
- vsubl.u8 $5, $0, $3
- vsubl.u8 $6, $2, $1
- vshl.s16 $6, $6, #2
- vadd.s16 $5, $5, $6
- vrshrn.s16 $4, $5, #3
-.endm
-
-.macro DIFF_LUMA_EQ4_P2P1P0
- vaddl.u8 q4, $1, $2
- vaddl.u8 q5, $3, $4
- vadd.u16 q5, q4, q5
-
- vaddl.u8 q4, $0, $1
- vshl.u16 q4, q4, #1
- vadd.u16 q4, q5, q4
-
- vrshrn.u16 $0, q5, #2
- vrshrn.u16 $7, q4, #3
-
- vshl.u16 q5, q5, #1
- vsubl.u8 q4, $5, $1
- vadd.u16 q5, q4,q5
-
- vaddl.u8 q4, $2, $5
- vaddw.u8 q4, q4, $2
- vaddw.u8 q4, q4, $3
-
- vrshrn.u16 d10,q5, #3
- vrshrn.u16 d8, q4, #2
- vbsl.u8 $6, d10, d8
-.endm
-
-.macro DIFF_LUMA_EQ4_MASK
- vmov $3, $2
- vbsl.u8 $3, $0, $1
-.endm
-
-.macro DIFF_CHROMA_EQ4_P0Q0
- vaddl.u8 $4, $0, $3
- vaddw.u8 $5, $4, $1
- vaddw.u8 $6, $4, $2
- vaddw.u8 $5, $5, $0
-
- vaddw.u8 $6, $6, $3
- vrshrn.u16 $7, $5, #2
- vrshrn.u16 $8, $6, #2
-.endm
-
-.macro LOAD_CHROMA_DATA_4
- vld4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
- vld4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
-.endm
-
-.macro STORE_CHROMA_DATA_4
- vst4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
- vst4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
-.endm
-
-.macro LOAD_LUMA_DATA_3
- vld3.u8 {$0[$6],$1[$6],$2[$6]}, [r2], r1
- vld3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
-.endm
-
-.macro STORE_LUMA_DATA_4
- vst4.u8 {$0[$4],$1[$4],$2[$4],$3[$4]}, [r0], r1
- vst4.u8 {$0[$5],$1[$5],$2[$5],$3[$5]}, [r2], r1
-.endm
-
-.macro LOAD_LUMA_DATA_4
- vld4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r3], r1
- vld4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r0], r1
-.endm
-
-.macro STORE_LUMA_DATA_3
- vst3.u8 {$0[$6],$1[$6],$2[$6]}, [r3], r1
- vst3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
-.endm
-
-.macro EXTRACT_DELTA_INTO_TWO_PART
- vcge.s8 $1, $0, #0
- vand $1, $0, $1
- vsub.s8 $0, $1, $0
-.endm
-#else
-.macro JMP_IF_128BITS_IS_ZERO arg0, arg1, arg2
- vorr.s16 \arg2, \arg0, \arg1
- vmov r3, r2, \arg2
- orr r3, r3, r2
- cmp r3, #0
-.endm
-
-.macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
- vabd.u8 \arg6, \arg1, \arg2
- vcgt.u8 \arg6, \arg4, \arg6
-
- vabd.u8 \arg4, \arg0, \arg1
- vclt.u8 \arg4, \arg4, \arg5
- vand.u8 \arg6, \arg6, \arg4
-
- vabd.u8 \arg4, \arg3, \arg2
- vclt.u8 \arg4, \arg4, \arg5
- vand.u8 \arg6, \arg6, \arg4
-.endm
-
-.macro DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
- vabd.u8 \arg9, \arg0, \arg2
- vclt.u8 \arg9, \arg9, \arg4
- vrhadd.u8 \arg8, \arg2, \arg3
- vhadd.u8 \arg8, \arg0, \arg8
- vsub.s8 \arg8, \arg8, \arg1
- vmax.s8 \arg8, \arg8, \arg5
- vmin.s8 \arg8, \arg8, \arg6
- vand.s8 \arg8, \arg8, \arg9
- vand.s8 \arg8, \arg8, \arg7
- vadd.u8 \arg8, \arg1, \arg8
- vabs.s8 \arg9, \arg9
-.endm
-
-.macro DIFF_LUMA_LT4_P0_Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6
- vsubl.u8 \arg5, \arg0, \arg3
- vsubl.u8 \arg6, \arg2, \arg1
- vshl.s16 \arg6, \arg6, #2
- vadd.s16 \arg5, \arg5, \arg6
- vrshrn.s16 \arg4, \arg5, #3
-.endm
-
-
-.macro DIFF_LUMA_EQ4_P2P1P0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
- vaddl.u8 q4, \arg1, \arg2
- vaddl.u8 q5, \arg3, \arg4
- vadd.u16 q5, q4, q5
-
- vaddl.u8 q4, \arg0, \arg1
- vshl.u16 q4, q4, #1
- vadd.u16 q4, q5, q4
-
- vrshrn.u16 \arg0, q5, #2
- vrshrn.u16 \arg7, q4, #3
-
- vshl.u16 q5, q5, #1
- vsubl.u8 q4, \arg5, \arg1
- vadd.u16 q5, q4,q5
-
- vaddl.u8 q4, \arg2, \arg5
- vaddw.u8 q4, q4, \arg2
- vaddw.u8 q4, q4, \arg3
-
- vrshrn.u16 d10,q5, #3
- vrshrn.u16 d8, q4, #2
- vbsl.u8 \arg6, d10, d8
-.endm
-
-.macro DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3
- vmov \arg3, \arg2
- vbsl.u8 \arg3, \arg0, \arg1
-.endm
-
-.macro DIFF_CHROMA_EQ4_P0Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
- vaddl.u8 \arg4, \arg0, \arg3
- vaddw.u8 \arg5, \arg4, \arg1
- vaddw.u8 \arg6, \arg4, \arg2
- vaddw.u8 \arg5, \arg5, \arg0
- vaddw.u8 \arg6, \arg6, \arg3
- vrshrn.u16 \arg7, \arg5, #2
- vrshrn.u16 \arg8, \arg6, #2
-.endm
-
-.macro LOAD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
- vld4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
- vld4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
-.endm
-
-.macro STORE_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
- vst4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
- vst4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
-.endm
-
-.macro LOAD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
- vld3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r2], r1
- vld3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
-.endm
-
-.macro STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
- vst4.u8 {\arg0[\arg4],\arg1[\arg4],\arg2[\arg4],\arg3[\arg4]}, [r0], r1
- vst4.u8 {\arg0[\arg5],\arg1[\arg5],\arg2[\arg5],\arg3[\arg5]}, [r2], r1
-.endm
-
-.macro LOAD_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
- vld4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r3], r1
- vld4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r0], r1
-.endm
-
-.macro STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
- vst3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r3], r1
- vst3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
-.endm
-
-.macro EXTRACT_DELTA_INTO_TWO_PART arg0, arg1
- vcge.s8 \arg1, \arg0, #0
- vand \arg1, \arg0, \arg1
- vsub.s8 \arg0, \arg1, \arg0
-.endm
-#endif
-
-WELS_ASM_FUNC_BEGIN DeblockLumaLt4V_neon
- vpush {q4-q7}
- vdup.u8 q11, r2
- vdup.u8 q9, r3
-
- add r2, r1, r1, lsl #1
- sub r2, r0, r2
- vld1.u8 {q0}, [r2], r1
- vld1.u8 {q3}, [r0], r1
- vld1.u8 {q1}, [r2], r1
- vld1.u8 {q4}, [r0], r1
- vld1.u8 {q2}, [r2]
- vld1.u8 {q5}, [r0]
- sub r2, r2, r1
-
- ldr r3, [sp, #64]
- vld1.s8 {d31}, [r3]
- vdup.s8 d28, d31[0]
- vdup.s8 d30, d31[1]
- vdup.s8 d29, d31[2]
- vdup.s8 d31, d31[3]
- vtrn.32 d28, d30
- vtrn.32 d29, d31
- vcge.s8 q10, q14, #0
-
- MASK_MATRIX q1, q2, q3, q4, q11, q9, q15
- vand.u8 q10, q10, q15
-
- veor q15, q15
- vsub.i8 q15,q15,q14
-
- DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
- vst1.u8 {q6}, [r2], r1
-
- DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
-
- vabs.s8 q12, q12
- vabs.s8 q13, q13
- vadd.u8 q14,q14,q12
- vadd.u8 q14,q14,q13
- veor q15, q15
- vsub.i8 q15,q15,q14
-
- DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13
- DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13
- vmax.s8 q8, q8, q15
- vmin.s8 q8, q8, q14
- vand.s8 q8, q8, q10
- EXTRACT_DELTA_INTO_TWO_PART q8, q9
- vqadd.u8 q2, q2, q9
- vqsub.u8 q2, q2, q8
- vst1.u8 {q2}, [r2], r1
- vqsub.u8 q3, q3, q9
- vqadd.u8 q3, q3, q8
- vst1.u8 {q3}, [r2] , r1
- vst1.u8 {q7}, [r2]
-
- vpop {q4-q7}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN DeblockLumaEq4V_neon
- vpush {q4-q7}
-
- vdup.u8 q5, r2
- vdup.u8 q4, r3
-
- sub r3, r0, r1, lsl #2
- vld1.u8 {q8}, [r3], r1
- vld1.u8 {q12}, [r0], r1
- vld1.u8 {q9}, [r3], r1
- vld1.u8 {q13}, [r0], r1
- vld1.u8 {q10}, [r3], r1
- vld1.u8 {q14}, [r0], r1
- vld1.u8 {q11}, [r3]
- vld1.u8 {q15}, [r0]
- sub r3, r3, r1 , lsl #1
-
- MASK_MATRIX q10, q11, q12, q13, q5, q4, q6
-
- mov r2, r2, lsr #2
- add r2, r2, #2
- vdup.u8 q5, r2
- vabd.u8 q0, q11, q12
- vclt.u8 q7, q0, q5
-
- vabd.u8 q1, q9, q11
- vclt.u8 q1, q1, q4
- vand.s8 q1, q1, q7
-
- vabd.u8 q2, q14,q12
- vclt.u8 q2, q2, q4
- vand.s8 q2, q2, q7
- vand.u8 q7, q7, q6
-
- vmov q3, q1
-
- DIFF_LUMA_EQ4_P2P1P0 d16, d18, d20, d22, d24, d26, d2, d0
- DIFF_LUMA_EQ4_P2P1P0 d17, d19, d21, d23, d25, d27, d3, d1
-
- vand.u8 q3, q7, q3
- DIFF_LUMA_EQ4_MASK q0, q9, q3, q4
- vst1.u8 {q4}, [r3], r1
- DIFF_LUMA_EQ4_MASK q8,q10, q3, q4
- vst1.u8 {q4}, [r3], r1
- DIFF_LUMA_EQ4_MASK q1,q11, q6, q4
- vst1.u8 {q4}, [r3], r1
-
- vmov q0, q2
- DIFF_LUMA_EQ4_P2P1P0 d30, d28, d26, d24, d22, d20, d4, d6
- DIFF_LUMA_EQ4_P2P1P0 d31, d29, d27, d25, d23, d21, d5, d7
-
- vand.u8 q0, q7, q0
- DIFF_LUMA_EQ4_MASK q2, q12, q6, q4
- vst1.u8 {q4}, [r3], r1
- DIFF_LUMA_EQ4_MASK q15, q13, q0, q4
- vst1.u8 {q4}, [r3], r1
- DIFF_LUMA_EQ4_MASK q3, q14, q0, q4
- vst1.u8 {q4}, [r3], r1
-
- vpop {q4-q7}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN DeblockLumaLt4H_neon
- vpush {q4-q7}
-
- vdup.u8 q11, r2
- vdup.u8 q9, r3
-
- sub r2, r0, #3
- LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 0
- LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 1
- LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 2
- LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 3
- LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 4
- LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 5
- LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 6
- LOAD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 7
-
- LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 0
- LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 1
- LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 2
- LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 3
- LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 4
- LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 5
- LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 6
- LOAD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 7
-
- vswp d1, d2
- vswp d3, d4
- vswp d1, d4
- vswp d7, d8
- vswp d9, d10
- vswp d7, d10
-
- sub r0, r0, r1, lsl #4
-
- ldr r3, [sp, #64]
- vld1.s8 {d31}, [r3]
- vdup.s8 d28, d31[0]
- vdup.s8 d30, d31[1]
- vdup.s8 d29, d31[2]
- vdup.s8 d31, d31[3]
- vtrn.32 d28, d30
- vtrn.32 d29, d31
- vcge.s8 q10, q14, #0
-
- MASK_MATRIX q1, q2, q3, q4, q11, q9, q15
- vand.u8 q10, q10, q15
-
- veor q15, q15
- vsub.i8 q15,q15,q14
-
- DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
- DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
-
- vabs.s8 q12, q12
- vabs.s8 q13, q13
- vadd.u8 q14,q14,q12
- vadd.u8 q14,q14,q13
- veor q15, q15
- vsub.i8 q15,q15,q14
-
- DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13
- DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13
- vmax.s8 q8, q8, q15
- vmin.s8 q8, q8, q14
- vand.s8 q8, q8, q10
- EXTRACT_DELTA_INTO_TWO_PART q8, q9
- vqadd.u8 q2, q2, q9
- vqsub.u8 q2, q2, q8
-
- vqsub.u8 q3, q3, q9
- vqadd.u8 q3, q3, q8
-
- sub r0, #2
- add r2, r0, r1
- lsl r1, #1
-
- vmov q1, q6
- vmov q4, q7
-
- vswp q2, q3
- vswp d3, d6
- vswp d5, d8
-
- STORE_LUMA_DATA_4 d2, d3, d4, d5, 0, 1
- STORE_LUMA_DATA_4 d2, d3, d4, d5, 2, 3
- STORE_LUMA_DATA_4 d2, d3, d4, d5, 4, 5
- STORE_LUMA_DATA_4 d2, d3, d4, d5, 6, 7
-
- STORE_LUMA_DATA_4 d6, d7, d8, d9, 0, 1
- STORE_LUMA_DATA_4 d6, d7, d8, d9, 2, 3
- STORE_LUMA_DATA_4 d6, d7, d8, d9, 4, 5
- STORE_LUMA_DATA_4 d6, d7, d8, d9, 6, 7
-
- vpop {q4-q7}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN DeblockLumaEq4H_neon
- vpush {q4-q7}
- vdup.u8 q5, r2
- vdup.u8 q4, r3
-
- sub r3, r0, #4 // pix -= 4
- LOAD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,0
- LOAD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,1
- LOAD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,2
- LOAD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,3
- LOAD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,4
- LOAD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,5
- LOAD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,6
- LOAD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,7
-
- LOAD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,0
- LOAD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,1
- LOAD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,2
- LOAD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,3
- LOAD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,4
- LOAD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,5
- LOAD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,6
- LOAD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,7
-
- vswp q9, q10
- vswp d17,d18
- vswp d21,d22
- vswp q13,q14
- vswp d25,d26
- vswp d29,d30
- sub r0, r0, r1 , lsl #4
-
- MASK_MATRIX q10, q11, q12, q13, q5, q4, q6
-
- mov r2, r2, lsr #2
- add r2, r2, #2
- vdup.u8 q5, r2
- vabd.u8 q0, q11, q12
- vclt.u8 q7, q0, q5
-
- vabd.u8 q1, q9, q11
- vclt.u8 q1, q1, q4
- vand.s8 q1, q1, q7
-
- vabd.u8 q2, q14,q12
- vclt.u8 q2, q2, q4
- vand.s8 q2, q2, q7
- vand.u8 q7, q7, q6
-
- vmov q3, q1
-
- DIFF_LUMA_EQ4_P2P1P0 d16, d18, d20, d22, d24, d26, d2, d0
- DIFF_LUMA_EQ4_P2P1P0 d17, d19, d21, d23, d25, d27, d3, d1
-
- vand.u8 q3, q7, q3
- DIFF_LUMA_EQ4_MASK q0, q9, q3, q4
- vmov q9, q4
- vbsl.u8 q3, q8, q10
- DIFF_LUMA_EQ4_MASK q1,q11, q6, q8
-
- vand.u8 q7, q7, q2
-
- DIFF_LUMA_EQ4_P2P1P0 d30, d28, d26, d24, d22, d20, d4, d0
- DIFF_LUMA_EQ4_P2P1P0 d31, d29, d27, d25, d23, d21, d5, d1
-
- vbsl.u8 q6, q2, q12
- DIFF_LUMA_EQ4_MASK q15, q13, q7, q4
-
- vbsl.u8 q7, q0, q14
-
- vmov q5, q6
- vmov q2, q9
- vmov q6, q4
- vmov q4, q8
-
- vswp d8, d6
- vswp d5, d7
- vswp d5, d8
- vswp d14, d12
- vswp d11, d13
- vswp d11, d14
-
- sub r3, r0, #3
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,0
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,1
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,2
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,3
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,4
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,5
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,6
- STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,7
-
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,0
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,1
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,2
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,3
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,4
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,5
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,6
- STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,7
-
- vpop {q4-q7}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN DeblockChromaLt4V_neon
- vdup.u8 q11, r3
- ldr r3, [sp, #0]
-
- sub r0, r0, r2 , lsl #1
- sub r1, r1, r2, lsl #1
- vdup.u8 q9, r3
- ldr r3, [sp, #4]
-
- vld1.u8 {d0}, [r0], r2
- vld1.u8 {d1}, [r1], r2
- vld1.u8 {d2}, [r0], r2
- vld1.u8 {d3}, [r1], r2
- vld1.u8 {d4}, [r0], r2
- vld1.u8 {d5}, [r1], r2
- vld1.u8 {d6}, [r0]
- vld1.u8 {d7}, [r1]
-
- sub r0, r0, r2, lsl #1
- sub r1, r1, r2, lsl #1
-
- vld1.s8 {d31}, [r3]
- vmovl.u8 q14,d31
- vshl.u64 d29,d28,#8
- vorr d28,d29
- vmov d29, d28
- veor q15, q15
- vsub.i8 q15,q15,q14
-
- MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
-
- DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d16, q12, q13
- DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d17, q12, q13
- vmax.s8 q8, q8, q15
- vmin.s8 q8, q8, q14
-
- vand.s8 q8, q8, q10
- vcge.s8 q14, q14, #0
- vand.s8 q8, q8, q14
- EXTRACT_DELTA_INTO_TWO_PART q8, q10
- vqadd.u8 q1, q1, q10
- vqsub.u8 q1, q1, q8
- vst1.u8 {d2}, [r0], r2
- vst1.u8 {d3}, [r1], r2
- vqsub.u8 q2, q2, q10
- vqadd.u8 q2, q2, q8
- vst1.u8 {d4}, [r0]
- vst1.u8 {d5}, [r1]
-
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN DeblockChromaEq4V_neon
- vpush {q4-q5}
-
- vdup.u8 q11, r3
- ldr r3, [sp, #32]
-
- sub r0, r0, r2 , lsl #1
- sub r1, r1, r2, lsl #1
- vdup.u8 q9, r3
- vld1.u8 {d0}, [r0], r2 // q0::p1
- vld1.u8 {d1}, [r1], r2
- vld1.u8 {d2}, [r0], r2 // q1::p0
- vld1.u8 {d3}, [r1], r2
- vld1.u8 {d4}, [r0], r2 // q2::q0
- vld1.u8 {d5}, [r1], r2
- vld1.u8 {d6}, [r0] // q3::q1
- vld1.u8 {d7}, [r1]
-
- sub r0, r0, r2, lsl #1 // pix = [-1*src_stride]
- sub r1, r1, r2, lsl #1
-
- MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
-
- vmov q11, q10
-
- DIFF_CHROMA_EQ4_P0Q0 d0, d2, d4, d6, q4, q5, q8, d30, d0 // Cb::p0' q0'
- DIFF_CHROMA_EQ4_P0Q0 d1, d3, d5, d7, q12, q13, q14, d31, d1 // Cr::p0' q0'
-
- vbsl.u8 q10, q15, q1
- vst1.u8 {d20}, [r0], r2
- vst1.u8 {d21}, [r1], r2
-
- vbsl.u8 q11, q0, q2
- vst1.u8 {d22}, [r0]
- vst1.u8 {d23}, [r1]
-
- vpop {q4-q5}
-WELS_ASM_FUNC_END
-
-WELS_ASM_FUNC_BEGIN DeblockChromaLt4H_neon
-
- vdup.u8 q11, r3
- ldr r3, [sp, #0]
-
- sub r0, r0, #2
- vdup.u8 q9, r3
- ldr r3, [sp, #4]
- sub r1, r1, #2
- vld1.s8 {d31}, [r3]
-
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
- vswp q1, q2
- vswp d1, d2
- vswp d6, d5
-
- vmovl.u8 q14, d31
- vshl.u64 d29,d28,#8
- vorr d28,d29
- vmov d29, d28
- veor q15, q15
- vsub.i8 q15,q15,q14
-
- MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
-
- DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d16, q12, q13
- DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d17, q12, q13
- vmax.s8 q8, q8, q15
- vmin.s8 q8, q8, q14
-
- vand.s8 q8, q8, q10
- vcge.s8 q14, q14, #0
- vand.s8 q8, q8, q14
- EXTRACT_DELTA_INTO_TWO_PART q8, q10
- vqadd.u8 q1, q1, q10
- vqsub.u8 q1, q1, q8
- vqsub.u8 q2, q2, q10
- vqadd.u8 q2, q2, q8
-
- sub r0, r0, r2, lsl #3
- sub r1, r1, r2, lsl #3
- vswp d1, d2
- vswp d6, d5
- vswp q1, q2
-
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
-
-WELS_ASM_FUNC_END
-
-WELS_ASM_FUNC_BEGIN DeblockChromaEq4H_neon
- vpush {q4-q5}
- vdup.u8 q11, r3
- ldr r3, [sp, #32]
-
- sub r0, r0, #2
- sub r1, r1, #2
-
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
- LOAD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
- vswp q1, q2
- vswp d1, d2
- vswp d6, d5
-
- vdup.u8 q9, r3
- MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
- vmov q11, q10
-
- DIFF_CHROMA_EQ4_P0Q0 d0, d2, d4, d6, q8, q9, q12, d8, d10
- DIFF_CHROMA_EQ4_P0Q0 d1, d3, d5, d7, q13, q14, q15, d9, d11
-
- vbsl.u8 q10, q4, q1
- vbsl.u8 q11, q5, q2
- sub r0, r0, r2, lsl #3 // pix: 0th row [-2]
- sub r1, r1, r2, lsl #3
-
- vmov q1, q10
- vmov q2, q11
- vswp d1, d2
- vswp d6, d5
- vswp q1, q2
- // Cb:d0d1d2d3, Cr:d4d5d6d7
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
- STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
-
- vpop {q4-q5}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsNonZeroCount_neon
-
- vld1.64 {d0-d2}, [r0]
-
- vceq.s8 q0, q0, #0
- vceq.s8 d2, d2, #0
- vmvn q0, q0
- vmvn d2, d2
- vabs.s8 q0, q0
- vabs.s8 d2, d2
-
- vst1.64 {d0-d2}, [r0]
-WELS_ASM_FUNC_END
-
-#ifdef __APPLE__
-.macro BS_NZC_CHECK
- vld1.8 {d0,d1}, [$0]
- /* Arrenge the input data --- TOP */
- ands r6, $1, #2
- beq bs_nzc_check_jump0
-
- sub r6, $0, $2, lsl #4
- sub r6, $2, lsl #3
- add r6, #12
- vld1.32 d3[1], [r6]
-
-bs_nzc_check_jump0:
- vext.8 q1, q1, q0, #12
- vadd.u8 $3, q0, q1
-
-
- /* Arrenge the input data --- LEFT */
- ands r6, $1, #1
- beq bs_nzc_check_jump1
-
- sub r6, $0, #21
- add r7, r6, #4
- vld1.8 d3[4], [r6]
- add r6, r7, #4
- vld1.8 d3[5], [r7]
- add r7, r6, #4
- vld1.8 d3[6], [r6]
- vld1.8 d3[7], [r7]
-
-bs_nzc_check_jump1:
- vzip.8 d0, d1
- vzip.8 d0, d1
- vext.8 q1, q1, q0, #12
- vadd.u8 $4, q0, q1
-.endm
-
-.macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6
- mov r6, #4
- vabd.s16 q8, $0, $1
- vabd.s16 q9, $1, $2
- vdup.s16 $0, r6
- vabd.s16 q10, $2, $3
- vabd.s16 q11, $3, $4
-
- vcge.s16 q8, $0
- vcge.s16 q9, $0
- vcge.s16 q10, $0
- vcge.s16 q11, $0
-
- vpadd.i16 d16, d16, d17
- vpadd.i16 d17, d18, d19
- vpadd.i16 d18, d20, d21
- vpadd.i16 d19, d22, d23
-
- vaddhn.i16 $5, q8, q8
- vaddhn.i16 $6, q9, q9
-.endm
-
-.macro BS_MV_CHECK
- vldm $0, {q0,q1,q2,q3}
-
- /* Arrenge the input data --- TOP */
- ands r6, $1, #2
- beq bs_mv_check_jump0
-
- sub r6, $0, $2, lsl #6
- add r6, #48
- vld1.8 {d8, d9}, [r6]
-
-bs_mv_check_jump0:
- BS_COMPARE_MV q4, q0, q1, q2, q3, $3, $4
-
- /* Arrenge the input data --- LEFT */
- ands r6, $1, #1
- beq bs_mv_check_jump1
-
- sub r6, $0, #52
- add r7, r6, #16
- vld1.32 d8[0], [r6]
- add r6, r7, #16
- vld1.32 d8[1], [r7]
- add r7, r6, #16
- vld1.32 d9[0], [r6]
- vld1.32 d9[1], [r7]
-
-bs_mv_check_jump1:
- vzip.32 q0, q2
- vzip.32 q1, q3
- vzip.32 q0, q1
- vzip.32 q2, q3
- BS_COMPARE_MV q4, q0, q1, q2, q3, $5, $6
-.endm
-#else
-.macro BS_NZC_CHECK arg0, arg1, arg2, arg3, arg4
- vld1.8 {d0,d1}, [\arg0]
- /* Arrenge the input data --- TOP */
- ands r6, \arg1, #2
- beq bs_nzc_check_jump0
-
- sub r6, \arg0, \arg2, lsl #4
- sub r6, r6, \arg2, lsl #3
- add r6, #12
- vld1.32 d3[1], [r6]
-
-bs_nzc_check_jump0:
- vext.8 q1, q1, q0, #12
- vadd.u8 \arg3, q0, q1
-
-
- /* Arrenge the input data --- LEFT */
- ands r6, \arg1, #1
- beq bs_nzc_check_jump1
-
- sub r6, \arg0, #21
- add r7, r6, #4
- vld1.8 d3[4], [r6]
- add r6, r7, #4
- vld1.8 d3[5], [r7]
- add r7, r6, #4
- vld1.8 d3[6], [r6]
- vld1.8 d3[7], [r7]
-
-bs_nzc_check_jump1:
- vzip.8 d0, d1
- vzip.8 d0, d1
- vext.8 q1, q1, q0, #12
- vadd.u8 \arg4, q0, q1
-.endm
-
-.macro BS_COMPARE_MV arg0, arg1, arg2, arg3, arg4, arg5, arg6 //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6
- mov r6, #4
- vabd.s16 q8, \arg0, \arg1
- vabd.s16 q9, \arg1, \arg2
- vdup.s16 \arg0, r6
- vabd.s16 q10, \arg2, \arg3
- vabd.s16 q11, \arg3, \arg4
-
- vcge.s16 q8, \arg0
- vcge.s16 q9, \arg0
- vcge.s16 q10, \arg0
- vcge.s16 q11, \arg0
-
- vpadd.i16 d16, d16, d17
- vpadd.i16 d17, d18, d19
- vpadd.i16 d18, d20, d21
- vpadd.i16 d19, d22, d23
-
- vaddhn.i16 \arg5, q8, q8
- vaddhn.i16 \arg6, q9, q9
-.endm
-
-.macro BS_MV_CHECK arg0, arg1, arg2, arg3, arg4, arg5, arg6
- vldm \arg0, {q0,q1,q2,q3}
-
- /* Arrenge the input data --- TOP */
- ands r6, \arg1, #2
- beq bs_mv_check_jump0
-
- sub r6, \arg0, \arg2, lsl #6
- add r6, #48
- vld1.8 {d8, d9}, [r6]
-
-bs_mv_check_jump0:
- BS_COMPARE_MV q4, q0, q1, q2, q3, \arg3, \arg4
-
- /* Arrenge the input data --- LEFT */
- ands r6, \arg1, #1
- beq bs_mv_check_jump1
-
- sub r6, \arg0, #52
- add r7, r6, #16
- vld1.32 d8[0], [r6]
- add r6, r7, #16
- vld1.32 d8[1], [r7]
- add r7, r6, #16
- vld1.32 d9[0], [r6]
- vld1.32 d9[1], [r7]
-
-bs_mv_check_jump1:
- vzip.32 q0, q2
- vzip.32 q1, q3
- vzip.32 q0, q1
- vzip.32 q2, q3
- BS_COMPARE_MV q4, q0, q1, q2, q3, \arg5, \arg6
-.endm
-#endif
-
-
-WELS_ASM_FUNC_BEGIN DeblockingBSCalcEnc_neon
-
- stmdb sp!, {r5-r7}
- vpush {q4}
-
- ldr r5, [sp, #28] //Save BS to r5
-
- /* Checking the nzc status */
- BS_NZC_CHECK r0, r2, r3, q14, q15 //q14,q15 save the nzc status
-
- /* For checking bS[I] = 2 */
- mov r6, #2
- vcgt.s8 q14, q14, #0
- vdup.u8 q0, r6
- vcgt.s8 q15, q15, #0
-
- vand.u8 q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top
- vand.u8 q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left
-
- /* Checking the mv status*/
- BS_MV_CHECK r1, r2, r3, d24, d25, d26, d27//q12, q13 save the mv status
-
- /* For checking bS[I] = 1 */
- mov r6, #1
- vdup.u8 q0, r6
-
- vand.u8 q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top
- vand.u8 q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left
-
-
- /* Check bS[I] is '1' or '2' */
- vmax.u8 q1, q12, q14
- vmax.u8 q0, q13, q15
-
- //vstm r5, {q0, q1}
- vst1.32 {q0, q1}, [r5]
- vpop {q4}
- ldmia sp!, {r5-r7}
-WELS_ASM_FUNC_END
-#endif
--- a/codec/common/expand_picture.asm
+++ /dev/null
@@ -1,728 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* expand_picture.asm
-;*
-;* Abstract
-;* mmxext/sse for expand_frame
-;*
-;* History
-;* 09/25/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-
-
-SECTION .text
-
-
-;;;;;;;expanding result;;;;;;;
-
-;aaaa|attttttttttttttttb|bbbb
-;aaaa|attttttttttttttttb|bbbb
-;aaaa|attttttttttttttttb|bbbb
-;aaaa|attttttttttttttttb|bbbb
-;----------------------------
-;aaaa|attttttttttttttttb|bbbb
-;llll|l r|rrrr
-;llll|l r|rrrr
-;llll|l r|rrrr
-;llll|l r|rrrr
-;llll|l r|rrrr
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-;----------------------------
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-
-%macro mov_line_8x4_mmx 3 ; dst, stride, mm?
- movq [%1], %3
- movq [%1+%2], %3
- lea %1, [%1+2*%2]
- movq [%1], %3
- movq [%1+%2], %3
- lea %1, [%1+2*%2]
-%endmacro
-
-%macro mov_line_end8x4_mmx 3 ; dst, stride, mm?
- movq [%1], %3
- movq [%1+%2], %3
- lea %1, [%1+2*%2]
- movq [%1], %3
- movq [%1+%2], %3
- lea %1, [%1+%2]
-%endmacro
-
-%macro mov_line_16x4_sse2 4 ; dst, stride, xmm?, u/a
- movdq%4 [%1], %3 ; top(bottom)_0
- movdq%4 [%1+%2], %3 ; top(bottom)_1
- lea %1, [%1+2*%2]
- movdq%4 [%1], %3 ; top(bottom)_2
- movdq%4 [%1+%2], %3 ; top(bottom)_3
- lea %1, [%1+2*%2]
-%endmacro
-
-%macro mov_line_end16x4_sse2 4 ; dst, stride, xmm?, u/a
- movdq%4 [%1], %3 ; top(bottom)_0
- movdq%4 [%1+%2], %3 ; top(bottom)_1
- lea %1, [%1+2*%2]
- movdq%4 [%1], %3 ; top(bottom)_2
- movdq%4 [%1+%2], %3 ; top(bottom)_3
- lea %1, [%1+%2]
-%endmacro
-
-%macro mov_line_32x4_sse2 3 ; dst, stride, xmm?
- movdqa [%1], %3 ; top(bottom)_0
- movdqa [%1+16], %3 ; top(bottom)_0
- movdqa [%1+%2], %3 ; top(bottom)_1
- movdqa [%1+%2+16], %3 ; top(bottom)_1
- lea %1, [%1+2*%2]
- movdqa [%1], %3 ; top(bottom)_2
- movdqa [%1+16], %3 ; top(bottom)_2
- movdqa [%1+%2], %3 ; top(bottom)_3
- movdqa [%1+%2+16], %3 ; top(bottom)_3
- lea %1, [%1+2*%2]
-%endmacro
-
-%macro mov_line_end32x4_sse2 3 ; dst, stride, xmm?
- movdqa [%1], %3 ; top(bottom)_0
- movdqa [%1+16], %3 ; top(bottom)_0
- movdqa [%1+%2], %3 ; top(bottom)_1
- movdqa [%1+%2+16], %3 ; top(bottom)_1
- lea %1, [%1+2*%2]
- movdqa [%1], %3 ; top(bottom)_2
- movdqa [%1+16], %3 ; top(bottom)_2
- movdqa [%1+%2], %3 ; top(bottom)_3
- movdqa [%1+%2+16], %3 ; top(bottom)_3
- lea %1, [%1+%2]
-%endmacro
-
-%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)]
- ;r2 [width/16(8)]
- ;r0 [pSrc +0], r5 [pSrc -width] r1[-stride], 32(16) ;top
- ;r3 [pSrc +(h-1)*stride], r4 [pSrc + (h+31)*stride],32(16); bottom
-
-%if %1 == 32 ; for luma
- sar r2, 04h ; width / 16(8) pixels
-.top_bottom_loops:
- ; top
- movdqa xmm0, [r0] ; first line of picture pData
- mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r5, r1, xmm0, a
- mov_line_16x4_sse2 r5, r1, xmm0, a
- mov_line_16x4_sse2 r5, r1, xmm0, a
- mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r5, r1, xmm0, a
- mov_line_16x4_sse2 r5, r1, xmm0, a
- mov_line_end16x4_sse2 r5, r1, xmm0, a
-
- ; bottom
- movdqa xmm1, [r3] ; last line of picture pData
- mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r4, r1, xmm1, a
- mov_line_16x4_sse2 r4, r1, xmm1, a
- mov_line_16x4_sse2 r4, r1, xmm1, a
- mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r4, r1, xmm1, a
- mov_line_16x4_sse2 r4, r1, xmm1, a
- mov_line_end16x4_sse2 r4, r1, xmm1, a
-
- lea r0, [r0+16] ; top pSrc
- lea r5, [r5+16] ; top dst
- lea r3, [r3+16] ; bottom pSrc
- lea r4, [r4+16] ; bottom dst
- neg r1 ; positive/negative stride need for next loop?
-
- dec r2
- jnz near .top_bottom_loops
-%elif %1 == 16 ; for chroma ??
- mov r6, r2
- sar r2, 04h ; (width / 16) pixels
-.top_bottom_loops:
- ; top
- movdqa xmm0, [r0] ; first line of picture pData
- mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r5, r1, xmm0, a
- mov_line_16x4_sse2 r5, r1, xmm0, a
- mov_line_end16x4_sse2 r5, r1, xmm0, a
-
- ; bottom
- movdqa xmm1, [r3] ; last line of picture pData
- mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r4, r1, xmm1, a
- mov_line_16x4_sse2 r4, r1, xmm1, a
- mov_line_end16x4_sse2 r4, r1, xmm1, a
-
- lea r0, [r0+16] ; top pSrc
- lea r5, [r5+16] ; top dst
- lea r3, [r3+16] ; bottom pSrc
- lea r4, [r4+16] ; bottom dst
- neg r1 ; positive/negative stride need for next loop?
-
- dec r2
- jnz near .top_bottom_loops
-
- ; for remaining 8 bytes
- and r6, 0fh ; any 8 bytes left?
- test r6, r6
- jz near .to_be_continued ; no left to exit here
-
- ; top
- movq mm0, [r0] ; remained 8 byte
- mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
- mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
- mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
- mov_line_end8x4_mmx r5, r1, mm0 ; dst, stride, mm?
- ; bottom
- movq mm1, [r3]
- mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
- mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
- mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
- mov_line_end8x4_mmx r4, r1, mm1 ; dst, stride, mm?
- WELSEMMS
-
-.to_be_continued:
-%endif
-%endmacro
-
-%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
- ;r6 [height]
- ;r0 [pSrc+0] r5[pSrc-32] r1[stride]
- ;r3 [pSrc+(w-1)] r4[pSrc+w]
-
-%if %1 == 32 ; for luma
-.left_right_loops:
- ; left
- movzx r2d, byte [r0] ; pixel pData for left border
- SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- movdqa [r5], xmm0
- movdqa [r5+16], xmm0
-
- ; right
- movzx r2d, byte [r3]
- SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- movdqa [r4], xmm1
- movdqa [r4+16], xmm1
-
- lea r0, [r0+r1] ; left pSrc
- lea r5, [r5+r1] ; left dst
- lea r3, [r3+r1] ; right pSrc
- lea r4, [r4+r1] ; right dst
-
- dec r6
- jnz near .left_right_loops
-%elif %1 == 16 ; for chroma ??
-.left_right_loops:
- ; left
- movzx r2d, byte [r0] ; pixel pData for left border
- SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- movdqa [r5], xmm0
-
- ; right
- movzx r2d, byte [r3]
- SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- movdq%2 [r4], xmm1 ; might not be aligned 16 bytes in case chroma planes
-
- lea r0, [r0+r1] ; left pSrc
- lea r5, [r5+r1] ; left dst
- lea r3, [r3+r1] ; right pSrc
- lea r4, [r4+r1] ; right dst
-
- dec r6
- jnz near .left_right_loops
-%endif
-%endmacro
-
-%macro exp_cross_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
- ; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
- ; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
- ;r3:TL ,r4:TR,r5:BL,r6:BR r1:-stride
-%if %1 == 32 ; luma
- ; TL
- mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
- mov_line_end32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
-
- ; TR
- mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
- mov_line_end32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
-
- ; BL
- mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
- mov_line_end32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
-
- ; BR
- mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
- mov_line_end32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
-%elif %1 == 16 ; chroma
- ; TL
- mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
- mov_line_end16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
-
- ; TR
- mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
- mov_line_end16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
-
- ; BL
- mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
- mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
- mov_line_end16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
-
- ; BR
- mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
- mov_line_end16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
-%endif
-%endmacro
-
-;***********************************************************************----------------
-; void ExpandPictureLuma_sse2( uint8_t *pDst,
-; const int32_t iStride,
-; const int32_t iWidth,
-; const int32_t iHeight );
-;***********************************************************************----------------
-WELS_EXTERN ExpandPictureLuma_sse2
-
- push r4
- push r5
- push r6
-
- %assign push_num 3
- LOAD_4_PARA
- PUSH_XMM 7
-
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r2, r2d
- SIGN_EXTENSION r3, r3d
-
- ;also prepare for cross border pData top-left:xmm3
-
- movzx r6d,byte[r0]
- SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
-
- neg r1
- lea r5,[r0+r1] ;last line of top border r5= dst top pSrc[-stride]
- neg r1
-
- push r3
-
-
- dec r3 ;h-1
- imul r3,r1 ;(h-1)*stride
- lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
-
- mov r6,r1 ;r6 = stride
- sal r6,05h ;r6 = 32*stride
- lea r4,[r3+r6] ;r4 = dst bottom
-
- ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
-
- movzx r6d,byte [r3] ;bottom-left
- SSE2_Copy16Times xmm5,r6d
-
- lea r6,[r3+r2-1]
- movzx r6d,byte [r6]
- SSE2_Copy16Times xmm6,r6d ;bottom-right
-
- neg r1 ;r1 = -stride
-
- push r0
- push r1
- push r2
-
- exp_top_bottom_sse2 32
-
- ; for both left and right border
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
- pop r2
- pop r1
- pop r0
-
- lea r5,[r0-32] ;left border dst luma =32 chroma = -16
-
- lea r3,[r0+r2-1] ;right border src
- lea r4,[r3+1] ;right border dst
-
- ;prepare for cross border data: top-rigth with xmm4
- movzx r6d,byte [r3] ;top -rigth
- SSE2_Copy16Times xmm4,r6d
-
- neg r1 ;r1 = stride
-
-
- pop r6 ; r6 = height
-
-
-
- push r0
- push r1
- push r2
- push r6
-
- exp_left_right_sse2 32,a
-
- pop r6
- pop r2
- pop r1
- pop r0
-
- ; for cross border [top-left, top-right, bottom-left, bottom-right]
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
-
- neg r1 ;r1 = -stride
- lea r3,[r0-32]
- lea r3,[r3+r1] ;last line of top-left border
-
- lea r4,[r0+r2] ;psrc +width
- lea r4,[r4+r1] ;psrc +width -stride
-
-
- neg r1 ;r1 = stride
- add r6,32 ;height +32(16) ,luma = 32, chroma = 16
- imul r6,r1
-
- lea r5,[r3+r6] ;last line of bottom-left border
- lea r6,[r4+r6] ;last line of botoom-right border
-
- neg r1 ; r1 = -stride
-
- ; for left & right border expanding
- exp_cross_sse2 32,a
-
- POP_XMM
- LOAD_4_PARA_POP
-
- pop r6
- pop r5
- pop r4
-
- %assign push_num 0
-
-
- ret
-
-;***********************************************************************----------------
-; void ExpandPictureChromaAlign_sse2( uint8_t *pDst,
-; const int32_t iStride,
-; const int32_t iWidth,
-; const int32_t iHeight );
-;***********************************************************************----------------
-WELS_EXTERN ExpandPictureChromaAlign_sse2
-
- push r4
- push r5
- push r6
-
- %assign push_num 3
- LOAD_4_PARA
- PUSH_XMM 7
-
- SIGN_EXTENSION r1,r1d
- SIGN_EXTENSION r2,r2d
- SIGN_EXTENSION r3,r3d
-
- ;also prepare for cross border pData top-left:xmm3
-
- movzx r6d,byte [r0]
- SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
-
- neg r1
- lea r5,[r0+r1] ;last line of top border r5= dst top pSrc[-stride]
- neg r1
-
- push r3
-
-
- dec r3 ;h-1
- imul r3,r1 ;(h-1)*stride
- lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
-
- mov r6,r1 ;r6 = stride
- sal r6,04h ;r6 = 32*stride
- lea r4,[r3+r6] ;r4 = dst bottom
-
- ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
-
- movzx r6d,byte [r3] ;bottom-left
- SSE2_Copy16Times xmm5,r6d
-
- lea r6,[r3+r2-1]
- movzx r6d,byte [r6]
- SSE2_Copy16Times xmm6,r6d ;bottom-right
-
- neg r1 ;r1 = -stride
-
- push r0
- push r1
- push r2
-
- exp_top_bottom_sse2 16
-
- ; for both left and right border
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
- pop r2
- pop r1
- pop r0
-
- lea r5,[r0-16] ;left border dst luma =32 chroma = -16
-
- lea r3,[r0+r2-1] ;right border src
- lea r4,[r3+1] ;right border dst
-
- ;prepare for cross border data: top-rigth with xmm4
- movzx r6d,byte [r3] ;top -rigth
- SSE2_Copy16Times xmm4,r6d
-
- neg r1 ;r1 = stride
-
-
- pop r6 ; r6 = height
-
-
-
- push r0
- push r1
- push r2
- push r6
- exp_left_right_sse2 16,a
-
- pop r6
- pop r2
- pop r1
- pop r0
-
- ; for cross border [top-left, top-right, bottom-left, bottom-right]
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
-
- neg r1 ;r1 = -stride
- lea r3,[r0-16]
- lea r3,[r3+r1] ;last line of top-left border
-
- lea r4,[r0+r2] ;psrc +width
- lea r4,[r4+r1] ;psrc +width -stride
-
-
- neg r1 ;r1 = stride
- add r6,16 ;height +32(16) ,luma = 32, chroma = 16
- imul r6,r1
-
- lea r5,[r3+r6] ;last line of bottom-left border
- lea r6,[r4+r6] ;last line of botoom-right border
-
- neg r1 ; r1 = -stride
-
- ; for left & right border expanding
- exp_cross_sse2 16,a
-
- POP_XMM
- LOAD_4_PARA_POP
-
- pop r6
- pop r5
- pop r4
-
- %assign push_num 0
-
-
- ret
-
-;***********************************************************************----------------
-; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst,
-; const int32_t iStride,
-; const int32_t iWidth,
-; const int32_t iHeight );
-;***********************************************************************----------------
-WELS_EXTERN ExpandPictureChromaUnalign_sse2
- push r4
- push r5
- push r6
-
- %assign push_num 3
- LOAD_4_PARA
- PUSH_XMM 7
-
- SIGN_EXTENSION r1,r1d
- SIGN_EXTENSION r2,r2d
- SIGN_EXTENSION r3,r3d
-
- ;also prepare for cross border pData top-left:xmm3
-
- movzx r6d,byte [r0]
- SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
-
- neg r1
- lea r5,[r0+r1] ;last line of top border r5= dst top pSrc[-stride]
- neg r1
-
- push r3
-
-
- dec r3 ;h-1
- imul r3,r1 ;(h-1)*stride
- lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
-
- mov r6,r1 ;r6 = stride
- sal r6,04h ;r6 = 32*stride
- lea r4,[r3+r6] ;r4 = dst bottom
-
- ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
-
- movzx r6d,byte [r3] ;bottom-left
- SSE2_Copy16Times xmm5,r6d
-
- lea r6,[r3+r2-1]
- movzx r6d,byte [r6]
- SSE2_Copy16Times xmm6,r6d ;bottom-right
-
- neg r1 ;r1 = -stride
-
- push r0
- push r1
- push r2
-
- exp_top_bottom_sse2 16
-
- ; for both left and right border
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
- pop r2
- pop r1
- pop r0
-
- lea r5,[r0-16] ;left border dst luma =32 chroma = -16
-
- lea r3,[r0+r2-1] ;right border src
- lea r4,[r3+1] ;right border dst
-
- ;prepare for cross border data: top-rigth with xmm4
- movzx r6d,byte [r3] ;top -rigth
- SSE2_Copy16Times xmm4,r6d
-
- neg r1 ;r1 = stride
-
-
- pop r6 ; r6 = height
-
-
-
- push r0
- push r1
- push r2
- push r6
- exp_left_right_sse2 16,u
-
- pop r6
- pop r2
- pop r1
- pop r0
-
- ; for cross border [top-left, top-right, bottom-left, bottom-right]
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
-
- neg r1 ;r1 = -stride
- lea r3,[r0-16]
- lea r3,[r3+r1] ;last line of top-left border
-
- lea r4,[r0+r2] ;psrc +width
- lea r4,[r4+r1] ;psrc +width -stride
-
-
- neg r1 ;r1 = stride
- add r6,16 ;height +32(16) ,luma = 32, chroma = 16
- imul r6,r1
-
- lea r5,[r3+r6] ;last line of bottom-left border
- lea r6,[r4+r6] ;last line of botoom-right border
-
- neg r1 ; r1 = -stride
-
- ; for left & right border expanding
- exp_cross_sse2 16,u
-
- POP_XMM
- LOAD_4_PARA_POP
-
- pop r6
- pop r5
- pop r4
-
- %assign push_num 0
-
-
- ret
--- a/codec/common/expand_picture_common.h
+++ /dev/null
@@ -1,72 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2009-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file expand_pic.h
- *
- * \brief Interface for expanding reconstructed picture to be used for reference
- *
- * \date 06/08/2009
- *************************************************************************************
- */
-
-#ifndef EXPAND_PICTURE_COMMON_H
-#define EXPAND_PICTURE_COMMON_H
-
-#include "typedefs.h"
-
-#if defined(__cplusplus)
-extern "C" {
-#endif//__cplusplus
-
-#if defined(X86_ASM)
-void ExpandPictureLuma_sse2 (uint8_t* pDst,
- const int32_t kiStride,
- const int32_t kiPicW,
- const int32_t kiPicH);
-void ExpandPictureChromaAlign_sse2 (uint8_t* pDst,
- const int32_t kiStride,
- const int32_t kiPicW,
- const int32_t kiPicH);
-void ExpandPictureChromaUnalign_sse2 (uint8_t* pDst,
- const int32_t kiStride,
- const int32_t kiPicW,
- const int32_t kiPicH);
-#endif//X86_ASM
-
-#if defined(HAVE_NEON)
-void ExpandPictureLuma_neon(uint8_t *pDst, const int32_t kiStride, const int32_t kiPicW, const int32_t kiPicH);
-void ExpandPictureChroma_neon(uint8_t *pDst, const int32_t kiStride, const int32_t kiPicW, const int32_t kiPicH);
-#endif
-#if defined(__cplusplus)
-}
-#endif//__cplusplus
-
-#endif
--- a/codec/common/expand_picture_neon.S
+++ /dev/null
@@ -1,137 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifdef HAVE_NEON
-.text
-#include "arm_arch_common_macro.S"
-
-
-WELS_ASM_FUNC_BEGIN ExpandPictureLuma_neon
- stmdb sp!, {r4-r8}
- //Save the dst
- mov r7, r0
- mov r8, r3
-
- add r4, r7, r2
- sub r4, #1
- //For the left and right expand
-_expand_picture_luma_loop2:
- sub r5, r7, #32
- add r6, r4, #1
-
- vld1.8 {d0[], d1[]}, [r7], r1
- vld1.8 {d2[], d3[]}, [r4], r1
-
- vst1.8 {q0}, [r5]!
- vst1.8 {q0}, [r5]
- vst1.8 {q1}, [r6]!
- vst1.8 {q1}, [r6]
- subs r8, #1
- bne _expand_picture_luma_loop2
-
- //for the top and bottom expand
- add r2, #64
- sub r0, #32
- mla r4, r1, r3, r0
- sub r4, r1
-_expand_picture_luma_loop0:
- mov r5, #32
- mls r5, r5, r1, r0
- add r6, r4, r1
- vld1.8 {q0}, [r0]!
- vld1.8 {q1}, [r4]!
-
- mov r8, #32
-_expand_picture_luma_loop1:
- vst1.8 {q0}, [r5], r1
- vst1.8 {q1}, [r6], r1
- subs r8, #1
- bne _expand_picture_luma_loop1
-
- subs r2, #16
- bne _expand_picture_luma_loop0
-
- //vldreq.32 d0, [r0]
-
- ldmia sp!, {r4-r8}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN ExpandPictureChroma_neon
- stmdb sp!, {r4-r8}
- //Save the dst
- mov r7, r0
- mov r8, r3
-
- add r4, r7, r2
- sub r4, #1
- //For the left and right expand
-_expand_picture_chroma_loop2:
- sub r5, r7, #16
- add r6, r4, #1
-
- vld1.8 {d0[], d1[]}, [r7], r1
- vld1.8 {d2[], d3[]}, [r4], r1
-
- vst1.8 {q0}, [r5]
- vst1.8 {q1}, [r6]
- subs r8, #1
- bne _expand_picture_chroma_loop2
-
- //for the top and bottom expand
- add r2, #32
- sub r0, #16
- mla r4, r1, r3, r0
- sub r4, r1
-_expand_picture_chroma_loop0:
- mov r5, #16
- mls r5, r5, r1, r0
- add r6, r4, r1
- vld1.8 {q0}, [r0]!
- vld1.8 {q1}, [r4]!
-
- mov r8, #16
-_expand_picture_chroma_loop1:
- vst1.8 {q0}, [r5], r1
- vst1.8 {q1}, [r6], r1
- subs r8, #1
- bne _expand_picture_chroma_loop1
-
- subs r2, #16
- bne _expand_picture_chroma_loop0
-
- //vldreq.32 d0, [r0]
-
- ldmia sp!, {r4-r8}
-WELS_ASM_FUNC_END
-
-#endif
--- /dev/null
+++ b/codec/common/inc/WelsThreadLib.h
@@ -1,0 +1,132 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file WelsThreadLib.h
+ *
+ * \brief Interfaces introduced in thread programming
+ *
+ * \date 11/17/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#ifndef _WELS_THREAD_API_H_
+#define _WELS_THREAD_API_H_
+
+#include "typedefs.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32)
+
+#include <windows.h>
+
+typedef HANDLE WELS_THREAD_HANDLE;
+typedef LPTHREAD_START_ROUTINE LPWELS_THREAD_ROUTINE;
+
+typedef CRITICAL_SECTION WELS_MUTEX;
+typedef HANDLE WELS_EVENT;
+
+#define WELS_THREAD_ROUTINE_TYPE DWORD WINAPI
+#define WELS_THREAD_ROUTINE_RETURN(rc) return (DWORD)rc;
+
+#else // NON-WINDOWS
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <signal.h>
+#include <errno.h>
+#include <time.h>
+#include <sys/time.h>
+
+#include <sys/stat.h>
+#include <fcntl.h>
+
+typedef pthread_t WELS_THREAD_HANDLE;
+typedef void* (*LPWELS_THREAD_ROUTINE) (void*);
+
+typedef pthread_mutex_t WELS_MUTEX;
+typedef sem_t* WELS_EVENT;
+
+#define WELS_THREAD_ROUTINE_TYPE void *
+#define WELS_THREAD_ROUTINE_RETURN(rc) return (void*)(intptr_t)rc;
+
+#endif//_WIN32
+
+typedef int32_t WELS_THREAD_ERROR_CODE;
+typedef int32_t WELS_THREAD_ATTR;
+
+typedef struct _WelsLogicalProcessorInfo {
+ int32_t ProcessorCount;
+} WelsLogicalProcessInfo;
+
+#define WELS_THREAD_ERROR_OK 0
+#define WELS_THREAD_ERROR_GENERAL ((uint32_t)(-1))
+#define WELS_THREAD_ERROR_WAIT_OBJECT_0 0
+#define WELS_THREAD_ERROR_WAIT_TIMEOUT ((uint32_t)0x00000102L)
+#define WELS_THREAD_ERROR_WAIT_FAILED WELS_THREAD_ERROR_GENERAL
+
+void WelsSleep (uint32_t dwMilliseconds);
+WELS_THREAD_ERROR_CODE WelsMutexInit (WELS_MUTEX* mutex);
+WELS_THREAD_ERROR_CODE WelsMutexLock (WELS_MUTEX* mutex);
+WELS_THREAD_ERROR_CODE WelsMutexUnlock (WELS_MUTEX* mutex);
+WELS_THREAD_ERROR_CODE WelsMutexDestroy (WELS_MUTEX* mutex);
+
+WELS_THREAD_ERROR_CODE WelsEventOpen (WELS_EVENT* p_event, const char* event_name);
+WELS_THREAD_ERROR_CODE WelsEventClose (WELS_EVENT* event, const char* event_name);
+WELS_THREAD_ERROR_CODE WelsEventSignal (WELS_EVENT* event);
+WELS_THREAD_ERROR_CODE WelsEventWait (WELS_EVENT* event);
+WELS_THREAD_ERROR_CODE WelsEventWaitWithTimeOut (WELS_EVENT* event, uint32_t dwMilliseconds);
+WELS_THREAD_ERROR_CODE WelsMultipleEventsWaitSingleBlocking (uint32_t nCount, WELS_EVENT* event_list,
+ WELS_EVENT* master_event = NULL);
+WELS_THREAD_ERROR_CODE WelsMultipleEventsWaitAllBlocking (uint32_t nCount, WELS_EVENT* event_list,
+ WELS_EVENT* master_event = NULL);
+
+WELS_THREAD_ERROR_CODE WelsThreadCreate (WELS_THREAD_HANDLE* thread, LPWELS_THREAD_ROUTINE routine,
+ void* arg, WELS_THREAD_ATTR attr);
+
+WELS_THREAD_ERROR_CODE WelsThreadJoin (WELS_THREAD_HANDLE thread);
+
+WELS_THREAD_HANDLE WelsThreadSelf();
+
+WELS_THREAD_ERROR_CODE WelsQueryLogicalProcessInfo (WelsLogicalProcessInfo* pInfo);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
+++ b/codec/common/inc/cpu.h
@@ -1,0 +1,80 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file cpu.h
+ *
+ * \brief CPU feature compatibility detection
+ *
+ * \date 04/29/2009 Created
+ *
+ *************************************************************************************
+ */
+#if !defined(WELS_CPU_DETECTION_H__)
+#define WELS_CPU_DETECTION_H__
+
+#include "typedefs.h"
+#include "cpu_core.h"
+
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#if defined(X86_ASM)
+/*
+ * cpuid support verify routine
+ * return 0 if cpuid is not supported by cpu
+ */
+int32_t WelsCPUIdVerify();
+
+void WelsCPUId (uint32_t uiIndex, uint32_t* pFeatureA, uint32_t* pFeatureB, uint32_t* pFeatureC, uint32_t* pFeatureD);
+
+int32_t WelsCPUSupportAVX (uint32_t eax, uint32_t ecx);
+int32_t WelsCPUSupportFMA (uint32_t eax, uint32_t ecx);
+
+void WelsEmms();
+
+/*
+ * clear FPU registers states for potential float based calculation if support
+ */
+void WelsCPURestore (const uint32_t kuiCPU);
+
+#else
+#define WelsEmms()
+#endif
+
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors);
+
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+#endif//WELS_CPU_DETECTION_H__
--- /dev/null
+++ b/codec/common/inc/cpu_core.h
@@ -1,0 +1,85 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file cpu_core.h
+ *
+ * \brief cpu core feature detection
+ *
+ * \date 4/24/2009 Created
+ *
+ *************************************************************************************
+ */
+#if !defined(WELS_CPU_CORE_FEATURE_DETECTION_H__)
+#define WELS_CPU_CORE_FEATURE_DETECTION_H__
+
+/*
+ * WELS CPU feature flags
+ */
+#define WELS_CPU_MMX 0x00000001 /* mmx */
+#define WELS_CPU_MMXEXT 0x00000002 /* mmx-ext*/
+#define WELS_CPU_SSE 0x00000004 /* sse */
+#define WELS_CPU_SSE2 0x00000008 /* sse 2 */
+#define WELS_CPU_SSE3 0x00000010 /* sse 3 */
+#define WELS_CPU_SSE41 0x00000020 /* sse 4.1 */
+#define WELS_CPU_3DNOW 0x00000040 /* 3dnow! */
+#define WELS_CPU_3DNOWEXT 0x00000080 /* 3dnow! ext */
+#define WELS_CPU_ALTIVEC 0x00000100 /* altivec */
+#define WELS_CPU_SSSE3 0x00000200 /* ssse3 */
+#define WELS_CPU_SSE42 0x00000400 /* sse 4.2 */
+
+/* CPU features application extensive */
+#define WELS_CPU_AVX 0x00000800 /* Advanced Vector eXtentions */
+#define WELS_CPU_FPU 0x00001000 /* x87-FPU on chip */
+#define WELS_CPU_HTT 0x00002000 /* Hyper-Threading Technology (HTT), Multi-threading enabled feature:
+ physical processor package is capable of supporting more than one logic processor
+ */
+#define WELS_CPU_CMOV 0x00004000 /* Conditional Move Instructions,
+ also if x87-FPU is present at indicated by the CPUID.FPU feature bit, then FCOMI and FCMOV are supported
+ */
+#define WELS_CPU_MOVBE 0x00008000 /* MOVBE instruction */
+#define WELS_CPU_AES 0x00010000 /* AES instruction extensions */
+#define WELS_CPU_FMA 0x00020000 /* AVX VEX FMA instruction sets */
+
+#define WELS_CPU_CACHELINE_16 0x10000000 /* CacheLine Size 16 */
+#define WELS_CPU_CACHELINE_32 0x20000000 /* CacheLine Size 32 */
+#define WELS_CPU_CACHELINE_64 0x40000000 /* CacheLine Size 64 */
+#define WELS_CPU_CACHELINE_128 0x80000000 /* CacheLine Size 128 */
+
+/* For the android OS */
+#define WELS_CPU_ARMv7 0x000001 /* ARMv7 */
+#define WELS_CPU_VFPv3 0x000002 /* VFPv3 */
+#define WELS_CPU_NEON 0x000004 /* NEON */
+
+/*
+ * Interfaces for CPU core feature detection as below
+ */
+
+#endif//WELS_CPU_CORE_FEATURE_DETECTION_H__
--- /dev/null
+++ b/codec/common/inc/crt_util_safe_x.h
@@ -1,0 +1,99 @@
+/*!
+ * \copy
+ * Copyright (c) 2010-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file crt_util_safe_x.h
+ *
+ * \brief Safe CRT like util for cross platfroms support
+ *
+ * \date 06/04/2010 Created
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_CRT_UTIL_SAFE_CROSS_PLATFORMS_H__
+#define WELS_CRT_UTIL_SAFE_CROSS_PLATFORMS_H__
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+
+#if defined(_WIN32)
+#include <windows.h>
+#include <sys/types.h>
+#include <sys/timeb.h>
+#else
+#include <sys/timeb.h>
+#include <sys/time.h>
+#include "typedefs.h"
+#endif//_WIN32
+
+#include "typedefs.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define WELS_FILE_SEEK_SET SEEK_SET
+#define WELS_FILE_SEEK_CUR SEEK_CUR
+#define WESL_FILE_SEEK_END SEEK_END
+
+typedef FILE WelsFileHandle;
+
+#ifdef _WIN32
+typedef struct _timeb SWelsTime;
+#else
+typedef struct timeb SWelsTime;
+#endif
+
+int32_t WelsSnprintf (char* buffer, int32_t sizeOfBuffer, const char* format, ...);
+char* WelsStrncpy (char* dest, int32_t sizeInBytes, const char* src);
+char* WelsStrcat (char* dest, int32_t sizeInBytes, const char* src);
+int32_t WelsVsnprintf (char* buffer, int32_t sizeOfBuffer, const char* format, va_list argptr);
+
+WelsFileHandle* WelsFopen (const char* filename, const char* mode);
+int32_t WelsFclose (WelsFileHandle* fp);
+int32_t WelsFread (void* buffer, int32_t size, int32_t count, WelsFileHandle* fp);
+int32_t WelsFwrite (const void* buffer, int32_t size, int32_t count, WelsFileHandle* fp);
+int32_t WelsFseek (WelsFileHandle* fp, int32_t offset, int32_t origin);
+int32_t WelsFflush (WelsFileHandle* fp);
+
+int32_t WelsGetTimeOfDay (SWelsTime* tp);
+int32_t WelsStrftime (char* buffer, int32_t size, const char* format, const SWelsTime* tp);
+uint16_t WelsGetMillisecond (const SWelsTime* tp);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif//WELS_CRT_UTIL_SAFE_CROSS_PLATFORMS_H__
--- /dev/null
+++ b/codec/common/inc/deblocking_common.h
@@ -1,0 +1,55 @@
+#ifndef WELS_DEBLOCKING_COMMON_H__
+#define WELS_DEBLOCKING_COMMON_H__
+#include "typedefs.h"
+void DeblockLumaLt4V_c (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4V_c (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+
+void DeblockLumaLt4H_c (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4H_c (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+
+void DeblockChromaLt4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+ int8_t* pTc);
+void DeblockChromaEq4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+
+void DeblockChromaLt4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+ int8_t* pTc);
+void DeblockChromaEq4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#ifdef X86_ASM
+void DeblockLumaLt4V_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4V_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockLumaTransposeH2V_sse2 (uint8_t* pPixY, int32_t iStride, uint8_t* pDst);
+void DeblockLumaTransposeV2H_sse2 (uint8_t* pPixY, int32_t iStride, uint8_t* pSrc);
+void DeblockLumaLt4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaEq4V_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaLt4V_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+ int8_t* pTC);
+void DeblockChromaEq4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaLt4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+ int8_t* pTC);
+#endif
+
+#if defined(HAVE_NEON)
+void DeblockLumaLt4V_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4V_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+
+void DeblockLumaLt4H_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4H_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+
+void DeblockChromaLt4V_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTC);
+void DeblockChromaEq4V_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+
+void DeblockChromaLt4H_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTC);
+void DeblockChromaEq4H_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+#endif
+
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+#endif //WELS_DEBLOCKING_COMMON_H__
--- /dev/null
+++ b/codec/common/inc/expand_picture_common.h
@@ -1,0 +1,72 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file expand_pic.h
+ *
+ * \brief Interface for expanding reconstructed picture to be used for reference
+ *
+ * \date 06/08/2009
+ *************************************************************************************
+ */
+
+#ifndef EXPAND_PICTURE_COMMON_H
+#define EXPAND_PICTURE_COMMON_H
+
+#include "typedefs.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#if defined(X86_ASM)
+void ExpandPictureLuma_sse2 (uint8_t* pDst,
+ const int32_t kiStride,
+ const int32_t kiPicW,
+ const int32_t kiPicH);
+void ExpandPictureChromaAlign_sse2 (uint8_t* pDst,
+ const int32_t kiStride,
+ const int32_t kiPicW,
+ const int32_t kiPicH);
+void ExpandPictureChromaUnalign_sse2 (uint8_t* pDst,
+ const int32_t kiStride,
+ const int32_t kiPicW,
+ const int32_t kiPicH);
+#endif//X86_ASM
+
+#if defined(HAVE_NEON)
+void ExpandPictureLuma_neon(uint8_t *pDst, const int32_t kiStride, const int32_t kiPicW, const int32_t kiPicH);
+void ExpandPictureChroma_neon(uint8_t *pDst, const int32_t kiStride, const int32_t kiPicW, const int32_t kiPicH);
+#endif
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+#endif
--- /dev/null
+++ b/codec/common/inc/logging.h
@@ -1,0 +1,54 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * Copyright (c) 2013, Mozilla
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+
+#ifndef WELS_LOGGING_H__
+#define WELS_LOGGING_H__
+
+// API surface.
+void WelsStderrSetTraceLevel (int32_t level);
+
+
+// Internal details.
+int32_t welsStderrLevelTrace (int32_t level, const char* format, va_list ap);
+
+template<int level> int32_t welsStderrTrace (
+ const char* format, ...) {
+ va_list ap;
+ va_start (ap, format);
+ welsStderrLevelTrace (level, format, ap);
+ va_end (ap);
+ return 0;
+}
+
+#endif
--- /dev/null
+++ b/codec/common/inc/ls_defines.h
@@ -1,0 +1,93 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef ___LD_ST_MACROS___
+#define ___LD_ST_MACROS___
+
+#include <string.h>
+#include "typedefs.h"
+
+#ifdef __GNUC__
+
+struct tagUnaligned_64 {
+ uint64_t l;
+} __attribute__ ((packed));
+struct tagUnaligned_32 {
+ uint32_t l;
+} __attribute__ ((packed));
+struct tagUnaligned_16 {
+ uint16_t l;
+} __attribute__ ((packed));
+
+#define LD16(a) (((struct tagUnaligned_16 *) (a))->l)
+#define LD32(a) (((struct tagUnaligned_32 *) (a))->l)
+#define LD64(a) (((struct tagUnaligned_64 *) (a))->l)
+//#define _USE_STRUCT_INT_CVT
+// #ifdef _USE_STRUCT_INT_CVT
+#define ST16(a, b) (((struct tagUnaligned_16 *) (a))->l) = (b)
+#define ST32(a, b) (((struct tagUnaligned_32 *) (a))->l) = (b)
+#define ST64(a, b) (((struct tagUnaligned_64 *) (a))->l) = (b)
+// #else
+// inline void __ST16(void *dst, uint16_t v) { memcpy(dst, &v, 2); }
+// inline void __ST32(void *dst, uint32_t v) { memcpy(dst, &v, 4); }
+//inline void __ST64(void *dst, uint64_t v) { memcpy(dst, &v, 8); }
+// #endif
+
+#else
+
+//#define INTD16(a) (*((int16_t*)(a)))
+//#define INTD32(a) (*((int32_t*)(a)))
+//#define INTD64(a) (*((int64_t*)(a)))
+
+#define LD16(a) (*((uint16_t*)(a)))
+#define LD32(a) (*((uint32_t*)(a)))
+#define LD64(a) (*((uint64_t*)(a)))
+
+#define ST16(a, b) *((uint16_t*)(a)) = (b)
+#define ST32(a, b) *((uint32_t*)(a)) = (b)
+#define ST64(a, b) *((uint64_t*)(a)) = (b)
+
+#endif /* !__GNUC__ */
+
+#ifndef INTD16
+#define INTD16 LD16
+#endif//INTD16
+
+#ifndef INTD32
+#define INTD32 LD32
+#endif//INTD32
+
+#ifndef INTD64
+#define INTD64 LD64
+#endif//INTD64
+
+#endif//___LD_ST_MACROS___
--- /dev/null
+++ b/codec/common/inc/macros.h
@@ -1,0 +1,274 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file macros.h
+ *
+ * \brief MACRO based tool utilization
+ *
+ * \date 3/13/2009 Created
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_MACRO_UTILIZATIONS_H__
+#define WELS_MACRO_UTILIZATIONS_H__
+
+#include <math.h>
+#include <assert.h>
+#include "typedefs.h"
+
+/*
+* ENFORCE_STACK_ALIGN_1D: force 1 dimension local data aligned in stack
+* _tp: type
+* _nm: var name
+* _sz: size
+* _al: align bytes
+* auxiliary var: _nm ## _tEmP
+*/
+#define ENFORCE_STACK_ALIGN_1D(_tp, _nm, _sz, _al) \
+ _tp _nm ## _tEmP[(_sz)+(_al)-1]; \
+ _tp *_nm = _nm ## _tEmP + ((_al)-1) - (((uintptr_t)(_nm ## _tEmP + ((_al)-1)) & ((_al)-1))/sizeof(_tp));
+
+
+#define ENFORCE_STACK_ALIGN_2D(_tp, _nm, _cx, _cy, _al) \
+ assert( ((_al) && !((_al) & ((_al) - 1))) && ((_al) >= sizeof(_tp)) ); /*_al should be power-of-2 and >= sizeof(_tp)*/\
+ _tp _nm ## _tEmP[(_cx)*(_cy)+(_al)/sizeof(_tp)-1]; \
+ _tp *_nm ## _tEmP_al = _nm ## _tEmP + ((_al)/sizeof(_tp)-1); \
+ _nm ## _tEmP_al -= (((uintptr_t)_nm ## _tEmP_al & ((_al)-1))/sizeof(_tp)); \
+ _tp (*_nm)[(_cy)] = (_tp (*)[(_cy)])_nm ## _tEmP_al;
+
+
+#if defined(_MSC_VER)
+
+#if(_MSC_VER < 1700)
+#define inline __inline
+#endif
+
+#define ALIGNED_DECLARE( type, var, n ) __declspec(align(n)) type var
+#define __align16(t,v) __declspec(align(16)) t v
+#define ALIGNED_DECLARE_MATRIX_1D(name,size,type,alignment) \
+ __declspec(align(alignment)) type name[(size)]
+#define ALIGNED_DECLARE_MATRIX_2D(name,sizex,sizey,type,alignment) \
+__declspec(align(alignment)) type name[(sizex)*(sizey)]
+
+#elif defined(__GNUC__)
+
+#define ALIGNED_DECLARE( type, var, n ) type var __attribute__((aligned(n)))
+#define __align16(t,v) t v __attribute__ ((aligned (16)))
+#define ALIGNED_DECLARE_MATRIX_1D(name,size,type,alignment) \
+ type name[size] __attribute__((aligned(alignment)))
+#define ALIGNED_DECLARE_MATRIX_2D(name,sizex,sizey,type,alignment) \
+ type name[(sizex)*(sizey)] __attribute__((aligned(alignment)))
+#endif//_MSC_VER
+
+
+#ifndef WELS_ALIGN
+#define WELS_ALIGN(x, n) (((x)+(n)-1)&~((n)-1))
+#endif//WELS_ALIGN
+
+
+#if 1 // Alternative implementation of WELS_MAX and WELS_MIN
+#ifndef WELS_MAX
+#define WELS_MAX(x, y) ((x) > (y) ? (x) : (y))
+#endif//WELS_MAX
+
+#ifndef WELS_MIN
+#define WELS_MIN(x, y) ((x) < (y) ? (x) : (y))
+#endif//WELS_MIN
+#else // Alternative implementation of WELS_MAX and WELS_MIN
+#ifndef WELS_MAX
+#define WELS_MAX(x, y) ((x) - (((x)-(y))&(((x)-(y))>>31)))
+#endif//WELS_MAX
+
+#ifndef WELS_MIN
+#define WELS_MIN(x, y) ((y) + (((x)-(y))&(((x)-(y))>>31)))
+#endif//WELS_MIN
+#endif // Alternative implementation of WELS_MAX and WELS_MIN
+
+
+#ifndef WELS_CEIL
+#define WELS_CEIL(x) ceil(x) // FIXME: low complexity instead of math library used
+#endif//WELS_CEIL
+
+#ifndef WELS_FLOOR
+#define WELS_FLOOR(x) floor(x) // FIXME: low complexity instead of math library used
+#endif//WELS_FLOOR
+
+#ifndef WELS_ROUND
+#define WELS_ROUND(x) ((int32_t)(0.5f+(x)))
+#endif//WELS_ROUND
+
+#define WELS_NON_ZERO_COUNT_AVERAGE(nC,nA,nB) { \
+ nC = nA + nB + 1; \
+ nC >>= (uint8_t)( nA != -1 && nB != -1); \
+ nC += (uint8_t)(nA == -1 && nB == -1); \
+}
+
+static inline int32_t CeilLog2 (int32_t i) {
+int32_t s = 0;
+i--;
+while (i > 0) {
+ s++;
+ i >>= 1;
+}
+return s;
+}
+/*
+the second path will degrades the performance
+*/
+#if 1
+static inline int32_t WelsMedian (int32_t iX, int32_t iY, int32_t iZ) {
+int32_t iMin = iX, iMax = iX;
+
+if (iY < iMin)
+ iMin = iY;
+else
+ iMax = iY;
+
+if (iZ < iMin)
+ iMin = iZ;
+else if (iZ > iMax)
+ iMax = iZ;
+
+return (iX + iY + iZ) - (iMin + iMax);
+}
+#else
+static inline int32_t WelsMedian (int32_t iX, int32_t iY, int32_t iZ) {
+int32_t iTmp = (iX - iY) & ((iX - iY) >> 31);
+iX -= iTmp;
+iY += iTmp;
+iY -= (iY - iZ) & ((iY - iZ) >> 31);
+iY += (iX - iY) & ((iX - iY) >> 31);
+return iY;
+}
+
+#endif
+
+#ifndef NEG_NUM
+//#define NEG_NUM( num ) (-num)
+#define NEG_NUM(iX) (1+(~(iX)))
+#endif// NEG_NUM
+
+static inline uint8_t WelsClip1(int32_t iX) {
+ uint8_t uiTmp = (uint8_t)(((iX) & ~255) ? (-(iX) >> 31) : (iX));
+ return uiTmp;
+}
+
+#ifndef WELS_SIGN
+#define WELS_SIGN(iX) ((int32_t)(iX) >> 31)
+#endif //WELS_SIGN
+#ifndef WELS_ABS
+#define WELS_ABS(iX) ((WELS_SIGN(iX) ^ (int32_t)(iX)) - WELS_SIGN(iX))
+#endif //WELS_ABS
+
+// WELS_CLIP3
+#ifndef WELS_CLIP3
+#define WELS_CLIP3(iX, iY, iZ) ((iX) < (iY) ? (iY) : ((iX) > (iZ) ? (iZ) : (iX)))
+#endif //WELS_CLIP3
+
+/*
+ * Description: to check variable validation and return the specified result
+ * iResult: value to be checked
+ * iExpected: the expected value
+ */
+#ifndef WELS_VERIFY_RETURN_IFNEQ
+#define WELS_VERIFY_RETURN_IFNEQ(iResult, iExpected) \
+ if ( iResult != iExpected ){ \
+ return iResult; \
+ }
+#endif//#if WELS_VERIFY_RETURN_IF
+
+/*
+ * Description: to check variable validation and return the specified result
+ * iResult: value to be return
+ * bCaseIf: negative condition to be verified
+ */
+#ifndef WELS_VERIFY_RETURN_IF
+#define WELS_VERIFY_RETURN_IF(iResult, bCaseIf) \
+ if ( bCaseIf ){ \
+ return iResult; \
+ }
+#endif//#if WELS_VERIFY_RETURN_IF
+
+/*
+ * Description: to check variable validation and return the specified result
+ * with correspoinding process advance.
+ * result: value to be return
+ * case_if: negative condition to be verified
+ * proc: process need perform
+ */
+#ifndef WELS_VERIFY_RETURN_PROC_IF
+#define WELS_VERIFY_RETURN_PROC_IF(iResult, bCaseIf, fProc) \
+ if ( bCaseIf ){ \
+ fProc; \
+ return iResult; \
+ }
+#endif//#if WELS_VERIFY_RETURN_PROC_IF
+
+static inline int32_t WELS_LOG2 (uint32_t v) {
+int32_t r = 0;
+while (v >>= 1) {
+ ++r;
+}
+return r;
+
+}
+
+#define CLIP3_QP_0_51(q) WELS_CLIP3(q, 0, 51) // ((q) < (0) ? (0) : ((q) > (51) ? (51) : (q)))
+#define CALC_BI_STRIDE(width,bitcount) ((((width * bitcount) + 31) & ~31) >> 3)
+
+
+
+
+#ifndef BUTTERFLY1x2
+#define BUTTERFLY1x2(b) (((b)<<8) | (b))
+#endif//BUTTERFLY1x2
+
+#ifndef BUTTERFLY2x4
+#define BUTTERFLY2x4(wd) (((uint32_t)(wd)<<16) |(wd))
+#endif//BUTTERFLY2x4
+
+#ifndef BUTTERFLY4x8
+#define BUTTERFLY4x8(dw) (((uint64_t)(dw)<<32) | (dw))
+#endif//BUTTERFLY4x8
+
+static inline bool WELS_POWER2_IF (uint32_t v) {
+return (v && ! (v & (v - 1)));
+}
+
+#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 4)
+#define WELS_GCC_UNUSED __attribute__((__unused__))
+#else
+#define WELS_GCC_UNUSED
+#endif
+
+
+
+#endif//WELS_MACRO_UTILIZATIONS_H__
--- /dev/null
+++ b/codec/common/inc/mc_common.h
@@ -1,0 +1,161 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef MC_COMMON_H
+#define MC_COMMON_H
+
+#include "typedefs.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#if defined(HAVE_NEON)
+void McCopyWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+void McCopyWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+void McCopyWidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+void McChromaWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
+
+void McChromaWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
+
+void PixelAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
+void PixelAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
+void PixelAvgWidthEq4_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
+
+void McHorVer01WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer01WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer01WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer03WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer03WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer03WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+void McHorVer10WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer10WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer10WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer30WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer30WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer30WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+ //horizontal filter to gain half sample, that is (2, 0) location in quarter sample
+void McHorVer20WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer20WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer20WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+ //vertical filter to gain half sample, that is (0, 2) location in quarter sample
+void McHorVer02WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer02WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer02WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+ //horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
+void McHorVer22WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer22WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer22WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+void PixStrideAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
+void PixStrideAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
+
+void McHorVer20Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
+void McHorVer20Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
+
+void McHorVer02Height17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
+void McHorVer02Height9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
+
+void McHorVer22Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
+void McHorVer22Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
+#endif
+
+#if defined(X86_ASM)
+//***************************************************************************//
+// MMXEXT definition //
+//***************************************************************************//
+void McHorVer20WidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iHeight);
+void McChromaWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ const uint8_t* kpABCD, int32_t iHeight);
+void McCopyWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iHeight);
+void McCopyWidthEq8_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iHeight);
+void PixelAvgWidthEq4_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+ const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
+void PixelAvgWidthEq8_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+ const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
+
+//***************************************************************************//
+// SSE2 definition //
+//***************************************************************************//
+void McChromaWidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ const uint8_t* kpABCD, int32_t iHeight);
+void McCopyWidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iHeight);
+void McHorVer20WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iHeight);
+void McHorVer20WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iHeight);
+void McHorVer02WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iHeight);
+void McHorVer22Width8HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iHeight);
+void McHorVer22Width8VerLastAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight);
+void McHorVer22Width8VerLastUnAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight);
+
+void PixelAvgWidthEq16_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+ const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
+
+void McHorVer20Width9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+ int32_t iHeight);
+
+void McHorVer02Height9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+ int32_t iHeight);
+
+void McHorVer22HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pTap, int32_t iTapStride, int32_t iWidth,
+ int32_t iHeight);
+
+//***************************************************************************//
+// SSSE3 definition //
+//***************************************************************************//
+
+void McChromaWidthEq8_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ const uint8_t* kpABCD, int32_t iHeight);
+
+#endif //X86_ASM
+
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+#endif//MC_COMMON_H
--- /dev/null
+++ b/codec/common/inc/measure_time.h
@@ -1,0 +1,89 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file measure_time.h
+ *
+ * \brief time cost measure utilization
+ *
+ * \date 04/28/2009 Created
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_TIME_COST_MEASURE_UTIL_H__
+#define WELS_TIME_COST_MEASURE_UTIL_H__
+
+#include <stdlib.h>
+
+#include "typedefs.h"
+#ifndef _WIN32
+#include <sys/time.h>
+#else
+#include <windows.h>
+#include <sys/timeb.h>
+#endif
+#include <time.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif//__cplusplus
+
+/*!
+ * \brief time cost measure utilization
+ * \param void
+ * \return time elapsed since run (unit: microsecond)
+ */
+
+static inline int64_t WelsTime (void) {
+#ifndef _WIN32
+ struct timeval tv_date;
+
+ gettimeofday (&tv_date, NULL);
+ return ((int64_t) tv_date.tv_sec * 1000000 + (int64_t) tv_date.tv_usec);
+#else
+ static int64_t iMtimeFreq = 0;
+ int64_t iMtimeCur = 0;
+ int64_t iResult = 0;
+ if (!iMtimeFreq) {
+ QueryPerformanceFrequency ((LARGE_INTEGER*)&iMtimeFreq);
+ if (!iMtimeFreq)
+ iMtimeFreq = 1;
+ }
+ QueryPerformanceCounter ((LARGE_INTEGER*)&iMtimeCur);
+ iResult = (int64_t) ((double)iMtimeCur * 1e6 / (double)iMtimeFreq + 0.5);
+ return iResult;
+#endif//WIN32
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif//WELS_TIME_COST_MEASURE_UTIL_H__
--- /dev/null
+++ b/codec/common/inc/typedefs.h
@@ -1,0 +1,74 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+// typedef.h
+#ifndef WELS_TYPE_DEFINES_H__
+#define WELS_TYPE_DEFINES_H__
+
+#include <limits.h>
+#include <stddef.h>
+
+////////////////////////////////////////////////////////////////////////////
+// NOTICE : ALL internal implement MUST use the data type defined as below
+// ONLY except with the interface file !!!!!
+////////////////////////////////////////////////////////////////////////////
+
+#ifndef _MSC_VER
+
+#define __STDC_FORMAT_MACROS
+#include <stdint.h>
+#include <inttypes.h>
+
+#else
+
+// FIXME: all singed type should be declared explicit, for example, int8_t should be declared as signed char.
+typedef signed char int8_t ;
+typedef unsigned char uint8_t ;
+typedef short int16_t ;
+typedef unsigned short uint16_t;
+typedef int int32_t ;
+typedef unsigned int uint32_t;
+typedef __int64 int64_t ;
+typedef unsigned __int64 uint64_t;
+#define PRId64 "I64d"
+
+#endif // _MSC_VER defined
+
+// The 'float' type is portable and usable without any need for any extra typedefs.
+
+#ifdef EPSN
+#undef EPSN
+#endif//EPSN
+#define EPSN (0.000001f) // (1e-6) // desired float precision
+
+#endif //WELS_TYPE_DEFINES_H__
+
--- a/codec/common/logging.cpp
+++ /dev/null
@@ -1,49 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * Copyright (c) 2013, Mozilla
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <stdarg.h>
-#include <stdio.h>
-#include "typedefs.h"
-
-static int32_t g_TraceLevel = 0;
-
-void WelsStderrSetTraceLevel (int32_t level) {
- g_TraceLevel = level;
-}
-
-int32_t welsStderrLevelTrace (int32_t level, const char* format, va_list ap) {
- if (level < g_TraceLevel) {
- vfprintf (stderr, format, ap);
- }
- return 0;
-}
--- a/codec/common/logging.h
+++ /dev/null
@@ -1,54 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * Copyright (c) 2013, Mozilla
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-
-#ifndef WELS_LOGGING_H__
-#define WELS_LOGGING_H__
-
-// API surface.
-void WelsStderrSetTraceLevel (int32_t level);
-
-
-// Internal details.
-int32_t welsStderrLevelTrace (int32_t level, const char* format, va_list ap);
-
-template<int level> int32_t welsStderrTrace (
- const char* format, ...) {
- va_list ap;
- va_start (ap, format);
- welsStderrLevelTrace (level, format, ap);
- va_end (ap);
- return 0;
-}
-
-#endif
--- a/codec/common/ls_defines.h
+++ /dev/null
@@ -1,93 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifndef ___LD_ST_MACROS___
-#define ___LD_ST_MACROS___
-
-#include <string.h>
-#include "typedefs.h"
-
-#ifdef __GNUC__
-
-struct tagUnaligned_64 {
- uint64_t l;
-} __attribute__ ((packed));
-struct tagUnaligned_32 {
- uint32_t l;
-} __attribute__ ((packed));
-struct tagUnaligned_16 {
- uint16_t l;
-} __attribute__ ((packed));
-
-#define LD16(a) (((struct tagUnaligned_16 *) (a))->l)
-#define LD32(a) (((struct tagUnaligned_32 *) (a))->l)
-#define LD64(a) (((struct tagUnaligned_64 *) (a))->l)
-//#define _USE_STRUCT_INT_CVT
-// #ifdef _USE_STRUCT_INT_CVT
-#define ST16(a, b) (((struct tagUnaligned_16 *) (a))->l) = (b)
-#define ST32(a, b) (((struct tagUnaligned_32 *) (a))->l) = (b)
-#define ST64(a, b) (((struct tagUnaligned_64 *) (a))->l) = (b)
-// #else
-// inline void __ST16(void *dst, uint16_t v) { memcpy(dst, &v, 2); }
-// inline void __ST32(void *dst, uint32_t v) { memcpy(dst, &v, 4); }
-//inline void __ST64(void *dst, uint64_t v) { memcpy(dst, &v, 8); }
-// #endif
-
-#else
-
-//#define INTD16(a) (*((int16_t*)(a)))
-//#define INTD32(a) (*((int32_t*)(a)))
-//#define INTD64(a) (*((int64_t*)(a)))
-
-#define LD16(a) (*((uint16_t*)(a)))
-#define LD32(a) (*((uint32_t*)(a)))
-#define LD64(a) (*((uint64_t*)(a)))
-
-#define ST16(a, b) *((uint16_t*)(a)) = (b)
-#define ST32(a, b) *((uint32_t*)(a)) = (b)
-#define ST64(a, b) *((uint64_t*)(a)) = (b)
-
-#endif /* !__GNUC__ */
-
-#ifndef INTD16
-#define INTD16 LD16
-#endif//INTD16
-
-#ifndef INTD32
-#define INTD32 LD32
-#endif//INTD32
-
-#ifndef INTD64
-#define INTD64 LD64
-#endif//INTD64
-
-#endif//___LD_ST_MACROS___
--- a/codec/common/macros.h
+++ /dev/null
@@ -1,274 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2009-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file macros.h
- *
- * \brief MACRO based tool utilization
- *
- * \date 3/13/2009 Created
- *
- *************************************************************************************
- */
-#ifndef WELS_MACRO_UTILIZATIONS_H__
-#define WELS_MACRO_UTILIZATIONS_H__
-
-#include <math.h>
-#include <assert.h>
-#include "typedefs.h"
-
-/*
-* ENFORCE_STACK_ALIGN_1D: force 1 dimension local data aligned in stack
-* _tp: type
-* _nm: var name
-* _sz: size
-* _al: align bytes
-* auxiliary var: _nm ## _tEmP
-*/
-#define ENFORCE_STACK_ALIGN_1D(_tp, _nm, _sz, _al) \
- _tp _nm ## _tEmP[(_sz)+(_al)-1]; \
- _tp *_nm = _nm ## _tEmP + ((_al)-1) - (((uintptr_t)(_nm ## _tEmP + ((_al)-1)) & ((_al)-1))/sizeof(_tp));
-
-
-#define ENFORCE_STACK_ALIGN_2D(_tp, _nm, _cx, _cy, _al) \
- assert( ((_al) && !((_al) & ((_al) - 1))) && ((_al) >= sizeof(_tp)) ); /*_al should be power-of-2 and >= sizeof(_tp)*/\
- _tp _nm ## _tEmP[(_cx)*(_cy)+(_al)/sizeof(_tp)-1]; \
- _tp *_nm ## _tEmP_al = _nm ## _tEmP + ((_al)/sizeof(_tp)-1); \
- _nm ## _tEmP_al -= (((uintptr_t)_nm ## _tEmP_al & ((_al)-1))/sizeof(_tp)); \
- _tp (*_nm)[(_cy)] = (_tp (*)[(_cy)])_nm ## _tEmP_al;
-
-
-#if defined(_MSC_VER)
-
-#if(_MSC_VER < 1700)
-#define inline __inline
-#endif
-
-#define ALIGNED_DECLARE( type, var, n ) __declspec(align(n)) type var
-#define __align16(t,v) __declspec(align(16)) t v
-#define ALIGNED_DECLARE_MATRIX_1D(name,size,type,alignment) \
- __declspec(align(alignment)) type name[(size)]
-#define ALIGNED_DECLARE_MATRIX_2D(name,sizex,sizey,type,alignment) \
-__declspec(align(alignment)) type name[(sizex)*(sizey)]
-
-#elif defined(__GNUC__)
-
-#define ALIGNED_DECLARE( type, var, n ) type var __attribute__((aligned(n)))
-#define __align16(t,v) t v __attribute__ ((aligned (16)))
-#define ALIGNED_DECLARE_MATRIX_1D(name,size,type,alignment) \
- type name[size] __attribute__((aligned(alignment)))
-#define ALIGNED_DECLARE_MATRIX_2D(name,sizex,sizey,type,alignment) \
- type name[(sizex)*(sizey)] __attribute__((aligned(alignment)))
-#endif//_MSC_VER
-
-
-#ifndef WELS_ALIGN
-#define WELS_ALIGN(x, n) (((x)+(n)-1)&~((n)-1))
-#endif//WELS_ALIGN
-
-
-#if 1 // Alternative implementation of WELS_MAX and WELS_MIN
-#ifndef WELS_MAX
-#define WELS_MAX(x, y) ((x) > (y) ? (x) : (y))
-#endif//WELS_MAX
-
-#ifndef WELS_MIN
-#define WELS_MIN(x, y) ((x) < (y) ? (x) : (y))
-#endif//WELS_MIN
-#else // Alternative implementation of WELS_MAX and WELS_MIN
-#ifndef WELS_MAX
-#define WELS_MAX(x, y) ((x) - (((x)-(y))&(((x)-(y))>>31)))
-#endif//WELS_MAX
-
-#ifndef WELS_MIN
-#define WELS_MIN(x, y) ((y) + (((x)-(y))&(((x)-(y))>>31)))
-#endif//WELS_MIN
-#endif // Alternative implementation of WELS_MAX and WELS_MIN
-
-
-#ifndef WELS_CEIL
-#define WELS_CEIL(x) ceil(x) // FIXME: low complexity instead of math library used
-#endif//WELS_CEIL
-
-#ifndef WELS_FLOOR
-#define WELS_FLOOR(x) floor(x) // FIXME: low complexity instead of math library used
-#endif//WELS_FLOOR
-
-#ifndef WELS_ROUND
-#define WELS_ROUND(x) ((int32_t)(0.5f+(x)))
-#endif//WELS_ROUND
-
-#define WELS_NON_ZERO_COUNT_AVERAGE(nC,nA,nB) { \
- nC = nA + nB + 1; \
- nC >>= (uint8_t)( nA != -1 && nB != -1); \
- nC += (uint8_t)(nA == -1 && nB == -1); \
-}
-
-static inline int32_t CeilLog2 (int32_t i) {
-int32_t s = 0;
-i--;
-while (i > 0) {
- s++;
- i >>= 1;
-}
-return s;
-}
-/*
-the second path will degrades the performance
-*/
-#if 1
-static inline int32_t WelsMedian (int32_t iX, int32_t iY, int32_t iZ) {
-int32_t iMin = iX, iMax = iX;
-
-if (iY < iMin)
- iMin = iY;
-else
- iMax = iY;
-
-if (iZ < iMin)
- iMin = iZ;
-else if (iZ > iMax)
- iMax = iZ;
-
-return (iX + iY + iZ) - (iMin + iMax);
-}
-#else
-static inline int32_t WelsMedian (int32_t iX, int32_t iY, int32_t iZ) {
-int32_t iTmp = (iX - iY) & ((iX - iY) >> 31);
-iX -= iTmp;
-iY += iTmp;
-iY -= (iY - iZ) & ((iY - iZ) >> 31);
-iY += (iX - iY) & ((iX - iY) >> 31);
-return iY;
-}
-
-#endif
-
-#ifndef NEG_NUM
-//#define NEG_NUM( num ) (-num)
-#define NEG_NUM(iX) (1+(~(iX)))
-#endif// NEG_NUM
-
-static inline uint8_t WelsClip1(int32_t iX) {
- uint8_t uiTmp = (uint8_t)(((iX) & ~255) ? (-(iX) >> 31) : (iX));
- return uiTmp;
-}
-
-#ifndef WELS_SIGN
-#define WELS_SIGN(iX) ((int32_t)(iX) >> 31)
-#endif //WELS_SIGN
-#ifndef WELS_ABS
-#define WELS_ABS(iX) ((WELS_SIGN(iX) ^ (int32_t)(iX)) - WELS_SIGN(iX))
-#endif //WELS_ABS
-
-// WELS_CLIP3
-#ifndef WELS_CLIP3
-#define WELS_CLIP3(iX, iY, iZ) ((iX) < (iY) ? (iY) : ((iX) > (iZ) ? (iZ) : (iX)))
-#endif //WELS_CLIP3
-
-/*
- * Description: to check variable validation and return the specified result
- * iResult: value to be checked
- * iExpected: the expected value
- */
-#ifndef WELS_VERIFY_RETURN_IFNEQ
-#define WELS_VERIFY_RETURN_IFNEQ(iResult, iExpected) \
- if ( iResult != iExpected ){ \
- return iResult; \
- }
-#endif//#if WELS_VERIFY_RETURN_IF
-
-/*
- * Description: to check variable validation and return the specified result
- * iResult: value to be return
- * bCaseIf: negative condition to be verified
- */
-#ifndef WELS_VERIFY_RETURN_IF
-#define WELS_VERIFY_RETURN_IF(iResult, bCaseIf) \
- if ( bCaseIf ){ \
- return iResult; \
- }
-#endif//#if WELS_VERIFY_RETURN_IF
-
-/*
- * Description: to check variable validation and return the specified result
- * with correspoinding process advance.
- * result: value to be return
- * case_if: negative condition to be verified
- * proc: process need perform
- */
-#ifndef WELS_VERIFY_RETURN_PROC_IF
-#define WELS_VERIFY_RETURN_PROC_IF(iResult, bCaseIf, fProc) \
- if ( bCaseIf ){ \
- fProc; \
- return iResult; \
- }
-#endif//#if WELS_VERIFY_RETURN_PROC_IF
-
-static inline int32_t WELS_LOG2 (uint32_t v) {
-int32_t r = 0;
-while (v >>= 1) {
- ++r;
-}
-return r;
-
-}
-
-#define CLIP3_QP_0_51(q) WELS_CLIP3(q, 0, 51) // ((q) < (0) ? (0) : ((q) > (51) ? (51) : (q)))
-#define CALC_BI_STRIDE(width,bitcount) ((((width * bitcount) + 31) & ~31) >> 3)
-
-
-
-
-#ifndef BUTTERFLY1x2
-#define BUTTERFLY1x2(b) (((b)<<8) | (b))
-#endif//BUTTERFLY1x2
-
-#ifndef BUTTERFLY2x4
-#define BUTTERFLY2x4(wd) (((uint32_t)(wd)<<16) |(wd))
-#endif//BUTTERFLY2x4
-
-#ifndef BUTTERFLY4x8
-#define BUTTERFLY4x8(dw) (((uint64_t)(dw)<<32) | (dw))
-#endif//BUTTERFLY4x8
-
-static inline bool WELS_POWER2_IF (uint32_t v) {
-return (v && ! (v & (v - 1)));
-}
-
-#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 4)
-#define WELS_GCC_UNUSED __attribute__((__unused__))
-#else
-#define WELS_GCC_UNUSED
-#endif
-
-
-
-#endif//WELS_MACRO_UTILIZATIONS_H__
--- a/codec/common/mb_copy.asm
+++ /dev/null
@@ -1,581 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* mb_copy.asm
-;*
-;* Abstract
-;* mb_copy and mb_copy1
-;*
-;* History
-;* 15/09/2009 Created
-;* 12/28/2009 Modified with larger throughput
-;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
-;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
-;*
-;*
-;*********************************************************************************************/
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-
-;***********************************************************************
-; void WelsCopy16x16_sse2( uint8_t* Dst,
-; int32_t iStrideD,
-; uint8_t* Src,
-; int32_t iStrideS )
-;***********************************************************************
-WELS_EXTERN WelsCopy16x16_sse2
-
- push r4
- push r5
- %assign push_num 2
- LOAD_4_PARA
- PUSH_XMM 8
-
- lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
- lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
-
- movdqa xmm0, [r2]
- movdqa xmm1, [r2+r3]
- movdqa xmm2, [r2+2*r3]
- movdqa xmm3, [r2+r5]
- lea r2, [r2+4*r3]
- movdqa xmm4, [r2]
- movdqa xmm5, [r2+r3]
- movdqa xmm6, [r2+2*r3]
- movdqa xmm7, [r2+r5]
- lea r2, [r2+4*r3]
-
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm2
- movdqa [r0+r4], xmm3
- lea r0, [r0+4*r1]
- movdqa [r0], xmm4
- movdqa [r0+r1], xmm5
- movdqa [r0+2*r1], xmm6
- movdqa [r0+r4], xmm7
- lea r0, [r0+4*r1]
-
- movdqa xmm0, [r2]
- movdqa xmm1, [r2+r3]
- movdqa xmm2, [r2+2*r3]
- movdqa xmm3, [r2+r5]
- lea r2, [r2+4*r3]
- movdqa xmm4, [r2]
- movdqa xmm5, [r2+r3]
- movdqa xmm6, [r2+2*r3]
- movdqa xmm7, [r2+r5]
-
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm2
- movdqa [r0+r4], xmm3
- lea r0, [r0+4*r1]
- movdqa [r0], xmm4
- movdqa [r0+r1], xmm5
- movdqa [r0+2*r1], xmm6
- movdqa [r0+r4], xmm7
- POP_XMM
- LOAD_4_PARA_POP
- pop r5
- pop r4
- ret
-
-;***********************************************************************
-; void WelsCopy16x16NotAligned_sse2( uint8_t* Dst,
-; int32_t iStrideD,
-; uint8_t* Src,
-; int32_t iStrideS )
-;***********************************************************************
-; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011
-WELS_EXTERN WelsCopy16x16NotAligned_sse2
- push r4
- push r5
- %assign push_num 2
- LOAD_4_PARA
- PUSH_XMM 8
-
- lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
- lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
-
- movdqu xmm0, [r2]
- movdqu xmm1, [r2+r3]
- movdqu xmm2, [r2+2*r3]
- movdqu xmm3, [r2+r5]
- lea r2, [r2+4*r3]
- movdqu xmm4, [r2]
- movdqu xmm5, [r2+r3]
- movdqu xmm6, [r2+2*r3]
- movdqu xmm7, [r2+r5]
- lea r2, [r2+4*r3]
-
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm2
- movdqa [r0+r4], xmm3
- lea r0, [r0+4*r1]
- movdqa [r0], xmm4
- movdqa [r0+r1], xmm5
- movdqa [r0+2*r1], xmm6
- movdqa [r0+r4], xmm7
- lea r0, [r0+4*r1]
-
- movdqu xmm0, [r2]
- movdqu xmm1, [r2+r3]
- movdqu xmm2, [r2+2*r3]
- movdqu xmm3, [r2+r5]
- lea r2, [r2+4*r3]
- movdqu xmm4, [r2]
- movdqu xmm5, [r2+r3]
- movdqu xmm6, [r2+2*r3]
- movdqu xmm7, [r2+r5]
-
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm2
- movdqa [r0+r4], xmm3
- lea r0, [r0+4*r1]
- movdqa [r0], xmm4
- movdqa [r0+r1], xmm5
- movdqa [r0+2*r1], xmm6
- movdqa [r0+r4], xmm7
- POP_XMM
- LOAD_4_PARA_POP
- pop r5
- pop r4
- ret
-
-; , 12/29/2011
-;***********************************************************************
-; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst,
-; int32_t iStrideD,
-; uint8_t* Src,
-; int32_t iStrideS )
-;***********************************************************************
-WELS_EXTERN WelsCopy16x8NotAligned_sse2
- push r4
- push r5
- %assign push_num 2
- LOAD_4_PARA
- PUSH_XMM 8
-
- lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
- lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
-
- movdqu xmm0, [r2]
- movdqu xmm1, [r2+r3]
- movdqu xmm2, [r2+2*r3]
- movdqu xmm3, [r2+r5]
- lea r2, [r2+4*r3]
- movdqu xmm4, [r2]
- movdqu xmm5, [r2+r3]
- movdqu xmm6, [r2+2*r3]
- movdqu xmm7, [r2+r5]
-
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm2
- movdqa [r0+r4], xmm3
- lea r0, [r0+4*r1]
- movdqa [r0], xmm4
- movdqa [r0+r1], xmm5
- movdqa [r0+2*r1], xmm6
- movdqa [r0+r4], xmm7
- POP_XMM
- LOAD_4_PARA_POP
- pop r5
- pop r4
- ret
-
-
-;***********************************************************************
-; void WelsCopy8x16_mmx(uint8_t* Dst,
-; int32_t iStrideD,
-; uint8_t* Src,
-; int32_t iStrideS )
-;***********************************************************************
-WELS_EXTERN WelsCopy8x16_mmx
- %assign push_num 0
- LOAD_4_PARA
-
- movq mm0, [r2]
- movq mm1, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm2, [r2]
- movq mm3, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm4, [r2]
- movq mm5, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm6, [r2]
- movq mm7, [r2+r3]
- lea r2, [r2+2*r3]
-
- movq [r0], mm0
- movq [r0+r1], mm1
- lea r0, [r0+2*r1]
- movq [r0], mm2
- movq [r0+r1], mm3
- lea r0, [r0+2*r1]
- movq [r0], mm4
- movq [r0+r1], mm5
- lea r0, [r0+2*r1]
- movq [r0], mm6
- movq [r0+r1], mm7
- lea r0, [r0+2*r1]
-
- movq mm0, [r2]
- movq mm1, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm2, [r2]
- movq mm3, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm4, [r2]
- movq mm5, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm6, [r2]
- movq mm7, [r2+r3]
-
- movq [r0], mm0
- movq [r0+r1], mm1
- lea r0, [r0+2*r1]
- movq [r0], mm2
- movq [r0+r1], mm3
- lea r0, [r0+2*r1]
- movq [r0], mm4
- movq [r0+r1], mm5
- lea r0, [r0+2*r1]
- movq [r0], mm6
- movq [r0+r1], mm7
-
- WELSEMMS
- LOAD_4_PARA_POP
- ret
-
-;***********************************************************************
-; void WelsCopy8x8_mmx( uint8_t* Dst,
-; int32_t iStrideD,
-; uint8_t* Src,
-; int32_t iStrideS )
-;***********************************************************************
-WELS_EXTERN WelsCopy8x8_mmx
- push r4
- %assign push_num 1
- LOAD_4_PARA
- lea r4, [r3+2*r3] ;edx, [ebx+2*ebx]
-
- ; to prefetch next loop
- prefetchnta [r2+2*r3]
- prefetchnta [r2+r4]
- movq mm0, [r2]
- movq mm1, [r2+r3]
- lea r2, [r2+2*r3]
- ; to prefetch next loop
- prefetchnta [r2+2*r3]
- prefetchnta [r2+r4]
- movq mm2, [r2]
- movq mm3, [r2+r3]
- lea r2, [r2+2*r3]
- ; to prefetch next loop
- prefetchnta [r2+2*r3]
- prefetchnta [r2+r4]
- movq mm4, [r2]
- movq mm5, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm6, [r2]
- movq mm7, [r2+r3]
-
- movq [r0], mm0
- movq [r0+r1], mm1
- lea r0, [r0+2*r1]
- movq [r0], mm2
- movq [r0+r1], mm3
- lea r0, [r0+2*r1]
- movq [r0], mm4
- movq [r0+r1], mm5
- lea r0, [r0+2*r1]
- movq [r0], mm6
- movq [r0+r1], mm7
-
- WELSEMMS
- LOAD_4_PARA_POP
- pop r4
- ret
-
-; (dunhuang@cisco), 12/21/2011
-;***********************************************************************
-; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv )
-;***********************************************************************
-WELS_EXTERN UpdateMbMv_sse2
-
- %assign push_num 0
- LOAD_2_PARA
-
- movd xmm0, r1d ; _mv
- pshufd xmm1, xmm0, $00
- movdqa [r0 ], xmm1
- movdqa [r0+0x10], xmm1
- movdqa [r0+0x20], xmm1
- movdqa [r0+0x30], xmm1
- ret
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-
-
-
-
-;*******************************************************************************
-; void PixelAvgWidthEq4_mmx( uint8_t *pDst, int iDstStride,
-; uint8_t *pSrcA, int iSrcAStride,
-; uint8_t *pSrcB, int iSrcBStride,
-; int iHeight );
-;*******************************************************************************
-WELS_EXTERN PixelAvgWidthEq4_mmx
-
- %assign push_num 0
- LOAD_7_PARA
-
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r5, r5d
- SIGN_EXTENSION r6, r6d
-
-ALIGN 4
-.height_loop:
- movd mm0, [r4]
- pavgb mm0, [r2]
- movd [r0], mm0
-
- dec r6
- lea r0, [r0+r1]
- lea r2, [r2+r3]
- lea r4, [r4+r5]
- jne .height_loop
-
- WELSEMMS
- LOAD_7_PARA_POP
- ret
-
-
-;*******************************************************************************
-; void PixelAvgWidthEq8_mmx( uint8_t *pDst, int iDstStride,
-; uint8_t *pSrcA, int iSrcAStride,
-; uint8_t *pSrcB, int iSrcBStride,
-; int iHeight );
-;*******************************************************************************
-WELS_EXTERN PixelAvgWidthEq8_mmx
- %assign push_num 0
- LOAD_7_PARA
-
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r5, r5d
- SIGN_EXTENSION r6, r6d
-
-ALIGN 4
-.height_loop:
- movq mm0, [r2]
- pavgb mm0, [r4]
- movq [r0], mm0
- movq mm0, [r2+r3]
- pavgb mm0, [r4+r5]
- movq [r0+r1], mm0
-
- lea r2, [r2+2*r3]
- lea r4, [r4+2*r5]
- lea r0, [r0+2*r1]
-
- sub r6, 2
- jnz .height_loop
-
- WELSEMMS
- LOAD_7_PARA_POP
- ret
-
-
-
-;*******************************************************************************
-; void PixelAvgWidthEq16_sse2( uint8_t *pDst, int iDstStride,
-; uint8_t *pSrcA, int iSrcAStride,
-; uint8_t *pSrcB, int iSrcBStride,
-; int iHeight );
-;*******************************************************************************
-WELS_EXTERN PixelAvgWidthEq16_sse2
-
- %assign push_num 0
- LOAD_7_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r5, r5d
- SIGN_EXTENSION r6, r6d
-ALIGN 4
-.height_loop:
- movdqu xmm0, [r2]
- movdqu xmm1, [r4]
- pavgb xmm0, xmm1
- ;pavgb xmm0, [r4]
- movdqu [r0], xmm0
-
- movdqu xmm0, [r2+r3]
- movdqu xmm1, [r4+r5]
- pavgb xmm0, xmm1
- movdqu [r0+r1], xmm0
-
- movdqu xmm0, [r2+2*r3]
- movdqu xmm1, [r4+2*r5]
- pavgb xmm0, xmm1
- movdqu [r0+2*r1], xmm0
-
- lea r2, [r2+2*r3]
- lea r4, [r4+2*r5]
- lea r0, [r0+2*r1]
-
- movdqu xmm0, [r2+r3]
- movdqu xmm1, [r4+r5]
- pavgb xmm0, xmm1
- movdqu [r0+r1], xmm0
-
- lea r2, [r2+2*r3]
- lea r4, [r4+2*r5]
- lea r0, [r0+2*r1]
-
- sub r6, 4
- jne .height_loop
-
- WELSEMMS
- LOAD_7_PARA_POP
- ret
-
-;*******************************************************************************
-; void McCopyWidthEq4_mmx( uint8_t *pSrc, int iSrcStride,
-; uint8_t *pDst, int iDstStride, int iHeight )
-;*******************************************************************************
-WELS_EXTERN McCopyWidthEq4_mmx
- push r5
- %assign push_num 1
- LOAD_5_PARA
-
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
-
-ALIGN 4
-.height_loop:
- mov r5d, [r0]
- mov [r2], r5d
-
- add r0, r1
- add r2, r3
- dec r4
- jnz .height_loop
- WELSEMMS
- LOAD_5_PARA_POP
- pop r5
- ret
-
-;*******************************************************************************
-; void McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride,
-; uint8_t *pDst, int iDstStride, int iHeight )
-;*******************************************************************************
-WELS_EXTERN McCopyWidthEq8_mmx
- %assign push_num 0
- LOAD_5_PARA
-
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
-
-ALIGN 4
-.height_loop:
- movq mm0, [r0]
- movq [r2], mm0
- add r0, r1
- add r2, r3
- dec r4
- jnz .height_loop
-
- WELSEMMS
- LOAD_5_PARA_POP
- ret
-
-
-;*******************************************************************************
-; void McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
-;*******************************************************************************
-;read unaligned memory
-%macro SSE_READ_UNA 2
- movq %1, [%2]
- movhps %1, [%2+8]
-%endmacro
-
-;write unaligned memory
-%macro SSE_WRITE_UNA 2
- movq [%1], %2
- movhps [%1+8], %2
-%endmacro
-WELS_EXTERN McCopyWidthEq16_sse2
- %assign push_num 0
- LOAD_5_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
-ALIGN 4
-.height_loop:
- SSE_READ_UNA xmm0, r0
- SSE_READ_UNA xmm1, r0+r1
- SSE_WRITE_UNA r2, xmm0
- SSE_WRITE_UNA r2+r3, xmm1
-
- sub r4, 2
- lea r0, [r0+r1*2]
- lea r2, [r2+r3*2]
- jnz .height_loop
-
- LOAD_5_PARA_POP
- ret
--- a/codec/common/mc_chroma.asm
+++ /dev/null
@@ -1,293 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2004-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* mc_chroma.asm
-;*
-;* Abstract
-;* mmx motion compensation for chroma
-;*
-;* History
-;* 10/13/2004 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-h264_d0x20_sse2:
- dw 32,32,32,32,32,32,32,32
-ALIGN 16
-h264_d0x20_mmx:
- dw 32,32,32,32
-
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-;*******************************************************************************
-; void McChromaWidthEq4_mmx( const uint8_t *src,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; const uint8_t *pABCD,
-; int32_t iHeigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq4_mmx
- %assign push_num 0
- LOAD_6_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r5, r5d
-
- movd mm3, [r4]; [eax]
- WELS_Zero mm7
- punpcklbw mm3, mm3
- movq mm4, mm3
- punpcklwd mm3, mm3
- punpckhwd mm4, mm4
-
- movq mm5, mm3
- punpcklbw mm3, mm7
- punpckhbw mm5, mm7
-
- movq mm6, mm4
- punpcklbw mm4, mm7
- punpckhbw mm6, mm7
-
- lea r4, [r0 + r1] ;lea ebx, [esi + eax]
- movd mm0, [r0]
- movd mm1, [r0+1]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
-.xloop:
-
- pmullw mm0, mm3
- pmullw mm1, mm5
- paddw mm0, mm1
-
- movd mm1, [r4]
- punpcklbw mm1, mm7
- movq mm2, mm1
- pmullw mm1, mm4
- paddw mm0, mm1
-
- movd mm1, [r4+1]
- punpcklbw mm1, mm7
- movq mm7, mm1
- pmullw mm1,mm6
- paddw mm0, mm1
- movq mm1,mm7
-
- paddw mm0, [h264_d0x20_mmx]
- psrlw mm0, 6
-
- WELS_Zero mm7
- packuswb mm0, mm7
- movd [r2], mm0
-
- movq mm0, mm2
-
- lea r2, [r2 + r3]
- lea r4, [r4 + r1]
-
- dec r5
- jnz near .xloop
- WELSEMMS
- LOAD_6_PARA_POP
- ret
-
-
-;*******************************************************************************
-; void McChromaWidthEq8_sse2( const uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; const uint8_t *pABCD,
-; int32_t iheigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq8_sse2
- %assign push_num 0
- LOAD_6_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r5, r5d
-
- movd xmm3, [r4]
- WELS_Zero xmm7
- punpcklbw xmm3, xmm3
- punpcklwd xmm3, xmm3
-
- movdqa xmm4, xmm3
- punpckldq xmm3, xmm3
- punpckhdq xmm4, xmm4
- movdqa xmm5, xmm3
- movdqa xmm6, xmm4
-
- punpcklbw xmm3, xmm7
- punpckhbw xmm5, xmm7
- punpcklbw xmm4, xmm7
- punpckhbw xmm6, xmm7
-
- lea r4, [r0 + r1] ;lea ebx, [esi + eax]
- movq xmm0, [r0]
- movq xmm1, [r0+1]
- punpcklbw xmm0, xmm7
- punpcklbw xmm1, xmm7
-.xloop:
-
- pmullw xmm0, xmm3
- pmullw xmm1, xmm5
- paddw xmm0, xmm1
-
- movq xmm1, [r4]
- punpcklbw xmm1, xmm7
- movdqa xmm2, xmm1
- pmullw xmm1, xmm4
- paddw xmm0, xmm1
-
- movq xmm1, [r4+1]
- punpcklbw xmm1, xmm7
- movdqa xmm7, xmm1
- pmullw xmm1, xmm6
- paddw xmm0, xmm1
- movdqa xmm1,xmm7
-
- paddw xmm0, [h264_d0x20_sse2]
- psrlw xmm0, 6
-
- WELS_Zero xmm7
- packuswb xmm0, xmm7
- movq [r2], xmm0
-
- movdqa xmm0, xmm2
-
- lea r2, [r2 + r3]
- lea r4, [r4 + r1]
-
- dec r5
- jnz near .xloop
-
- POP_XMM
- LOAD_6_PARA_POP
-
- ret
-
-
-
-
-;***********************************************************************
-; void McChromaWidthEq8_ssse3( const uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; const uint8_t *pABCD,
-; int32_t iHeigh);
-;***********************************************************************
-WELS_EXTERN McChromaWidthEq8_ssse3
- %assign push_num 0
- LOAD_6_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r5, r5d
-
- pxor xmm7, xmm7
- movd xmm5, [r4]
- punpcklwd xmm5, xmm5
- punpckldq xmm5, xmm5
- movdqa xmm6, xmm5
- punpcklqdq xmm5, xmm5
- punpckhqdq xmm6, xmm6
-
- sub r2, r3 ;sub esi, edi
- sub r2, r3
- movdqa xmm7, [h264_d0x20_sse2]
-
- movdqu xmm0, [r0]
- movdqa xmm1, xmm0
- psrldq xmm1, 1
- punpcklbw xmm0, xmm1
-
-.hloop_chroma:
- lea r2, [r2+2*r3]
-
- movdqu xmm2, [r0+r1]
- movdqa xmm3, xmm2
- psrldq xmm3, 1
- punpcklbw xmm2, xmm3
- movdqa xmm4, xmm2
-
- pmaddubsw xmm0, xmm5
- pmaddubsw xmm2, xmm6
- paddw xmm0, xmm2
- paddw xmm0, xmm7
- psrlw xmm0, 6
- packuswb xmm0, xmm0
- movq [r2],xmm0
-
- lea r0, [r0+2*r1]
- movdqu xmm2, [r0]
- movdqa xmm3, xmm2
- psrldq xmm3, 1
- punpcklbw xmm2, xmm3
- movdqa xmm0, xmm2
-
- pmaddubsw xmm4, xmm5
- pmaddubsw xmm2, xmm6
- paddw xmm4, xmm2
- paddw xmm4, xmm7
- psrlw xmm4, 6
- packuswb xmm4, xmm4
- movq [r2+r3],xmm4
-
- sub r5, 2
- jnz .hloop_chroma
-
- POP_XMM
- LOAD_6_PARA_POP
-
- ret
-
-
--- a/codec/common/mc_common.h
+++ /dev/null
@@ -1,161 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifndef MC_COMMON_H
-#define MC_COMMON_H
-
-#include "typedefs.h"
-
-#if defined(__cplusplus)
-extern "C" {
-#endif//__cplusplus
-
-#if defined(HAVE_NEON)
-void McCopyWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
-void McCopyWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
-void McCopyWidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
-void McChromaWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
-
-void McChromaWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
-
-void PixelAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
-void PixelAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
-void PixelAvgWidthEq4_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
-
-void McHorVer01WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer01WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer01WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer03WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer03WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer03WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
-void McHorVer10WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer10WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer10WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer30WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer30WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer30WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
- //horizontal filter to gain half sample, that is (2, 0) location in quarter sample
-void McHorVer20WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer20WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer20WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
- //vertical filter to gain half sample, that is (0, 2) location in quarter sample
-void McHorVer02WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer02WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer02WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
- //horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
-void McHorVer22WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer22WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer22WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
-void PixStrideAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
-void PixStrideAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
-
-void McHorVer20Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
-void McHorVer20Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
-
-void McHorVer02Height17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
-void McHorVer02Height9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
-
-void McHorVer22Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
-void McHorVer22Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
-#endif
-
-#if defined(X86_ASM)
-//***************************************************************************//
-// MMXEXT definition //
-//***************************************************************************//
-void McHorVer20WidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
- int32_t iHeight);
-void McChromaWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
- const uint8_t* kpABCD, int32_t iHeight);
-void McCopyWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
- int32_t iHeight);
-void McCopyWidthEq8_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
- int32_t iHeight);
-void PixelAvgWidthEq4_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
- const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
-void PixelAvgWidthEq8_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
- const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
-
-//***************************************************************************//
-// SSE2 definition //
-//***************************************************************************//
-void McChromaWidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
- const uint8_t* kpABCD, int32_t iHeight);
-void McCopyWidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
- int32_t iHeight);
-void McHorVer20WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
- int32_t iHeight);
-void McHorVer20WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
- int32_t iHeight);
-void McHorVer02WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
- int32_t iHeight);
-void McHorVer22Width8HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
- int32_t iHeight);
-void McHorVer22Width8VerLastAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
- int32_t iWidth, int32_t iHeight);
-void McHorVer22Width8VerLastUnAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
- int32_t iWidth, int32_t iHeight);
-
-void PixelAvgWidthEq16_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
- const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
-
-void McHorVer20Width9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
- int32_t iHeight);
-
-void McHorVer02Height9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
- int32_t iHeight);
-
-void McHorVer22HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pTap, int32_t iTapStride, int32_t iWidth,
- int32_t iHeight);
-
-//***************************************************************************//
-// SSSE3 definition //
-//***************************************************************************//
-
-void McChromaWidthEq8_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
- const uint8_t* kpABCD, int32_t iHeight);
-
-#endif //X86_ASM
-
-#if defined(__cplusplus)
-}
-#endif//__cplusplus
-
-#endif//MC_COMMON_H
--- a/codec/common/mc_luma.asm
+++ /dev/null
@@ -1,1164 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* mc_luma.asm
-;*
-;* Abstract
-;* sse2 motion compensation
-;*
-;* History
-;* 17/08/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-;*******************************************************************************
-; Local Data (Read Only)
-;*******************************************************************************
-SECTION .rodata align=16
-
-;*******************************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;*******************************************************************************
-
-ALIGN 16
-h264_w0x10:
- dw 16, 16, 16, 16
-ALIGN 16
-h264_w0x10_1:
- dw 16, 16, 16, 16, 16, 16, 16, 16
-ALIGN 16
-h264_mc_hc_32:
- dw 32, 32, 32, 32, 32, 32, 32, 32
-
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-
-
-
-;*******************************************************************************
-; void McHorVer20WidthEq4_mmx( const uint8_t *pSrc,
-; int iSrcStride,
-; uint8_t *pDst,
-; int iDstStride,
-; int iHeight)
-;*******************************************************************************
-WELS_EXTERN McHorVer20WidthEq4_mmx
- %assign push_num 0
- LOAD_5_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
-
- sub r0, 2
- WELS_Zero mm7
- movq mm6, [h264_w0x10]
-.height_loop:
- movd mm0, [r0]
- punpcklbw mm0, mm7
- movd mm1, [r0+5]
- punpcklbw mm1, mm7
- movd mm2, [r0+1]
- punpcklbw mm2, mm7
- movd mm3, [r0+4]
- punpcklbw mm3, mm7
- movd mm4, [r0+2]
- punpcklbw mm4, mm7
- movd mm5, [r0+3]
- punpcklbw mm5, mm7
-
- paddw mm2, mm3
- paddw mm4, mm5
- psllw mm4, 2
- psubw mm4, mm2
- paddw mm0, mm1
- paddw mm0, mm4
- psllw mm4, 2
- paddw mm0, mm4
- paddw mm0, mm6
- psraw mm0, 5
- packuswb mm0, mm7
- movd [r2], mm0
-
- add r0, r1
- add r2, r3
- dec r4
- jnz .height_loop
-
- WELSEMMS
- LOAD_5_PARA_POP
- ret
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-
-%macro SSE_LOAD_8P 3
- movq %1, %3
- punpcklbw %1, %2
-%endmacro
-
-%macro FILTER_HV_W8 9
- paddw %1, %6
- movdqa %8, %3
- movdqa %7, %2
- paddw %1, [h264_w0x10_1]
- paddw %8, %4
- paddw %7, %5
- psllw %8, 2
- psubw %8, %7
- paddw %1, %8
- psllw %8, 2
- paddw %1, %8
- psraw %1, 5
- WELS_Zero %8
- packuswb %1, %8
- movq %9, %1
-%endmacro
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-
-;***********************************************************************
-; void McHorVer22Width8HorFirst_sse2(const int16_t *pSrc,
-; int16_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride
-; int32_t iHeight
-; )
-;***********************************************************************
-WELS_EXTERN McHorVer22Width8HorFirst_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- pxor xmm7, xmm7
-
- sub r0, r1 ;;;;;;;;need more 5 lines.
- sub r0, r1
-
-.yloop_width_8:
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
-
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- movdqa [r2], xmm0
-
- add r0, r1
- add r2, r3
- dec r4
- jnz .yloop_width_8
- POP_XMM
- LOAD_5_PARA_POP
- ret
-
-;*******************************************************************************
-; void McHorVer20WidthEq8_sse2( const uint8_t *pSrc,
-; int iSrcStride,
-; uint8_t *pDst,
-; int iDstStride,
-; int iHeight,
-; );
-;*******************************************************************************
-WELS_EXTERN McHorVer20WidthEq8_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- lea r0, [r0-2] ;pSrc -= 2;
-
- pxor xmm7, xmm7
- movdqa xmm6, [h264_w0x10_1]
-.y_loop:
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
-
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- paddw xmm0, xmm6
- psraw xmm0, 5
-
- packuswb xmm0, xmm7
- movq [r2], xmm0
-
- lea r2, [r2+r3]
- lea r0, [r0+r1]
- dec r4
- jnz near .y_loop
-
- POP_XMM
- LOAD_5_PARA_POP
- ret
-
-;*******************************************************************************
-; void McHorVer20WidthEq16_sse2( const uint8_t *pSrc,
-; int iSrcStride,
-; uint8_t *pDst,
-; int iDstStride,
-; int iHeight,
-; );
-;*******************************************************************************
-WELS_EXTERN McHorVer20WidthEq16_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- lea r0, [r0-2] ;pSrc -= 2;
-
- pxor xmm7, xmm7
- movdqa xmm6, [h264_w0x10_1]
-.y_loop:
-
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
-
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- paddw xmm0, xmm6
- psraw xmm0, 5
- packuswb xmm0, xmm7
- movq [r2], xmm0
-
- movq xmm0, [r0+8]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5+8]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1+8]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4+8]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2+8]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3+8]
- punpcklbw xmm5, xmm7
-
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- paddw xmm0, xmm6
- psraw xmm0, 5
- packuswb xmm0, xmm7
- movq [r2+8], xmm0
-
- lea r2, [r2+r3]
- lea r0, [r0+r1]
- dec r4
- jnz near .y_loop
-
- POP_XMM
- LOAD_5_PARA_POP
- ret
-
-
-;*******************************************************************************
-; void McHorVer02WidthEq8_sse2( const uint8_t *pSrc,
-; int iSrcStride,
-; uint8_t *pDst,
-; int iDstStride,
-; int iHeight )
-;*******************************************************************************
-WELS_EXTERN McHorVer02WidthEq8_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- sub r0, r1
- sub r0, r1
-
- WELS_Zero xmm7
-
- SSE_LOAD_8P xmm0, xmm7, [r0]
- SSE_LOAD_8P xmm1, xmm7, [r0+r1]
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm2, xmm7, [r0]
- SSE_LOAD_8P xmm3, xmm7, [r0+r1]
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm4, xmm7, [r0]
- SSE_LOAD_8P xmm5, xmm7, [r0+r1]
-
-.start:
- FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r4
- jz near .xx_exit
-
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm6, xmm7, [r0]
- FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
- dec r4
- jz near .xx_exit
-
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm7, xmm0, [r0+r1]
- FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
- dec r4
- jz near .xx_exit
-
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm0, xmm1, [r0]
- FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
- dec r4
- jz near .xx_exit
-
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm1, xmm2, [r0+r1]
- FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
- dec r4
- jz near .xx_exit
-
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm2, xmm3, [r0]
- FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
- dec r4
- jz near .xx_exit
-
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm3, xmm4, [r0+r1]
- FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
- dec r4
- jz near .xx_exit
-
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm4, xmm5, [r0]
- FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
- dec r4
- jz near .xx_exit
-
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm5, xmm6, [r0+r1]
- jmp near .start
-
-.xx_exit:
- POP_XMM
- LOAD_5_PARA_POP
- ret
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-
-
-;***********************************************************************
-; void McHorVer02Height9Or17_sse2( const uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; int32_t iWidth,
-; int32_t iHeight )
-;***********************************************************************
-WELS_EXTERN McHorVer02Height9Or17_sse2
- %assign push_num 0
- LOAD_6_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- SIGN_EXTENSION r5, r5d
-
-%ifndef X86_32
- push r12
- push r13
- push r14
- mov r12, r0
- mov r13, r2
- mov r14, r5
-%endif
-
- shr r4, 3
- sub r0, r1
- sub r0, r1
-
-.xloop:
- WELS_Zero xmm7
- SSE_LOAD_8P xmm0, xmm7, [r0]
- SSE_LOAD_8P xmm1, xmm7, [r0+r1]
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm2, xmm7, [r0]
- SSE_LOAD_8P xmm3, xmm7, [r0+r1]
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm4, xmm7, [r0]
- SSE_LOAD_8P xmm5, xmm7, [r0+r1]
-
- FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r5
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm6, xmm7, [r0]
- movdqa xmm0,xmm1
- movdqa xmm1,xmm2
- movdqa xmm2,xmm3
- movdqa xmm3,xmm4
- movdqa xmm4,xmm5
- movdqa xmm5,xmm6
- add r2, r3
- sub r0, r1
-
-.start:
- FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r5
- jz near .x_loop_dec
-
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm6, xmm7, [r0]
- FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
- dec r5
- jz near .x_loop_dec
-
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm7, xmm0, [r0+r1]
- FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
- dec r5
- jz near .x_loop_dec
-
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm0, xmm1, [r0]
- FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
- dec r5
- jz near .x_loop_dec
-
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm1, xmm2, [r0+r1]
- FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
- dec r5
- jz near .x_loop_dec
-
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm2, xmm3, [r0]
- FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
- dec r5
- jz near .x_loop_dec
-
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm3, xmm4, [r0+r1]
- FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
- dec r5
- jz near .x_loop_dec
-
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm4, xmm5, [r0]
- FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
- dec r5
- jz near .x_loop_dec
-
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm5, xmm6, [r0+r1]
- jmp near .start
-
-.x_loop_dec:
- dec r4
- jz near .xx_exit
-%ifdef X86_32
- mov r0, arg1
- mov r2, arg3
- mov r5, arg6
-%else
- mov r0, r12
- mov r2, r13
- mov r5, r14
-%endif
- sub r0, r1
- sub r0, r1
- add r0, 8
- add r2, 8
- jmp near .xloop
-
-.xx_exit:
-%ifndef X86_32
- pop r14
- pop r13
- pop r12
-%endif
- POP_XMM
- LOAD_6_PARA_POP
- ret
-
-
-;***********************************************************************
-; void McHorVer20Width9Or17_sse2( const uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; int32_t iWidth,
-; int32_t iHeight
-; );
-;***********************************************************************
-WELS_EXTERN McHorVer20Width9Or17_sse2
- %assign push_num 0
- LOAD_6_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- SIGN_EXTENSION r5, r5d
- sub r0, 2
- pxor xmm7, xmm7
-
- cmp r4, 9
- jne near .width_17
-
-.yloop_width_9:
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
-
- movdqa xmm7, xmm2
- paddw xmm7, xmm3
- movdqa xmm6, xmm4
- paddw xmm6, xmm5
- psllw xmm6, 2
- psubw xmm6, xmm7
- paddw xmm0, xmm1
- paddw xmm0, xmm6
- psllw xmm6, 2
- paddw xmm0, xmm6
- paddw xmm0, [h264_w0x10_1]
- psraw xmm0, 5
- packuswb xmm0, xmm0
- movd [r2], xmm0
-
- pxor xmm7, xmm7
- movq xmm0, [r0+6]
- punpcklbw xmm0, xmm7
-
- paddw xmm4, xmm1
- paddw xmm5, xmm3
- psllw xmm5, 2
- psubw xmm5, xmm4
- paddw xmm2, xmm0
- paddw xmm2, xmm5
- psllw xmm5, 2
- paddw xmm2, xmm5
- paddw xmm2, [h264_w0x10_1]
- psraw xmm2, 5
- packuswb xmm2, xmm2
- movq [r2+1], xmm2
-
- add r0, r1
- add r2, r3
- dec r5
- jnz .yloop_width_9
- POP_XMM
- LOAD_6_PARA_POP
- ret
-
-
-.width_17:
-.yloop_width_17:
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
-
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- paddw xmm0, [h264_w0x10_1]
- psraw xmm0, 5
- packuswb xmm0, xmm0
- movq [r2], xmm0
-
- movq xmm0, [r0+8]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5+8]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1+8]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4+8]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2+8]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3+8]
- punpcklbw xmm5, xmm7
-
- movdqa xmm7, xmm2
- paddw xmm7, xmm3
- movdqa xmm6, xmm4
- paddw xmm6, xmm5
- psllw xmm6, 2
- psubw xmm6, xmm7
- paddw xmm0, xmm1
- paddw xmm0, xmm6
- psllw xmm6, 2
- paddw xmm0, xmm6
- paddw xmm0, [h264_w0x10_1]
- psraw xmm0, 5
- packuswb xmm0, xmm0
- movd [r2+8], xmm0
-
-
- pxor xmm7, xmm7
- movq xmm0, [r0+6+8]
- punpcklbw xmm0, xmm7
-
- paddw xmm4, xmm1
- paddw xmm5, xmm3
- psllw xmm5, 2
- psubw xmm5, xmm4
- paddw xmm2, xmm0
- paddw xmm2, xmm5
- psllw xmm5, 2
- paddw xmm2, xmm5
- paddw xmm2, [h264_w0x10_1]
- psraw xmm2, 5
- packuswb xmm2, xmm2
- movq [r2+9], xmm2
- add r0, r1
- add r2, r3
- dec r5
- jnz .yloop_width_17
- POP_XMM
- LOAD_6_PARA_POP
- ret
-
-
-
-;***********************************************************************
-;void McHorVer22HorFirst_sse2
-; (const uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t * pTap,
-; int32_t iTapStride,
-; int32_t iWidth,int32_t iHeight);
-;***********************************************************************
-WELS_EXTERN McHorVer22HorFirst_sse2
- %assign push_num 0
- LOAD_6_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- SIGN_EXTENSION r5, r5d
- pxor xmm7, xmm7
- sub r0, r1 ;;;;;;;;need more 5 lines.
- sub r0, r1
-
- cmp r4, 9
- jne near .width_17
-
-.yloop_width_9:
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
-
- movdqa xmm7, xmm2
- paddw xmm7, xmm3
- movdqa xmm6, xmm4
- paddw xmm6, xmm5
- psllw xmm6, 2
- psubw xmm6, xmm7
- paddw xmm0, xmm1
- paddw xmm0, xmm6
- psllw xmm6, 2
- paddw xmm0, xmm6
- movd [r2], xmm0
-
- pxor xmm7, xmm7
- movq xmm0, [r0+6]
- punpcklbw xmm0, xmm7
-
- paddw xmm4, xmm1
- paddw xmm5, xmm3
- psllw xmm5, 2
- psubw xmm5, xmm4
- paddw xmm2, xmm0
- paddw xmm2, xmm5
- psllw xmm5, 2
- paddw xmm2, xmm5
- movq [r2+2], xmm2
- movhps [r2+2+8], xmm2
-
- add r0, r1
- add r2, r3
- dec r5
- jnz .yloop_width_9
- POP_XMM
- LOAD_6_PARA_POP
- ret
-
-
-.width_17:
-.yloop_width_17:
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
-
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- movdqa [r2], xmm0
-
- movq xmm0, [r0+8]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5+8]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1+8]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4+8]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2+8]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3+8]
- punpcklbw xmm5, xmm7
-
- movdqa xmm7, xmm2
- paddw xmm7, xmm3
- movdqa xmm6, xmm4
- paddw xmm6, xmm5
- psllw xmm6, 2
- psubw xmm6, xmm7
- paddw xmm0, xmm1
- paddw xmm0, xmm6
- psllw xmm6, 2
- paddw xmm0, xmm6
- movd [r2+16], xmm0
-
-
- pxor xmm7, xmm7
- movq xmm0, [r0+6+8]
- punpcklbw xmm0, xmm7
-
- paddw xmm4, xmm1
- paddw xmm5, xmm3
- psllw xmm5, 2
- psubw xmm5, xmm4
- paddw xmm2, xmm0
- paddw xmm2, xmm5
- psllw xmm5, 2
- paddw xmm2, xmm5
- movq [r2+18], xmm2
- movhps [r2+18+8], xmm2
-
- add r0, r1
- add r2, r3
- dec r5
- jnz .yloop_width_17
- POP_XMM
- LOAD_6_PARA_POP
- ret
-
-
-%macro FILTER_VER 9
- paddw %1, %6
- movdqa %7, %2
- movdqa %8, %3
-
-
- paddw %7, %5
- paddw %8, %4
-
- psubw %1, %7
- psraw %1, 2
- paddw %1, %8
- psubw %1, %7
- psraw %1, 2
- paddw %8, %1
- paddw %8, [h264_mc_hc_32]
- psraw %8, 6
- packuswb %8, %8
- movq %9, %8
-%endmacro
-;***********************************************************************
-;void McHorVer22Width8VerLastAlign_sse2(
-; const uint8_t *pTap,
-; int32_t iTapStride,
-; uint8_t * pDst,
-; int32_t iDstStride,
-; int32_t iWidth,
-; int32_t iHeight);
-;***********************************************************************
-
-WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
- %assign push_num 0
- LOAD_6_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- SIGN_EXTENSION r5, r5d
-%ifndef X86_32
- push r12
- push r13
- push r14
- mov r12, r0
- mov r13, r2
- mov r14, r5
-%endif
-
- shr r4, 3
-
-.width_loop:
- movdqa xmm0, [r0]
- movdqa xmm1, [r0+r1]
- lea r0, [r0+2*r1]
- movdqa xmm2, [r0]
- movdqa xmm3, [r0+r1]
- lea r0, [r0+2*r1]
- movdqa xmm4, [r0]
- movdqa xmm5, [r0+r1]
-
- FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r5
- lea r0, [r0+2*r1]
- movdqa xmm6, [r0]
-
- movdqa xmm0, xmm1
- movdqa xmm1, xmm2
- movdqa xmm2, xmm3
- movdqa xmm3, xmm4
- movdqa xmm4, xmm5
- movdqa xmm5, xmm6
-
- add r2, r3
- sub r0, r1
-
-.start:
- FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r5
- jz near .x_loop_dec
-
- lea r0, [r0+2*r1]
- movdqa xmm6, [r0]
- FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
- dec r5
- jz near .x_loop_dec
-
- lea r2, [r2+2*r3]
- movdqa xmm7, [r0+r1]
- FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
- dec r5
- jz near .x_loop_dec
-
- lea r0, [r0+2*r1]
- movdqa xmm0, [r0]
- FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
- dec r5
- jz near .x_loop_dec
-
- lea r2, [r2+2*r3]
- movdqa xmm1, [r0+r1]
- FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
- dec r5
- jz near .x_loop_dec
-
- lea r0, [r0+2*r1]
- movdqa xmm2, [r0]
- FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
- dec r5
- jz near .x_loop_dec
-
- lea r2, [r2+2*r3]
- movdqa xmm3, [r0+r1]
- FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
- dec r5
- jz near .x_loop_dec
-
- lea r0, [r0+2*r1]
- movdqa xmm4, [r0]
- FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
- dec r5
- jz near .x_loop_dec
-
- lea r2, [r2+2*r3]
- movdqa xmm5, [r0+r1]
- jmp near .start
-
-.x_loop_dec:
- dec r4
- jz near .exit
-%ifdef X86_32
- mov r0, arg1
- mov r2, arg3
- mov r5, arg6
-%else
- mov r0, r12
- mov r2, r13
- mov r5, r14
-%endif
- add r0, 16
- add r2, 8
- jmp .width_loop
-
-.exit:
-%ifndef X86_32
- pop r14
- pop r13
- pop r12
-%endif
- POP_XMM
- LOAD_6_PARA_POP
- ret
-
-;***********************************************************************
-;void McHorVer22Width8VerLastUnAlign_sse2(
-; const uint8_t *pTap,
-; int32_t iTapStride,
-; uint8_t * pDst,
-; int32_t iDstStride,
-; int32_t iWidth,
-; int32_t iHeight);
-;***********************************************************************
-
-WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
- %assign push_num 0
- LOAD_6_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- SIGN_EXTENSION r5, r5d
-%ifndef X86_32
- push r12
- push r13
- push r14
- mov r12, r0
- mov r13, r2
- mov r14, r5
-%endif
- shr r4, 3
-
-.width_loop:
- movdqu xmm0, [r0]
- movdqu xmm1, [r0+r1]
- lea r0, [r0+2*r1]
- movdqu xmm2, [r0]
- movdqu xmm3, [r0+r1]
- lea r0, [r0+2*r1]
- movdqu xmm4, [r0]
- movdqu xmm5, [r0+r1]
-
- FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r5
- lea r0, [r0+2*r1]
- movdqu xmm6, [r0]
-
- movdqa xmm0, xmm1
- movdqa xmm1, xmm2
- movdqa xmm2, xmm3
- movdqa xmm3, xmm4
- movdqa xmm4, xmm5
- movdqa xmm5, xmm6
-
- add r2, r3
- sub r0, r1
-
-.start:
- FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r5
- jz near .x_loop_dec
-
- lea r0, [r0+2*r1]
- movdqu xmm6, [r0]
- FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
- dec r5
- jz near .x_loop_dec
-
- lea r2, [r2+2*r3]
- movdqu xmm7, [r0+r1]
- FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
- dec r5
- jz near .x_loop_dec
-
- lea r0, [r0+2*r1]
- movdqu xmm0, [r0]
- FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
- dec r5
- jz near .x_loop_dec
-
- lea r2, [r2+2*r3]
- movdqu xmm1, [r0+r1]
- FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
- dec r5
- jz near .x_loop_dec
-
- lea r0, [r0+2*r1]
- movdqu xmm2, [r0]
- FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
- dec r5
- jz near .x_loop_dec
-
- lea r2, [r2+2*r3]
- movdqu xmm3, [r0+r1]
- FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
- dec r5
- jz near .x_loop_dec
-
- lea r0, [r0+2*r1]
- movdqu xmm4, [r0]
- FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
- dec r5
- jz near .x_loop_dec
-
- lea r2, [r2+2*r3]
- movdqu xmm5, [r0+r1]
- jmp near .start
-
-.x_loop_dec:
- dec r4
- jz near .exit
-%ifdef X86_32
- mov r0, arg1
- mov r2, arg3
- mov r5, arg6
-%else
- mov r0, r12
- mov r2, r13
- mov r5, r14
-%endif
- add r0, 16
- add r2, 8
- jmp .width_loop
-
-.exit:
-%ifndef X86_32
- pop r14
- pop r13
- pop r12
-%endif
- POP_XMM
- LOAD_6_PARA_POP
- ret
--- a/codec/common/mc_neon.S
+++ /dev/null
@@ -1,2210 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifdef HAVE_NEON
-.text
-#include "arm_arch_common_macro.S"
-
-#ifdef __APPLE__
-.macro AVERAGE_TWO_8BITS
-// { // input:dst_d, src_d A and B; working: q13
- vaddl.u8 q13, $2, $1
- vrshrn.u16 $0, q13, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
- vaddl.u8 q13, $2, $3 //src[0]+src[1]
- vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, $1, $4 //src[-1]+src[2]
- vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 $6, q12, #5
-// }
-.endm
-
-.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used
-// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2},
- vrev64.8 $2, $0 // X[5][4][3][2][1][0]O
- vaddl.u8 $3, $0, $2 // each 16bits, *[50][41][32][23][14][05]*
- vmul.s16 $0, $2, $1 // 0+1*[50]-5*[41]+20[32]
- vpadd.s16 $0, $0, $0
- vpadd.s16 $0, $0, $0
- vqrshrun.s16 $0, $4, #5
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
- vaddl.u8 q13, $2, $3 //src[0]+src[1]
- vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, $1, $4 //src[-1]+src[2]
- vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 $6, q12, #5
- vaddl.u8 q13, $2, $6
- vrshrn.u16 $6, q13, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
- vaddl.u8 q13, $2, $3 //src[0]+src[1]
- vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, $1, $4 //src[-1]+src[2]
- vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 $6, q12, #5
- vaddl.u8 q13, $3, $6
- vrshrn.u16 $6, q13, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS_TO_16BITS
-// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
- vaddl.u8 $6, $0, $5 //dst_q=src[-2]+src[3]
- vaddl.u8 q13, $2, $3 //src[0]+src[1]
- vmla.u16 $6, q13, $7 //dst_q += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, $1, $4 //src[-1]+src[2]
- vmls.s16 $6, q13, $8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
-// }
-.endm
-
-.macro FILTER_3_IN_16BITS_TO_8BITS
-// { // input:a, b, c, dst_d;
- vsub.s16 $0, $0, $1 //a-b
- vshr.s16 $0, $0, #2 //(a-b)/4
- vsub.s16 $0, $0, $1 //(a-b)/4-b
- vadd.s16 $0, $0, $2 //(a-b)/4-b+c
- vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4
- vadd.s16 $0, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- vqrshrun.s16 $3, $0, #6 //(+32)>>6
-// }
-.endm
-
-.macro UNPACK_2_16BITS_TO_ABC
-// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
- vext.16 $4, $0, $1, #2 //src[0]
- vext.16 $3, $0, $1, #3 //src[1]
- vadd.s16 $4, $3 //c=src[0]+src[1]
-
- vext.16 $3, $0, $1, #1 //src[-1]
- vext.16 $2, $0, $1, #4 //src[2]
- vadd.s16 $3, $2 //b=src[-1]+src[2]
-
- vext.16 $2, $0, $1, #5 //src[3]
- vadd.s16 $2, $0 //a=src[-2]+src[3]
-// }
-.endm
-
-.macro UNPACK_1_IN_8x16BITS_TO_8BITS
-// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
- vext.16 $3, $3, $3, #7 // 0x????, [0][1][2][3][4][5],
- vrev64.16 $1, $1
- vadd.u16 $2, $1 // C[2+3],B[1+4],A[0+5],
- vshr.s64 $1, $2, #16
- vshr.s64 $0, $2, #32 // Output: C $2, B $1, A $0
-
- vsub.s16 $0, $0, $1 //a-b
- vshr.s16 $0, $0, #2 //(a-b)/4
- vsub.s16 $0, $0, $1 //(a-b)/4-b
- vadd.s16 $0, $0, $2 //(a-b)/4-b+c
- vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4
- vadd.s16 $1, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- vqrshrun.s16 $0, $3, #6 //(+32)>>6
-// }
-.endm
-#else
-.macro AVERAGE_TWO_8BITS arg0, arg1, arg2
-// { // input:dst_d, src_d A and B; working: q13
- vaddl.u8 q13, \arg2, \arg1
- vrshrn.u16 \arg0, q13, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
- vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
- vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
- vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 \arg6, q12, #5
-// }
-.endm
-
-.macro FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5 // when width=17/9, used
-// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}
- vrev64.8 \arg2, \arg0 // X[5][4][3][2][1][0]O
- vaddl.u8 \arg3, \arg0, \arg2 // each 16bits, *[50][41][32][23][14][05]*
- vmul.s16 \arg0, \arg2, \arg1 // 0+1*[50]-5*[41]+20[32]
- vpadd.s16 \arg0, \arg0, \arg0
- vpadd.s16 \arg0, \arg0, \arg0
- vqrshrun.s16 \arg0, \arg4, #5
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
- vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
- vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
- vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 \arg6, q12, #5
- vaddl.u8 q13, \arg2, \arg6
- vrshrn.u16 \arg6, q13, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
- vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
- vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
- vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 \arg6, q12, #5
- vaddl.u8 q13, \arg3, \arg6
- vrshrn.u16 \arg6, q13, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS_TO_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
- vaddl.u8 \arg6, \arg0, \arg5 //dst_q=src[-2]+src[3]
- vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
- vmla.u16 \arg6, q13, \arg7 //dst_q += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
- vmls.s16 \arg6, q13, \arg8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
-// }
-.endm
-
-.macro FILTER_3_IN_16BITS_TO_8BITS arg0, arg1, arg2, arg3
-// { // input:a, b, c, dst_d;
- vsub.s16 \arg0, \arg0, \arg1 //a-b
- vshr.s16 \arg0, \arg0, #2 //(a-b)/4
- vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b
- vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c
- vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4
- vadd.s16 \arg0, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- vqrshrun.s16 \arg3, \arg0, #6 //(+32)>>6
-// }
-.endm
-
-.macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
-// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
- vext.16 \arg4, \arg0, \arg1, #2 //src[0]
- vext.16 \arg3, \arg0, \arg1, #3 //src[1]
- vadd.s16 \arg4, \arg3 //c=src[0]+src[1]
-
- vext.16 \arg3, \arg0, \arg1, #1 //src[-1]
- vext.16 \arg2, \arg0, \arg1, #4 //src[2]
- vadd.s16 \arg3,\arg2 //b=src[-1]+src[2]
-
- vext.16 \arg2, \arg0, \arg1, #5 //src[3]
- vadd.s16 \arg2, \arg0 //a=src[-2]+src[3]
-// }
-.endm
-
-.macro UNPACK_1_IN_8x16BITS_TO_8BITS arg0, arg1,arg2, arg3
-// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
- vext.16 \arg3, \arg3, \arg3, #7 // 0x????, [0][1][2][3][4][5]
- vrev64.16 \arg1, \arg1
- vadd.u16 \arg2, \arg1 // C[2+3],B[1+4],A[0+5]
- vshr.s64 \arg1, \arg2, #16
- vshr.s64 \arg0, \arg2, #32 // Output: C \arg2, B \arg1, A \arg0
-
- vsub.s16 \arg0, \arg0, \arg1 //a-b
- vshr.s16 \arg0, \arg0, #2 //(a-b)/4
- vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b
- vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c
- vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4
- vadd.s16 \arg1, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- vqrshrun.s16 \arg0, \arg3, #6 //(+32)>>6
-// }
-.endm
-#endif
-
-WELS_ASM_FUNC_BEGIN McHorVer20WidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w16_h_mc_luma_loop:
- vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
- pld [r0]
- pld [r0, #16]
-
- vext.8 q2, q0, q1, #1 //q2=src[-1]
- vext.8 q3, q0, q1, #2 //q3=src[0]
- vext.8 q8, q0, q1, #3 //q8=src[1]
- vext.8 q9, q0, q1, #4 //q9=src[2]
- vext.8 q10, q0, q1, #5 //q10=src[3]
-
- FILTER_6TAG_8BITS d0, d4, d6, d16, d18, d20, d2, q14, q15
-
- FILTER_6TAG_8BITS d1, d5, d7, d17, d19, d21, d3, q14, q15
-
- sub r4, #1
- vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
-
- cmp r4, #0
- bne w16_h_mc_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer20WidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w8_h_mc_luma_loop:
- vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
- pld [r0]
-
- vext.8 d2, d0, d1, #1 //d2=src[-1]
- vext.8 d3, d0, d1, #2 //d3=src[0]
- vext.8 d4, d0, d1, #3 //d4=src[1]
- vext.8 d5, d0, d1, #4 //d5=src[2]
- vext.8 d6, d0, d1, #5 //d6=src[3]
-
- FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d1, q14, q15
-
- sub r4, #1
- vst1.u8 {d1}, [r2], r3
-
- cmp r4, #0
- bne w8_h_mc_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer20WidthEq4_neon
- push {r4, r5, r6}
- ldr r6, [sp, #12]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w4_h_mc_luma_loop:
- vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
- pld [r0]
- vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
- pld [r0]
-
- vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
- vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
- vext.8 q3, q2, q2, #1 //src[0:6 *]
- vext.8 q8, q2, q2, #2 //src[1:6 * *]
-
- vtrn.32 q3, q8 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
- vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
- vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
- vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
-
- FILTER_6TAG_8BITS d0, d4, d6, d7, d2, d5, d1, q14, q15
-
- vmov r4, r5, d1
- str r4, [r2], r3
- str r5, [r2], r3
-
- sub r6, #2
- cmp r6, #0
- bne w4_h_mc_luma_loop
-
- pop {r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer10WidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w16_xy_10_mc_luma_loop:
- vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
- pld [r0]
- pld [r0, #16]
-
- vext.8 q2, q0, q1, #1 //q2=src[-1]
- vext.8 q3, q0, q1, #2 //q3=src[0]
- vext.8 q8, q0, q1, #3 //q8=src[1]
- vext.8 q9, q0, q1, #4 //q9=src[2]
- vext.8 q10, q0, q1, #5 //q10=src[3]
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d16, d18, d20, d2, q14, q15
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d5, d7, d17, d19, d21, d3, q14, q15
-
- sub r4, #1
- vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
-
- cmp r4, #0
- bne w16_xy_10_mc_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer10WidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w8_xy_10_mc_luma_loop:
- vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
- pld [r0]
-
- vext.8 d2, d0, d1, #1 //d2=src[-1]
- vext.8 d3, d0, d1, #2 //d3=src[0]
- vext.8 d4, d0, d1, #3 //d4=src[1]
- vext.8 d5, d0, d1, #4 //d5=src[2]
- vext.8 d6, d0, d1, #5 //d6=src[3]
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d3, d4, d5, d6, d1, q14, q15
-
- sub r4, #1
- vst1.u8 {d1}, [r2], r3
-
- cmp r4, #0
- bne w8_xy_10_mc_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer10WidthEq4_neon
- push {r4, r5, r6}
- ldr r6, [sp, #12]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w4_xy_10_mc_luma_loop:
- vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
- pld [r0]
- vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
- pld [r0]
-
- vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
- vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
- vext.8 q3, q2, q2, #1 //src[0:6 *]
- vext.8 q8, q2, q2, #2 //src[1:6 * *]
-
- vtrn.32 q3, q8 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
- vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
- vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
- vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d7, d2, d5, d1, q14, q15
-
- vmov r4, r5, d1
- str r4, [r2], r3
- str r5, [r2], r3
-
- sub r6, #2
- cmp r6, #0
- bne w4_xy_10_mc_luma_loop
-
- pop {r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer30WidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w16_xy_30_mc_luma_loop:
- vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
- pld [r0]
- pld [r0, #16]
-
- vext.8 q2, q0, q1, #1 //q2=src[-1]
- vext.8 q3, q0, q1, #2 //q3=src[0]
- vext.8 q8, q0, q1, #3 //q8=src[1]
- vext.8 q9, q0, q1, #4 //q9=src[2]
- vext.8 q10, q0, q1, #5 //q10=src[3]
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d16, d18, d20, d2, q14, q15
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d5, d7, d17, d19, d21, d3, q14, q15
-
- sub r4, #1
- vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
-
- cmp r4, #0
- bne w16_xy_30_mc_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer30WidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w8_xy_30_mc_luma_loop:
- vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
- pld [r0]
-
- vext.8 d2, d0, d1, #1 //d2=src[-1]
- vext.8 d3, d0, d1, #2 //d3=src[0]
- vext.8 d4, d0, d1, #3 //d4=src[1]
- vext.8 d5, d0, d1, #4 //d5=src[2]
- vext.8 d6, d0, d1, #5 //d6=src[3]
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d3, d4, d5, d6, d1, q14, q15
-
- sub r4, #1
- vst1.u8 {d1}, [r2], r3
-
- cmp r4, #0
- bne w8_xy_30_mc_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer30WidthEq4_neon
- push {r4, r5, r6}
- ldr r6, [sp, #12]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w4_xy_30_mc_luma_loop:
- vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
- pld [r0]
- vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
- pld [r0]
-
- vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
- vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
- vext.8 q3, q2, q2, #1 //src[0:6 *]
- vext.8 q8, q2, q2, #2 //src[1:6 * *]
-
- vtrn.32 q3, q8 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
- vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
- vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
- vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d7, d2, d5, d1, q14, q15
-
- vmov r4, r5, d1
- str r4, [r2], r3
- str r5, [r2], r3
-
- sub r6, #2
- cmp r6, #0
- bne w4_xy_30_mc_luma_loop
-
- pop {r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer01WidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //q0=src[-2]
- vld1.u8 {q1}, [r0], r1 //q1=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {q2}, [r0], r1 //q2=src[0]
- vld1.u8 {q3}, [r0], r1 //q3=src[1]
- vld1.u8 {q8}, [r0], r1 //q8=src[2]
-
-w16_xy_01_luma_loop:
-
- vld1.u8 {q9}, [r0], r1 //q9=src[3]
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d16, d18, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d17, d19, d21, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d16, d18, d0, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d17, d19, d1, d21, q14, q15
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d4, d6, d16, d18, d0, d2, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d5, d7, d17, d19, d1, d3, d21, q14, q15
- vld1.u8 {q2}, [r0], r1 //read 4th row
- vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d6, d16, d18, d0, d2, d4, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d7, d17, d19, d1, d3, d5, d21, q14, q15
- vld1.u8 {q3}, [r0], r1 //read 5th row
- vst1.u8 {q10}, [r2], r3 //write 4th 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d16, d18, d0, d2, d4, d6, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d17, d19, d1, d3, d5, d7, d21, q14, q15
- vld1.u8 {q8}, [r0], r1 //read 6th row
- vst1.u8 {q10}, [r2], r3 //write 5th 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d18, d0, d2, d4, d6, d16, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d19, d1, d3, d5, d7, d17, d21, q14, q15
- vld1.u8 {q9}, [r0], r1 //read 7th row
- vst1.u8 {q10}, [r2], r3 //write 6th 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d16, d18, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d17, d19, d21, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 8th row
- vst1.u8 {q10}, [r2], r3 //write 7th 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d16, d18, d0, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d17, d19, d1, d21, q14, q15
- vst1.u8 {q10}, [r2], r3 //write 8th 16Byte
-
- //q2, q3, q4, q5, q0 --> q0~q4
- vswp q0, q8
- vswp q0, q2
- vmov q1, q3
- vmov q3, q9 //q0~q4
-
- sub r4, #8
- cmp r4, #0
- bne w16_xy_01_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer01WidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0}, [r0], r1 //d0=src[-2]
- vld1.u8 {d1}, [r0], r1 //d1=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {d2}, [r0], r1 //d2=src[0]
- vld1.u8 {d3}, [r0], r1 //d3=src[1]
-
- vld1.u8 {d4}, [r0], r1 //d4=src[2]
- vld1.u8 {d5}, [r0], r1 //d5=src[3]
-
-w8_xy_01_mc_luma_loop:
-
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d16, q14, q15
- vld1.u8 {d0}, [r0], r1 //read 2nd row
- vst1.u8 {d16}, [r2], r3 //write 1st 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d2, d3, d4, d5, d0, d16, q14, q15
- vld1.u8 {d1}, [r0], r1 //read 3rd row
- vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d16, q14, q15
- vld1.u8 {d2}, [r0], r1 //read 4th row
- vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d4, d5, d0, d1, d2, d16, q14, q15
- vld1.u8 {d3}, [r0], r1 //read 5th row
- vst1.u8 {d16}, [r2], r3 //write 4th 8Byte
-
- //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
- vswp q0, q2
- vswp q1, q2
-
- sub r4, #4
- cmp r4, #0
- bne w8_xy_01_mc_luma_loop
-
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer01WidthEq4_neon
- push {r4, r5, r6, r7}
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- ldr r4, [r0], r1 //r4=src[-2]
- ldr r5, [r0], r1 //r5=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- ldr r6, [r0], r1 //r6=src[0]
- ldr r7, [r0], r1 //r7=src[1]
-
- vmov d0, r4, r5
- vmov d1, r5, r6
- vmov d2, r6, r7
-
- ldr r4, [r0], r1 //r4=src[2]
- vmov d3, r7, r4
- ldr r7, [sp, #16]
-
-w4_xy_01_mc_luma_loop:
-
-// pld [r0]
- //using reserving r4
- ldr r5, [r0], r1 //r5=src[3]
- ldr r6, [r0], r1 //r6=src[0]
- vmov d4, r4, r5
- vmov d5, r5, r6 //reserved r6
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d16, q14, q15
- vmov r4, r5, d16
- str r4, [r2], r3 //write 1st 4Byte
- str r5, [r2], r3 //write 2nd 4Byte
-
- ldr r5, [r0], r1 //r5=src[1]
- ldr r4, [r0], r1 //r4=src[2]
- vmov d0, r6, r5
- vmov d1, r5, r4 //reserved r4
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d16, q14, q15
- vmov r5, r6, d16
- str r5, [r2], r3 //write 3rd 4Byte
- str r6, [r2], r3 //write 4th 4Byte
-
- //d4, d5, d0, d1 --> d0, d1, d2, d3
- vmov q1, q0
- vmov q0, q2
-
- sub r7, #4
- cmp r7, #0
- bne w4_xy_01_mc_luma_loop
-
- pop {r4, r5, r6, r7}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer03WidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //q0=src[-2]
- vld1.u8 {q1}, [r0], r1 //q1=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {q2}, [r0], r1 //q2=src[0]
- vld1.u8 {q3}, [r0], r1 //q3=src[1]
- vld1.u8 {q8}, [r0], r1 //q8=src[2]
-
-w16_xy_03_luma_loop:
-
- vld1.u8 {q9}, [r0], r1 //q9=src[3]
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d16, d18, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d17, d19, d21, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d16, d18, d0, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d17, d19, d1, d21, q14, q15
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d4, d6, d16, d18, d0, d2, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d5, d7, d17, d19, d1, d3, d21, q14, q15
- vld1.u8 {q2}, [r0], r1 //read 4th row
- vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d6, d16, d18, d0, d2, d4, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d7, d17, d19, d1, d3, d5, d21, q14, q15
- vld1.u8 {q3}, [r0], r1 //read 5th row
- vst1.u8 {q10}, [r2], r3 //write 4th 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d16, d18, d0, d2, d4, d6, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d17, d19, d1, d3, d5, d7, d21, q14, q15
- vld1.u8 {q8}, [r0], r1 //read 6th row
- vst1.u8 {q10}, [r2], r3 //write 5th 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d18, d0, d2, d4, d6, d16, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d19, d1, d3, d5, d7, d17, d21, q14, q15
- vld1.u8 {q9}, [r0], r1 //read 7th row
- vst1.u8 {q10}, [r2], r3 //write 6th 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d16, d18, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d17, d19, d21, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 8th row
- vst1.u8 {q10}, [r2], r3 //write 7th 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d16, d18, d0, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d17, d19, d1, d21, q14, q15
- vst1.u8 {q10}, [r2], r3 //write 8th 16Byte
-
- //q2, q3, q8, q9, q0 --> q0~q8
- vswp q0, q8
- vswp q0, q2
- vmov q1, q3
- vmov q3, q9 //q0~q8
-
- sub r4, #8
- cmp r4, #0
- bne w16_xy_03_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer03WidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0}, [r0], r1 //d0=src[-2]
- vld1.u8 {d1}, [r0], r1 //d1=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {d2}, [r0], r1 //d2=src[0]
- vld1.u8 {d3}, [r0], r1 //d3=src[1]
-
- vld1.u8 {d4}, [r0], r1 //d4=src[2]
- vld1.u8 {d5}, [r0], r1 //d5=src[3]
-
-w8_xy_03_mc_luma_loop:
-
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d16, q14, q15
- vld1.u8 {d0}, [r0], r1 //read 2nd row
- vst1.u8 {d16}, [r2], r3 //write 1st 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d2, d3, d4, d5, d0, d16, q14, q15
- vld1.u8 {d1}, [r0], r1 //read 3rd row
- vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d16, q14, q15
- vld1.u8 {d2}, [r0], r1 //read 4th row
- vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d4, d5, d0, d1, d2, d16, q14, q15
- vld1.u8 {d3}, [r0], r1 //read 5th row
- vst1.u8 {d16}, [r2], r3 //write 4th 8Byte
-
- //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
- vswp q0, q2
- vswp q1, q2
-
- sub r4, #4
- cmp r4, #0
- bne w8_xy_03_mc_luma_loop
-
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer03WidthEq4_neon
- push {r4, r5, r6, r7}
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- ldr r4, [r0], r1 //r4=src[-2]
- ldr r5, [r0], r1 //r5=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- ldr r6, [r0], r1 //r6=src[0]
- ldr r7, [r0], r1 //r7=src[1]
-
- vmov d0, r4, r5
- vmov d1, r5, r6
- vmov d2, r6, r7
-
- ldr r4, [r0], r1 //r4=src[2]
- vmov d3, r7, r4
- ldr r7, [sp, #16]
-
-w4_xy_03_mc_luma_loop:
-
-// pld [r0]
- //using reserving r4
- ldr r5, [r0], r1 //r5=src[3]
- ldr r6, [r0], r1 //r6=src[0]
- vmov d4, r4, r5
- vmov d5, r5, r6 //reserved r6
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d16, q14, q15
- vmov r4, r5, d16
- str r4, [r2], r3 //write 1st 4Byte
- str r5, [r2], r3 //write 2nd 4Byte
-
- ldr r5, [r0], r1 //r5=src[1]
- ldr r4, [r0], r1 //r4=src[2]
- vmov d0, r6, r5
- vmov d1, r5, r4 //reserved r4
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d16, q14, q15
- vmov r5, r6, d16
- str r5, [r2], r3 //write 3rd 4Byte
- str r6, [r2], r3 //write 4th 4Byte
-
- //d4, d5, d0, d1 --> d0, d1, d2, d3
- vmov q1, q0
- vmov q0, q2
-
- sub r7, #4
- cmp r7, #0
- bne w4_xy_03_mc_luma_loop
-
- pop {r4, r5, r6, r7}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer02WidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //q0=src[-2]
- vld1.u8 {q1}, [r0], r1 //q1=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {q2}, [r0], r1 //q2=src[0]
- vld1.u8 {q3}, [r0], r1 //q3=src[1]
- vld1.u8 {q8}, [r0], r1 //q8=src[2]
-
-w16_v_mc_luma_loop:
-
- vld1.u8 {q9}, [r0], r1 //q9=src[3]
-
- FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
-
- FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte
-
- FILTER_6TAG_8BITS d4, d6, d16, d18, d0, d2, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d5, d7, d17, d19, d1, d3, d21, q14, q15
- vld1.u8 {q2}, [r0], r1 //read 4th row
- vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte
-
- FILTER_6TAG_8BITS d6, d16, d18, d0, d2, d4, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d7, d17, d19, d1, d3, d5, d21, q14, q15
- vld1.u8 {q3}, [r0], r1 //read 5th row
- vst1.u8 {q10}, [r2], r3 //write 4th 16Byte
-
- FILTER_6TAG_8BITS d16, d18, d0, d2, d4, d6, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d17, d19, d1, d3, d5, d7, d21, q14, q15
- vld1.u8 {q8}, [r0], r1 //read 6th row
- vst1.u8 {q10}, [r2], r3 //write 5th 16Byte
-
- FILTER_6TAG_8BITS d18, d0, d2, d4, d6, d16, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d19, d1, d3, d5, d7, d17, d21, q14, q15
- vld1.u8 {q9}, [r0], r1 //read 7th row
- vst1.u8 {q10}, [r2], r3 //write 6th 16Byte
-
- FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 8th row
- vst1.u8 {q10}, [r2], r3 //write 7th 16Byte
-
- FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15
- vst1.u8 {q10}, [r2], r3 //write 8th 16Byte
-
- //q2, q3, q8, q9, q0 --> q0~q8
- vswp q0, q8
- vswp q0, q2
- vmov q1, q3
- vmov q3, q9 //q0~q8
-
- sub r4, #8
- cmp r4, #0
- bne w16_v_mc_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer02WidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0}, [r0], r1 //d0=src[-2]
- vld1.u8 {d1}, [r0], r1 //d1=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {d2}, [r0], r1 //d2=src[0]
- vld1.u8 {d3}, [r0], r1 //d3=src[1]
-
- vld1.u8 {d4}, [r0], r1 //d4=src[2]
- vld1.u8 {d5}, [r0], r1 //d5=src[3]
-
-w8_v_mc_luma_loop:
-
- pld [r0]
- FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15
- vld1.u8 {d0}, [r0], r1 //read 2nd row
- vst1.u8 {d16}, [r2], r3 //write 1st 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d16, q14, q15
- vld1.u8 {d1}, [r0], r1 //read 3rd row
- vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d16, q14, q15
- vld1.u8 {d2}, [r0], r1 //read 4th row
- vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d16, q14, q15
- vld1.u8 {d3}, [r0], r1 //read 5th row
- vst1.u8 {d16}, [r2], r3 //write 4th 8Byte
-
- //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
- vswp q0, q2
- vswp q1, q2
-
- sub r4, #4
- cmp r4, #0
- bne w8_v_mc_luma_loop
-
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer02WidthEq4_neon
- push {r4, r5, r6, r7}
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- ldr r4, [r0], r1 //r4=src[-2]
- ldr r5, [r0], r1 //r5=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- ldr r6, [r0], r1 //r6=src[0]
- ldr r7, [r0], r1 //r7=src[1]
-
- vmov d0, r4, r5
- vmov d1, r5, r6
- vmov d2, r6, r7
-
- ldr r4, [r0], r1 //r4=src[2]
- vmov d3, r7, r4
- ldr r7, [sp, #16]
-
-w4_v_mc_luma_loop:
-
-// pld [r0]
- //using reserving r4
- ldr r5, [r0], r1 //r5=src[3]
- ldr r6, [r0], r1 //r6=src[0]
- vmov d4, r4, r5
- vmov d5, r5, r6 //reserved r6
-
- FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15
- vmov r4, r5, d16
- str r4, [r2], r3 //write 1st 4Byte
- str r5, [r2], r3 //write 2nd 4Byte
-
- ldr r5, [r0], r1 //r5=src[1]
- ldr r4, [r0], r1 //r4=src[2]
- vmov d0, r6, r5
- vmov d1, r5, r4 //reserved r4
-
- FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d16, q14, q15
- vmov r5, r6, d16
- str r5, [r2], r3 //write 3rd 4Byte
- str r6, [r2], r3 //write 4th 4Byte
-
- //d4, d5, d0, d1 --> d0, d1, d2, d3
- vmov q1, q0
- vmov q0, q2
-
- sub r7, #4
- cmp r7, #0
- bne w4_v_mc_luma_loop
-
- pop {r4, r5, r6, r7}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer22WidthEq16_neon
- push {r4}
- vpush {q4-q7}
- ldr r4, [sp, #68]
-
- sub r0, #2 //src[-2]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
- pld [r0]
- pld [r0, r1]
-
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0-d2}, [r0], r1 //use 21(16+5), =src[-2]
- vld1.u8 {d3-d5}, [r0], r1 //use 21(16+5), =src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
-
- vld1.u8 {d6-d8}, [r0], r1 //use 21(16+5), =src[0]
- vld1.u8 {d9-d11}, [r0], r1 //use 21(16+5), =src[1]
- pld [r0]
- pld [r0, r1]
- vld1.u8 {d12-d14}, [r0], r1 //use 21(16+5), =src[2]
-
-w16_hv_mc_luma_loop:
-
- vld1.u8 {d15-d17}, [r0], r1 //use 21(16+5), =src[3]
- //the 1st row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
-
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
- vst1.u8 {q0}, [r2], r3 //write 16Byte
-
-
- vld1.u8 {d0-d2}, [r0], r1 //read 2nd row
- //the 2nd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3
-
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4
-
- vst1.u8 {d3, d4}, [r2], r3 //write 16Byte
-
- vld1.u8 {d3-d5}, [r0], r1 //read 3rd row
- //the 3rd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6
-
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7
- vst1.u8 {d6, d7}, [r2], r3 //write 16Byte
-
- vld1.u8 {d6-d8}, [r0], r1 //read 4th row
- //the 4th row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10
- vst1.u8 {d9, d10}, [r2], r3 //write 16Byte
-
- //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
- vswp q0, q6
- vswp q6, q3
- vmov q5, q2
- vmov q2, q8
-
- vmov d20,d8
- vmov q4, q1
- vmov q1, q7
- vmov d14,d20
-
- sub r4, #4
- cmp r4, #0
- bne w16_hv_mc_luma_loop
- vpop {q4-q7}
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer22WidthEq8_neon
- push {r4}
- vpush {q4}
- ldr r4, [sp, #20]
-
- sub r0, #2 //src[-2]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
- pld [r0]
- pld [r0, r1]
-
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //use 13(8+5), =src[-2]
- vld1.u8 {q1}, [r0], r1 //use 13(8+5), =src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
-
- vld1.u8 {q2}, [r0], r1 //use 13(8+5), =src[0]
- vld1.u8 {q3}, [r0], r1 //use 13(8+5), =src[1]
- pld [r0]
- pld [r0, r1]
- vld1.u8 {q4}, [r0], r1 //use 13(8+5), =src[2]
-
-w8_hv_mc_luma_loop:
-
- vld1.u8 {q8}, [r0], r1 //use 13(8+5), =src[3]
- //the 1st row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d16, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d17, q10, q14, q15 // 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2], r3 //write 8Byte
-
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- //the 2nd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d16, d0, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d17, d1, q10, q14, q15 // 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2], r3 //write 8Byte
-
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- //the 3rd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d16, d0, d2, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d17, d1, d3, q10, q14, q15 // 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2], r3 //write 8Byte
-
- vld1.u8 {q2}, [r0], r1 //read 4th row
- //the 4th row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d6, d8, d16, d0, d2, d4, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d7, d9, d17, d1, d3, d5, q10, q14, q15 // 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2], r3 //write 8Byte
-
- //q4~q5, q0~q2, --> q0~q4
- vswp q0, q4
- vswp q2, q4
- vmov q3, q1
- vmov q1, q8
-
- sub r4, #4
- cmp r4, #0
- bne w8_hv_mc_luma_loop
- vpop {q4}
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer22WidthEq4_neon
- push {r4 ,r5, r6}
- vpush {q4-q7}
- ldr r6, [sp, #76]
-
- sub r0, #2 //src[-2]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
- pld [r0]
- pld [r0, r1]
-
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[-2]
- vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
-
- vld1.u8 {q2}, [r0], r1 //use 9(4+5), =src[0]
- vld1.u8 {q3}, [r0], r1 //use 9(4+5), =src[1]
- pld [r0]
- pld [r0, r1]
- vld1.u8 {q4}, [r0], r1 //use 9(4+5), =src[2]
-
-w4_hv_mc_luma_loop:
-
- vld1.u8 {q5}, [r0], r1 //use 9(4+5), =src[3]
- vld1.u8 {q6}, [r0], r1 //use 9(4+5), =src[4]
-
- //the 1st&2nd row
- pld [r0]
- pld [r0, r1]
- // vertical filtered
- FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q7, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q8, q14, q15 // 1 avail
-
- FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8,d10, d12, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9,d11, d13,q10, q14, q15 // 1 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail
- UNPACK_2_16BITS_TO_ABC q9,q10, q0, q7, q8 //4 avail
-
- vmov d23, d0
- vmov d25, d14
- vmov d27, d16
-
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0]
- vmov r4 ,r5, d22
- str r4, [r2], r3 //write 4Byte
- str r5, [r2], r3 //write 4Byte
-
- //the 3rd&4th row
- vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[3]
- vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[4]
- pld [r0]
- pld [r0, r1]
- // vertical filtered
- FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d12, d0, q7, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d13, d1, q8, q14, q15 // 1 avail
-
- FILTER_6TAG_8BITS_TO_16BITS d6, d8,d10, d12, d0, d2, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d7, d9,d11, d13, d1, d3,q10, q14, q15 // 1 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail
- UNPACK_2_16BITS_TO_ABC q9,q10, q2, q7, q8 //4 avail
-
- vmov d23, d4
- vmov d25, d14
- vmov d27, d16
-
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0]
- vmov r4 ,r5, d22
- str r4, [r2], r3 //write 4Byte
- str r5, [r2], r3 //write 4Byte
-
- //q4~q6, q0~q1, --> q0~q4
- vswp q4, q0
- vmov q3, q4
- vmov q4, q1
- vmov q1, q5
- vmov q2, q6
-
- sub r6, #4
- cmp r6, #0
- bne w4_hv_mc_luma_loop
-
- vpop {q4-q7}
- pop {r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McCopyWidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
-w16_copy_loop:
- vld1.u8 {q0}, [r0], r1
- sub r4, #2
- vld1.u8 {q1}, [r0], r1
- vst1.u8 {q0}, [r2], r3
- cmp r4, #0
- vst1.u8 {q1}, [r2], r3
- bne w16_copy_loop
-
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McCopyWidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
-w8_copy_loop:
- vld1.u8 {d0}, [r0], r1
- vld1.u8 {d1}, [r0], r1
- vst1.u8 {d0}, [r2], r3
- vst1.u8 {d1}, [r2], r3
- sub r4, #2
- cmp r4, #0
- bne w8_copy_loop
-
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McCopyWidthEq4_neon
- push {r4, r5, r6}
- ldr r4, [sp, #12]
-w4_copy_loop:
- ldr r5, [r0], r1
- ldr r6, [r0], r1
- str r5, [r2], r3
- str r6, [r2], r3
-
- sub r4, #2
- cmp r4, #0
- bne w4_copy_loop
-
- pop {r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN PixelAvgWidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
-w16_pix_avg_loop:
- vld1.u8 {q0}, [r2]!
- vld1.u8 {q1}, [r3]!
- vld1.u8 {q2}, [r2]!
- vld1.u8 {q3}, [r3]!
-
- vld1.u8 {q8}, [r2]!
- vld1.u8 {q9}, [r3]!
- vld1.u8 {q10}, [r2]!
- vld1.u8 {q11}, [r3]!
-
- AVERAGE_TWO_8BITS d0, d0, d2
- AVERAGE_TWO_8BITS d1, d1, d3
- vst1.u8 {q0}, [r0], r1
-
- AVERAGE_TWO_8BITS d4, d4, d6
- AVERAGE_TWO_8BITS d5, d5, d7
- vst1.u8 {q2}, [r0], r1
-
- AVERAGE_TWO_8BITS d16, d16, d18
- AVERAGE_TWO_8BITS d17, d17, d19
- vst1.u8 {q8}, [r0], r1
-
- AVERAGE_TWO_8BITS d20, d20, d22
- AVERAGE_TWO_8BITS d21, d21, d23
- vst1.u8 {q10}, [r0], r1
-
- sub r4, #4
- cmp r4, #0
- bne w16_pix_avg_loop
-
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN PixelAvgWidthEq8_neon
- push {r4, r5}
- ldr r4, [sp, #8]
- mov r5, #16
-w8_pix_avg_loop:
-
- vld1.u8 {d0}, [r2], r5
- vld1.u8 {d2}, [r3], r5
- vld1.u8 {d1}, [r2], r5
- vld1.u8 {d3}, [r3], r5
-
- AVERAGE_TWO_8BITS d0, d0, d2
- AVERAGE_TWO_8BITS d1, d1, d3
- vst1.u8 {d0}, [r0], r1
- vst1.u8 {d1}, [r0], r1
-
- vld1.u8 {d4}, [r2], r5
- vld1.u8 {d6}, [r3], r5
- vld1.u8 {d5}, [r2], r5
- vld1.u8 {d7}, [r3], r5
-
- AVERAGE_TWO_8BITS d4, d4, d6
- AVERAGE_TWO_8BITS d5, d5, d7
- vst1.u8 {d4}, [r0], r1
- vst1.u8 {d5}, [r0], r1
-
- sub r4, #4
- cmp r4, #0
- bne w8_pix_avg_loop
-
- pop {r4, r5}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN PixelAvgWidthEq4_neon
- push {r4-r8}
- ldr r4, [sp, #20]
-w4_pix_avg_loop:
-
- ldr r5, [r2]
- ldr r6, [r2, #16]
- ldr r7, [r3]
- ldr r8, [r3, #16]
- add r2, #32
- add r3, #32
-
- vmov d0, r5, r6
- vmov d1, r7, r8
- AVERAGE_TWO_8BITS d0, d0, d1
- vmov r5, r6, d0
-
- str r5, [r0], r1
- str r6, [r0], r1
-
- sub r4, #2
- cmp r4, #0
- bne w4_pix_avg_loop
-
- pop {r4-r8}
-WELS_ASM_FUNC_END
-
-WELS_ASM_FUNC_BEGIN McChromaWidthEq8_neon
- push {r4, r5}
- ldr r4, [sp, #8]
- ldr r5, [sp, #12]
-// normal case: {cA*src[x] + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
-// we can opti it by adding vert only/ hori only cases, to be continue
- vld1.u8 {d31}, [r4] //load A/B/C/D
- vld1.u8 {q0}, [r0], r1 //src[x]
-
- vdup.u8 d28, d31[0] //A
- vdup.u8 d29, d31[1] //B
- vdup.u8 d30, d31[2] //C
- vdup.u8 d31, d31[3] //D
-
- vext.u8 d1, d0, d1, #1 //src[x+1]
-
-w8_mc_chroma_loop: // each two pxl row
- vld1.u8 {q1}, [r0], r1 //src[x+stride]
- vld1.u8 {q2}, [r0], r1 //src[x+2*stride]
- vext.u8 d3, d2, d3, #1 //src[x+stride+1]
- vext.u8 d5, d4, d5, #1 //src[x+2*stride+1]
-
- vmull.u8 q3, d0, d28 //(src[x] * A)
- vmlal.u8 q3, d1, d29 //+=(src[x+1] * B)
- vmlal.u8 q3, d2, d30 //+=(src[x+stride] * C)
- vmlal.u8 q3, d3, d31 //+=(src[x+stride+1] * D)
- vrshrn.u16 d6, q3, #6
- vst1.u8 d6, [r2], r3
-
- vmull.u8 q3, d2, d28 //(src[x] * A)
- vmlal.u8 q3, d3, d29 //+=(src[x+1] * B)
- vmlal.u8 q3, d4, d30 //+=(src[x+stride] * C)
- vmlal.u8 q3, d5, d31 //+=(src[x+stride+1] * D)
- vrshrn.u16 d6, q3, #6
- vst1.u8 d6, [r2], r3
-
- vmov q0, q2
- sub r5, #2
- cmp r5, #0
- bne w8_mc_chroma_loop
-
- pop {r4, r5}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McChromaWidthEq4_neon
-
- push {r4, r5, r6}
- ldr r4, [sp, #12]
- ldr r6, [sp, #16]
-// normal case: {cA*src[x] + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
-// we can opti it by adding vert only/ hori only cases, to be continue
- vld1.u8 {d31}, [r4] //load A/B/C/D
-
- vdup.u8 d28, d31[0] //A
- vdup.u8 d29, d31[1] //B
- vdup.u8 d30, d31[2] //C
- vdup.u8 d31, d31[3] //D
-
-w4_mc_chroma_loop: // each two pxl row
- vld1.u8 {d0}, [r0], r1 //a::src[x]
- vld1.u8 {d2}, [r0], r1 //b::src[x+stride]
- vld1.u8 {d4}, [r0] //c::src[x+2*stride]
-
- vshr.u64 d1, d0, #8
- vshr.u64 d3, d2, #8
- vshr.u64 d5, d4, #8
-
- vmov q3, q1 //b::[0:7]+b::[1~8]
- vtrn.32 q0, q1 //d0{a::[0:3]+b::[0:3]}; d1{a::[1:4]+b::[1:4]}
- vtrn.32 q3, q2 //d6{b::[0:3]+c::[0:3]}; d7{b::[1:4]+c::[1:4]}
-
- vmull.u8 q1, d0, d28 //(src[x] * A)
- vmlal.u8 q1, d1, d29 //+=(src[x+1] * B)
- vmlal.u8 q1, d6, d30 //+=(src[x+stride] * C)
- vmlal.u8 q1, d7, d31 //+=(src[x+stride+1] * D)
-
- vrshrn.u16 d2, q1, #6
- vmov r4, r5, d2
- str r4, [r2], r3
- str r5, [r2], r3
-
- sub r6, #2
- cmp r6, #0
- bne w4_mc_chroma_loop
-
- pop {r4, r5, r6}
-WELS_ASM_FUNC_END
-
-WELS_ASM_FUNC_BEGIN McHorVer20Width17_neon
- push {r4-r5}
- mov r4, #20
- mov r5, #1
- sub r4, r4, r4, lsl #(16-2)
- lsl r5, #16
- ror r4, #16
- vmov d3, r5, r4 // 0x0014FFFB00010000
-
- sub r3, #16
- ldr r4, [sp, #8]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w17_h_mc_luma_loop:
- vld1.u8 {d0,d1,d2}, [r0], r1 //only use 22(17+5); q0=src[-2]
-
- vext.8 q2, q0, q1, #1 //q2=src[-1]
- vext.8 q3, q0, q1, #2 //q3=src[0]
- vext.8 q8, q0, q1, #3 //q8=src[1]
- vext.8 q9, q0, q1, #4 //q9=src[2]
- vext.8 q10, q0, q1, #5 //q10=src[3]
-
- FILTER_6TAG_8BITS d0, d4, d6, d16, d18, d20, d22, q14, q15
-
- FILTER_6TAG_8BITS d1, d5, d7, d17, d19, d21, d23, q14, q15
-
- vst1.u8 {d22, d23}, [r2]! //write [0:15] Byte
-
- vsli.64 d2, d2, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
- FILTER_SINGLE_TAG_8BITS d2, d3, d22, q11, q1
-
- vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
-
- sub r4, #1
- cmp r4, #0
- bne w17_h_mc_luma_loop
- pop {r4-r5}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer20Width9_neon
- push {r4-r5}
- mov r4, #20
- mov r5, #1
- sub r4, r4, r4, lsl #(16-2)
- lsl r5, #16
- ror r4, #16
- vmov d7, r5, r4 // 0x0014FFFB00010000
-
- sub r3, #8
- ldr r4, [sp, #8]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w9_h_mc_luma_loop:
- vld1.u8 {d0,d1}, [r0], r1 //only use 14(9+5); q0=src[-2]
- pld [r0]
-
- vext.8 d2, d0, d1, #1 //d2=src[-1]
- vext.8 d3, d0, d1, #2 //d3=src[0]
- vext.8 d4, d0, d1, #3 //d4=src[1]
- vext.8 d5, d0, d1, #4 //d5=src[2]
- vext.8 d6, d0, d1, #5 //d6=src[3]
-
- FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d16, q14, q15
-
- sub r4, #1
- vst1.u8 {d16}, [r2]! //write [0:7] Byte
-
- vsli.64 d2, d1, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
- FILTER_SINGLE_TAG_8BITS d2, d7, d18, q9, q1
- vst1.u8 {d2[0]}, [r2], r3 //write 8th Byte
-
- cmp r4, #0
- bne w9_h_mc_luma_loop
- pop {r4-r5}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer02Height17_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //q0=src[-2]
- vld1.u8 {q1}, [r0], r1 //q1=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {q2}, [r0], r1 //q2=src[0]
- vld1.u8 {q3}, [r0], r1 //q3=src[1]
- vld1.u8 {q8}, [r0], r1 //q8=src[2]
-
-w17_v_mc_luma_loop:
-
- vld1.u8 {q9}, [r0], r1 //q9=src[3]
-
- FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
-
- FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- vst1.u8 {q10}, [r2], r3 //write 2nd 16Byte
-
- FILTER_6TAG_8BITS d4, d6, d16, d18, d0, d2, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d5, d7, d17, d19, d1, d3, d21, q14, q15
- vld1.u8 {q2}, [r0], r1 //read 4th row
- vst1.u8 {q10}, [r2], r3 //write 3rd 16Byte
-
- FILTER_6TAG_8BITS d6, d16, d18, d0, d2, d4, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d7, d17, d19, d1, d3, d5, d21, q14, q15
- vld1.u8 {q3}, [r0], r1 //read 5th row
- vst1.u8 {q10}, [r2], r3 //write 4th 16Byte
-
- FILTER_6TAG_8BITS d16, d18, d0, d2, d4, d6, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d17, d19, d1, d3, d5, d7, d21, q14, q15
- vld1.u8 {q8}, [r0], r1 //read 6th row
- vst1.u8 {q10}, [r2], r3 //write 5th 16Byte
-
- FILTER_6TAG_8BITS d18, d0, d2, d4, d6, d16, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d19, d1, d3, d5, d7, d17, d21, q14, q15
- vld1.u8 {q9}, [r0], r1 //read 7th row
- vst1.u8 {q10}, [r2], r3 //write 6th 16Byte
-
- FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 8th row
- vst1.u8 {q10}, [r2], r3 //write 7th 16Byte
-
- FILTER_6TAG_8BITS d2, d4, d6, d16, d18, d0, d20, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d3, d5, d7, d17, d19, d1, d21, q14, q15
- vst1.u8 {q10}, [r2], r3 //write 8th 16Byte
-
- //q2, q3, q8, q9, q0 --> q0~q8
- vswp q0, q8
- vswp q0, q2
- vmov q1, q3
- vmov q3, q9 //q0~q8
-
- sub r4, #8
- cmp r4, #1
- bne w17_v_mc_luma_loop
- // the last 16Bytes
- vld1.u8 {q9}, [r0], r1 //q9=src[3]
- FILTER_6TAG_8BITS d0, d2, d4, d6, d16, d18, d20, q14, q15
- FILTER_6TAG_8BITS d1, d3, d5, d7, d17, d19, d21, q14, q15
- vst1.u8 {q10}, [r2], r3 //write 1st 16Byte
-
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer02Height9_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0}, [r0], r1 //d0=src[-2]
- vld1.u8 {d1}, [r0], r1 //d1=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {d2}, [r0], r1 //d2=src[0]
- vld1.u8 {d3}, [r0], r1 //d3=src[1]
-
- vld1.u8 {d4}, [r0], r1 //d4=src[2]
- vld1.u8 {d5}, [r0], r1 //d5=src[3]
-
-w9_v_mc_luma_loop:
-
- pld [r0]
- FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15
- vld1.u8 {d0}, [r0], r1 //read 2nd row
- vst1.u8 {d16}, [r2], r3 //write 1st 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d16, q14, q15
- vld1.u8 {d1}, [r0], r1 //read 3rd row
- vst1.u8 {d16}, [r2], r3 //write 2nd 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d16, q14, q15
- vld1.u8 {d2}, [r0], r1 //read 4th row
- vst1.u8 {d16}, [r2], r3 //write 3rd 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d16, q14, q15
- vld1.u8 {d3}, [r0], r1 //read 5th row
- vst1.u8 {d16}, [r2], r3 //write 4th 8Byte
-
- //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
- vswp q0, q2
- vswp q1, q2
-
- sub r4, #4
- cmp r4, #1
- bne w9_v_mc_luma_loop
-
- FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d16, q14, q15
- vst1.u8 {d16}, [r2], r3 //write last 8Byte
-
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer22Width17_neon
- push {r4}
- vpush {q4-q7}
- ldr r4, [sp, #68]
-
- sub r0, #2 //src[-2]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
- pld [r0]
- pld [r0, r1]
-
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0-d2}, [r0], r1 //use 21(17+5), =src[-2]
- vld1.u8 {d3-d5}, [r0], r1 //use 21(17+5), =src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
-
- vld1.u8 {d6-d8}, [r0], r1 //use 21(17+5), =src[0]
- vld1.u8 {d9-d11}, [r0], r1 //use 21(17+5), =src[1]
- pld [r0]
- pld [r0, r1]
- vld1.u8 {d12-d14}, [r0], r1 //use 21(17+5), =src[2]
- sub r3, #16
-
-w17_hv_mc_luma_loop:
-
- vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3]
- //the 1st row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
- vst1.u8 {d0, d1}, [r2]! //write 16Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0]
- vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
-
- vld1.u8 {d0-d2}, [r0], r1 //read 2nd row
- //the 2nd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4
- vst1.u8 {d3, d4}, [r2]! //write 16Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d5, d22, d23, q11 //output to d5[0]
- vst1.u8 {d5[0]}, [r2], r3 //write 16th Byte
-
- vld1.u8 {d3-d5}, [r0], r1 //read 3rd row
- //the 3rd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7
- vst1.u8 {d6, d7}, [r2]! //write 16Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d8, d22, d23, q11 //output to d8[0]
- vst1.u8 {d8[0]}, [r2], r3 //write 16th Byte
-
- vld1.u8 {d6-d8}, [r0], r1 //read 4th row
- //the 4th row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10
- vst1.u8 {d9, d10}, [r2], r3 //write 16Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d11, d22, d23, q11 //output to d11[0]
- vst1.u8 {d11[0]}, [r2], r3 //write 16th Byte
-
- //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
- vswp q0, q6
- vswp q6, q3
- vmov q5, q2
- vmov q2, q8
-
- vmov d20,d8
- vmov q4, q1
- vmov q1, q7
- vmov d14,d20
-
- sub r4, #4
- cmp r4, #1
- bne w17_hv_mc_luma_loop
- //the last row
- vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
- vst1.u8 {q0}, [r2]! //write 16Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0]
- vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
-
- vpop {q4-q7}
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer22Width9_neon
- push {r4}
- vpush {q4}
- ldr r4, [sp, #20]
-
- sub r0, #2 //src[-2]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
- pld [r0]
- pld [r0, r1]
-
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //use 14(9+5), =src[-2]
- vld1.u8 {q1}, [r0], r1 //use 14(9+5), =src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
-
- vld1.u8 {q2}, [r0], r1 //use 14(9+5), =src[0]
- vld1.u8 {q3}, [r0], r1 //use 14(9+5), =src[1]
- pld [r0]
- pld [r0, r1]
- vld1.u8 {q4}, [r0], r1 //use 14(9+5), =src[2]
- sub r3, #8
-
-w9_hv_mc_luma_loop:
-
- vld1.u8 {q8}, [r0], r1 //use 14(9+5), =src[3]
- //the 1st row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d16, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d17, q10, q14, q15 // 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2]! //write 8Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
- vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
-
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- //the 2nd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d16, d0, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d17, d1, q10, q14, q15 // 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2]! //write 8Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
- vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
-
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- //the 3rd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d16, d0, d2, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d17, d1, d3, q10, q14, q15 // 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2]! //write 8Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
- vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
-
- vld1.u8 {q2}, [r0], r1 //read 4th row
- //the 4th row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d6, d8, d16, d0, d2, d4, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d7, d9, d17, d1, d3, d5, q10, q14, q15 // 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2]! //write 8Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
- vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
-
- //q4~q8, q0~q2, --> q0~q4
- vswp q0, q4
- vswp q2, q4
- vmov q3, q1
- vmov q1, q8
-
- sub r4, #4
- cmp r4, #1
- bne w9_hv_mc_luma_loop
- //the last row
- vld1.u8 {q8}, [r0], r1 //use 14(9+5), =src[3]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d16, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d17, q10, q14, q15 // 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18 //output to q9[0]
- vst1.u8 d18, [r2]! //write 8Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d19, d20, d21, q10 //output to d19[0]
- vst1.u8 {d19[0]}, [r2], r3 //write 8th Byte
- vpop {q4}
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN PixStrideAvgWidthEq16_neon
- push {r4, r5, r6}
- ldr r4, [sp, #12]
- ldr r5, [sp, #16]
- ldr r6, [sp, #20]
-
-enc_w16_pix_avg_loop:
- vld1.u8 {q0}, [r2], r3
- vld1.u8 {q1}, [r4], r5
- vld1.u8 {q2}, [r2], r3
- vld1.u8 {q3}, [r4], r5
-
- vld1.u8 {q8}, [r2], r3
- vld1.u8 {q9}, [r4], r5
- vld1.u8 {q10}, [r2], r3
- vld1.u8 {q11}, [r4], r5
-
- AVERAGE_TWO_8BITS d0, d0, d2
- AVERAGE_TWO_8BITS d1, d1, d3
- vst1.u8 {q0}, [r0], r1
-
- AVERAGE_TWO_8BITS d4, d4, d6
- AVERAGE_TWO_8BITS d5, d5, d7
- vst1.u8 {q2}, [r0], r1
-
- AVERAGE_TWO_8BITS d16, d16, d18
- AVERAGE_TWO_8BITS d17, d17, d19
- vst1.u8 {q8}, [r0], r1
-
- AVERAGE_TWO_8BITS d20, d20, d22
- AVERAGE_TWO_8BITS d21, d21, d23
- vst1.u8 {q10}, [r0], r1
-
- sub r6, #4
- cmp r6, #0
- bne enc_w16_pix_avg_loop
-
- pop {r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN PixStrideAvgWidthEq8_neon
- push {r4, r5, r6}
- ldr r4, [sp, #12]
- ldr r5, [sp, #16]
- ldr r6, [sp, #20]
-enc_w8_pix_avg_loop:
-
- vld1.u8 {d0}, [r2], r3
- vld1.u8 {d2}, [r4], r5
- vld1.u8 {d1}, [r2], r3
- vld1.u8 {d3}, [r4], r5
-
- AVERAGE_TWO_8BITS d0, d0, d2
- AVERAGE_TWO_8BITS d1, d1, d3
- vst1.u8 {d0}, [r0], r1
- vst1.u8 {d1}, [r0], r1
-
- vld1.u8 {d4}, [r2], r3
- vld1.u8 {d6}, [r4], r5
- vld1.u8 {d5}, [r2], r3
- vld1.u8 {d7}, [r4], r5
-
- AVERAGE_TWO_8BITS d4, d4, d6
- AVERAGE_TWO_8BITS d5, d5, d7
- vst1.u8 {d4}, [r0], r1
- vst1.u8 {d5}, [r0], r1
-
- sub r6, #4
- cmp r6, #0
- bne enc_w8_pix_avg_loop
-
- pop {r4, r5, r6}
-WELS_ASM_FUNC_END
-
-#endif
--- a/codec/common/measure_time.h
+++ /dev/null
@@ -1,89 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2009-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file measure_time.h
- *
- * \brief time cost measure utilization
- *
- * \date 04/28/2009 Created
- *
- *************************************************************************************
- */
-#ifndef WELS_TIME_COST_MEASURE_UTIL_H__
-#define WELS_TIME_COST_MEASURE_UTIL_H__
-
-#include <stdlib.h>
-
-#include "typedefs.h"
-#ifndef _WIN32
-#include <sys/time.h>
-#else
-#include <windows.h>
-#include <sys/timeb.h>
-#endif
-#include <time.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif//__cplusplus
-
-/*!
- * \brief time cost measure utilization
- * \param void
- * \return time elapsed since run (unit: microsecond)
- */
-
-static inline int64_t WelsTime (void) {
-#ifndef _WIN32
- struct timeval tv_date;
-
- gettimeofday (&tv_date, NULL);
- return ((int64_t) tv_date.tv_sec * 1000000 + (int64_t) tv_date.tv_usec);
-#else
- static int64_t iMtimeFreq = 0;
- int64_t iMtimeCur = 0;
- int64_t iResult = 0;
- if (!iMtimeFreq) {
- QueryPerformanceFrequency ((LARGE_INTEGER*)&iMtimeFreq);
- if (!iMtimeFreq)
- iMtimeFreq = 1;
- }
- QueryPerformanceCounter ((LARGE_INTEGER*)&iMtimeCur);
- iResult = (int64_t) ((double)iMtimeCur * 1e6 / (double)iMtimeFreq + 0.5);
- return iResult;
-#endif//WIN32
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif//WELS_TIME_COST_MEASURE_UTIL_H__
--- a/codec/common/satd_sad.asm
+++ /dev/null
@@ -1,2184 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* satd_sad.asm
-;*
-;* Abstract
-;* WelsSampleSatd4x4_sse2
-;* WelsSampleSatd8x8_sse2
-;* WelsSampleSatd16x8_sse2
-;* WelsSampleSatd8x16_sse2
-;* WelsSampleSatd16x16_sse2
-;*
-;* WelsSampleSad16x8_sse2
-;* WelsSampleSad16x16_sse2
-;*
-;* History
-;* 8/5/2009 Created
-;* 24/9/2009 modified
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Data
-;***********************************************************************
-SECTION .rodata align=16
-
-align 16
-HSumSubDB1: db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
-align 16
-HSumSubDW1: dw 1,-1,1,-1,1,-1,1,-1
-align 16
-PDW1: dw 1,1,1,1,1,1,1,1
-align 16
-PDQ2: dw 2,0,0,0,2,0,0,0
-align 16
-HSwapSumSubDB1: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
-
-;***********************************************************************
-; Code
-;***********************************************************************
-SECTION .text
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse2 BEGIN
-;
-;***********************************************************************
-%macro MMX_DW_1_2REG 2
- pxor %1, %1
- pcmpeqw %2, %2
- psubw %1, %2
-%endmacro
-
-%macro SSE2_SumWHorizon1 2
- movdqa %2, %1
- psrldq %2, 8
- paddusw %1, %2
- movdqa %2, %1
- psrldq %2, 4
- paddusw %1, %2
- movdqa %2, %1
- psrldq %2, 2
- paddusw %1, %2
-%endmacro
-
-%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4 pOut: xmm4,xmm2,xmm1,xmm3
- SSE2_SumSub %1, %2, %5
- SSE2_SumSub %3, %4, %5
- SSE2_SumSub %2, %4, %5
- SSE2_SumSub %1, %3, %5
-%endmacro
-
-%macro SSE2_SumAbs4 7
- WELS_AbsW %1, %3
- WELS_AbsW %2, %3
- WELS_AbsW %4, %6
- WELS_AbsW %5, %6
- paddusw %1, %2
- paddusw %4, %5
- paddusw %7, %1
- paddusw %7, %4
-%endmacro
-
-%macro SSE2_SumWHorizon 3
- movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
- paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
- punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
- movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
- paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
- pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
- paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
-%endmacro
-
-%macro SSE2_GetSatd8x8 0
- SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
- SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
- SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
-
- SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
- SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
- SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
- SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
- SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
- SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
-
- SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
- SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
- SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
- SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
-%endmacro
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd4x4_sse2
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- movd xmm0, [r0]
- movd xmm1, [r0+r1]
- lea r0 , [r0+2*r1]
- movd xmm2, [r0]
- movd xmm3, [r0+r1]
- punpckldq xmm0, xmm2
- punpckldq xmm1, xmm3
-
- movd xmm4, [r2]
- movd xmm5, [r2+r3]
- lea r2 , [r2+2*r3]
- movd xmm6, [r2]
- movd xmm7, [r2+r3]
- punpckldq xmm4, xmm6
- punpckldq xmm5, xmm7
-
- pxor xmm6, xmm6
- punpcklbw xmm0, xmm6
- punpcklbw xmm1, xmm6
- punpcklbw xmm4, xmm6
- punpcklbw xmm5, xmm6
-
- psubw xmm0, xmm4
- psubw xmm1, xmm5
-
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- psubw xmm2, xmm1
- SSE2_XSawp qdq, xmm0, xmm2, xmm3
-
- movdqa xmm4, xmm0
- paddw xmm0, xmm3
- psubw xmm4, xmm3
-
- movdqa xmm2, xmm0
- punpcklwd xmm0, xmm4
- punpckhwd xmm4, xmm2
-
- SSE2_XSawp dq, xmm0, xmm4, xmm3
- SSE2_XSawp qdq, xmm0, xmm3, xmm5
-
- movdqa xmm7, xmm0
- paddw xmm0, xmm5
- psubw xmm7, xmm5
-
- SSE2_XSawp qdq, xmm0, xmm7, xmm1
-
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- psubw xmm2, xmm1
-
- WELS_AbsW xmm0, xmm3
- paddusw xmm6, xmm0
- WELS_AbsW xmm2, xmm4
- paddusw xmm6, xmm2
- SSE2_SumWHorizon1 xmm6, xmm4
- movd retrd, xmm6
- and retrd, 0xffff
- shr retrd, 1
- POP_XMM
- LOAD_4_PARA_POP
- ret
-
- ;***********************************************************************
- ;
- ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
- ;
- ;***********************************************************************
-WELS_EXTERN WelsSampleSatd8x8_sse2
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm6, xmm6
- pxor xmm7, xmm7
- SSE2_GetSatd8x8
- psrlw xmm6, 1
- SSE2_SumWHorizon xmm6,xmm4,xmm7
- movd retrd, xmm6
- POP_XMM
- LOAD_4_PARA_POP
- ret
-
- ;***********************************************************************
- ;
- ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
- ;
- ;***********************************************************************
-WELS_EXTERN WelsSampleSatd8x16_sse2
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm6, xmm6
- pxor xmm7, xmm7
-
- SSE2_GetSatd8x8
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSatd8x8
-
- psrlw xmm6, 1
- SSE2_SumWHorizon xmm6,xmm4,xmm7
- movd retrd, xmm6
- POP_XMM
- LOAD_4_PARA_POP
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x8_sse2
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- push r0
- push r2
- pxor xmm6, xmm6
- pxor xmm7, xmm7
-
- SSE2_GetSatd8x8
-
- pop r2
- pop r0
- add r0, 8
- add r2, 8
- SSE2_GetSatd8x8
-
- psrlw xmm6, 1
- SSE2_SumWHorizon xmm6,xmm4,xmm7
- movd retrd, xmm6
- POP_XMM
- LOAD_4_PARA_POP
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x16_sse2
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- push r0
- push r2
- pxor xmm6, xmm6
- pxor xmm7, xmm7
-
- SSE2_GetSatd8x8
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSatd8x8
-
- pop r2
- pop r0
- add r0, 8
- add r2, 8
-
- SSE2_GetSatd8x8
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSatd8x8
-
- ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
- psrlw xmm6, 1
- SSE2_SumWHorizon xmm6,xmm4,xmm7
- movd retrd, xmm6
- POP_XMM
- LOAD_4_PARA_POP
- ret
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse2 END
-;
-;***********************************************************************
-
-;***********************************************************************
-;
-;Pixel_satd_intra_sse2 BEGIN
-;
-;***********************************************************************
-
-%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
- pmaddubsw %1, xmm5
- movdqa %2, %1
- pmaddwd %1, xmm7
- pmaddwd %2, xmm6
- movdqa %3, %1
- punpckldq %1, %2
- punpckhdq %2, %3
- movdqa %3, %1
- punpcklqdq %1, %2
- punpckhqdq %3, %2
- paddd xmm4, %1 ;for dc
- paddd xmm4, %3 ;for dc
- packssdw %1, %3
- psllw %1, 2
-%endmacro
-%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
- pmaddubsw %1, xmm5
- movdqa %2, %1
- pmaddwd %1, xmm7
- pmaddwd %2, xmm6
- movdqa %3, %1
- punpckldq %1, %2
- punpckhdq %2, %3
- movdqa %3, %1
- punpcklqdq %1, %2
- punpckhqdq %3, %2
-; paddd xmm4, %1 ;for dc
-; paddd xmm4, %3 ;for dc
- movdqa %4, %1
- punpcklqdq %4, %3
- packssdw %1, %3
- psllw %1, 2
-%endmacro
-
-%macro SSE41_GetX38x4SatdDec 0
- pxor xmm7, xmm7
- movq xmm0, [eax]
- movq xmm1, [eax+ebx]
- lea eax, [eax+2*ebx]
- movq xmm2, [eax]
- movq xmm3, [eax+ebx]
- lea eax, [eax+2*ebx]
- punpcklbw xmm0, xmm7
- punpcklbw xmm1, xmm7
- punpcklbw xmm2, xmm7
- punpcklbw xmm3, xmm7
- SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm7
- SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm7
- SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
- ;doesn't need another transpose
-%endmacro
-%macro SSE41_GetX38x4SatdV 2
- pxor xmm0, xmm0
- pinsrw xmm0, word[esi+%2], 0
- pinsrw xmm0, word[esi+%2+8], 4
- psubsw xmm0, xmm7
- pabsw xmm0, xmm0
- paddw xmm4, xmm0
- pxor xmm0, xmm0
- pinsrw xmm0, word[esi+%2+2], 0
- pinsrw xmm0, word[esi+%2+10], 4
- psubsw xmm0, xmm1
- pabsw xmm0, xmm0
- paddw xmm4, xmm0
- pxor xmm0, xmm0
- pinsrw xmm0, word[esi+%2+4], 0
- pinsrw xmm0, word[esi+%2+12], 4
- psubsw xmm0, xmm3
- pabsw xmm0, xmm0
- paddw xmm4, xmm0
- pxor xmm0, xmm0
- pinsrw xmm0, word[esi+%2+6], 0
- pinsrw xmm0, word[esi+%2+14], 4
- psubsw xmm0, xmm2
- pabsw xmm0, xmm0
- paddw xmm4, xmm0
-%endmacro
-%macro SSE41_GetX38x4SatdH 3
- movq xmm0, [esi+%3+8*%1]
- punpcklqdq xmm0, xmm0
- psubsw xmm0, xmm7
- pabsw xmm0, xmm0
- paddw xmm5, xmm0
- pabsw xmm1, xmm1
- pabsw xmm2, xmm2
- pabsw xmm3, xmm3
- paddw xmm2, xmm1;for DC
- paddw xmm2, xmm3;for DC
- paddw xmm5, xmm2
-%endmacro
-%macro SSE41_I16X16GetX38x4SatdDC 0
- pxor xmm0, xmm0
- movq2dq xmm0, mm4
- punpcklqdq xmm0, xmm0
- psubsw xmm0, xmm7
- pabsw xmm0, xmm0
- paddw xmm6, xmm0
- paddw xmm6, xmm2
-%endmacro
-%macro SSE41_ChromaGetX38x4SatdDC 1
- shl %1, 4
- movdqa xmm0, [esi+32+%1]
- psubsw xmm0, xmm7
- pabsw xmm0, xmm0
- paddw xmm6, xmm0
- paddw xmm6, xmm2
-%endmacro
-%macro SSE41_I16x16GetX38x4Satd 2
- SSE41_GetX38x4SatdDec
- SSE41_GetX38x4SatdV %1, %2
- SSE41_GetX38x4SatdH %1, %2, 32
- SSE41_I16X16GetX38x4SatdDC
-%endmacro
-%macro SSE41_ChromaGetX38x4Satd 2
- SSE41_GetX38x4SatdDec
- SSE41_GetX38x4SatdV %1, %2
- SSE41_GetX38x4SatdH %1, %2, 16
- SSE41_ChromaGetX38x4SatdDC %1
-%endmacro
-%macro SSE41_HSum8W 3
- pmaddwd %1, %2
- movhlps %3, %1
- paddd %1, %3
- pshuflw %3, %1,0Eh
- paddd %1, %3
-%endmacro
-
-
-%ifdef X86_32
-WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
- push ebx
- push esi
- push edi
- mov ecx, [esp+16]
- mov edx, [esp+20]
- mov eax, [esp+24]
- mov ebx, [esp+28]
- mov esi, [esp+40] ;temp_satd
- pxor xmm4, xmm4
- movdqa xmm5, [HSumSubDB1]
- movdqa xmm6, [HSumSubDW1]
- movdqa xmm7, [PDW1]
- sub ecx, edx
- movdqu xmm0, [ecx]
- movhlps xmm1, xmm0
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm1, xmm1
- SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
- SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
- movdqa [esi], xmm0 ;V
- movdqa [esi+16], xmm1
- add ecx, edx
- pinsrb xmm0, byte[ecx-1], 0
- pinsrb xmm0, byte[ecx+edx-1], 1
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 2
- pinsrb xmm0, byte[ecx+edx-1], 3
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 4
- pinsrb xmm0, byte[ecx+edx-1], 5
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 6
- pinsrb xmm0, byte[ecx+edx-1], 7
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 8
- pinsrb xmm0, byte[ecx+edx-1], 9
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 10
- pinsrb xmm0, byte[ecx+edx-1], 11
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 12
- pinsrb xmm0, byte[ecx+edx-1], 13
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 14
- pinsrb xmm0, byte[ecx+edx-1], 15
- movhlps xmm1, xmm0
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm1, xmm1
- SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
- SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
- movdqa [esi+32], xmm0 ;H
- movdqa [esi+48], xmm1
- movd ecx, xmm4 ;dc
- add ecx, 16 ;(sum+16)
- shr ecx, 5 ;((sum+16)>>5)
- shl ecx, 4 ;
- movd mm4, ecx ; mm4 copy DC
- pxor xmm4, xmm4 ;V
- pxor xmm5, xmm5 ;H
- pxor xmm6, xmm6 ;DC
- mov ecx, 0
- mov edi, 0
-.loop16x16_get_satd:
-.loopStart1:
- SSE41_I16x16GetX38x4Satd ecx, edi
- inc ecx
- cmp ecx, 4
- jl .loopStart1
- cmp edi, 16
- je .loop16x16_get_satd_end
- mov eax, [esp+24]
- add eax, 8
- mov ecx, 0
- add edi, 16
- jmp .loop16x16_get_satd
- .loop16x16_get_satd_end:
- MMX_DW_1_2REG xmm0, xmm1
- psrlw xmm4, 1 ;/2
- psrlw xmm5, 1 ;/2
- psrlw xmm6, 1 ;/2
- SSE41_HSum8W xmm4, xmm0, xmm1
- SSE41_HSum8W xmm5, xmm0, xmm1
- SSE41_HSum8W xmm6, xmm0, xmm1
-
- ; comparing order: DC H V
- movd ebx, xmm6 ;DC
- movd edi, xmm5 ;H
- movd ecx, xmm4 ;V
- mov edx, [esp+36]
- shl edx, 1
- add edi, edx
- add ebx, edx
- mov edx, [esp+32]
- cmp ebx, edi
- jge near not_dc_16x16
- cmp ebx, ecx
- jge near not_dc_h_16x16
-
- ; for DC mode
- mov dword[edx], 2;I16_PRED_DC
- mov eax, ebx
- jmp near return_satd_intra_16x16_x3
-not_dc_16x16:
- ; for H mode
- cmp edi, ecx
- jge near not_dc_h_16x16
- mov dword[edx], 1;I16_PRED_H
- mov eax, edi
- jmp near return_satd_intra_16x16_x3
-not_dc_h_16x16:
- ; for V mode
- mov dword[edx], 0;I16_PRED_V
- mov eax, ecx
-return_satd_intra_16x16_x3:
- WELSEMMS
- pop edi
- pop esi
- pop ebx
-ret
-
-%macro SSE41_ChromaGetX38x8Satd 0
- movdqa xmm5, [HSumSubDB1]
- movdqa xmm6, [HSumSubDW1]
- movdqa xmm7, [PDW1]
- sub ecx, edx
- movq xmm0, [ecx]
- punpcklqdq xmm0, xmm0
- SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
- movdqa [esi], xmm0 ;V
- add ecx, edx
- pinsrb xmm0, byte[ecx-1], 0
- pinsrb xmm0, byte[ecx+edx-1], 1
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 2
- pinsrb xmm0, byte[ecx+edx-1], 3
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 4
- pinsrb xmm0, byte[ecx+edx-1], 5
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 6
- pinsrb xmm0, byte[ecx+edx-1], 7
- punpcklqdq xmm0, xmm0
- SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
- movdqa [esi+16], xmm0 ;H
-;(sum+2)>>2
- movdqa xmm6, [PDQ2]
- movdqa xmm5, xmm4
- punpckhqdq xmm5, xmm1
- paddd xmm5, xmm6
- psrld xmm5, 2
-;(sum1+sum2+4)>>3
- paddd xmm6, xmm6
- paddd xmm4, xmm1
- paddd xmm4, xmm6
- psrld xmm4, 3
-;satd *16
- pslld xmm5, 4
- pslld xmm4, 4
-;temp satd
- movdqa xmm6, xmm4
- punpcklqdq xmm4, xmm5
- psllq xmm4, 32
- psrlq xmm4, 32
- movdqa [esi+32], xmm4
- punpckhqdq xmm5, xmm6
- psllq xmm5, 32
- psrlq xmm5, 32
- movdqa [esi+48], xmm5
-
- pxor xmm4, xmm4 ;V
- pxor xmm5, xmm5 ;H
- pxor xmm6, xmm6 ;DC
- mov ecx, 0
-loop_chroma_satdx3_cb_cr:
- SSE41_ChromaGetX38x4Satd ecx, 0
- inc ecx
- cmp ecx, 2
- jl loop_chroma_satdx3_cb_cr
-%endmacro
-
-%macro SSEReg2MMX 3
- movdq2q %2, %1
- movhlps %1, %1
- movdq2q %3, %1
-%endmacro
-%macro MMXReg2SSE 4
- movq2dq %1, %3
- movq2dq %2, %4
- punpcklqdq %1, %2
-%endmacro
-;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
-
-WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
- push ebx
- push esi
- push edi
- mov ecx, [esp+16]
- mov edx, [esp+20]
- mov eax, [esp+24]
- mov ebx, [esp+28]
- mov esi, [esp+40] ;temp_satd
- xor edi, edi
-loop_chroma_satdx3:
- SSE41_ChromaGetX38x8Satd
- cmp edi, 1
- je loop_chroma_satdx3end
- inc edi
- SSEReg2MMX xmm4, mm0,mm1
- SSEReg2MMX xmm5, mm2,mm3
- SSEReg2MMX xmm6, mm5,mm6
- mov ecx, [esp+44]
- mov eax, [esp+48]
- jmp loop_chroma_satdx3
-loop_chroma_satdx3end:
- MMXReg2SSE xmm0, xmm3, mm0, mm1
- MMXReg2SSE xmm1, xmm3, mm2, mm3
- MMXReg2SSE xmm2, xmm3, mm5, mm6
-
- paddw xmm4, xmm0
- paddw xmm5, xmm1
- paddw xmm6, xmm2
-
- MMX_DW_1_2REG xmm0, xmm1
- psrlw xmm4, 1 ;/2
- psrlw xmm5, 1 ;/2
- psrlw xmm6, 1 ;/2
- SSE41_HSum8W xmm4, xmm0, xmm1
- SSE41_HSum8W xmm5, xmm0, xmm1
- SSE41_HSum8W xmm6, xmm0, xmm1
- ; comparing order: DC H V
- movd ebx, xmm6 ;DC
- movd edi, xmm5 ;H
- movd ecx, xmm4 ;V
- mov edx, [esp+36]
- shl edx, 1
- add edi, edx
- add ecx, edx
- mov edx, [esp+32]
- cmp ebx, edi
- jge near not_dc_8x8
- cmp ebx, ecx
- jge near not_dc_h_8x8
-
- ; for DC mode
- mov dword[edx], 0;I8_PRED_DC
- mov eax, ebx
- jmp near return_satd_intra_8x8_x3
-not_dc_8x8:
- ; for H mode
- cmp edi, ecx
- jge near not_dc_h_8x8
- mov dword[edx], 1;I8_PRED_H
- mov eax, edi
- jmp near return_satd_intra_8x8_x3
-not_dc_h_8x8:
- ; for V mode
- mov dword[edx], 2;I8_PRED_V
- mov eax, ecx
-return_satd_intra_8x8_x3:
- WELSEMMS
- pop edi
- pop esi
- pop ebx
-ret
-
-
-;***********************************************************************
-;
-;Pixel_satd_intra_sse2 END
-;
-;***********************************************************************
-%macro SSSE3_Get16BSadHVDC 2
- movd xmm6,%1
- pshufb xmm6,xmm1
- movdqa %1, xmm6
- movdqa xmm0,%2
- psadbw xmm0,xmm7
- paddw xmm4,xmm0
- movdqa xmm0,%2
- psadbw xmm0,xmm5
- paddw xmm2,xmm0
- psadbw xmm6,%2
- paddw xmm3,xmm6
-%endmacro
-%macro WelsAddDCValue 4
- movzx %2, byte %1
- mov %3, %2
- add %4, %2
-%endmacro
-
-;***********************************************************************
-;
-;Pixel_sad_intra_ssse3 BEGIN
-;
-;***********************************************************************
-WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
- push ebx
- push esi
- push edi
- mov ecx, [esp+16]
- mov edx, [esp+20]
- mov edi, [esp+40] ;temp_sad
- sub ecx, edx
- movdqa xmm5,[ecx]
- pxor xmm0,xmm0
- psadbw xmm0,xmm5
- movhlps xmm1,xmm0
- paddw xmm0,xmm1
- movd eax,xmm0
-
- add ecx,edx
- lea ebx, [edx+2*edx]
- WelsAddDCValue [ecx-1 ], esi, [edi ], eax
- WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
- WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
- WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
- lea ecx, [ecx+4*edx]
- add edi, 64
- WelsAddDCValue [ecx-1 ], esi, [edi ], eax
- WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
- WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
- WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
- lea ecx, [ecx+4*edx]
- add edi, 64
- WelsAddDCValue [ecx-1 ], esi, [edi ], eax
- WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
- WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
- WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
- lea ecx, [ecx+4*edx]
- add edi, 64
- WelsAddDCValue [ecx-1 ], esi, [edi ], eax
- WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
- WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
- WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
- sub edi, 192
- add eax,10h
- shr eax,5
- movd xmm7,eax
- pxor xmm1,xmm1
- pshufb xmm7,xmm1
- pxor xmm4,xmm4
- pxor xmm3,xmm3
- pxor xmm2,xmm2
-;sad begin
- mov eax, [esp+24]
- mov ebx, [esp+28]
- lea esi, [ebx+2*ebx]
- SSSE3_Get16BSadHVDC [edi], [eax]
- SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
- SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
- SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
- add edi, 64
- lea eax, [eax+4*ebx]
- SSSE3_Get16BSadHVDC [edi], [eax]
- SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
- SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
- SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
- add edi, 64
- lea eax, [eax+4*ebx]
- SSSE3_Get16BSadHVDC [edi], [eax]
- SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
- SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
- SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
- add edi, 64
- lea eax, [eax+4*ebx]
- SSSE3_Get16BSadHVDC [edi], [eax]
- SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
- SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
- SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-
- pslldq xmm3,4
- por xmm3,xmm2
- movhlps xmm1,xmm3
- paddw xmm3,xmm1
- movhlps xmm0,xmm4
- paddw xmm4,xmm0
-; comparing order: DC H V
- movd ebx, xmm4 ;DC
- movd ecx, xmm3 ;V
- psrldq xmm3, 4
- movd esi, xmm3 ;H
- mov eax, [esp+36] ;lamda
- shl eax, 1
- add esi, eax
- add ebx, eax
- mov edx, [esp+32]
- cmp ebx, esi
- jge near not_dc_16x16_sad
- cmp ebx, ecx
- jge near not_dc_h_16x16_sad
- ; for DC mode
- mov dword[edx], 2;I16_PRED_DC
- mov eax, ebx
- sub edi, 192
-%assign x 0
-%rep 16
- movdqa [edi+16*x], xmm7
-%assign x x+1
-%endrep
- jmp near return_sad_intra_16x16_x3
-not_dc_16x16_sad:
- ; for H mode
- cmp esi, ecx
- jge near not_dc_h_16x16_sad
- mov dword[edx], 1;I16_PRED_H
- mov eax, esi
- jmp near return_sad_intra_16x16_x3
-not_dc_h_16x16_sad:
- ; for V mode
- mov dword[edx], 0;I16_PRED_V
- mov eax, ecx
- sub edi, 192
-%assign x 0
-%rep 16
- movdqa [edi+16*x], xmm5
-%assign x x+1
-%endrep
-return_sad_intra_16x16_x3:
- pop edi
- pop esi
- pop ebx
- ret
-%endif
-;***********************************************************************
-;
-;Pixel_sad_intra_ssse3 END
-;
-;***********************************************************************
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse41 BEGIN
-;
-;***********************************************************************
-
-;SSE4.1
-%macro SSE41_GetSatd8x4 0
- movq xmm0, [r0]
- punpcklqdq xmm0, xmm0
- pmaddubsw xmm0, xmm7
- movq xmm1, [r0+r1]
- punpcklqdq xmm1, xmm1
- pmaddubsw xmm1, xmm7
- movq xmm2, [r2]
- punpcklqdq xmm2, xmm2
- pmaddubsw xmm2, xmm7
- movq xmm3, [r2+r3]
- punpcklqdq xmm3, xmm3
- pmaddubsw xmm3, xmm7
- psubsw xmm0, xmm2
- psubsw xmm1, xmm3
- movq xmm2, [r0+2*r1]
- punpcklqdq xmm2, xmm2
- pmaddubsw xmm2, xmm7
- movq xmm3, [r0+r4]
- punpcklqdq xmm3, xmm3
- pmaddubsw xmm3, xmm7
- movq xmm4, [r2+2*r3]
- punpcklqdq xmm4, xmm4
- pmaddubsw xmm4, xmm7
- movq xmm5, [r2+r5]
- punpcklqdq xmm5, xmm5
- pmaddubsw xmm5, xmm7
- psubsw xmm2, xmm4
- psubsw xmm3, xmm5
- SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4
- pabsw xmm0, xmm0
- pabsw xmm2, xmm2
- pabsw xmm1, xmm1
- pabsw xmm3, xmm3
- movdqa xmm4, xmm3
- pblendw xmm3, xmm1, 0xAA
- pslld xmm1, 16
- psrld xmm4, 16
- por xmm1, xmm4
- pmaxuw xmm1, xmm3
- paddw xmm6, xmm1
- movdqa xmm4, xmm0
- pblendw xmm0, xmm2, 0xAA
- pslld xmm2, 16
- psrld xmm4, 16
- por xmm2, xmm4
- pmaxuw xmm0, xmm2
- paddw xmm6, xmm0
-%endmacro
-
-%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
- MMX_DW_1_2REG %3, %4
- pmaddwd %2, %3
- movhlps %4, %2
- paddd %2, %4
- pshuflw %4, %2,0Eh
- paddd %2, %4
- movd %1, %2
-%endmacro
-;***********************************************************************
-;
-;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd4x4_sse41
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- movdqa xmm4,[HSwapSumSubDB1]
- movd xmm2,[r2]
- movd xmm5,[r2+r3]
- shufps xmm2,xmm5,0
- movd xmm3,[r2+r3*2]
- lea r2, [r3*2+r2]
- movd xmm5,[r2+r3]
- shufps xmm3,xmm5,0
- movd xmm0,[r0]
- movd xmm5,[r0+r1]
- shufps xmm0,xmm5,0
- movd xmm1,[r0+r1*2]
- lea r0, [r1*2+r0]
- movd xmm5,[r0+r1]
- shufps xmm1,xmm5,0
- pmaddubsw xmm0,xmm4
- pmaddubsw xmm1,xmm4
- pmaddubsw xmm2,xmm4
- pmaddubsw xmm3,xmm4
- psubw xmm0,xmm2
- psubw xmm1,xmm3
- movdqa xmm2,xmm0
- paddw xmm0,xmm1
- psubw xmm1,xmm2
- movdqa xmm2,xmm0
- punpcklqdq xmm0,xmm1
- punpckhqdq xmm2,xmm1
- movdqa xmm1,xmm0
- paddw xmm0,xmm2
- psubw xmm2,xmm1
- movdqa xmm1,xmm0
- pblendw xmm0,xmm2,0AAh
- pslld xmm2,16
- psrld xmm1,16
- por xmm2,xmm1
- pabsw xmm0,xmm0
- pabsw xmm2,xmm2
- pmaxsw xmm0,xmm2
- SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
- POP_XMM
- LOAD_4_PARA_POP
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd8x8_sse41
-%ifdef X86_32
- push r4
- push r5
-%endif
- %assign push_num 2
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- movdqa xmm7, [HSumSubDB1]
- lea r4, [r1+r1*2]
- lea r5, [r3+r3*2]
- pxor xmm6, xmm6
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE41_GetSatd8x4
- SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
- POP_XMM
- LOAD_4_PARA_POP
-%ifdef X86_32
- pop r5
- pop r4
-%endif
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd8x16_sse41
-%ifdef X86_32
- push r4
- push r5
- push r6
-%endif
- %assign push_num 3
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- movdqa xmm7, [HSumSubDB1]
- lea r4, [r1+r1*2]
- lea r5, [r3+r3*2]
- pxor xmm6, xmm6
- mov r6, 0
-loop_get_satd_8x16:
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- inc r6
- cmp r6, 4
- jl loop_get_satd_8x16
- SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
- POP_XMM
- LOAD_4_PARA_POP
-%ifdef X86_32
- pop r6
- pop r5
- pop r4
-%endif
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x8_sse41
-%ifdef X86_32
- push r4
- push r5
-%endif
- %assign push_num 2
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- push r0
- push r2
-
- movdqa xmm7, [HSumSubDB1]
- lea r4, [r1+r1*2]
- lea r5, [r3+r3*2]
- pxor xmm6, xmm6
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE41_GetSatd8x4
-
- pop r2
- pop r0
- add r0, 8
- add r2, 8
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE41_GetSatd8x4
- SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
- POP_XMM
- LOAD_4_PARA_POP
-%ifdef X86_32
- pop r5
- pop r4
-%endif
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-
-WELS_EXTERN WelsSampleSatd16x16_sse41
-%ifdef X86_32
- push r4
- push r5
- push r6
-%endif
- %assign push_num 3
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
-
- push r0
- push r2
-
- movdqa xmm7, [HSumSubDB1]
- lea r4, [r1+r1*2]
- lea r5, [r3+r3*2]
- pxor xmm6, xmm6
- mov r6, 0
-loop_get_satd_16x16_left:
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- inc r6
- cmp r6, 4
- jl loop_get_satd_16x16_left
-
- pop r2
- pop r0
- add r0, 8
- add r2, 8
- mov r6, 0
-loop_get_satd_16x16_right:
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- inc r6
- cmp r6, 4
- jl loop_get_satd_16x16_right
- SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
- POP_XMM
- LOAD_4_PARA_POP
-%ifdef X86_32
- pop r6
- pop r5
- pop r4
-%endif
- ret
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse41 END
-;
-;***********************************************************************
-
-;***********************************************************************
-;
-;Pixel_sad_wxh_sse2 BEGIN
-;
-;***********************************************************************
-
-%macro SSE2_GetSad2x16 0
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqu xmm1, [r2]
- MOVDQ xmm2, [r0];[eax] must aligned 16
- psadbw xmm1, xmm2
- paddw xmm0, xmm1
- movdqu xmm1, [r2+r3]
- MOVDQ xmm2, [r0+r1]
- psadbw xmm1, xmm2
- paddw xmm0, xmm1
-%endmacro
-
-
-%macro SSE2_GetSad4x16 0
- movdqu xmm0, [r2]
- MOVDQ xmm2, [r0]
- psadbw xmm0, xmm2
- paddw xmm7, xmm0
- movdqu xmm1, [r2+r3]
- MOVDQ xmm2, [r0+r1]
- psadbw xmm1, xmm2
- paddw xmm7, xmm1
- movdqu xmm1, [r2+2*r3]
- MOVDQ xmm2, [r0+2*r1];[eax] must aligned 16
- psadbw xmm1, xmm2
- paddw xmm7, xmm1
- movdqu xmm1, [r2+r5]
- MOVDQ xmm2, [r0+r4]
- psadbw xmm1, xmm2
- paddw xmm7, xmm1
-%endmacro
-
-
-%macro SSE2_GetSad8x4 0
- movq xmm0, [r0]
- movq xmm1, [r0+r1]
- lea r0, [r0+2*r1]
- movhps xmm0, [r0]
- movhps xmm1, [r0+r1]
-
- movq xmm2, [r2]
- movq xmm3, [r2+r3]
- lea r2, [r2+2*r3]
- movhps xmm2, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm2
- psadbw xmm1, xmm3
- paddw xmm6, xmm0
- paddw xmm6, xmm1
-%endmacro
-
-;***********************************************************************
-;
-;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
-;First parameter can align to 16 bytes,
-;In wels, the third parameter can't align to 16 bytes.
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSad16x16_sse2
-%ifdef X86_32
- push r4
- push r5
-%endif
-
- %assign push_num 2
- LOAD_4_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- lea r4, [3*r1]
- lea r5, [3*r3]
-
- pxor xmm7, xmm7
- SSE2_GetSad4x16
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE2_GetSad4x16
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE2_GetSad4x16
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE2_GetSad4x16
- movhlps xmm0, xmm7
- paddw xmm0, xmm7
- movd retrd, xmm0
- POP_XMM
- LOAD_4_PARA_POP
-%ifdef X86_32
- pop r5
- pop r4
-%endif
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
-;First parameter can align to 16 bytes,
-;In wels, the third parameter can't align to 16 bytes.
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSad16x8_sse2
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- movdqu xmm0, [r2]
- MOVDQ xmm2, [r0]
- psadbw xmm0, xmm2
- movdqu xmm1, [r2+r3]
- MOVDQ xmm2, [r0+r1]
- psadbw xmm1, xmm2
- paddw xmm0, xmm1
-
- SSE2_GetSad2x16
- SSE2_GetSad2x16
- SSE2_GetSad2x16
-
- movhlps xmm1, xmm0
- paddw xmm0, xmm1
- movd retrd, xmm0
- LOAD_4_PARA_POP
- ret
-
-
-
-WELS_EXTERN WelsSampleSad8x16_sse2
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 7
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm6, xmm6
-
- SSE2_GetSad8x4
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSad8x4
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSad8x4
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSad8x4
-
- movhlps xmm0, xmm6
- paddw xmm0, xmm6
- movd retrd, xmm0
- POP_XMM
- LOAD_4_PARA_POP
- ret
-
-
-%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
-and %1, 0x1f|(%3>>1)
-cmp %1, (32-%2)|(%3>>1)
-%endmacro
-
-WELS_EXTERN WelsSampleSad8x8_sse21
- %assign push_num 0
- mov r2, arg3
- push r2
- CACHE_SPLIT_CHECK r2, 8, 64
- jle near .pixel_sad_8x8_nsplit
- pop r2
-%ifdef X86_32
- push r3
- push r4
- push r5
-%endif
- %assign push_num 3
- PUSH_XMM 8
- mov r0, arg1
- mov r1, arg2
- SIGN_EXTENSION r1, r1d
- pxor xmm7, xmm7
-
- ;ecx r2, edx r4, edi r5
-
- mov r5, r2
- and r5, 0x07
- sub r2, r5
- mov r4, 8
- sub r4, r5
-
- shl r5, 3
- shl r4, 3
- movd xmm5, r5d
- movd xmm6, r4d
- mov r5, 8
- add r5, r2
- mov r3, arg4
- SIGN_EXTENSION r3, r3d
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
-
- movq xmm1, [r2]
- movq xmm2, [r5]
- movhps xmm1, [r2+r3]
- movhps xmm2, [r5+r3]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- lea r5, [r5+2*r3]
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
-
- movq xmm1, [r2]
- movq xmm2, [r5]
- movhps xmm1, [r2+r3]
- movhps xmm2, [r5+r3]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- lea r5, [r5+2*r3]
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
-
- movq xmm1, [r2]
- movq xmm2, [r5]
- movhps xmm1, [r2+r3]
- movhps xmm2, [r5+r3]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- lea r5, [r5+2*r3]
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
-
- movq xmm1, [r2]
- movq xmm2, [r5]
- movhps xmm1, [r2+r3]
- movhps xmm2, [r5+r3]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
-
- movhlps xmm0, xmm7
- paddw xmm0, xmm7
- movd retrd, xmm0
- POP_XMM
-%ifdef X86_32
- pop r5
- pop r4
- pop r3
-%endif
- jmp .return
-
-.pixel_sad_8x8_nsplit:
-
- pop r2
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 7
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm6, xmm6
- SSE2_GetSad8x4
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSad8x4
- movhlps xmm0, xmm6
- paddw xmm0, xmm6
- movd retrd, xmm0
- POP_XMM
- LOAD_4_PARA_POP
-.return:
- ret
-
-
-;***********************************************************************
-;
-;Pixel_sad_wxh_sse2 END
-;
-;***********************************************************************
-
-
-;***********************************************************************
-;
-;Pixel_sad_4_wxh_sse2 BEGIN
-;
-;***********************************************************************
-
-
-%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
- psadbw %1, %4
- paddw xmm5, %1
- psadbw %4, %3
- paddw xmm4, %4
- movdqu %4, [%5-1]
- psadbw %4, %2
- paddw xmm6, %4
- movdqu %4, [%5+1]
- psadbw %4, %2
- paddw xmm7, %4
-%endmacro
-WELS_EXTERN WelsSampleSadFour16x16_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
- pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
- pxor xmm6, xmm6 ;sad pRefMb-1
- pxor xmm7, xmm7 ;sad pRefMb+1
- movdqa xmm0, [r0]
- sub r2, r3
- movdqu xmm3, [r2]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movdqa xmm1, [r0+r1]
- movdqu xmm3, [r2+r3]
- psadbw xmm3, xmm1
- paddw xmm4, xmm3
-
- movdqu xmm2, [r2+r3-1]
- psadbw xmm2, xmm0
- paddw xmm6, xmm2
-
- movdqu xmm3, [r2+r3+1]
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm2, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
- movdqa xmm0, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm1, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
- movdqa xmm2, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm0, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
- movdqa xmm1, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm2, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
- movdqa xmm0, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm1, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
- movdqa xmm2, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm0, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
- movdqa xmm1, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm2, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
- movdqa xmm0, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
- lea r2, [r2+2*r3]
- movdqu xmm3, [r2]
- psadbw xmm2, xmm3
- paddw xmm5, xmm2
-
- movdqu xmm2, [r2-1]
- psadbw xmm2, xmm0
- paddw xmm6, xmm2
-
- movdqu xmm3, [r2+1]
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movdqu xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- movhlps xmm0, xmm5
- paddw xmm5, xmm0
- movhlps xmm0, xmm6
- paddw xmm6, xmm0
- movhlps xmm0, xmm7
- paddw xmm7, xmm0
- punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6
- movdqa [r4],xmm4
- POP_XMM
- LOAD_5_PARA_POP
- ret
-
-
-WELS_EXTERN WelsSampleSadFour16x8_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
- pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
- pxor xmm6, xmm6 ;sad pRefMb-1
- pxor xmm7, xmm7 ;sad pRefMb+1
- movdqa xmm0, [r0]
- sub r2, r3
- movdqu xmm3, [r2]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movdqa xmm1, [r0+r1]
- movdqu xmm3, [r2+r3]
- psadbw xmm3, xmm1
- paddw xmm4, xmm3
-
- movdqu xmm2, [r2+r3-1]
- psadbw xmm2, xmm0
- paddw xmm6, xmm2
-
- movdqu xmm3, [r2+r3+1]
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm2, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
- movdqa xmm0, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm1, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
- movdqa xmm2, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm0, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
- movdqa xmm1, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
- lea r2, [r2+2*r3]
- movdqu xmm3, [r2]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movdqu xmm0, [r2-1]
- psadbw xmm0, xmm1
- paddw xmm6, xmm0
-
- movdqu xmm3, [r2+1]
- psadbw xmm3, xmm1
- paddw xmm7, xmm3
-
- movdqu xmm3, [r2+r3]
- psadbw xmm1, xmm3
- paddw xmm5, xmm1
-
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- movhlps xmm0, xmm5
- paddw xmm5, xmm0
- movhlps xmm0, xmm6
- paddw xmm6, xmm0
- movhlps xmm0, xmm7
- paddw xmm7, xmm0
- punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6
- movdqa [r4],xmm4
- POP_XMM
- LOAD_5_PARA_POP
- ret
-
-WELS_EXTERN WelsSampleSadFour8x16_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
- pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
- pxor xmm6, xmm6 ;sad pRefMb-1
- pxor xmm7, xmm7 ;sad pRefMb+1
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- sub r2, r3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- movhlps xmm0, xmm5
- paddw xmm5, xmm0
- movhlps xmm0, xmm6
- paddw xmm6, xmm0
- movhlps xmm0, xmm7
- paddw xmm7, xmm0
- punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6
- movdqa [r4],xmm4
- POP_XMM
- LOAD_5_PARA_POP
- ret
-
-
-WELS_EXTERN WelsSampleSadFour8x8_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
- pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
- pxor xmm6, xmm6 ;sad pRefMb-1
- pxor xmm7, xmm7 ;sad pRefMb+1
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- sub r2, r3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- movhlps xmm0, xmm5
- paddw xmm5, xmm0
- movhlps xmm0, xmm6
- paddw xmm6, xmm0
- movhlps xmm0, xmm7
- paddw xmm7, xmm0
- punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6
- movdqa [r4],xmm4
- POP_XMM
- LOAD_5_PARA_POP
- ret
-
-WELS_EXTERN WelsSampleSadFour4x4_sse2
- %assign push_num 0
- LOAD_5_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- movd xmm0, [r0]
- movd xmm1, [r0+r1]
- lea r0, [r0+2*r1]
- movd xmm2, [r0]
- movd xmm3, [r0+r1]
- punpckldq xmm0, xmm1
- punpckldq xmm2, xmm3
- punpcklqdq xmm0, xmm2
- sub r2, r3
- movd xmm1, [r2]
- movd xmm2, [r2+r3]
- punpckldq xmm1, xmm2
- movd xmm2, [r2+r3-1]
- movd xmm3, [r2+r3+1]
-
- lea r2, [r2+2*r3]
-
- movd xmm4, [r2]
- movd xmm5, [r2-1]
- punpckldq xmm2, xmm5
- movd xmm5, [r2+1]
- punpckldq xmm3, xmm5
-
- movd xmm5, [r2+r3]
- punpckldq xmm4, xmm5
-
- punpcklqdq xmm1, xmm4 ;-L
-
- movd xmm5, [r2+r3-1]
- movd xmm6, [r2+r3+1]
-
- lea r2, [r2+2*r3]
- movd xmm7, [r2-1]
- punpckldq xmm5, xmm7
- punpcklqdq xmm2, xmm5 ;-1
- movd xmm7, [r2+1]
- punpckldq xmm6, xmm7
- punpcklqdq xmm3, xmm6 ;+1
- movd xmm6, [r2]
- movd xmm7, [r2+r3]
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6 ;+L
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
- psadbw xmm4, xmm0
-
- movhlps xmm0, xmm1
- paddw xmm1, xmm0
- movhlps xmm0, xmm2
- paddw xmm2, xmm0
- movhlps xmm0, xmm3
- paddw xmm3, xmm0
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- punpckldq xmm1, xmm4
- punpckldq xmm2, xmm3
- punpcklqdq xmm1, xmm2
- movdqa [r4],xmm1
- LOAD_5_PARA_POP
- ret
-
-;***********************************************************************
-;
-;Pixel_sad_4_wxh_sse2 END
-;
-;***********************************************************************
-
-;***********************************************************************
-; int32_t WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
-;***********************************************************************
-WELS_EXTERN WelsSampleSad4x4_mmx
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- movd mm0, [r0]
- movd mm1, [r0+r1]
- punpckldq mm0, mm1
-
- movd mm3, [r2]
- movd mm4, [r2+r3]
- punpckldq mm3, mm4
- psadbw mm0, mm3
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
-
- movd mm1, [r0]
- movd mm2, [r0+r1]
- punpckldq mm1, mm2
-
- movd mm3, [r2]
- movd mm4, [r2+r3]
- punpckldq mm3, mm4
- psadbw mm1, mm3
- paddw mm0, mm1
-
- movd retrd, mm0
-
- WELSEMMS
- LOAD_4_PARA_POP
- ret
--- /dev/null
+++ b/codec/common/src/WelsThreadLib.cpp
@@ -1,0 +1,473 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file WelsThreadLib.c
+ *
+ * \brief Interfaces introduced in thread programming
+ *
+ * \date 11/17/2009 Created
+ *
+ *************************************************************************************
+ */
+
+
+#ifdef LINUX
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <sched.h>
+#elif !defined(_WIN32)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <sys/param.h>
+#include <unistd.h>
+#ifdef __APPLE__
+#define HW_NCPU_NAME "hw.logicalcpu"
+#else
+#define HW_NCPU_NAME "hw.ncpu"
+#endif
+#endif
+#ifdef ANDROID_NDK
+#include <cpu-features.h>
+#endif
+
+#include "WelsThreadLib.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+
+#ifdef _WIN32
+
+#ifdef WINAPI_FAMILY
+#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define InitializeCriticalSection(x) InitializeCriticalSectionEx(x, 0, 0)
+#endif
+#endif
+
+WELS_THREAD_ERROR_CODE WelsMutexInit (WELS_MUTEX* mutex) {
+ InitializeCriticalSection (mutex);
+
+ return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE WelsMutexLock (WELS_MUTEX* mutex) {
+ EnterCriticalSection (mutex);
+
+ return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE WelsMutexUnlock (WELS_MUTEX* mutex) {
+ LeaveCriticalSection (mutex);
+
+ return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE WelsMutexDestroy (WELS_MUTEX* mutex) {
+ DeleteCriticalSection (mutex);
+
+ return WELS_THREAD_ERROR_OK;
+}
+
+#else /* _WIN32 */
+
+WELS_THREAD_ERROR_CODE WelsMutexInit (WELS_MUTEX* mutex) {
+ return pthread_mutex_init (mutex, NULL);
+}
+
+WELS_THREAD_ERROR_CODE WelsMutexLock (WELS_MUTEX* mutex) {
+ return pthread_mutex_lock (mutex);
+}
+
+WELS_THREAD_ERROR_CODE WelsMutexUnlock (WELS_MUTEX* mutex) {
+ return pthread_mutex_unlock (mutex);
+}
+
+WELS_THREAD_ERROR_CODE WelsMutexDestroy (WELS_MUTEX* mutex) {
+ return pthread_mutex_destroy (mutex);
+}
+
+#endif /* !_WIN32 */
+
+
+#ifdef MT_ENABLED
+
+#ifdef _WIN32
+
+void WelsSleep (uint32_t dwMilliseconds) {
+ Sleep (dwMilliseconds);
+}
+
+WELS_THREAD_ERROR_CODE WelsEventOpen (WELS_EVENT* event, const char* event_name) {
+ WELS_EVENT h = CreateEvent (NULL, FALSE, FALSE, NULL);
+
+ if (h == NULL) {
+ return WELS_THREAD_ERROR_GENERAL;
+ }
+ *event = h;
+ return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE WelsEventSignal (WELS_EVENT* event) {
+ if (SetEvent (*event)) {
+ return WELS_THREAD_ERROR_OK;
+ }
+ return WELS_THREAD_ERROR_GENERAL;
+}
+
+WELS_THREAD_ERROR_CODE WelsEventWait (WELS_EVENT* event) {
+ return WaitForSingleObject (*event, INFINITE);
+}
+
+WELS_THREAD_ERROR_CODE WelsEventWaitWithTimeOut (WELS_EVENT* event, uint32_t dwMilliseconds) {
+ return WaitForSingleObject (*event, dwMilliseconds);
+}
+
+WELS_THREAD_ERROR_CODE WelsMultipleEventsWaitSingleBlocking (uint32_t nCount,
+ WELS_EVENT* event_list, WELS_EVENT* master_event) {
+ // Don't need/use the master event for anything, since windows has got WaitForMultipleObjects
+ return WaitForMultipleObjects (nCount, event_list, FALSE, INFINITE);
+}
+
+WELS_THREAD_ERROR_CODE WelsMultipleEventsWaitAllBlocking (uint32_t nCount,
+ WELS_EVENT* event_list, WELS_EVENT* master_event) {
+ // Don't need/use the master event for anything, since windows has got WaitForMultipleObjects
+ return WaitForMultipleObjects (nCount, event_list, TRUE, INFINITE);
+}
+
+WELS_THREAD_ERROR_CODE WelsEventClose (WELS_EVENT* event, const char* event_name) {
+ CloseHandle (*event);
+
+ *event = NULL;
+ return WELS_THREAD_ERROR_OK;
+}
+
+
+WELS_THREAD_ERROR_CODE WelsThreadCreate (WELS_THREAD_HANDLE* thread, LPWELS_THREAD_ROUTINE routine,
+ void* arg, WELS_THREAD_ATTR attr) {
+ WELS_THREAD_HANDLE h = CreateThread (NULL, 0, routine, arg, 0, NULL);
+
+ if (h == NULL) {
+ return WELS_THREAD_ERROR_GENERAL;
+ }
+ * thread = h;
+
+ return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE WelsThreadJoin (WELS_THREAD_HANDLE thread) {
+ WaitForSingleObject (thread, INFINITE);
+ CloseHandle (thread);
+
+ return WELS_THREAD_ERROR_OK;
+}
+
+
+WELS_THREAD_HANDLE WelsThreadSelf() {
+ return GetCurrentThread();
+}
+
+WELS_THREAD_ERROR_CODE WelsQueryLogicalProcessInfo (WelsLogicalProcessInfo* pInfo) {
+ SYSTEM_INFO si;
+
+ GetSystemInfo (&si);
+
+ pInfo->ProcessorCount = si.dwNumberOfProcessors;
+
+ return WELS_THREAD_ERROR_OK;
+}
+
+#else
+
+void WelsSleep (uint32_t dwMilliseconds) {
+ usleep (dwMilliseconds * 1000); // microseconds
+}
+
+WELS_THREAD_ERROR_CODE WelsThreadCreate (WELS_THREAD_HANDLE* thread, LPWELS_THREAD_ROUTINE routine,
+ void* arg, WELS_THREAD_ATTR attr) {
+ WELS_THREAD_ERROR_CODE err = 0;
+
+ pthread_attr_t at;
+ err = pthread_attr_init (&at);
+ if (err)
+ return err;
+#ifndef __ANDROID__
+ err = pthread_attr_setscope (&at, PTHREAD_SCOPE_SYSTEM);
+ if (err)
+ return err;
+ err = pthread_attr_setschedpolicy (&at, SCHED_FIFO);
+ if (err)
+ return err;
+#endif
+ err = pthread_create (thread, &at, routine, arg);
+
+ pthread_attr_destroy (&at);
+
+ return err;
+}
+
+WELS_THREAD_ERROR_CODE WelsThreadJoin (WELS_THREAD_HANDLE thread) {
+ return pthread_join (thread, NULL);
+}
+
+WELS_THREAD_HANDLE WelsThreadSelf() {
+ return pthread_self();
+}
+
+// unnamed semaphores aren't supported on OS X
+
+WELS_THREAD_ERROR_CODE WelsEventOpen (WELS_EVENT* p_event, const char* event_name) {
+#ifdef __APPLE__
+ if (p_event == NULL || event_name == NULL)
+ return WELS_THREAD_ERROR_GENERAL;
+ *p_event = sem_open (event_name, O_CREAT, (S_IRUSR | S_IWUSR)/*0600*/, 0);
+ if (*p_event == (sem_t*)SEM_FAILED) {
+ sem_unlink (event_name);
+ *p_event = NULL;
+ return WELS_THREAD_ERROR_GENERAL;
+ } else {
+ return WELS_THREAD_ERROR_OK;
+ }
+#else
+ WELS_EVENT event = (WELS_EVENT) malloc(sizeof(*event));
+ if (event == NULL)
+ return WELS_THREAD_ERROR_GENERAL;
+ WELS_THREAD_ERROR_CODE err = sem_init(event, 0, 0);
+ if (!err) {
+ *p_event = event;
+ return err;
+ }
+ free(event);
+ return err;
+#endif
+}
+WELS_THREAD_ERROR_CODE WelsEventClose (WELS_EVENT* event, const char* event_name) {
+#ifdef __APPLE__
+ WELS_THREAD_ERROR_CODE err = sem_close (*event); // match with sem_open
+ if (event_name)
+ sem_unlink (event_name);
+ return err;
+#else
+ WELS_THREAD_ERROR_CODE err = sem_destroy (*event); // match with sem_init
+ free(*event);
+ return err;
+#endif
+}
+
+WELS_THREAD_ERROR_CODE WelsEventSignal (WELS_EVENT* event) {
+ WELS_THREAD_ERROR_CODE err = 0;
+// int32_t val = 0;
+// sem_getvalue(event, &val);
+// fprintf( stderr, "before signal it, val= %d..\n",val );
+ err = sem_post (*event);
+// sem_getvalue(event, &val);
+// fprintf( stderr, "after signal it, val= %d..\n",val );
+ return err;
+}
+
+WELS_THREAD_ERROR_CODE WelsEventWait (WELS_EVENT* event) {
+ return sem_wait (*event); // blocking until signaled
+}
+
+WELS_THREAD_ERROR_CODE WelsEventWaitWithTimeOut (WELS_EVENT* event, uint32_t dwMilliseconds) {
+ if (dwMilliseconds != (uint32_t) - 1) {
+ return sem_wait (*event);
+ } else {
+#if defined(__APPLE__)
+ int32_t err = 0;
+ int32_t wait_count = 0;
+ do {
+ err = sem_trywait (*event);
+ if (WELS_THREAD_ERROR_OK == err)
+ break;// WELS_THREAD_ERROR_OK;
+ else if (wait_count > 0)
+ break;
+ usleep (dwMilliseconds * 1000);
+ ++ wait_count;
+ } while (1);
+ return err;
+#else
+ struct timespec ts;
+ struct timeval tv;
+
+ gettimeofday (&tv, 0);
+
+ ts.tv_nsec = tv.tv_usec * 1000 + dwMilliseconds * 1000000;
+ ts.tv_sec = tv.tv_sec + ts.tv_nsec / 1000000000;
+ ts.tv_nsec %= 1000000000;
+
+ return sem_timedwait (*event, &ts);
+#endif//__APPLE__
+ }
+}
+
+WELS_THREAD_ERROR_CODE WelsMultipleEventsWaitSingleBlocking (uint32_t nCount,
+ WELS_EVENT* event_list, WELS_EVENT* master_event) {
+ uint32_t nIdx = 0;
+ uint32_t uiAccessTime = 2; // 2 us once
+
+ if (nCount == 0)
+ return WELS_THREAD_ERROR_WAIT_FAILED;
+
+ if (master_event != NULL) {
+ // This design relies on the events actually being semaphores;
+ // if multiple events in the list have been signalled, the master
+ // event should have a similar count (events in windows can't keep
+ // track of the actual count, but the master event isn't needed there
+ // since it uses WaitForMultipleObjects).
+ int32_t err = sem_wait (*master_event);
+ if (err != WELS_THREAD_ERROR_OK)
+ return err;
+ uiAccessTime = 0; // no blocking, just quickly loop through all to find the one that was signalled
+ }
+
+ while (1) {
+ nIdx = 0; // access each event by order
+ while (nIdx < nCount) {
+ int32_t err = 0;
+ int32_t wait_count = 0;
+
+ /*
+ * although such interface is not used in __GNUC__ like platform, to use
+ * pthread_cond_timedwait() might be better choice if need
+ */
+ do {
+ err = sem_trywait (event_list[nIdx]);
+ if (WELS_THREAD_ERROR_OK == err)
+ return WELS_THREAD_ERROR_WAIT_OBJECT_0 + nIdx;
+ else if (wait_count > 0 || uiAccessTime == 0)
+ break;
+ usleep (uiAccessTime);
+ ++ wait_count;
+ } while (1);
+ // we do need access next event next time
+ ++ nIdx;
+ }
+ usleep (1); // switch to working threads
+ if (master_event != NULL) {
+ // A master event was used and was signalled, but none of the events in the
+ // list was found to be signalled, thus wait a little more when rechecking
+ // the list to avoid busylooping here.
+ // If we ever hit this codepath it's mostly a bug in the code that signals
+ // the events.
+ uiAccessTime = 2;
+ }
+ }
+
+ return WELS_THREAD_ERROR_WAIT_FAILED;
+}
+
+WELS_THREAD_ERROR_CODE WelsMultipleEventsWaitAllBlocking (uint32_t nCount,
+ WELS_EVENT* event_list, WELS_EVENT* master_event) {
+ uint32_t nIdx = 0;
+ uint32_t uiCountSignals = 0;
+ uint32_t uiSignalFlag = 0; // UGLY: suppose maximal event number up to 32
+
+ if (nCount == 0 || nCount > (sizeof (uint32_t) << 3))
+ return WELS_THREAD_ERROR_WAIT_FAILED;
+
+ while (1) {
+ nIdx = 0; // access each event by order
+ while (nIdx < nCount) {
+ const uint32_t kuiBitwiseFlag = (1 << nIdx);
+
+ if ((uiSignalFlag & kuiBitwiseFlag) != kuiBitwiseFlag) { // non-blocking mode
+ int32_t err = 0;
+// fprintf( stderr, "sem_wait(): start to wait event %d..\n", nIdx );
+ if (master_event == NULL) {
+ err = sem_wait (event_list[nIdx]);
+ } else {
+ err = sem_wait (*master_event);
+ if (err == WELS_THREAD_ERROR_OK) {
+ err = sem_wait (event_list[nIdx]);
+ if (err != WELS_THREAD_ERROR_OK) {
+ // We successfully waited for the master event,
+ // but waiting for the individual event failed (e.g. EINTR?).
+ // Increase the master event count so that the next retry will
+ // work as intended.
+ sem_post (*master_event);
+ }
+ }
+ }
+// fprintf( stderr, "sem_wait(): wait event %d result %d errno %d..\n", nIdx, err, errno );
+ if (WELS_THREAD_ERROR_OK == err) {
+// int32_t val = 0;
+// sem_getvalue(&event_list[nIdx], &val);
+// fprintf( stderr, "after sem_timedwait(), event_list[%d] semaphore value= %d..\n", nIdx, val);
+
+ uiSignalFlag |= kuiBitwiseFlag;
+ ++ uiCountSignals;
+ if (uiCountSignals >= nCount) {
+ return WELS_THREAD_ERROR_OK;
+ }
+ }
+ }
+ // we do need access next event next time
+ ++ nIdx;
+ }
+ }
+
+ return WELS_THREAD_ERROR_WAIT_FAILED;
+}
+
+WELS_THREAD_ERROR_CODE WelsQueryLogicalProcessInfo (WelsLogicalProcessInfo* pInfo) {
+#ifdef ANDROID_NDK
+ pInfo->ProcessorCount = android_getCpuCount();
+ return WELS_THREAD_ERROR_OK;
+#elif defined(LINUX)
+
+ cpu_set_t cpuset;
+
+ CPU_ZERO (&cpuset);
+
+ if (!sched_getaffinity (0, sizeof (cpuset), &cpuset))
+ pInfo->ProcessorCount = CPU_COUNT (&cpuset);
+ else
+ pInfo->ProcessorCount = 1;
+
+ return WELS_THREAD_ERROR_OK;
+
+#else
+
+ size_t len = sizeof (pInfo->ProcessorCount);
+
+ if (sysctlbyname (HW_NCPU_NAME, &pInfo->ProcessorCount, &len, NULL, 0) == -1)
+ pInfo->ProcessorCount = 1;
+
+ return WELS_THREAD_ERROR_OK;
+
+#endif//LINUX
+}
+
+#endif
+
+
+#endif // MT_ENABLED
+
--- /dev/null
+++ b/codec/common/src/cpu.cpp
@@ -1,0 +1,293 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file cpu.cpp
+ *
+ * \brief CPU compatibility detection
+ *
+ * \date 04/29/2009 Created
+ *
+ *************************************************************************************
+ */
+#include <string.h>
+#include <stdio.h>
+#ifdef ANDROID_NDK
+#include <cpu-features.h>
+#endif
+#include "cpu.h"
+#include "cpu_core.h"
+
+
+
+#define CPU_Vendor_AMD "AuthenticAMD"
+#define CPU_Vendor_INTEL "GenuineIntel"
+#define CPU_Vendor_CYRIX "CyrixInstead"
+
+#if defined(X86_ASM)
+
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
+ uint32_t uiCPU = 0;
+ uint32_t uiFeatureA = 0, uiFeatureB = 0, uiFeatureC = 0, uiFeatureD = 0;
+ int32_t CacheLineSize = 0;
+ int8_t chVendorName[16] = { 0 };
+ uint32_t uiMaxCpuidLevel = 0;
+
+ if (!WelsCPUIdVerify()) {
+ /* cpuid is not supported in cpu */
+ return 0;
+ }
+
+ WelsCPUId (0, &uiFeatureA, (uint32_t*)&chVendorName[0], (uint32_t*)&chVendorName[8], (uint32_t*)&chVendorName[4]);
+ uiMaxCpuidLevel = uiFeatureA;
+ if (uiMaxCpuidLevel == 0) {
+ /* maximum input value for basic cpuid information */
+ return 0;
+ }
+
+ WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+ if ((uiFeatureD & 0x00800000) == 0) {
+ /* Basic MMX technology is not support in cpu, mean nothing for us so return here */
+ return 0;
+ }
+
+ uiCPU = WELS_CPU_MMX;
+ if (uiFeatureD & 0x02000000) {
+ /* SSE technology is identical to AMD MMX extensions */
+ uiCPU |= WELS_CPU_MMXEXT | WELS_CPU_SSE;
+ }
+ if (uiFeatureD & 0x04000000) {
+ /* SSE2 support here */
+ uiCPU |= WELS_CPU_SSE2;
+ }
+ if (uiFeatureD & 0x00000001) {
+ /* x87 FPU on-chip checking */
+ uiCPU |= WELS_CPU_FPU;
+ }
+ if (uiFeatureD & 0x00008000) {
+ /* CMOV instruction checking */
+ uiCPU |= WELS_CPU_CMOV;
+ }
+ if ((!strcmp ((const char*)chVendorName, CPU_Vendor_INTEL)) ||
+ (!strcmp((const char*)chVendorName, CPU_Vendor_AMD)) ) { // confirmed_safe_unsafe_usage
+ if (uiFeatureD & 0x10000000) {
+ /* Multi-Threading checking: contains of multiple logic processors */
+ uiCPU |= WELS_CPU_HTT;
+ }
+ }
+
+ if (uiFeatureC & 0x00000001) {
+ /* SSE3 support here */
+ uiCPU |= WELS_CPU_SSE3;
+ }
+ if (uiFeatureC & 0x00000200) {
+ /* SSSE3 support here */
+ uiCPU |= WELS_CPU_SSSE3;
+ }
+ if (uiFeatureC & 0x00080000) {
+ /* SSE4.1 support here, 45nm Penryn processor */
+ uiCPU |= WELS_CPU_SSE41;
+ }
+ if (uiFeatureC & 0x00100000) {
+ /* SSE4.2 support here, next generation Nehalem processor */
+ uiCPU |= WELS_CPU_SSE42;
+ }
+ if (WelsCPUSupportAVX (uiFeatureA, uiFeatureC)) {
+ /* AVX supported */
+ uiCPU |= WELS_CPU_AVX;
+ }
+ if (WelsCPUSupportFMA (uiFeatureA, uiFeatureC)) {
+ /* AVX FMA supported */
+ uiCPU |= WELS_CPU_FMA;
+ }
+ if (uiFeatureC & 0x02000000) {
+ /* AES checking */
+ uiCPU |= WELS_CPU_AES;
+ }
+ if (uiFeatureC & 0x00400000) {
+ /* MOVBE checking */
+ uiCPU |= WELS_CPU_MOVBE;
+ }
+
+ if( pNumberOfLogicProcessors != NULL ){
+ if( uiCPU & WELS_CPU_HTT){
+ *pNumberOfLogicProcessors = (uiFeatureB & 0x00ff0000) >> 16; // feature bits: 23-16 on returned EBX
+ } else {
+ *pNumberOfLogicProcessors = 0;
+ }
+ if( !strcmp((const char*)chVendorName, CPU_Vendor_INTEL) ){
+ if( uiMaxCpuidLevel >= 4 ){
+ uiFeatureC = 0;
+ WelsCPUId(0x4, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+ if( uiFeatureA != 0 ){
+ *pNumberOfLogicProcessors = ((uiFeatureA&0xfc000000)>>26) + 1;
+ }
+ }
+ }
+ }
+
+ WelsCPUId (0x80000000, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+
+ if ((!strcmp ((const char*)chVendorName, CPU_Vendor_AMD))
+ && (uiFeatureA >= 0x80000001)) { // confirmed_safe_unsafe_usage
+ WelsCPUId (0x80000001, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+ if (uiFeatureD & 0x00400000) {
+ uiCPU |= WELS_CPU_MMXEXT;
+ }
+ if (uiFeatureD & 0x80000000) {
+ uiCPU |= WELS_CPU_3DNOW;
+ }
+ }
+
+ if (!strcmp ((const char*)chVendorName, CPU_Vendor_INTEL)) { // confirmed_safe_unsafe_usage
+ int32_t family, model;
+
+ WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+ family = ((uiFeatureA >> 8) & 0xf) + ((uiFeatureA >> 20) & 0xff);
+ model = ((uiFeatureA >> 4) & 0xf) + ((uiFeatureA >> 12) & 0xf0);
+
+ if ((family == 6) && (model == 9 || model == 13 || model == 14)) {
+ uiCPU &= ~ (WELS_CPU_SSE2 | WELS_CPU_SSE3);
+ }
+ }
+
+ // get cache line size
+ if ((!strcmp ((const char*)chVendorName, CPU_Vendor_INTEL))
+ || ! (strcmp ((const char*)chVendorName, CPU_Vendor_CYRIX))) { // confirmed_safe_unsafe_usage
+ WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+
+ CacheLineSize = (uiFeatureB & 0xff00) >>
+ 5; // ((clflush_line_size >> 8) << 3), CLFLUSH_line_size * 8 = CacheLineSize_in_byte
+
+ if (CacheLineSize == 128) {
+ uiCPU |= WELS_CPU_CACHELINE_128;
+ } else if (CacheLineSize == 64) {
+ uiCPU |= WELS_CPU_CACHELINE_64;
+ } else if (CacheLineSize == 32) {
+ uiCPU |= WELS_CPU_CACHELINE_32;
+ } else if (CacheLineSize == 16) {
+ uiCPU |= WELS_CPU_CACHELINE_16;
+ }
+ }
+
+ return uiCPU;
+}
+
+
+void WelsCPURestore (const uint32_t kuiCPU) {
+ if (kuiCPU & (WELS_CPU_MMX | WELS_CPU_MMXEXT | WELS_CPU_3DNOW | WELS_CPU_3DNOWEXT)) {
+ WelsEmms();
+ }
+}
+
+#elif defined(HAVE_NEON) //For supporting both android platform and iOS platform
+#if defined(ANDROID_NDK)
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors)
+{
+ uint32_t uiCPU = 0;
+ AndroidCpuFamily cpuFamily = ANDROID_CPU_FAMILY_UNKNOWN;
+ uint64_t uiFeatures = 0;
+ cpuFamily = android_getCpuFamily();
+ if (cpuFamily == ANDROID_CPU_FAMILY_ARM) {
+ uiFeatures = android_getCpuFeatures();
+ if (uiFeatures & ANDROID_CPU_ARM_FEATURE_ARMv7){
+ uiCPU |= WELS_CPU_ARMv7;
+ }
+ if (uiFeatures & ANDROID_CPU_ARM_FEATURE_VFPv3){
+ uiCPU |= WELS_CPU_VFPv3;
+ }
+ if (uiFeatures & ANDROID_CPU_ARM_FEATURE_NEON){
+ uiCPU |= WELS_CPU_NEON;
+ }
+ }
+
+ if( pNumberOfLogicProcessors != NULL ){
+ *pNumberOfLogicProcessors = android_getCpuCount();
+ }
+
+ return uiCPU;
+}
+
+#elif defined(__APPLE__)
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors)
+{
+ uint32_t uiCPU = 0;
+
+#if defined(__ARM_NEON__)
+ uiCPU |= WELS_CPU_ARMv7;
+ uiCPU |= WELS_CPU_VFPv3;
+ uiCPU |= WELS_CPU_NEON;
+#endif
+ return uiCPU;
+}
+#elif defined(__linux__)
+
+/* Generic arm/linux cpu feature detection */
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
+ FILE *f = fopen("/proc/cpuinfo", "r");
+
+ if (!f)
+ return 0;
+
+ char buf[200];
+ int flags = 0;
+ while (fgets(buf, sizeof(buf), f)) {
+ if (!strncmp(buf, "Features", strlen("Features"))) {
+ if (strstr(buf, " neon "))
+ flags |= WELS_CPU_NEON;
+ if (strstr(buf, " vfpv3 "))
+ flags |= WELS_CPU_VFPv3;
+ break;
+ }
+ }
+ fclose(f);
+ return flags;
+}
+
+#else /* HAVE_NEON enabled but no runtime detection */
+
+/* No runtime feature detection available, but built with HAVE_NEON - assume
+ * that NEON and all associated features are available. */
+
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
+ return WELS_CPU_ARMv7 |
+ WELS_CPU_VFPv3 |
+ WELS_CPU_NEON;
+}
+#endif
+#else /* Neither X86_ASM nor HAVE_NEON */
+
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
+ return 0;
+}
+
+#endif
+
+
--- /dev/null
+++ b/codec/common/src/crt_util_safe_x.cpp
@@ -1,0 +1,256 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file crt_utils_safe_x.cpp
+ *
+ * \brief common tool/function utilization
+ *
+ * \date 03/10/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include <string.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#if defined(_WIN32)
+#include <windows.h>
+#include <sys/types.h>
+#include <sys/timeb.h>
+#ifndef _MSC_VER
+#include <sys/time.h>
+#endif //!_MSC_VER
+#else
+#include <sys/time.h>
+#include <sys/timeb.h>
+#endif //_WIN32
+
+#include "macros.h"
+#include "crt_util_safe_x.h" // Safe CRT routines like utils for cross platforms
+
+#if defined(_WIN32) && defined(_MSC_VER)
+
+#if defined(_MSC_VER) && (_MSC_VER>=1500)
+
+int32_t WelsSnprintf (char* pBuffer, int32_t iSizeOfBuffer, const char* kpFormat, ...) {
+ va_list pArgPtr;
+ int32_t iRc;
+
+ va_start (pArgPtr, kpFormat);
+
+ iRc = vsnprintf_s (pBuffer, iSizeOfBuffer, _TRUNCATE, kpFormat, pArgPtr);
+
+ va_end (pArgPtr);
+
+ return iRc;
+}
+
+char* WelsStrncpy (char* pDest, int32_t iSizeInBytes, const char* kpSrc) {
+ strncpy_s (pDest, iSizeInBytes, kpSrc, _TRUNCATE);
+
+ return pDest;
+}
+
+int32_t WelsVsnprintf (char* pBuffer, int32_t iSizeOfBuffer, const char* kpFormat, va_list pArgPtr) {
+ return vsnprintf_s (pBuffer, iSizeOfBuffer, _TRUNCATE, kpFormat, pArgPtr);
+}
+
+WelsFileHandle* WelsFopen (const char* kpFilename, const char* kpMode) {
+ WelsFileHandle* pFp = NULL;
+ if (fopen_s (&pFp, kpFilename, kpMode) != 0) {
+ return NULL;
+ }
+
+ return pFp;
+}
+
+int32_t WelsFclose (WelsFileHandle* pFp) {
+ return fclose (pFp);
+}
+
+int32_t WelsGetTimeOfDay (SWelsTime* pTp) {
+ return _ftime_s (pTp);
+}
+
+int32_t WelsStrftime (char* pBuffer, int32_t iSize, const char* kpFormat, const SWelsTime* kpTp) {
+ struct tm sTimeNow;
+ int32_t iRc;
+
+ localtime_s (&sTimeNow, &kpTp->time);
+
+ iRc = strftime (pBuffer, iSize, kpFormat, &sTimeNow);
+ if (iRc == 0)
+ pBuffer[0] = '\0';
+ return iRc;
+}
+
+#else
+
+int32_t WelsSnprintf (char* pBuffer, int32_t iSizeOfBuffer, const char* kpFormat, ...) {
+ va_list pArgPtr;
+ int32_t iRc;
+
+ va_start (pArgPtr, kpFormat);
+
+ iRc = vsnprintf (pBuffer, iSizeOfBuffer, kpFormat, pArgPtr); //confirmed_safe_unsafe_usage
+ if (iRc < 0)
+ pBuffer[iSizeOfBuffer - 1] = '\0';
+
+ va_end (pArgPtr);
+
+ return iRc;
+}
+
+char* WelsStrncpy (char* pDest, int32_t iSizeInBytes, const char* kpSrc) {
+ strncpy (pDest, kpSrc, iSizeInBytes); //confirmed_safe_unsafe_usage
+ pDest[iSizeInBytes - 1] = '\0';
+
+ return pDest;
+}
+
+int32_t WelsVsnprintf (char* pBuffer, int32_t iSizeOfBuffer, const char* kpFormat, va_list pArgPtr) {
+ int32_t iRc = vsnprintf (pBuffer, iSizeOfBuffer, kpFormat, pArgPtr); //confirmed_safe_unsafe_usage
+ if (iRc < 0)
+ pBuffer[iSizeOfBuffer - 1] = '\0';
+ return iRc;
+}
+
+
+WelsFileHandle* WelsFopen (const char* kpFilename, const char* kpMode) {
+ return fopen (kpFilename, kpMode);
+}
+
+int32_t WelsFclose (WelsFileHandle* pFp) {
+ return fclose (pFp);
+}
+
+int32_t WelsGetTimeOfDay (SWelsTime* pTp) {
+ _ftime (pTp);
+ return 0;
+}
+
+int32_t WelsStrftime (char* pBuffer, int32_t iSize, const char* kpFormat, const SWelsTime* kpTp) {
+ struct tm* pTnow;
+ int32_t iRc;
+
+ pTnow = localtime (&kpTp->time);
+
+ iRc = strftime (pBuffer, iSize, kpFormat, pTnow);
+ if (iRc == 0)
+ pBuffer[0] = '\0';
+ return iRc;
+}
+
+
+#endif // _MSC_VER
+
+#else //GCC
+
+int32_t WelsSnprintf (char* pBuffer, int32_t iSizeOfBuffer, const char* kpFormat, ...) {
+ va_list pArgPtr;
+ int32_t iRc;
+
+ va_start (pArgPtr, kpFormat);
+
+ iRc = vsnprintf (pBuffer, iSizeOfBuffer, kpFormat, pArgPtr);
+
+ va_end (pArgPtr);
+
+ return iRc;
+}
+
+char* WelsStrncpy (char* pDest, int32_t iSizeInBytes, const char* kpSrc) {
+ strncpy (pDest, kpSrc, iSizeInBytes); //confirmed_safe_unsafe_usage
+ pDest[iSizeInBytes - 1] = '\0';
+ return pDest;
+}
+
+int32_t WelsVsnprintf (char* pBuffer, int32_t iSizeOfBuffer, const char* kpFormat, va_list pArgPtr) {
+ return vsnprintf (pBuffer, iSizeOfBuffer, kpFormat, pArgPtr); //confirmed_safe_unsafe_usage
+}
+
+WelsFileHandle* WelsFopen (const char* kpFilename, const char* kpMode) {
+ return fopen (kpFilename, kpMode);
+}
+
+int32_t WelsFclose (WelsFileHandle* pFp) {
+ return fclose (pFp);
+}
+
+int32_t WelsGetTimeOfDay (SWelsTime* pTp) {
+ struct timeval sTv;
+
+ if (gettimeofday (&sTv, NULL)) {
+ return -1;
+ }
+
+ pTp->time = sTv.tv_sec;
+ pTp->millitm = (uint16_t)sTv.tv_usec / 1000;
+
+ return 0;
+}
+
+int32_t WelsStrftime (char* pBuffer, int32_t iSize, const char* kpFormat, const SWelsTime* kpTp) {
+ struct tm* pTnow;
+ int32_t iRc;
+
+ pTnow = localtime (&kpTp->time);
+
+ iRc = strftime (pBuffer, iSize, kpFormat, pTnow);
+ if (iRc == 0)
+ pBuffer[0] = '\0';
+ return iRc;
+}
+
+#endif
+
+
+char* WelsStrcat (char* pDest, int32_t iSizeInBytes, const char* kpSrc) {
+ int32_t iCurLen = strlen(pDest);
+ return WelsStrncpy(pDest + iCurLen, iSizeInBytes - iCurLen, kpSrc);
+}
+
+int32_t WelsFwrite (const void* kpBuffer, int32_t iSize, int32_t iCount, WelsFileHandle* pFp) {
+ return fwrite (kpBuffer, iSize, iCount, pFp);
+}
+
+uint16_t WelsGetMillisecond (const SWelsTime* kpTp) {
+ return kpTp->millitm;
+}
+
+int32_t WelsFseek (WelsFileHandle* fp, int32_t offset, int32_t origin) {
+ return fseek(fp, offset, origin);
+}
+
+int32_t WelsFflush (WelsFileHandle* pFp) {
+ return fflush (pFp);
+}
--- /dev/null
+++ b/codec/common/src/deblocking_common.cpp
@@ -1,0 +1,204 @@
+#include "deblocking_common.h"
+#include "macros.h"
+// C code only
+void DeblockLumaLt4_c (uint8_t* pPix, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta,
+ int8_t* pTc) {
+ for (int32_t i = 0; i < 16; i++) {
+ int32_t iTc0 = pTc[i >> 2];
+ if (iTc0 >= 0) {
+ int32_t p0 = pPix[-iStrideX];
+ int32_t p1 = pPix[-2 * iStrideX];
+ int32_t p2 = pPix[-3 * iStrideX];
+ int32_t q0 = pPix[0];
+ int32_t q1 = pPix[iStrideX];
+ int32_t q2 = pPix[2 * iStrideX];
+ bool bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
+ bool bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
+ bool bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
+ int32_t iTc = iTc0;
+ if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
+ bool bDetaP2P0 = WELS_ABS (p2 - p0) < iBeta;
+ bool bDetaQ2Q0 = WELS_ABS (q2 - q0) < iBeta;
+ if (bDetaP2P0) {
+ pPix[-2 * iStrideX] = p1 + WELS_CLIP3 ((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1, -iTc0, iTc0);
+ iTc++;
+ }
+ if (bDetaQ2Q0) {
+ pPix[iStrideX] = q1 + WELS_CLIP3 ((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1, -iTc0, iTc0);
+ iTc++;
+ }
+ int32_t iDeta = WELS_CLIP3 ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3, -iTc, iTc);
+ pPix[-iStrideX] = WelsClip1 (p0 + iDeta); /* p0' */
+ pPix[0] = WelsClip1 (q0 - iDeta); /* q0' */
+ }
+ }
+ pPix += iStrideY;
+ }
+}
+void DeblockLumaEq4_c (uint8_t* pPix, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta) {
+ int32_t p0, p1, p2, q0, q1, q2;
+ int32_t iDetaP0Q0;
+ bool bDetaP1P0, bDetaQ1Q0;
+ for (int32_t i = 0; i < 16; i++) {
+ p0 = pPix[-iStrideX];
+ p1 = pPix[-2 * iStrideX];
+ p2 = pPix[-3 * iStrideX];
+ q0 = pPix[0];
+ q1 = pPix[iStrideX];
+ q2 = pPix[2 * iStrideX];
+ iDetaP0Q0 = WELS_ABS (p0 - q0);
+ bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
+ bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
+ if ((iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0) {
+ if (iDetaP0Q0 < ((iAlpha >> 2) + 2)) {
+ bool bDetaP2P0 = WELS_ABS (p2 - p0) < iBeta;
+ bool bDetaQ2Q0 = WELS_ABS (q2 - q0) < iBeta;
+ if (bDetaP2P0) {
+ const int32_t p3 = pPix[-4 * iStrideX];
+ pPix[-iStrideX] = (p2 + (p1 << 1) + (p0 << 1) + (q0 << 1) + q1 + 4) >> 3; //p0
+ pPix[-2 * iStrideX] = (p2 + p1 + p0 + q0 + 2) >> 2; //p1
+ pPix[-3 * iStrideX] = ((p3 << 1) + p2 + (p2 << 1) + p1 + p0 + q0 + 4) >> 3;//p2
+ } else {
+ pPix[-1 * iStrideX] = ((p1 << 1) + p0 + q1 + 2) >> 2; //p0
+ }
+ if (bDetaQ2Q0) {
+ const int32_t q3 = pPix[3 * iStrideX];
+ pPix[0] = (p1 + (p0 << 1) + (q0 << 1) + (q1 << 1) + q2 + 4) >> 3; //q0
+ pPix[iStrideX] = (p0 + q0 + q1 + q2 + 2) >> 2; //q1
+ pPix[2 * iStrideX] = ((q3 << 1) + q2 + (q2 << 1) + q1 + q0 + p0 + 4) >> 3;//q2
+ } else {
+ pPix[0] = ((q1 << 1) + q0 + p1 + 2) >> 2; //q0
+ }
+ } else {
+ pPix[-iStrideX] = ((p1 << 1) + p0 + q1 + 2) >> 2; //p0
+ pPix[ 0] = ((q1 << 1) + q0 + p1 + 2) >> 2; //q0
+ }
+ }
+ pPix += iStrideY;
+ }
+}
+void DeblockLumaLt4V_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc) {
+ DeblockLumaLt4_c (pPix, iStride, 1, iAlpha, iBeta, tc);
+}
+void DeblockLumaLt4H_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc) {
+ DeblockLumaLt4_c (pPix, 1, iStride, iAlpha, iBeta, tc);
+}
+void DeblockLumaEq4V_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
+ DeblockLumaEq4_c (pPix, iStride, 1, iAlpha, iBeta);
+}
+void DeblockLumaEq4H_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
+ DeblockLumaEq4_c (pPix, 1, iStride, iAlpha, iBeta);
+}
+void DeblockChromaLt4_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha,
+ int32_t iBeta, int8_t* pTc) {
+ int32_t p0, p1, q0, q1, iDeta;
+ bool bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
+
+ for (int32_t i = 0; i < 8; i++) {
+ int32_t iTc0 = pTc[i >> 1];
+ if (iTc0 > 0) {
+ p0 = pPixCb[-iStrideX];
+ p1 = pPixCb[-2 * iStrideX];
+ q0 = pPixCb[0];
+ q1 = pPixCb[iStrideX];
+
+ bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
+ bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
+ bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
+ if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
+ iDeta = WELS_CLIP3 ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3, -iTc0, iTc0);
+ pPixCb[-iStrideX] = WelsClip1 (p0 + iDeta); /* p0' */
+ pPixCb[0] = WelsClip1 (q0 - iDeta); /* q0' */
+ }
+
+
+ p0 = pPixCr[-iStrideX];
+ p1 = pPixCr[-2 * iStrideX];
+ q0 = pPixCr[0];
+ q1 = pPixCr[iStrideX];
+
+ bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
+ bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
+ bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
+
+ if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
+ iDeta = WELS_CLIP3 ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3, -iTc0, iTc0);
+ pPixCr[-iStrideX] = WelsClip1 (p0 + iDeta); /* p0' */
+ pPixCr[0] = WelsClip1 (q0 - iDeta); /* q0' */
+ }
+ }
+ pPixCb += iStrideY;
+ pPixCr += iStrideY;
+ }
+}
+void DeblockChromaEq4_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha,
+ int32_t iBeta) {
+ int32_t p0, p1, q0, q1;
+ bool bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
+ for (int32_t i = 0; i < 8; i++) {
+ //cb
+ p0 = pPixCb[-iStrideX];
+ p1 = pPixCb[-2 * iStrideX];
+ q0 = pPixCb[0];
+ q1 = pPixCb[iStrideX];
+ bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
+ bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
+ bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
+ if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
+ pPixCb[-iStrideX] = ((p1 << 1) + p0 + q1 + 2) >> 2; /* p0' */
+ pPixCb[0] = ((q1 << 1) + q0 + p1 + 2) >> 2; /* q0' */
+ }
+
+ //cr
+ p0 = pPixCr[-iStrideX];
+ p1 = pPixCr[-2 * iStrideX];
+ q0 = pPixCr[0];
+ q1 = pPixCr[iStrideX];
+ bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
+ bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
+ bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
+ if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
+ pPixCr[-iStrideX] = ((p1 << 1) + p0 + q1 + 2) >> 2; /* p0' */
+ pPixCr[0] = ((q1 << 1) + q0 + p1 + 2) >> 2; /* q0' */
+ }
+ pPixCr += iStrideY;
+ pPixCb += iStrideY;
+ }
+}
+void DeblockChromaLt4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+ int8_t* tc) {
+ DeblockChromaLt4_c (pPixCb, pPixCr, iStride, 1, iAlpha, iBeta, tc);
+}
+void DeblockChromaLt4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+ int8_t* tc) {
+ DeblockChromaLt4_c (pPixCb, pPixCr, 1, iStride, iAlpha, iBeta, tc);
+}
+void DeblockChromaEq4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
+ DeblockChromaEq4_c (pPixCb, pPixCr, iStride, 1, iAlpha, iBeta);
+}
+void DeblockChromaEq4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
+ DeblockChromaEq4_c (pPixCb, pPixCr, 1, iStride, iAlpha, iBeta);
+}
+
+#ifdef X86_ASM
+extern "C" {
+ void DeblockLumaLt4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
+ ENFORCE_STACK_ALIGN_1D (uint8_t, uiBuf, 16 * 8, 16);
+
+ DeblockLumaTransposeH2V_sse2 (pPixY - 4, iStride, &uiBuf[0]);
+ DeblockLumaLt4V_ssse3 (&uiBuf[4 * 16], 16, iAlpha, iBeta, pTc);
+ DeblockLumaTransposeV2H_sse2 (pPixY - 4, iStride, &uiBuf[0]);
+ }
+
+ void DeblockLumaEq4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
+ ENFORCE_STACK_ALIGN_1D (uint8_t, uiBuf, 16 * 8, 16);
+
+ DeblockLumaTransposeH2V_sse2 (pPixY - 4, iStride, &uiBuf[0]);
+ DeblockLumaEq4V_ssse3 (&uiBuf[4 * 16], 16, iAlpha, iBeta);
+ DeblockLumaTransposeV2H_sse2 (pPixY - 4, iStride, &uiBuf[0]);
+ }
+
+}
+
+#endif
+
--- /dev/null
+++ b/codec/common/src/logging.cpp
@@ -1,0 +1,49 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * Copyright (c) 2013, Mozilla
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "typedefs.h"
+
+static int32_t g_TraceLevel = 0;
+
+void WelsStderrSetTraceLevel (int32_t level) {
+ g_TraceLevel = level;
+}
+
+int32_t welsStderrLevelTrace (int32_t level, const char* format, va_list ap) {
+ if (level < g_TraceLevel) {
+ vfprintf (stderr, format, ap);
+ }
+ return 0;
+}
--- a/codec/common/targets.mk
+++ b/codec/common/targets.mk
@@ -1,23 +1,23 @@
COMMON_SRCDIR=codec/common
COMMON_CPP_SRCS=\
- $(COMMON_SRCDIR)/cpu.cpp\
- $(COMMON_SRCDIR)/crt_util_safe_x.cpp\
- $(COMMON_SRCDIR)/deblocking_common.cpp\
- $(COMMON_SRCDIR)/logging.cpp\
- $(COMMON_SRCDIR)/WelsThreadLib.cpp\
+ $(COMMON_SRCDIR)/src/cpu.cpp\
+ $(COMMON_SRCDIR)/src/crt_util_safe_x.cpp\
+ $(COMMON_SRCDIR)/src/deblocking_common.cpp\
+ $(COMMON_SRCDIR)/src/logging.cpp\
+ $(COMMON_SRCDIR)/src/WelsThreadLib.cpp\
COMMON_OBJS += $(COMMON_CPP_SRCS:.cpp=.$(OBJ))
ifeq ($(ASM_ARCH), x86)
COMMON_ASM_SRCS=\
- $(COMMON_SRCDIR)/cpuid.asm\
- $(COMMON_SRCDIR)/deblock.asm\
- $(COMMON_SRCDIR)/expand_picture.asm\
- $(COMMON_SRCDIR)/mb_copy.asm\
- $(COMMON_SRCDIR)/mc_chroma.asm\
- $(COMMON_SRCDIR)/mc_luma.asm\
- $(COMMON_SRCDIR)/satd_sad.asm\
- $(COMMON_SRCDIR)/vaa.asm\
+ $(COMMON_SRCDIR)/x86/cpuid.asm\
+ $(COMMON_SRCDIR)/x86/deblock.asm\
+ $(COMMON_SRCDIR)/x86/expand_picture.asm\
+ $(COMMON_SRCDIR)/x86/mb_copy.asm\
+ $(COMMON_SRCDIR)/x86/mc_chroma.asm\
+ $(COMMON_SRCDIR)/x86/mc_luma.asm\
+ $(COMMON_SRCDIR)/x86/satd_sad.asm\
+ $(COMMON_SRCDIR)/x86/vaa.asm\
COMMON_OBJS += $(COMMON_ASM_SRCS:.asm=.$(OBJ))
endif
@@ -24,9 +24,9 @@
ifeq ($(ASM_ARCH), arm)
COMMON_ASM_S_SRCS=\
- $(COMMON_SRCDIR)/deblocking_neon.S\
- $(COMMON_SRCDIR)/expand_picture_neon.S\
- $(COMMON_SRCDIR)/mc_neon.S\
+ $(COMMON_SRCDIR)/arm/deblocking_neon.S\
+ $(COMMON_SRCDIR)/arm/expand_picture_neon.S\
+ $(COMMON_SRCDIR)/arm/mc_neon.S\
COMMON_OBJS += $(COMMON_ASM_S_SRCS:.S=.$(OBJ))
endif
--- a/codec/common/typedefs.h
+++ /dev/null
@@ -1,74 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-// typedef.h
-#ifndef WELS_TYPE_DEFINES_H__
-#define WELS_TYPE_DEFINES_H__
-
-#include <limits.h>
-#include <stddef.h>
-
-////////////////////////////////////////////////////////////////////////////
-// NOTICE : ALL internal implement MUST use the data type defined as below
-// ONLY except with the interface file !!!!!
-////////////////////////////////////////////////////////////////////////////
-
-#ifndef _MSC_VER
-
-#define __STDC_FORMAT_MACROS
-#include <stdint.h>
-#include <inttypes.h>
-
-#else
-
-// FIXME: all singed type should be declared explicit, for example, int8_t should be declared as signed char.
-typedef signed char int8_t ;
-typedef unsigned char uint8_t ;
-typedef short int16_t ;
-typedef unsigned short uint16_t;
-typedef int int32_t ;
-typedef unsigned int uint32_t;
-typedef __int64 int64_t ;
-typedef unsigned __int64 uint64_t;
-#define PRId64 "I64d"
-
-#endif // _MSC_VER defined
-
-// The 'float' type is portable and usable without any need for any extra typedefs.
-
-#ifdef EPSN
-#undef EPSN
-#endif//EPSN
-#define EPSN (0.000001f) // (1e-6) // desired float precision
-
-#endif //WELS_TYPE_DEFINES_H__
-
--- a/codec/common/vaa.asm
+++ /dev/null
@@ -1,411 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2010-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* vaa.asm
-;*
-;* Abstract
-;* sse2 for pVaa routines
-;*
-;* History
-;* 04/14/2010 Created
-;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3)
-;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement
-;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-; by comparing it outperforms than phaddw(SSSE3) sets
-%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp
- ; @sum_8x2 begin
- pshufd %2, %1, 04Eh ; 01001110 B
- paddw %1, %2
- pshuflw %2, %1, 04Eh ; 01001110 B
- paddw %1, %2
- pshuflw %2, %1, 0B1h ; 10110001 B
- paddw %1, %2
- ; end of @sum_8x2
-%endmacro ; END of SUM_WORD_8x2_SSE2
-
-
-%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
- movdqa %1, [r0 ] ; line 0
- movdqa %2, [r0+r1] ; line 1
- movdqa %3, %1
- punpcklbw %1, xmm7
- punpckhbw %3, xmm7
- movdqa %4, %2
- punpcklbw %4, xmm7
- punpckhbw %2, xmm7
- paddw %1, %4
- paddw %2, %3
- movdqa %3, [r0+r2] ; line 2
- movdqa %4, [r0+r3] ; line 3
- movdqa %5, %3
- punpcklbw %3, xmm7
- punpckhbw %5, xmm7
- movdqa %6, %4
- punpcklbw %6, xmm7
- punpckhbw %4, xmm7
- paddw %3, %6
- paddw %4, %5
- paddw %1, %3 ; block 0, 1
- paddw %2, %4 ; block 2, 3
- pshufd %3, %1, 0B1h
- pshufd %4, %2, 0B1h
- paddw %1, %3
- paddw %2, %4
- movdqa %3, %1
- movdqa %4, %2
- pshuflw %5, %1, 0B1h
- pshufhw %6, %3, 0B1h
- paddw %1, %5
- paddw %3, %6
- pshuflw %5, %2, 0B1h
- pshufhw %6, %4, 0B1h
- paddw %2, %5
- paddw %4, %6
- punpcklwd %1, %2
- punpckhwd %3, %4
- punpcklwd %1, %3
- psraw %1, $04
-%endmacro
-
-%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
- movdqa %1, [r0 ] ; line 0
- movdqa %2, [r0+r1] ; line 1
- movdqa %3, %1
- punpcklbw %1, xmm7
- punpckhbw %3, xmm7
- movdqa %4, %2
- punpcklbw %4, xmm7
- punpckhbw %2, xmm7
- paddw %1, %4
- paddw %2, %3
- movdqa %3, [r0+r2] ; line 2
- movdqa %4, [r0+r3] ; line 3
- movdqa %5, %3
- punpcklbw %3, xmm7
- punpckhbw %5, xmm7
- movdqa %6, %4
- punpcklbw %6, xmm7
- punpckhbw %4, xmm7
- paddw %3, %6
- paddw %4, %5
- paddw %1, %3 ; block 0, 1
- paddw %2, %4 ; block 2, 3
- phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
- phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
- psraw %1, $04
-%endmacro
-
-
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-; , 6/7/2010
-
-;***********************************************************************
-; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t iLineSize );
-;***********************************************************************
-WELS_EXTERN AnalysisVaaInfoIntra_sse2
-
- %assign push_num 0
- LOAD_2_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1,r1d
-
-%ifdef X86_32
- push r3
- push r4
- push r5
- push r6
- %assign push_num push_num+4
-%endif
-
- mov r5,r7
- and r5,0fh
- sub r7,r5
- sub r7,32
-
-
- mov r2,r1
- sal r2,$01 ;r2 = 2*iLineSize
- mov r3,r2
- add r3,r1 ;r3 = 3*iLineSize
-
- mov r4,r2
- sal r4,$01 ;r4 = 4*iLineSize
-
- pxor xmm7, xmm7
-
- ; loops
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [r7], xmm0
-
- lea r0, [r0+r4]
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [r7+8], xmm0
-
- lea r0, [r0+r4]
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [r7+16], xmm0
-
- lea r0, [r0+r4]
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [r7+24], xmm0
-
- movdqa xmm0, [r7] ; block 0~7
- movdqa xmm1, [r7+16] ; block 8~15
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- SUM_WORD_8x2_SSE2 xmm0, xmm3
-
- pmullw xmm1, xmm1
- pmullw xmm2, xmm2
- movdqa xmm3, xmm1
- movdqa xmm4, xmm2
- punpcklwd xmm1, xmm7
- punpckhwd xmm3, xmm7
- punpcklwd xmm2, xmm7
- punpckhwd xmm4, xmm7
- paddd xmm1, xmm2
- paddd xmm3, xmm4
- paddd xmm1, xmm3
- pshufd xmm2, xmm1, 01Bh
- paddd xmm1, xmm2
- pshufd xmm2, xmm1, 0B1h
- paddd xmm1, xmm2
-
-
-
- movd r2d, xmm0
- and r2, 0ffffh ; effective low work truncated
- mov r3, r2
- imul r2, r3
- sar r2, $04
- movd retrd, xmm1
- sub retrd, r2d
-
- add r7,32
- add r7,r5
-
-%ifdef X86_32
- pop r6
- pop r5
- pop r4
- pop r3
-%endif
- POP_XMM
-
- ret
-
-;***********************************************************************
-; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t iLineSize );
-;***********************************************************************
-WELS_EXTERN AnalysisVaaInfoIntra_ssse3
-
- %assign push_num 0
- LOAD_2_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1,r1d
-
-%ifdef X86_32
- push r3
- push r4
- push r5
- push r6
- %assign push_num push_num+4
-%endif
-
- mov r5,r7
- and r5,0fh
- sub r7,r5
- sub r7,32
-
-
- mov r2,r1
- sal r2,$01 ;r2 = 2*iLineSize
- mov r3,r2
- add r3,r1 ;r3 = 3*iLineSize
-
- mov r4,r2
- sal r4,$01 ;r4 = 4*iLineSize
-
- pxor xmm7, xmm7
-
- ; loops
- VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [r7],xmm0
-
- lea r0,[r0+r4]
- VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
- movq [r7+8],xmm1
-
-
- lea r0,[r0+r4]
- VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [r7+16],xmm0
-
- lea r0,[r0+r4]
- VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
- movq [r7+24],xmm1
-
-
- movdqa xmm0,[r7]
- movdqa xmm1,[r7+16]
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets
-
- pmullw xmm1, xmm1
- pmullw xmm2, xmm2
- movdqa xmm3, xmm1
- movdqa xmm4, xmm2
- punpcklwd xmm1, xmm7
- punpckhwd xmm3, xmm7
- punpcklwd xmm2, xmm7
- punpckhwd xmm4, xmm7
- paddd xmm1, xmm2
- paddd xmm3, xmm4
- paddd xmm1, xmm3
- pshufd xmm2, xmm1, 01Bh
- paddd xmm1, xmm2
- pshufd xmm2, xmm1, 0B1h
- paddd xmm1, xmm2
-
-
- movd r2d, xmm0
- and r2, 0ffffh ; effective low work truncated
- mov r3, r2
- imul r2, r3
- sar r2, $04
- movd retrd, xmm1
- sub retrd, r2d
-
- add r7,32
- add r7,r5
-%ifdef X86_32
- pop r6
- pop r5
- pop r4
- pop r3
-%endif
- POP_XMM
-
- ret
-
-;***********************************************************************
-; uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
-;***********************************************************************
-WELS_EXTERN MdInterAnalysisVaaInfo_sse41
- %assign push_num 0
- LOAD_1_PARA
- movdqa xmm0,[r0]
- pshufd xmm1, xmm0, 01Bh
- paddd xmm1, xmm0
- pshufd xmm2, xmm1, 0B1h
- paddd xmm1, xmm2
- psrad xmm1, 02h ; iAverageSad
- movdqa xmm2, xmm1
- psrad xmm2, 06h
- movdqa xmm3, xmm0 ; iSadBlock
- psrad xmm3, 06h
- psubd xmm3, xmm2
- pmulld xmm3, xmm3 ; [comment]: pmulld from SSE4.1 instruction sets
- pshufd xmm4, xmm3, 01Bh
- paddd xmm4, xmm3
- pshufd xmm3, xmm4, 0B1h
- paddd xmm3, xmm4
- movd r0d, xmm3
- cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
-
- jb near .threshold_exit
- pshufd xmm0, xmm0, 01Bh
- pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
- movmskps retrd, xmm0
- ret
-.threshold_exit:
- mov retrd, 15
- ret
-
-;***********************************************************************
-; uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
-;***********************************************************************
-WELS_EXTERN MdInterAnalysisVaaInfo_sse2
- %assign push_num 0
- LOAD_1_PARA
- movdqa xmm0, [r0]
- pshufd xmm1, xmm0, 01Bh
- paddd xmm1, xmm0
- pshufd xmm2, xmm1, 0B1h
- paddd xmm1, xmm2
- psrad xmm1, 02h ; iAverageSad
- movdqa xmm2, xmm1
- psrad xmm2, 06h
- movdqa xmm3, xmm0 ; iSadBlock
- psrad xmm3, 06h
- psubd xmm3, xmm2
-
- ; to replace pmulld functionality as below
- movdqa xmm2, xmm3
- pmuludq xmm2, xmm3
- pshufd xmm4, xmm3, 0B1h
- pmuludq xmm4, xmm4
- movdqa xmm5, xmm2
- punpckldq xmm5, xmm4
- punpckhdq xmm2, xmm4
- punpcklqdq xmm5, xmm2
-
- pshufd xmm4, xmm5, 01Bh
- paddd xmm4, xmm5
- pshufd xmm5, xmm4, 0B1h
- paddd xmm5, xmm4
-
- movd r0d, xmm5
- cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
- jb near .threshold_exit
- pshufd xmm0, xmm0, 01Bh
- pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
- movmskps retrd, xmm0
- ret
-.threshold_exit:
- mov retrd, 15
- ret
--- /dev/null
+++ b/codec/common/x86/asm_inc.asm
@@ -1,0 +1,599 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* sse2inc.asm
+;*
+;* Abstract
+;* macro and constant
+;*
+;* History
+;* 8/5/2009 Created
+;*
+;*
+;*************************************************************************/
+;***********************************************************************
+; Options, for DEBUG
+;***********************************************************************
+
+%if 1
+ %define MOVDQ movdqa
+%else
+ %define MOVDQ movdqu
+%endif
+
+%if 1
+ %define WELSEMMS emms
+%else
+ %define WELSEMMS
+%endif
+
+
+;***********************************************************************
+; Macros
+;***********************************************************************
+
+DEFAULT REL
+
+%ifdef WIN64 ; Windows x64 ;************************************
+
+BITS 64
+
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 r8
+%define arg4 r9
+%define arg5 [rsp + push_num*8 + 40]
+%define arg6 [rsp + push_num*8 + 48]
+%define arg7 [rsp + push_num*8 + 56]
+%define arg8 [rsp + push_num*8 + 64]
+%define arg9 [rsp + push_num*8 + 72]
+%define arg10 [rsp + push_num*8 + 80]
+%define arg11 [rsp + push_num*8 + 88]
+%define arg12 [rsp + push_num*8 + 96]
+
+%define r0 rcx
+%define r1 rdx
+%define r2 r8
+%define r3 r9
+%define r4 rax
+%define r5 r10
+%define r6 r11
+%define r7 rsp
+
+%define r0d ecx
+%define r1d edx
+%define r2d r8d
+%define r3d r9d
+%define r4d eax
+%define r5d r10d
+%define r6d r11d
+
+%define r0w cx
+%define r1w dx
+%define r2w r8w
+%define r3w r9w
+
+%define r0b cl
+%define r1b dl
+%define r2b r8l
+%define r3b r9l
+
+%define PUSHRFLAGS pushfq
+%define POPRFLAGS popfq
+%define retrq rax
+%define retrd eax
+
+%elifdef UNIX64 ; Unix x64 ;************************************
+
+BITS 64
+
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rdx
+%define arg4 rcx
+%define arg5 r8
+%define arg6 r9
+%define arg7 [rsp + push_num*8 + 8]
+%define arg8 [rsp + push_num*8 + 16]
+%define arg9 [rsp + push_num*8 + 24]
+%define arg10 [rsp + push_num*8 + 32]
+%define arg11 [rsp + push_num*8 + 40]
+%define arg12 [rsp + push_num*8 + 48]
+
+%define r0 rdi
+%define r1 rsi
+%define r2 rdx
+%define r3 rcx
+%define r4 r8
+%define r5 r9
+%define r6 r10
+%define r7 rsp
+
+%define r0d edi
+%define r1d esi
+%define r2d edx
+%define r3d ecx
+%define r4d r8d
+%define r5d r9d
+%define r6d r10d
+
+%define r0w di
+%define r1w si
+%define r2w dx
+%define r3w cx
+
+%define r0b dil
+%define r1b sil
+%define r2b dl
+%define r3b cl
+
+%define PUSHRFLAGS pushfq
+%define POPRFLAGS popfq
+%define retrq rax
+%define retrd eax
+
+%elifdef X86_32 ; X86_32 ;************************************
+
+BITS 32
+
+%define arg1 [esp + push_num*4 + 4]
+%define arg2 [esp + push_num*4 + 8]
+%define arg3 [esp + push_num*4 + 12]
+%define arg4 [esp + push_num*4 + 16]
+%define arg5 [esp + push_num*4 + 20]
+%define arg6 [esp + push_num*4 + 24]
+%define arg7 [esp + push_num*4 + 28]
+%define arg8 [esp + push_num*4 + 32]
+%define arg9 [esp + push_num*4 + 36]
+%define arg10 [esp + push_num*4 + 40]
+%define arg11 [esp + push_num*4 + 44]
+%define arg12 [esp + push_num*4 + 48]
+
+%define r0 eax
+%define r1 ecx
+%define r2 edx
+%define r3 ebx
+%define r4 esi
+%define r5 edi
+%define r6 ebp
+%define r7 esp
+
+%define r0d eax
+%define r1d ecx
+%define r2d edx
+%define r3d ebx
+%define r4d esi
+%define r5d edi
+%define r6d ebp
+
+%define r0w ax
+%define r1w cx
+%define r2w dx
+%define r3w bx
+
+%define r0b al
+%define r1b cl
+%define r2b dl
+%define r3b bl
+
+%define PUSHRFLAGS pushfd
+%define POPRFLAGS popfd
+%define retrq eax ; 32 bit mode do not support 64 bits regesters
+%define retrd eax
+
+%endif
+
+%macro LOAD_PARA 2
+ mov %1, %2
+%endmacro
+
+%macro LOAD_1_PARA 0
+ %ifdef X86_32
+ mov r0, [esp + push_num*4 + 4]
+ %endif
+%endmacro
+
+%macro LOAD_2_PARA 0
+ %ifdef X86_32
+ mov r0, [esp + push_num*4 + 4]
+ mov r1, [esp + push_num*4 + 8]
+ %endif
+%endmacro
+
+%macro LOAD_3_PARA 0
+ %ifdef X86_32
+ mov r0, [esp + push_num*4 + 4]
+ mov r1, [esp + push_num*4 + 8]
+ mov r2, [esp + push_num*4 + 12]
+ %endif
+%endmacro
+
+%macro LOAD_4_PARA 0
+ %ifdef X86_32
+ push r3
+ %assign push_num push_num+1
+ mov r0, [esp + push_num*4 + 4]
+ mov r1, [esp + push_num*4 + 8]
+ mov r2, [esp + push_num*4 + 12]
+ mov r3, [esp + push_num*4 + 16]
+ %endif
+%endmacro
+
+%macro LOAD_5_PARA 0
+ %ifdef X86_32
+ push r3
+ push r4
+ %assign push_num push_num+2
+ mov r0, [esp + push_num*4 + 4]
+ mov r1, [esp + push_num*4 + 8]
+ mov r2, [esp + push_num*4 + 12]
+ mov r3, [esp + push_num*4 + 16]
+ mov r4, [esp + push_num*4 + 20]
+ %elifdef WIN64
+ mov r4, [rsp + push_num*8 + 40]
+ %endif
+%endmacro
+
+%macro LOAD_6_PARA 0
+ %ifdef X86_32
+ push r3
+ push r4
+ push r5
+ %assign push_num push_num+3
+ mov r0, [esp + push_num*4 + 4]
+ mov r1, [esp + push_num*4 + 8]
+ mov r2, [esp + push_num*4 + 12]
+ mov r3, [esp + push_num*4 + 16]
+ mov r4, [esp + push_num*4 + 20]
+ mov r5, [esp + push_num*4 + 24]
+ %elifdef WIN64
+ mov r4, [rsp + push_num*8 + 40]
+ mov r5, [rsp + push_num*8 + 48]
+ %endif
+%endmacro
+
+%macro LOAD_7_PARA 0
+ %ifdef X86_32
+ push r3
+ push r4
+ push r5
+ push r6
+ %assign push_num push_num+4
+ mov r0, [esp + push_num*4 + 4]
+ mov r1, [esp + push_num*4 + 8]
+ mov r2, [esp + push_num*4 + 12]
+ mov r3, [esp + push_num*4 + 16]
+ mov r4, [esp + push_num*4 + 20]
+ mov r5, [esp + push_num*4 + 24]
+ mov r6, [esp + push_num*4 + 28]
+ %elifdef WIN64
+ mov r4, [rsp + push_num*8 + 40]
+ mov r5, [rsp + push_num*8 + 48]
+ mov r6, [rsp + push_num*8 + 56]
+ %elifdef UNIX64
+ mov r6, [rsp + push_num*8 + 8]
+ %endif
+%endmacro
+
+
+
+%macro LOAD_4_PARA_POP 0
+ %ifdef X86_32
+ pop r3
+ %endif
+%endmacro
+
+%macro LOAD_5_PARA_POP 0
+ %ifdef X86_32
+ pop r4
+ pop r3
+ %endif
+%endmacro
+
+%macro LOAD_6_PARA_POP 0
+ %ifdef X86_32
+ pop r5
+ pop r4
+ pop r3
+ %endif
+%endmacro
+
+%macro LOAD_7_PARA_POP 0
+ %ifdef X86_32
+ pop r6
+ pop r5
+ pop r4
+ pop r3
+ %endif
+%endmacro
+
+%macro PUSH_XMM 1
+ %ifdef WIN64
+ %assign xmm_num_regs %1
+ %if xmm_num_regs > 6
+ %ifdef push_num
+ %assign push_num push_num+2*(%1-6)
+ %endif
+ sub rsp, 16*(%1 - 6)
+ movdqu [rsp], xmm6
+ %endif
+ %if xmm_num_regs > 7
+ movdqu [rsp+16], xmm7
+ %endif
+ %if xmm_num_regs > 8
+ movdqu [rsp+32], xmm8
+ %endif
+ %if xmm_num_regs > 9
+ movdqu [rsp+48], xmm9
+ %endif
+ %if xmm_num_regs > 10
+ movdqu [rsp+64], xmm10
+ %endif
+ %if xmm_num_regs > 11
+ movdqu [rsp+80], xmm11
+ %endif
+ %if xmm_num_regs > 12
+ movdqu [rsp+96], xmm12
+ %endif
+ %if xmm_num_regs > 13
+ movdqu [rsp+112], xmm13
+ %endif
+ %if xmm_num_regs > 14
+ movdqu [rsp+128], xmm14
+ %endif
+ %if xmm_num_regs > 15
+ movdqu [rsp+144], xmm15
+ %endif
+ %endif
+%endmacro
+
+%macro POP_XMM 0
+ %ifdef WIN64
+ %if xmm_num_regs > 15
+ movdqu xmm15, [rsp+144]
+ %endif
+ %if xmm_num_regs > 14
+ movdqu xmm14, [rsp+128]
+ %endif
+ %if xmm_num_regs > 13
+ movdqu xmm13, [rsp+112]
+ %endif
+ %if xmm_num_regs > 12
+ movdqu xmm12, [rsp+96]
+ %endif
+ %if xmm_num_regs > 11
+ movdqu xmm11, [rsp+80]
+ %endif
+ %if xmm_num_regs > 10
+ movdqu xmm10, [rsp+64]
+ %endif
+ %if xmm_num_regs > 9
+ movdqu xmm9, [rsp+48]
+ %endif
+ %if xmm_num_regs > 8
+ movdqu xmm8, [rsp+32]
+ %endif
+ %if xmm_num_regs > 7
+ movdqu xmm7, [rsp+16]
+ %endif
+ %if xmm_num_regs > 6
+ movdqu xmm6, [rsp]
+ add rsp, 16*(xmm_num_regs - 6)
+ %endif
+ %endif
+%endmacro
+
+%macro SIGN_EXTENSION 2
+ %ifndef X86_32
+ movsxd %1, %2
+ %endif
+%endmacro
+
+%macro SIGN_EXTENSIONW 2
+ %ifndef X86_32
+ movsx %1, %2
+ %endif
+%endmacro
+
+%macro WELS_EXTERN 1
+ ALIGN 16
+ %ifdef PREFIX
+ global _%1
+ %define %1 _%1
+ %else
+ global %1
+ %endif
+ %1:
+%endmacro
+
+%macro WELS_AbsW 2
+ pxor %2, %2
+ psubw %2, %1
+ pmaxsw %1, %2
+%endmacro
+
+%macro MMX_XSwap 4
+ movq %4, %2
+ punpckh%1 %4, %3
+ punpckl%1 %2, %3
+%endmacro
+
+; pOut mm1, mm4, mm5, mm3
+%macro MMX_Trans4x4W 5
+ MMX_XSwap wd, %1, %2, %5
+ MMX_XSwap wd, %3, %4, %2
+ MMX_XSwap dq, %1, %3, %4
+ MMX_XSwap dq, %5, %2, %3
+%endmacro
+
+;for TRANSPOSE
+%macro SSE2_XSawp 4
+ movdqa %4, %2
+ punpckl%1 %2, %3
+ punpckh%1 %4, %3
+%endmacro
+
+; in: xmm1, xmm2, xmm3, xmm4 pOut: xmm1, xmm4, xmm5, mm3
+%macro SSE2_Trans4x4D 5
+ SSE2_XSawp dq, %1, %2, %5
+ SSE2_XSawp dq, %3, %4, %2
+ SSE2_XSawp qdq, %1, %3, %4
+ SSE2_XSawp qdq, %5, %2, %3
+%endmacro
+
+;in: xmm0, xmm1, xmm2, xmm3 pOut: xmm0, xmm1, xmm3, xmm4
+%macro SSE2_TransTwo4x4W 5
+ SSE2_XSawp wd, %1, %2, %5
+ SSE2_XSawp wd, %3, %4, %2
+ SSE2_XSawp dq, %1, %3, %4
+ SSE2_XSawp dq, %5, %2, %3
+ SSE2_XSawp qdq, %1, %5, %2
+ SSE2_XSawp qdq, %4, %3, %5
+%endmacro
+
+;in: m1, m2, m3, m4, m5, m6, m7, m8
+;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+%macro SSE2_TransTwo8x8B 9
+ movdqa %9, %8
+ SSE2_XSawp bw, %1, %2, %8
+ SSE2_XSawp bw, %3, %4, %2
+ SSE2_XSawp bw, %5, %6, %4
+ movdqa %6, %9
+ movdqa %9, %4
+ SSE2_XSawp bw, %7, %6, %4
+
+ SSE2_XSawp wd, %1, %3, %6
+ SSE2_XSawp wd, %8, %2, %3
+ SSE2_XSawp wd, %5, %7, %2
+ movdqa %7, %9
+ movdqa %9, %3
+ SSE2_XSawp wd, %7, %4, %3
+
+ SSE2_XSawp dq, %1, %5, %4
+ SSE2_XSawp dq, %6, %2, %5
+ SSE2_XSawp dq, %8, %7, %2
+ movdqa %7, %9
+ movdqa %9, %5
+ SSE2_XSawp dq, %7, %3, %5
+
+ SSE2_XSawp qdq, %1, %8, %3
+ SSE2_XSawp qdq, %4, %2, %8
+ SSE2_XSawp qdq, %6, %7, %2
+ movdqa %7, %9
+ movdqa %9, %1
+ SSE2_XSawp qdq, %7, %5, %1
+ movdqa %5, %9
+%endmacro
+
+;xmm0, xmm6, xmm7, [eax], [ecx]
+;xmm7 = 0, eax = pix1, ecx = pix2, xmm0 save the result
+%macro SSE2_LoadDiff8P 5
+ movq %1, %4
+ punpcklbw %1, %3
+ movq %2, %5
+ punpcklbw %2, %3
+ psubw %1, %2
+%endmacro
+
+; m2 = m1 + m2, m1 = m1 - m2
+%macro SSE2_SumSub 3
+ movdqa %3, %2
+ paddw %2, %1
+ psubw %1, %3
+%endmacro
+
+
+%macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
+ mov %3h, %3l
+ movd %1, e%3x ; i.e, 1% = eax (=b0)
+ pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
+ pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro
+
+;copy a dw into a xmm for 8 times
+%macro SSE2_Copy8Times 2
+ movd %1, %2
+ punpcklwd %1, %1
+ pshufd %1, %1, 0
+%endmacro
+
+;copy a db into a xmm for 16 times
+%macro SSE2_Copy16Times 2
+ movd %1, %2
+ pshuflw %1, %1, 0
+ punpcklqdq %1, %1
+ packuswb %1, %1
+%endmacro
+
+
+
+;***********************************************************************
+;preprocessor constants
+;***********************************************************************
+;dw 32,32,32,32,32,32,32,32 for xmm
+;dw 32,32,32,32 for mm
+%macro WELS_DW32 1
+ pcmpeqw %1,%1
+ psrlw %1,15
+ psllw %1,5
+%endmacro
+
+;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm
+;dw 1, 1, 1, 1 for mm
+%macro WELS_DW1 1
+ pcmpeqw %1,%1
+ psrlw %1,15
+%endmacro
+
+;all 0 for xmm and mm
+%macro WELS_Zero 1
+ pxor %1, %1
+%endmacro
+
+;dd 1, 1, 1, 1 for xmm
+;dd 1, 1 for mm
+%macro WELS_DD1 1
+ pcmpeqw %1,%1
+ psrld %1,31
+%endmacro
+
+;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+%macro WELS_DB1 1
+ pcmpeqw %1,%1
+ psrlw %1,15
+ packuswb %1,%1
+%endmacro
+
+
+
+
+
+
--- /dev/null
+++ b/codec/common/x86/cpuid.asm
@@ -1,0 +1,212 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* cpu_mmx.asm
+;*
+;* Abstract
+;* verify cpuid feature support and cpuid detection
+;*
+;* History
+;* 04/29/2009 Created
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+;******************************************************************************************
+; Macros
+;******************************************************************************************
+
+
+;******************************************************************************************
+; Code
+;******************************************************************************************
+
+SECTION .text
+
+; refer to "The IA-32 Intel(R) Architecture Software Developers Manual, Volume 2A A-M"
+; section CPUID - CPU Identification
+
+;******************************************************************************************
+; int32_t WelsCPUIdVerify()
+;******************************************************************************************
+WELS_EXTERN WelsCPUIdVerify
+ push r1
+ PUSHRFLAGS
+ PUSHRFLAGS
+
+ pop r1
+ mov eax, r1d
+ xor eax, 00200000h
+ xor eax, r1d
+ POPRFLAGS
+ pop r1
+ ret
+
+;****************************************************************************************************
+; void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
+;****************************************************************************************************
+%ifdef WIN64
+
+WELS_EXTERN WelsCPUId
+ push rbx
+ push rdx
+
+ mov eax, ecx
+ mov rcx, [r9]
+ cpuid
+ mov [r9], ecx
+ mov [r8], ebx
+ mov rcx, [rsp + 2*8 + 40]
+ mov [rcx], edx
+ pop rdx
+ mov [rdx], eax
+
+ pop rbx
+ ret
+
+%elifdef UNIX64
+WELS_EXTERN WelsCPUId
+ push rbx
+ push rcx
+ push rdx
+
+ mov eax, edi
+ mov rcx, [rcx]
+ cpuid
+ mov [r8], edx
+ pop rdx
+ pop r8
+ mov [r8], ecx
+ mov [rdx], ebx
+ mov [rsi], eax
+
+ pop rbx
+ ret
+
+%elifdef X86_32
+
+WELS_EXTERN WelsCPUId
+ push ebx
+ push edi
+
+ mov eax, [esp+12] ; operating index
+ mov edi, [esp+24]
+ mov ecx, [edi]
+ cpuid ; cpuid
+
+ ; processing various information return
+ mov edi, [esp+16]
+ mov [edi], eax
+ mov edi, [esp+20]
+ mov [edi], ebx
+ mov edi, [esp+24]
+ mov [edi], ecx
+ mov edi, [esp+28]
+ mov [edi], edx
+
+ pop edi
+ pop ebx
+ ret
+
+%endif
+
+; need call after cpuid=1 and eax, ecx flag got then
+;****************************************************************************************************
+; int32_t WelsCPUSupportAVX( uint32_t eax, uint32_t ecx )
+;****************************************************************************************************
+WELS_EXTERN WelsCPUSupportAVX
+%ifdef WIN64
+ mov eax, ecx
+ mov ecx, edx
+%elifdef UNIX64
+ mov eax, edi
+ mov ecx, esi
+%else
+ mov eax, [esp+4]
+ mov ecx, [esp+8]
+%endif
+
+ ; refer to detection of AVX addressed in INTEL AVX manual document
+ and ecx, 018000000H
+ cmp ecx, 018000000H ; check both OSXSAVE and AVX feature flags
+ jne avx_not_supported
+ ; processor supports AVX instructions and XGETBV is enabled by OS
+ mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
+ XGETBV ; result in EDX:EAX
+ and eax, 06H
+ cmp eax, 06H ; check OS has enabled both XMM and YMM state support
+ jne avx_not_supported
+ mov eax, 1
+ ret
+avx_not_supported:
+ mov eax, 0
+ ret
+
+
+; need call after cpuid=1 and eax, ecx flag got then
+;****************************************************************************************************
+; int32_t WelsCPUSupportFMA( uint32_t eax, uint32_t ecx )
+;****************************************************************************************************
+WELS_EXTERN WelsCPUSupportFMA
+%ifdef WIN64
+ mov eax, ecx
+ mov ecx, edx
+%elifdef UNIX64
+ mov eax, edi
+ mov ecx, esi
+%else
+ mov eax, [esp+4]
+ mov ecx, [esp+8]
+%endif
+ ; refer to detection of FMA addressed in INTEL AVX manual document
+ and ecx, 018001000H
+ cmp ecx, 018001000H ; check OSXSAVE, AVX, FMA feature flags
+ jne fma_not_supported
+ ; processor supports AVX,FMA instructions and XGETBV is enabled by OS
+ mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
+ XGETBV ; result in EDX:EAX
+ and eax, 06H
+ cmp eax, 06H ; check OS has enabled both XMM and YMM state support
+ jne fma_not_supported
+ mov eax, 1
+ ret
+fma_not_supported:
+ mov eax, 0
+ ret
+
+;******************************************************************************************
+; void WelsEmms()
+;******************************************************************************************
+WELS_EXTERN WelsEmms
+ emms ; empty mmx technology states
+ ret
+
--- /dev/null
+++ b/codec/common/x86/deblock.asm
@@ -1,0 +1,5278 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* deblock.asm
+;*
+;* Abstract
+;* edge loop
+;*
+;* History
+;* 08/07/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+SECTION .rodata align=16
+
+ALIGN 16
+FOUR_16B_SSE2: dw 4, 4, 4, 4, 4, 4, 4, 4
+
+
+SECTION .text
+
+%ifdef WIN64
+
+
+WELS_EXTERN DeblockLumaLt4V_ssse3
+ push rbp
+ mov r11,[rsp + 16 + 20h] ; pTC
+ PUSH_XMM 16
+ sub rsp,1B0h
+ lea rbp,[rsp+20h]
+ movd xmm4,r8d
+ movd xmm2,r9d
+ mov qword [rbp+180h],r12
+ mov r10,rcx
+ movsxd r12,edx
+ add edx,edx
+ movsxd rdx,edx
+ sub r10,r12
+ movsx r8d,byte [r11]
+ pxor xmm3,xmm3
+ punpcklwd xmm2,xmm2
+ movaps [rbp+50h],xmm14
+ lea rax,[r12+r12*2]
+ movdqa xmm14,[rdx+rcx]
+ neg rax
+ pshufd xmm0,xmm2,0
+ movd xmm2,r8d
+ movsx edx,byte [r11+1]
+ movsx r8d,byte [r11+2]
+ movsx r11d,byte [r11+3]
+ movaps [rbp+70h],xmm12
+ movd xmm1,edx
+ movaps [rbp+80h],xmm11
+ movd xmm12,r8d
+ movd xmm11,r11d
+ movdqa xmm5, [rax+rcx]
+ lea rax,[r12+r12]
+ punpcklwd xmm12,xmm12
+ neg rax
+ punpcklwd xmm11,xmm11
+ movaps [rbp],xmm8
+ movdqa xmm8, [r10]
+ punpcklwd xmm2,xmm2
+ punpcklwd xmm1,xmm1
+ punpcklqdq xmm12,xmm12
+ punpcklqdq xmm11,xmm11
+ punpcklqdq xmm2,xmm2
+ punpcklqdq xmm1,xmm1
+ shufps xmm12,xmm11,88h
+ movdqa xmm11,xmm8
+ movaps [rbp+30h],xmm9
+ movdqa xmm9,[rcx]
+ shufps xmm2,xmm1,88h
+ movdqa xmm1,xmm5
+ punpcklbw xmm11,xmm3
+ movaps [rbp+20h],xmm6
+ movaps [rbp+60h],xmm13
+ movdqa xmm13,xmm11
+ movaps [rbp+90h],xmm10
+ movdqa xmm10,xmm9
+ movdqa xmm6,[rax+rcx]
+ punpcklbw xmm1,xmm3
+ movaps [rbp+0A0h],xmm12
+ psubw xmm13,xmm1
+ movaps [rbp+40h],xmm15
+ movdqa xmm15,xmm14
+ movaps [rbp+10h],xmm7
+ movdqa xmm7,xmm6
+ punpcklbw xmm10,xmm3
+ movdqa xmm12,[r12+rcx]
+ punpcklbw xmm7,xmm3
+ punpcklbw xmm12,xmm3
+ punpcklbw xmm15,xmm3
+ pabsw xmm3,xmm13
+ movdqa xmm13,xmm10
+ psubw xmm13,xmm15
+ movdqa [rbp+0F0h],xmm15
+ pabsw xmm15,xmm13
+ movdqa xmm13,xmm11
+ movdqa [rbp+0B0h],xmm1
+ movdqa xmm1,xmm0
+ pavgw xmm13,xmm10
+ pcmpgtw xmm1,xmm3
+ movdqa [rbp+120h],xmm13
+ movaps xmm13,xmm2
+ punpcklwd xmm4,xmm4
+ movdqa xmm3,xmm0
+ movdqa [rbp+100h],xmm1
+ psubw xmm13,xmm1
+ movdqa xmm1,xmm10
+ pcmpgtw xmm3,xmm15
+ pshufd xmm4,xmm4,0
+ psubw xmm1,xmm11
+ movdqa [rbp+0D0h],xmm10
+ psubw xmm13,xmm3
+ movdqa [rbp+110h],xmm3
+ pabsw xmm15,xmm1
+ movdqa xmm3,xmm4
+ psubw xmm10,xmm12
+ pcmpgtw xmm3,xmm15
+ pabsw xmm15,xmm10
+ movdqa xmm10,xmm0
+ psllw xmm1,2
+ movdqa [rbp+0C0h],xmm11
+ psubw xmm11,xmm7
+ pcmpgtw xmm10,xmm15
+ pabsw xmm11,xmm11
+ movdqa xmm15,xmm0
+ pand xmm3,xmm10
+ pcmpgtw xmm15,xmm11
+ movaps xmm11,xmm2
+ pxor xmm10,xmm10
+ pand xmm3,xmm15
+ pcmpgtw xmm11,xmm10
+ pcmpeqw xmm10,xmm2
+ por xmm11,xmm10
+ pand xmm3,xmm11
+ movdqa xmm11,xmm7
+ psubw xmm11,xmm12
+ pxor xmm15,xmm15
+ paddw xmm11,xmm1
+ psubw xmm15,xmm13
+ movdqa [rbp+0E0h],xmm12
+ paddw xmm11,[FOUR_16B_SSE2]
+ pxor xmm12,xmm12
+ psraw xmm11,3
+ punpckhbw xmm8,xmm12
+ pmaxsw xmm15,xmm11
+ punpckhbw xmm5,xmm12
+ movdqa xmm11,xmm8
+ pminsw xmm13,xmm15
+ psubw xmm11,xmm5
+ punpckhbw xmm9,xmm12
+ pand xmm13,xmm3
+ movdqa [rbp+130h],xmm13
+ pabsw xmm13,xmm11
+ punpckhbw xmm14,xmm12
+ movdqa xmm11,xmm9
+ psubw xmm11,xmm14
+ movdqa xmm15,xmm0
+ movdqa [rbp+140h],xmm14
+ pabsw xmm14,xmm11
+ movdqa xmm11,xmm8
+ pcmpgtw xmm15,xmm14
+ movdqa xmm1,[r12+rcx]
+ pavgw xmm11,xmm9
+ movdqa [rbp+170h],xmm11
+ movdqa xmm10,xmm9
+ punpckhbw xmm6,xmm12
+ psubw xmm10,xmm8
+ punpckhbw xmm1,xmm12
+ movdqa xmm12,xmm0
+ movaps xmm11,[rbp+0A0h]
+ pcmpgtw xmm12,xmm13
+ movaps xmm13,xmm11
+ psubw xmm13,xmm12
+ movdqa [rbp+160h],xmm15
+ psubw xmm13,xmm15
+ movdqa xmm15,xmm9
+ psubw xmm15,xmm1
+ movdqa [rbp+150h],xmm12
+ pabsw xmm12,xmm10
+ pabsw xmm14,xmm15
+ movdqa xmm15,xmm8
+ pcmpgtw xmm4,xmm12
+ movdqa xmm12,xmm0
+ psubw xmm15,xmm6
+ pcmpgtw xmm12,xmm14
+ pabsw xmm14,xmm15
+ psllw xmm10,2
+ pcmpgtw xmm0,xmm14
+ movdqa xmm14,xmm6
+ psubw xmm14,xmm1
+ pand xmm4,xmm12
+ paddw xmm14,xmm10
+ pand xmm4,xmm0
+ paddw xmm14,[FOUR_16B_SSE2]
+ pxor xmm15,xmm15
+ movaps xmm12,xmm11
+ psubw xmm15,xmm13
+ pxor xmm0,xmm0
+ psraw xmm14,3
+ pcmpgtw xmm12,xmm0
+ pcmpeqw xmm0,xmm11
+ pmaxsw xmm15,xmm14
+ por xmm12,xmm0
+ movdqa xmm0,[rbp+120h]
+ pminsw xmm13,xmm15
+ movdqa xmm15,[rbp+0B0h]
+ movdqa xmm10,xmm7
+ pand xmm4,xmm12
+ paddw xmm15,xmm0
+ pxor xmm12,xmm12
+ paddw xmm10,xmm7
+ movdqa xmm14,xmm12
+ psubw xmm15,xmm10
+ psubw xmm14,xmm2
+ psraw xmm15,1
+ pmaxsw xmm15,xmm14
+ movdqa xmm10,xmm6
+ pminsw xmm15,xmm2
+ paddw xmm10,xmm6
+ pand xmm15,xmm3
+ psubw xmm12,xmm11
+ pand xmm15,[rbp+100h]
+ pand xmm13,xmm4
+ paddw xmm7,xmm15
+ paddw xmm8,xmm13
+ movdqa xmm15,[rbp+170h]
+ psubw xmm9,xmm13
+ paddw xmm5,xmm15
+ psubw xmm5,xmm10
+ psraw xmm5,1
+ pmaxsw xmm5,xmm12
+ pminsw xmm5,xmm11
+ pand xmm5,xmm4
+ pand xmm5,[rbp+150h]
+ paddw xmm6,xmm5
+ movdqa xmm5,[rbp+0C0h]
+ packuswb xmm7,xmm6
+ movdqa xmm6,[rbp+130h]
+ paddw xmm5,xmm6
+ packuswb xmm5,xmm8
+ movdqa xmm8,[rbp+0D0h]
+ psubw xmm8,xmm6
+ movdqa xmm6,[rbp+0F0h]
+ paddw xmm6,xmm0
+ movdqa xmm0,[rbp+0E0h]
+ packuswb xmm8,xmm9
+ movdqa xmm9,xmm0
+ paddw xmm9,xmm0
+ psubw xmm6,xmm9
+ psraw xmm6,1
+ pmaxsw xmm14,xmm6
+ pminsw xmm2,xmm14
+ pand xmm2,xmm3
+ pand xmm2,[rbp+110h]
+ paddw xmm0,xmm2
+ movdqa xmm2,[rbp+140h]
+ paddw xmm2,xmm15
+ movdqa xmm15,xmm1
+ paddw xmm15,xmm1
+ psubw xmm2,xmm15
+ psraw xmm2,1
+ pmaxsw xmm12,xmm2
+ pminsw xmm11,xmm12
+ pand xmm11,xmm4
+ pand xmm11,[rbp+160h]
+ paddw xmm1,xmm11
+ movdqa [rax+rcx],xmm7
+ movdqa [r10],xmm5
+ packuswb xmm0,xmm1
+ movdqa [rcx],xmm8
+ movdqa [r12+rcx],xmm0
+ mov r12,qword [rbp+180h]
+ lea rsp,[rbp+190h]
+ POP_XMM
+ pop rbp
+ ret
+
+
+WELS_EXTERN DeblockLumaEq4V_ssse3
+ mov rax,rsp
+ push rbx
+ push rbp
+ push rsi
+ push rdi
+ sub rsp,1D8h
+ movaps [rax-38h],xmm6
+ movaps [rax-48h],xmm7
+ movaps [rax-58h],xmm8
+ pxor xmm1,xmm1
+ movsxd r10,edx
+ mov rbp,rcx
+ mov r11d,r8d
+ mov rdx,rcx
+ mov rdi,rbp
+ mov rbx,rbp
+ movdqa xmm5,[rbp]
+ movaps [rax-68h],xmm9
+ movaps [rax-78h],xmm10
+ punpcklbw xmm5,xmm1
+ movaps [rax-88h],xmm11
+ movaps [rax-98h],xmm12
+ movaps [rax-0A8h],xmm13
+ movaps [rax-0B8h],xmm14
+ movdqa xmm14,[r10+rbp]
+ movaps [rax-0C8h],xmm15
+ lea eax,[r10*4]
+ movsxd r8,eax
+ lea eax,[r10+r10*2]
+ movsxd rcx,eax
+ lea eax,[r10+r10]
+ sub rdx,r8
+ punpcklbw xmm14,xmm1
+ movdqa [rsp+90h],xmm5
+ movdqa [rsp+30h],xmm14
+ movsxd rsi,eax
+ movsx eax,r11w
+ sub rdi,rcx
+ sub rbx,rsi
+ mov r8,rbp
+ sub r8,r10
+ movd xmm0,eax
+ movsx eax,r9w
+ movdqa xmm12,[rdi]
+ movdqa xmm6, [rsi+rbp]
+ movdqa xmm13,[rbx]
+ punpcklwd xmm0,xmm0
+ pshufd xmm11,xmm0,0
+ punpcklbw xmm13,xmm1
+ punpcklbw xmm6,xmm1
+ movdqa xmm8,[r8]
+ movd xmm0,eax
+ movdqa xmm10,xmm11
+ mov eax,2
+ punpcklbw xmm8,xmm1
+ punpcklbw xmm12,xmm1
+ cwde
+ punpcklwd xmm0,xmm0
+ psraw xmm10,2
+ movdqa xmm1,xmm8
+ movdqa [rsp+0F0h],xmm13
+ movdqa [rsp+0B0h],xmm8
+ pshufd xmm7,xmm0,0
+ psubw xmm1,xmm13
+ movdqa xmm0,xmm5
+ movdqa xmm4,xmm7
+ movdqa xmm2,xmm7
+ psubw xmm0,xmm8
+ pabsw xmm3,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm5
+ movdqa [rsp+40h],xmm7
+ movdqa [rsp+60h],xmm6
+ pcmpgtw xmm4,xmm0
+ psubw xmm1,xmm14
+ pabsw xmm0,xmm1
+ pcmpgtw xmm2,xmm0
+ pand xmm4,xmm2
+ movdqa xmm0,xmm11
+ pcmpgtw xmm0,xmm3
+ pand xmm4,xmm0
+ movd xmm0,eax
+ movdqa [rsp+20h],xmm4
+ punpcklwd xmm0,xmm0
+ pshufd xmm2,xmm0,0
+ paddw xmm10,xmm2
+ movdqa [rsp+0A0h],xmm2
+ movdqa xmm15,xmm7
+ pxor xmm4,xmm4
+ movdqa xmm0,xmm8
+ psubw xmm0,xmm12
+ mov eax,4
+ pabsw xmm0,xmm0
+ movdqa xmm1,xmm10
+ cwde
+ pcmpgtw xmm15,xmm0
+ pcmpgtw xmm1,xmm3
+ movdqa xmm3,xmm7
+ movdqa xmm7,[rdx]
+ movdqa xmm0,xmm5
+ psubw xmm0,xmm6
+ pand xmm15,xmm1
+ punpcklbw xmm7,xmm4
+ movdqa xmm9,xmm15
+ pabsw xmm0,xmm0
+ psllw xmm7,1
+ pandn xmm9,xmm12
+ pcmpgtw xmm3,xmm0
+ paddw xmm7,xmm12
+ movd xmm0,eax
+ pand xmm3,xmm1
+ paddw xmm7,xmm12
+ punpcklwd xmm0,xmm0
+ paddw xmm7,xmm12
+ pshufd xmm1,xmm0,0
+ paddw xmm7,xmm13
+ movdqa xmm0,xmm3
+ pandn xmm0,xmm6
+ paddw xmm7,xmm8
+ movdqa [rsp+70h],xmm1
+ paddw xmm7,xmm5
+ movdqa [rsp+120h],xmm0
+ movdqa xmm0,[rcx+rbp]
+ punpcklbw xmm0,xmm4
+ paddw xmm7,xmm1
+ movdqa xmm4,xmm15
+ psllw xmm0,1
+ psraw xmm7,3
+ paddw xmm0,xmm6
+ pand xmm7,xmm15
+ paddw xmm0,xmm6
+ paddw xmm0,xmm6
+ paddw xmm0,xmm14
+ movdqa xmm6,xmm15
+ paddw xmm0,xmm5
+ pandn xmm6,xmm13
+ paddw xmm0,xmm8
+ paddw xmm0,xmm1
+ psraw xmm0,3
+ movdqa xmm1,xmm12
+ paddw xmm1,xmm13
+ pand xmm0,xmm3
+ movdqa [rsp+100h],xmm0
+ movdqa xmm0,xmm8
+ paddw xmm0,xmm5
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm3
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ pandn xmm0,xmm14
+ pand xmm4,xmm1
+ movdqa [rsp+0E0h],xmm0
+ movdqa xmm0,xmm5
+ paddw xmm0,xmm8
+ movdqa xmm1,[rsp+60h]
+ paddw xmm1,xmm14
+ movdqa xmm14,xmm3
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm8
+ paddw xmm0,[rsp+30h]
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ pand xmm14,xmm1
+ movdqa xmm1,xmm13
+ paddw xmm1,xmm13
+ paddw xmm1,xmm0
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ movdqa xmm0,[rsp+30h]
+ movdqa xmm2,xmm13
+ movdqa xmm5,xmm15
+ paddw xmm0,[rsp+70h]
+ pandn xmm5,xmm1
+ paddw xmm2,xmm8
+ movdqa xmm8,[rsp+90h]
+ movdqa xmm1,xmm12
+ paddw xmm2,xmm8
+ psllw xmm2,1
+ paddw xmm2,xmm0
+ paddw xmm1,xmm2
+ movdqa xmm0,xmm8
+ movdqa xmm8,xmm3
+ movdqa xmm2,[rsp+30h]
+ paddw xmm0,xmm13
+ psraw xmm1,3
+ pand xmm15,xmm1
+ movdqa xmm1,xmm2
+ paddw xmm1,xmm2
+ paddw xmm2,[rsp+90h]
+ paddw xmm2,[rsp+0B0h]
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm13
+ movdqa xmm13,[r8]
+ paddw xmm0, [rsp+70h]
+ paddw xmm1, [rsp+0A0h]
+ psllw xmm2,1
+ paddw xmm2,xmm0
+ psraw xmm1,2
+ movdqa xmm0, [rdi]
+ pandn xmm8,xmm1
+ movdqa xmm1, [rsp+60h]
+ paddw xmm1,xmm2
+ movdqa xmm2, [rbx]
+ psraw xmm1,3
+ pand xmm3,xmm1
+ movdqa xmm1, [rbp]
+ movdqa [rsp+0D0h],xmm3
+ pxor xmm3,xmm3
+ punpckhbw xmm0,xmm3
+ punpckhbw xmm1,xmm3
+ punpckhbw xmm13,xmm3
+ movdqa [rsp+0C0h],xmm0
+ movdqa xmm0,[r10+rbp]
+ movdqa [rsp],xmm1
+ punpckhbw xmm0,xmm3
+ punpckhbw xmm2,xmm3
+ movdqa [rsp+80h],xmm0
+ movdqa xmm0,[rsi+rbp]
+ movdqa [rsp+10h],xmm13
+ punpckhbw xmm0,xmm3
+ movdqa [rsp+50h],xmm0
+ movdqa xmm0,xmm1
+ movdqa xmm1,xmm13
+ psubw xmm0,xmm13
+ psubw xmm1,xmm2
+ pabsw xmm3,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,[rsp]
+ movdqa xmm13,[rsp+40h]
+ movdqa [rsp+110h],xmm2
+ psubw xmm1, [rsp+80h]
+ pcmpgtw xmm13,xmm0
+ pcmpgtw xmm11,xmm3
+ pabsw xmm0,xmm1
+ pcmpgtw xmm10,xmm3
+ movdqa xmm1, [rsp+40h]
+ movdqa xmm2,xmm1
+ movdqa xmm3,xmm1
+ pcmpgtw xmm2,xmm0
+ movdqa xmm0, [rsp+10h]
+ pand xmm13,xmm2
+ pand xmm13,xmm11
+ movdqa xmm11,[rsp+0C0h]
+ psubw xmm0,xmm11
+ pabsw xmm0,xmm0
+ pcmpgtw xmm3,xmm0
+ pand xmm3,xmm10
+ movdqa xmm0,[rsp]
+ psubw xmm0,[rsp+50h]
+ movdqa xmm2,[rdx]
+ pabsw xmm0,xmm0
+ por xmm7,xmm9
+ movdqa xmm9,[rsp+20h]
+ pcmpgtw xmm1,xmm0
+ pand xmm9,xmm7
+ movdqa xmm7,[rsp+20h]
+ movdqa xmm0,xmm7
+ pandn xmm0,xmm12
+ movdqa xmm12,[rsp+110h]
+ pand xmm1,xmm10
+ movdqa xmm10,[rsp+70h]
+ movdqa [rsp+40h],xmm1
+ movdqa xmm1,xmm13
+ por xmm9,xmm0
+ pxor xmm0,xmm0
+ por xmm4,xmm6
+ movdqa xmm6,xmm7
+ punpckhbw xmm2,xmm0
+ por xmm15,xmm5
+ movdqa xmm5,[rsp+20h]
+ movdqa xmm0,xmm3
+ psllw xmm2,1
+ pandn xmm0,xmm11
+ pand xmm6,xmm4
+ movdqa xmm4,[rsp]
+ paddw xmm2,xmm11
+ pand xmm5,xmm15
+ movdqa xmm15,[rsp+20h]
+ paddw xmm2,xmm11
+ paddw xmm2,xmm11
+ paddw xmm2,xmm12
+ paddw xmm2,[rsp+10h]
+ paddw xmm2,[rsp]
+ paddw xmm2,xmm10
+ psraw xmm2,3
+ pand xmm2,xmm3
+ por xmm2,xmm0
+ pand xmm1,xmm2
+ movdqa xmm0,xmm13
+ movdqa xmm2,xmm11
+ pandn xmm0,xmm11
+ paddw xmm2,xmm12
+ por xmm1,xmm0
+ packuswb xmm9,xmm1
+ movdqa xmm0,xmm7
+ movdqa xmm7,[rsp+0A0h]
+ pandn xmm0,[rsp+0F0h]
+ movdqa xmm1,xmm3
+ por xmm6,xmm0
+ movdqa xmm0,[rsp+10h]
+ paddw xmm0,xmm4
+ paddw xmm2,xmm0
+ paddw xmm2,xmm7
+ movdqa xmm0,xmm3
+ pandn xmm0,xmm12
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ movdqa xmm2,xmm13
+ movdqa xmm0,xmm13
+ pand xmm2,xmm1
+ pandn xmm0,xmm12
+ movdqa xmm1,xmm12
+ paddw xmm1,[rsp+10h]
+ por xmm2,xmm0
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+0B0h]
+ paddw xmm1,xmm4
+ packuswb xmm6,xmm2
+ movdqa xmm2,xmm3
+ psllw xmm1,1
+ por xmm5,xmm0
+ movdqa xmm0,[rsp+80h]
+ paddw xmm0,xmm10
+ paddw xmm1,xmm0
+ paddw xmm11,xmm1
+ psraw xmm11,3
+ movdqa xmm1,xmm12
+ pand xmm2,xmm11
+ paddw xmm1,xmm12
+ movdqa xmm11,[rsp+80h]
+ movdqa xmm0, [rsp+10h]
+ por xmm14,[rsp+0E0h]
+ paddw xmm0,xmm11
+ movdqa xmm4,xmm15
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm13
+ paddw xmm1,xmm7
+ psraw xmm1,2
+ pandn xmm3,xmm1
+ por xmm2,xmm3
+ movdqa xmm1,xmm13
+ movdqa xmm3,[rsp+10h]
+ pandn xmm0,xmm3
+ pand xmm1,xmm2
+ movdqa xmm2,xmm11
+ paddw xmm2,[rsp]
+ por xmm1,xmm0
+ movdqa xmm0,[rsp+0D0h]
+ por xmm0,xmm8
+ paddw xmm2,xmm3
+ packuswb xmm5,xmm1
+ movdqa xmm8,[rsp+40h]
+ movdqa xmm1,[rsp+50h]
+ movdqa xmm3,xmm8
+ pand xmm4,xmm0
+ psllw xmm2,1
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+90h]
+ por xmm4,xmm0
+ movdqa xmm0,xmm12
+ paddw xmm0,xmm10
+ paddw xmm2,xmm0
+ paddw xmm1,xmm2
+ movdqa xmm0,[rsp]
+ movdqa xmm2,xmm11
+ paddw xmm0,xmm12
+ movdqa xmm12,[rsp]
+ paddw xmm2,xmm11
+ paddw xmm2,xmm0
+ psraw xmm1,3
+ movdqa xmm0,xmm8
+ pand xmm3,xmm1
+ paddw xmm2,xmm7
+ movdqa xmm1,xmm13
+ psraw xmm2,2
+ pandn xmm0,xmm2
+ por xmm3,xmm0
+ movdqa xmm2,[rsp+50h]
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm12
+ pand xmm1,xmm3
+ paddw xmm2,xmm11
+ movdqa xmm3,xmm15
+ por xmm1,xmm0
+ pand xmm3,xmm14
+ movdqa xmm14,[rsp+10h]
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+30h]
+ packuswb xmm4,xmm1
+ movdqa xmm1,xmm8
+ por xmm3,xmm0
+ movdqa xmm0,xmm12
+ paddw xmm0,xmm14
+ paddw xmm2,xmm0
+ paddw xmm2,xmm7
+ movdqa xmm0,xmm8
+ pandn xmm0,xmm11
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ movdqa xmm2,xmm13
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm11
+ pand xmm2,xmm1
+ movdqa xmm1,xmm15
+ por xmm2,xmm0
+ packuswb xmm3,xmm2
+ movdqa xmm0,[rsp+100h]
+ por xmm0,[rsp+120h]
+ pand xmm1,xmm0
+ movdqa xmm2,[rcx+rbp]
+ movdqa xmm7,[rsp+50h]
+ pandn xmm15,[rsp+60h]
+ lea r11,[rsp+1D8h]
+ pxor xmm0,xmm0
+ por xmm1,xmm15
+ movaps xmm15,[r11-0A8h]
+ movdqa [rdi],xmm9
+ movaps xmm9,[r11-48h]
+ punpckhbw xmm2,xmm0
+ psllw xmm2,1
+ paddw xmm2,xmm7
+ paddw xmm2,xmm7
+ movdqa [rbx],xmm6
+ movaps xmm6,[r11-18h]
+ paddw xmm2,xmm7
+ paddw xmm2,xmm11
+ movaps xmm11,[r11-68h]
+ paddw xmm2,xmm12
+ movaps xmm12,[r11-78h]
+ paddw xmm2,xmm14
+ paddw xmm2,xmm10
+ psraw xmm2,3
+ movaps xmm10,[r11-58h]
+ movaps xmm14,[r11-98h]
+ movdqa xmm0,xmm13
+ pand xmm2,xmm8
+ pandn xmm8,xmm7
+ pandn xmm13,xmm7
+ por xmm2,xmm8
+ movaps xmm7,[r11-28h]
+ movaps xmm8,[r11-38h]
+ movdqa [r8],xmm5
+ pand xmm0,xmm2
+ por xmm0,xmm13
+ packuswb xmm1,xmm0
+ movaps xmm13,[r11-88h]
+ movdqa [rbp],xmm4
+ movdqa [r10+rbp],xmm3
+ movdqa [rsi+rbp],xmm1
+ mov rsp,r11
+ pop rdi
+ pop rsi
+ pop rbp
+ pop rbx
+ ret
+
+
+WELS_EXTERN DeblockChromaLt4V_ssse3
+ mov rax,rsp
+ push rbx
+ push rdi
+ PUSH_XMM 16
+ sub rsp,0C8h
+ mov r10,qword [rax + 30h] ; pTC
+ pxor xmm1,xmm1
+ mov rbx,rcx
+ movsxd r11,r8d
+ movsx ecx,byte [r10]
+ movsx r8d,byte [r10+2]
+ mov rdi,rdx
+ movq xmm2,[rbx]
+ movq xmm9,[r11+rbx]
+ movsx edx,byte [r10+1]
+ mov word [rsp+2],cx
+ mov word [rsp],cx
+ movsx eax,byte [r10+3]
+ mov word [rsp+6],dx
+ mov word [rsp+4],dx
+ movdqa xmm11,xmm1
+ mov word [rsp+0Eh],ax
+ mov word [rsp+0Ch],ax
+ lea eax,[r11+r11]
+ movsxd rcx,eax
+ mov rax,rbx
+ mov rdx,rdi
+ sub rax,rcx
+ mov word [rsp+0Ah],r8w
+ mov word [rsp+8],r8w
+ movdqa xmm6,[rsp]
+ movdqa xmm7,xmm6
+ movq xmm13, [rax]
+ mov rax,rdi
+ sub rax,rcx
+ mov rcx,rbx
+ pcmpgtw xmm7,xmm1
+ psubw xmm11,xmm6
+ sub rcx,r11
+ sub rdx,r11
+ movq xmm0,[rax]
+ movsx eax,r9w
+ movq xmm15,[rcx]
+ punpcklqdq xmm13,xmm0
+ movq xmm0, [rdx]
+ movdqa xmm4,xmm13
+ punpcklqdq xmm15,xmm0
+ movq xmm0, [rdi]
+ punpcklbw xmm4,xmm1
+ movdqa xmm12,xmm15
+ punpcklqdq xmm2,xmm0
+ movq xmm0, [r11+rdi]
+ punpcklbw xmm12,xmm1
+ movdqa xmm14,xmm2
+ punpcklqdq xmm9,xmm0
+ punpckhbw xmm2,xmm1
+ punpcklbw xmm14,xmm1
+ movd xmm0,eax
+ movsx eax,word [rsp + 0C8h + 38h + 160] ; iBeta
+ punpckhbw xmm13,xmm1
+ punpckhbw xmm15,xmm1
+ movdqa xmm3,xmm9
+ movdqa [rsp+10h],xmm2
+ punpcklwd xmm0,xmm0
+ punpckhbw xmm9,xmm1
+ punpcklbw xmm3,xmm1
+ movdqa xmm1,xmm14
+ pshufd xmm10,xmm0,0
+ movd xmm0,eax
+ mov eax,4
+ cwde
+ punpcklwd xmm0,xmm0
+ pshufd xmm8,xmm0,0
+ movd xmm0,eax
+ punpcklwd xmm0,xmm0
+ pshufd xmm5,xmm0,0
+ psubw xmm1,xmm12
+ movdqa xmm2,xmm10
+ lea r11,[rsp+0C8h]
+ psllw xmm1,2
+ movdqa xmm0,xmm4
+ psubw xmm4,xmm12
+ psubw xmm0,xmm3
+ psubw xmm3,xmm14
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm11
+ psraw xmm1,3
+ pmaxsw xmm0,xmm1
+ pminsw xmm6,xmm0
+ movdqa xmm1,xmm8
+ movdqa xmm0,xmm12
+ psubw xmm0,xmm14
+ pabsw xmm0,xmm0
+ pcmpgtw xmm2,xmm0
+ pabsw xmm0,xmm4
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm3
+ movdqa xmm3,[rsp]
+ pand xmm2,xmm1
+ movdqa xmm1,xmm8
+ pcmpgtw xmm1,xmm0
+ movdqa xmm0,xmm13
+ pand xmm2,xmm1
+ psubw xmm0,xmm9
+ psubw xmm13,xmm15
+ pand xmm2,xmm7
+ pand xmm6,xmm2
+ paddw xmm12,xmm6
+ psubw xmm14,xmm6
+ movdqa xmm2,[rsp+10h]
+ movaps xmm6,[r11-18h]
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm15
+ psubw xmm9,xmm2
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm15
+ psubw xmm0,xmm2
+ psraw xmm1,3
+ pmaxsw xmm11,xmm1
+ pabsw xmm0,xmm0
+ movdqa xmm1,xmm8
+ pcmpgtw xmm10,xmm0
+ pabsw xmm0,xmm13
+ pminsw xmm3,xmm11
+ movaps xmm11,[r11-68h]
+ movaps xmm13,[rsp+40h]
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm9
+ movaps xmm9, [r11-48h]
+ pand xmm10,xmm1
+ pcmpgtw xmm8,xmm0
+ pand xmm10,xmm8
+ pand xmm10,xmm7
+ movaps xmm8,[r11-38h]
+ movaps xmm7,[r11-28h]
+ pand xmm3,xmm10
+ paddw xmm15,xmm3
+ psubw xmm2,xmm3
+ movaps xmm10,[r11-58h]
+ packuswb xmm12,xmm15
+ movaps xmm15,[rsp+20h]
+ packuswb xmm14,xmm2
+ movq [rcx],xmm12
+ movq [rbx],xmm14
+ psrldq xmm12,8
+ psrldq xmm14,8
+ movq [rdx],xmm12
+ movaps xmm12,[r11-78h]
+ movq [rdi],xmm14
+ movaps xmm14,[rsp+30h]
+ mov rsp,r11
+ POP_XMM
+ pop rdi
+ pop rbx
+ ret
+
+
+WELS_EXTERN DeblockChromaEq4V_ssse3
+ mov rax,rsp
+ push rbx
+ PUSH_XMM 15
+ sub rsp,90h
+ pxor xmm1,xmm1
+ mov r11,rcx
+ mov rbx,rdx
+ mov r10d,r9d
+ movq xmm13,[r11]
+ lea eax,[r8+r8]
+ movsxd r9,eax
+ mov rax,rcx
+ sub rax,r9
+ movq xmm14,[rax]
+ mov rax,rdx
+ sub rax,r9
+ movq xmm0,[rax]
+ movsxd rax,r8d
+ sub rcx,rax
+ sub rdx,rax
+ movq xmm12,[rax+r11]
+ movq xmm10,[rcx]
+ punpcklqdq xmm14,xmm0
+ movdqa xmm8,xmm14
+ movq xmm0,[rdx]
+ punpcklbw xmm8,xmm1
+ punpckhbw xmm14,xmm1
+ punpcklqdq xmm10,xmm0
+ movq xmm0,[rbx]
+ movdqa xmm5,xmm10
+ punpcklqdq xmm13,xmm0
+ movq xmm0, [rax+rbx]
+ punpcklbw xmm5,xmm1
+ movsx eax,r10w
+ movdqa xmm9,xmm13
+ punpcklqdq xmm12,xmm0
+ punpcklbw xmm9,xmm1
+ punpckhbw xmm10,xmm1
+ movd xmm0,eax
+ movsx eax,word [rsp + 90h + 8h + 28h + 144] ; iBeta
+ punpckhbw xmm13,xmm1
+ movdqa xmm7,xmm12
+ punpcklwd xmm0,xmm0
+ punpckhbw xmm12,xmm1
+ pshufd xmm11,xmm0,0
+ punpcklbw xmm7,xmm1
+ movd xmm0,eax
+ movdqa xmm1,xmm8
+ psubw xmm1,xmm5
+ punpcklwd xmm0,xmm0
+ movdqa xmm6,xmm11
+ pshufd xmm3,xmm0,0
+ movdqa xmm0,xmm5
+ psubw xmm0,xmm9
+ movdqa xmm2,xmm3
+ pabsw xmm0,xmm0
+ pcmpgtw xmm6,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm3
+ pcmpgtw xmm2,xmm0
+ pand xmm6,xmm2
+ movdqa xmm0,xmm7
+ movdqa xmm2,xmm3
+ psubw xmm0,xmm9
+ pabsw xmm0,xmm0
+ pcmpgtw xmm1,xmm0
+ pand xmm6,xmm1
+ movdqa xmm0,xmm10
+ movdqa xmm1,xmm14
+ psubw xmm0,xmm13
+ psubw xmm1,xmm10
+ pabsw xmm0,xmm0
+ pcmpgtw xmm11,xmm0
+ pabsw xmm0,xmm1
+ pcmpgtw xmm2,xmm0
+ pand xmm11,xmm2
+ movdqa xmm0,xmm12
+ movdqa xmm4,xmm6
+ movdqa xmm1,xmm8
+ mov eax,2
+ cwde
+ paddw xmm1,xmm8
+ psubw xmm0,xmm13
+ paddw xmm1,xmm5
+ pabsw xmm0,xmm0
+ movdqa xmm2,xmm14
+ paddw xmm1,xmm7
+ pcmpgtw xmm3,xmm0
+ paddw xmm2,xmm14
+ movd xmm0,eax
+ pand xmm11,xmm3
+ paddw xmm7,xmm7
+ paddw xmm2,xmm10
+ punpcklwd xmm0,xmm0
+ paddw xmm2,xmm12
+ paddw xmm12,xmm12
+ pshufd xmm3,xmm0,0
+ paddw xmm7,xmm9
+ paddw xmm12,xmm13
+ movdqa xmm0,xmm6
+ paddw xmm1,xmm3
+ pandn xmm0,xmm5
+ paddw xmm7,xmm8
+ psraw xmm1,2
+ paddw xmm12,xmm14
+ paddw xmm7,xmm3
+ movaps xmm14,[rsp]
+ pand xmm4,xmm1
+ paddw xmm12,xmm3
+ psraw xmm7,2
+ movdqa xmm1,xmm11
+ por xmm4,xmm0
+ psraw xmm12,2
+ paddw xmm2,xmm3
+ movdqa xmm0,xmm11
+ pandn xmm0,xmm10
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ packuswb xmm4,xmm1
+ movdqa xmm0,xmm11
+ movdqa xmm1,xmm6
+ pand xmm1,xmm7
+ movaps xmm7,[rsp+70h]
+ movq [rcx],xmm4
+ pandn xmm6,xmm9
+ pandn xmm11,xmm13
+ pand xmm0,xmm12
+ por xmm1,xmm6
+ por xmm0,xmm11
+ psrldq xmm4,8
+ packuswb xmm1,xmm0
+ movq [r11],xmm1
+ psrldq xmm1,8
+ movq [rdx],xmm4
+ lea r11,[rsp+90h]
+ movaps xmm6,[r11-10h]
+ movaps xmm8,[r11-30h]
+ movaps xmm9,[r11-40h]
+ movq [rbx],xmm1
+ movaps xmm10,[r11-50h]
+ movaps xmm11,[r11-60h]
+ movaps xmm12,[r11-70h]
+ movaps xmm13,[r11-80h]
+ mov rsp,r11
+ POP_XMM
+ pop rbx
+ ret
+
+
+
+
+
+WELS_EXTERN DeblockChromaEq4H_ssse3
+ mov rax,rsp
+ mov [rax+20h],rbx
+ push rdi
+ PUSH_XMM 16
+ sub rsp,140h
+ mov rdi,rdx
+ lea eax,[r8*4]
+ movsxd r10,eax
+ mov eax,[rcx-2]
+ mov [rsp+10h],eax
+ lea rbx,[r10+rdx-2]
+ lea r11,[r10+rcx-2]
+ movdqa xmm5,[rsp+10h]
+ movsxd r10,r8d
+ mov eax,[r10+rcx-2]
+ lea rdx,[r10+r10*2]
+ mov [rsp+20h],eax
+ mov eax,[rcx+r10*2-2]
+ mov [rsp+30h],eax
+ mov eax,[rdx+rcx-2]
+ movdqa xmm2,[rsp+20h]
+ mov [rsp+40h],eax
+ mov eax, [rdi-2]
+ movdqa xmm4,[rsp+30h]
+ mov [rsp+50h],eax
+ mov eax,[r10+rdi-2]
+ movdqa xmm3,[rsp+40h]
+ mov [rsp+60h],eax
+ mov eax,[rdi+r10*2-2]
+ punpckldq xmm5,[rsp+50h]
+ mov [rsp+70h],eax
+ mov eax, [rdx+rdi-2]
+ punpckldq xmm2, [rsp+60h]
+ mov [rsp+80h],eax
+ mov eax,[r11]
+ punpckldq xmm4, [rsp+70h]
+ mov [rsp+50h],eax
+ mov eax,[rbx]
+ punpckldq xmm3,[rsp+80h]
+ mov [rsp+60h],eax
+ mov eax,[r10+r11]
+ movdqa xmm0, [rsp+50h]
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm5,xmm0
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax,[r10+rbx]
+ movdqa xmm0,[rsp+50h]
+ movdqa xmm1,xmm5
+ mov [rsp+60h],eax
+ mov eax,[r11+r10*2]
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm2,xmm0
+ punpcklbw xmm1,xmm2
+ punpckhbw xmm5,xmm2
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax,[rbx+r10*2]
+ movdqa xmm0,[rsp+50h]
+ mov [rsp+60h],eax
+ mov eax, [rdx+r11]
+ movdqa xmm15,xmm1
+ punpckldq xmm0,[rsp+60h]
+ punpcklqdq xmm4,xmm0
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax, [rdx+rbx]
+ movdqa xmm0,[rsp+50h]
+ mov [rsp+60h],eax
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm3,xmm0
+ movdqa xmm0,xmm4
+ punpcklbw xmm0,xmm3
+ punpckhbw xmm4,xmm3
+ punpcklwd xmm15,xmm0
+ punpckhwd xmm1,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm12,xmm15
+ punpcklwd xmm0,xmm4
+ punpckhwd xmm5,xmm4
+ punpckldq xmm12,xmm0
+ punpckhdq xmm15,xmm0
+ movdqa xmm0,xmm1
+ movdqa xmm11,xmm12
+ punpckldq xmm0,xmm5
+ punpckhdq xmm1,xmm5
+ punpcklqdq xmm11,xmm0
+ punpckhqdq xmm12,xmm0
+ movsx eax,r9w
+ movdqa xmm14,xmm15
+ punpcklqdq xmm14,xmm1
+ punpckhqdq xmm15,xmm1
+ pxor xmm1,xmm1
+ movd xmm0,eax
+ movdqa xmm4,xmm12
+ movdqa xmm8,xmm11
+ movsx eax,word [rsp+170h + 160] ; iBeta
+ punpcklwd xmm0,xmm0
+ punpcklbw xmm4,xmm1
+ punpckhbw xmm12,xmm1
+ movdqa xmm9,xmm14
+ movdqa xmm7,xmm15
+ movdqa xmm10,xmm15
+ pshufd xmm13,xmm0,0
+ punpcklbw xmm9,xmm1
+ punpckhbw xmm14,xmm1
+ movdqa xmm6,xmm13
+ movd xmm0,eax
+ movdqa [rsp],xmm11
+ mov eax,2
+ cwde
+ punpckhbw xmm11,xmm1
+ punpckhbw xmm10,xmm1
+ punpcklbw xmm7,xmm1
+ punpcklwd xmm0,xmm0
+ punpcklbw xmm8,xmm1
+ pshufd xmm3,xmm0,0
+ movdqa xmm1,xmm8
+ movdqa xmm0,xmm4
+ psubw xmm0,xmm9
+ psubw xmm1,xmm4
+ movdqa xmm2,xmm3
+ pabsw xmm0,xmm0
+ pcmpgtw xmm6,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm3
+ pcmpgtw xmm2,xmm0
+ pand xmm6,xmm2
+ movdqa xmm0,xmm7
+ movdqa xmm2,xmm3
+ psubw xmm0,xmm9
+ pabsw xmm0,xmm0
+ pcmpgtw xmm1,xmm0
+ pand xmm6,xmm1
+ movdqa xmm0,xmm12
+ movdqa xmm1,xmm11
+ psubw xmm0,xmm14
+ psubw xmm1,xmm12
+ movdqa xmm5,xmm6
+ pabsw xmm0,xmm0
+ pcmpgtw xmm13,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm8
+ pcmpgtw xmm2,xmm0
+ paddw xmm1,xmm8
+ movdqa xmm0,xmm10
+ pand xmm13,xmm2
+ psubw xmm0,xmm14
+ paddw xmm1,xmm4
+ movdqa xmm2,xmm11
+ pabsw xmm0,xmm0
+ paddw xmm2,xmm11
+ paddw xmm1,xmm7
+ pcmpgtw xmm3,xmm0
+ paddw xmm2,xmm12
+ movd xmm0,eax
+ pand xmm13,xmm3
+ paddw xmm2,xmm10
+ punpcklwd xmm0,xmm0
+ pshufd xmm3,xmm0,0
+ movdqa xmm0,xmm6
+ paddw xmm1,xmm3
+ pandn xmm0,xmm4
+ paddw xmm2,xmm3
+ psraw xmm1,2
+ pand xmm5,xmm1
+ por xmm5,xmm0
+ paddw xmm7,xmm7
+ paddw xmm10,xmm10
+ psraw xmm2,2
+ movdqa xmm1,xmm13
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm12
+ pand xmm1,xmm2
+ paddw xmm7,xmm9
+ por xmm1,xmm0
+ paddw xmm10,xmm14
+ paddw xmm7,xmm8
+ movdqa xmm0,xmm13
+ packuswb xmm5,xmm1
+ paddw xmm7,xmm3
+ paddw xmm10,xmm11
+ movdqa xmm1,xmm6
+ paddw xmm10,xmm3
+ pandn xmm6,xmm9
+ psraw xmm7,2
+ pand xmm1,xmm7
+ psraw xmm10,2
+ pandn xmm13,xmm14
+ pand xmm0,xmm10
+ por xmm1,xmm6
+ movdqa xmm6,[rsp]
+ movdqa xmm4,xmm6
+ por xmm0,xmm13
+ punpcklbw xmm4,xmm5
+ punpckhbw xmm6,xmm5
+ movdqa xmm3,xmm4
+ packuswb xmm1,xmm0
+ movdqa xmm0,xmm1
+ punpckhbw xmm1,xmm15
+ punpcklbw xmm0,xmm15
+ punpcklwd xmm3,xmm0
+ punpckhwd xmm4,xmm0
+ movdqa xmm0,xmm6
+ movdqa xmm2,xmm3
+ punpcklwd xmm0,xmm1
+ punpckhwd xmm6,xmm1
+ movdqa xmm1,xmm4
+ punpckldq xmm2,xmm0
+ punpckhdq xmm3,xmm0
+ punpckldq xmm1,xmm6
+ movdqa xmm0,xmm2
+ punpcklqdq xmm0,xmm1
+ punpckhdq xmm4,xmm6
+ punpckhqdq xmm2,xmm1
+ movdqa [rsp+10h],xmm0
+ movdqa [rsp+60h],xmm2
+ movdqa xmm0,xmm3
+ mov eax,[rsp+10h]
+ mov [rcx-2],eax
+ mov eax,[rsp+60h]
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm3,xmm4
+ mov [r10+rcx-2],eax
+ movdqa [rsp+20h],xmm0
+ mov eax, [rsp+20h]
+ movdqa [rsp+70h],xmm3
+ mov [rcx+r10*2-2],eax
+ mov eax,[rsp+70h]
+ mov [rdx+rcx-2],eax
+ mov eax,[rsp+18h]
+ mov [r11],eax
+ mov eax,[rsp+68h]
+ mov [r10+r11],eax
+ mov eax,[rsp+28h]
+ mov [r11+r10*2],eax
+ mov eax,[rsp+78h]
+ mov [rdx+r11],eax
+ mov eax,[rsp+14h]
+ mov [rdi-2],eax
+ mov eax,[rsp+64h]
+ mov [r10+rdi-2],eax
+ mov eax,[rsp+24h]
+ mov [rdi+r10*2-2],eax
+ mov eax, [rsp+74h]
+ mov [rdx+rdi-2],eax
+ mov eax, [rsp+1Ch]
+ mov [rbx],eax
+ mov eax, [rsp+6Ch]
+ mov [r10+rbx],eax
+ mov eax,[rsp+2Ch]
+ mov [rbx+r10*2],eax
+ mov eax,[rsp+7Ch]
+ mov [rdx+rbx],eax
+ lea rsp,[rsp+140h]
+ POP_XMM
+ mov rbx, [rsp+28h]
+ pop rdi
+ ret
+
+
+
+WELS_EXTERN DeblockChromaLt4H_ssse3
+ mov rax,rsp
+ push rbx
+ push rbp
+ push rsi
+ push rdi
+ push r12
+ PUSH_XMM 16
+ sub rsp,170h
+
+ movsxd rsi,r8d
+ lea eax,[r8*4]
+ mov r11d,r9d
+ movsxd r10,eax
+ mov eax, [rcx-2]
+ mov r12,rdx
+ mov [rsp+40h],eax
+ mov eax, [rsi+rcx-2]
+ lea rbx,[r10+rcx-2]
+ movdqa xmm5,[rsp+40h]
+ mov [rsp+50h],eax
+ mov eax, [rcx+rsi*2-2]
+ lea rbp,[r10+rdx-2]
+ movdqa xmm2, [rsp+50h]
+ mov [rsp+60h],eax
+ lea r10,[rsi+rsi*2]
+ mov rdi,rcx
+ mov eax,[r10+rcx-2]
+ movdqa xmm4,[rsp+60h]
+ mov [rsp+70h],eax
+ mov eax,[rdx-2]
+ mov [rsp+80h],eax
+ mov eax, [rsi+rdx-2]
+ movdqa xmm3,[rsp+70h]
+ mov [rsp+90h],eax
+ mov eax,[rdx+rsi*2-2]
+ punpckldq xmm5,[rsp+80h]
+ mov [rsp+0A0h],eax
+ mov eax, [r10+rdx-2]
+ punpckldq xmm2,[rsp+90h]
+ mov [rsp+0B0h],eax
+ mov eax, [rbx]
+ punpckldq xmm4,[rsp+0A0h]
+ mov [rsp+80h],eax
+ mov eax,[rbp]
+ punpckldq xmm3,[rsp+0B0h]
+ mov [rsp+90h],eax
+ mov eax,[rsi+rbx]
+ movdqa xmm0,[rsp+80h]
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm5,xmm0
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax,[rsi+rbp]
+ movdqa xmm0,[rsp+80h]
+ movdqa xmm1,xmm5
+ mov [rsp+90h],eax
+ mov eax,[rbx+rsi*2]
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm2,xmm0
+ punpcklbw xmm1,xmm2
+ punpckhbw xmm5,xmm2
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax,[rbp+rsi*2]
+ movdqa xmm0, [rsp+80h]
+ mov [rsp+90h],eax
+ mov eax,[r10+rbx]
+ movdqa xmm7,xmm1
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm4,xmm0
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax, [r10+rbp]
+ movdqa xmm0,[rsp+80h]
+ mov [rsp+90h],eax
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm3,xmm0
+ movdqa xmm0,xmm4
+ punpcklbw xmm0,xmm3
+ punpckhbw xmm4,xmm3
+ punpcklwd xmm7,xmm0
+ punpckhwd xmm1,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm6,xmm7
+ punpcklwd xmm0,xmm4
+ punpckhwd xmm5,xmm4
+ punpckldq xmm6,xmm0
+ punpckhdq xmm7,xmm0
+ movdqa xmm0,xmm1
+ punpckldq xmm0,xmm5
+ mov rax, [rsp+1C8h+160] ; pTC
+ punpckhdq xmm1,xmm5
+ movdqa xmm9,xmm6
+ punpckhqdq xmm6,xmm0
+ punpcklqdq xmm9,xmm0
+ movdqa xmm2,xmm7
+ movdqa xmm13,xmm6
+ movdqa xmm4,xmm9
+ movdqa [rsp+10h],xmm9
+ punpcklqdq xmm2,xmm1
+ punpckhqdq xmm7,xmm1
+ pxor xmm1,xmm1
+ movsx ecx,byte [rax+3]
+ movsx edx,byte [rax+2]
+ movsx r8d,byte [rax+1]
+ movsx r9d,byte [rax]
+ movdqa xmm10,xmm1
+ movdqa xmm15,xmm2
+ punpckhbw xmm2,xmm1
+ punpckhbw xmm6,xmm1
+ punpcklbw xmm4,xmm1
+ movsx eax,r11w
+ mov word [rsp+0Eh],cx
+ mov word [rsp+0Ch],cx
+ movdqa xmm3,xmm7
+ movdqa xmm8,xmm7
+ movdqa [rsp+20h],xmm7
+ punpcklbw xmm15,xmm1
+ punpcklbw xmm13,xmm1
+ punpcklbw xmm3,xmm1
+ mov word [rsp+0Ah],dx
+ mov word [rsp+8],dx
+ mov word [rsp+6],r8w
+ movd xmm0,eax
+ movdqa [rsp+30h],xmm6
+ punpckhbw xmm9,xmm1
+ punpckhbw xmm8,xmm1
+ punpcklwd xmm0,xmm0
+ movsx eax,word [rsp+1C0h+160] ; iBeta
+ mov word [rsp+4],r8w
+ mov word [rsp+2],r9w
+ pshufd xmm12,xmm0,0
+ mov word [rsp],r9w
+ movd xmm0,eax
+ mov eax,4
+ cwde
+ movdqa xmm14, [rsp]
+ movdqa [rsp],xmm2
+ movdqa xmm2,xmm12
+ punpcklwd xmm0,xmm0
+ pshufd xmm11,xmm0,0
+ psubw xmm10,xmm14
+ movd xmm0,eax
+ movdqa xmm7,xmm14
+ movdqa xmm6,xmm14
+ pcmpgtw xmm7,xmm1
+ punpcklwd xmm0,xmm0
+ pshufd xmm5,xmm0,0
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm15
+ psubw xmm4,xmm13
+ psubw xmm0,xmm3
+ psubw xmm1,xmm13
+ psubw xmm3,xmm15
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm10
+ psraw xmm1,3
+ pmaxsw xmm0,xmm1
+ pminsw xmm6,xmm0
+ movdqa xmm1,xmm11
+ movdqa xmm0,xmm13
+ psubw xmm0,xmm15
+ pabsw xmm0,xmm0
+ pcmpgtw xmm2,xmm0
+ pabsw xmm0,xmm4
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm3
+ pand xmm2,xmm1
+ movdqa xmm1,xmm11
+ movdqa xmm3,[rsp+30h]
+ pcmpgtw xmm1,xmm0
+ movdqa xmm0,xmm9
+ pand xmm2,xmm1
+ psubw xmm0,xmm8
+ psubw xmm9,xmm3
+ pand xmm2,xmm7
+ pand xmm6,xmm2
+ psubw xmm15,xmm6
+ paddw xmm13,xmm6
+ movdqa xmm2,[rsp]
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ psubw xmm8,xmm2
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm3
+ movdqa xmm5,[rsp+10h]
+ psubw xmm0,xmm2
+ psraw xmm1,3
+ movdqa xmm4,xmm5
+ pabsw xmm0,xmm0
+ pmaxsw xmm10,xmm1
+ movdqa xmm1,xmm11
+ pcmpgtw xmm12,xmm0
+ pabsw xmm0,xmm9
+ pminsw xmm14,xmm10
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm8
+ pcmpgtw xmm11,xmm0
+ pand xmm12,xmm1
+ movdqa xmm1,[rsp+20h]
+ pand xmm12,xmm11
+ pand xmm12,xmm7
+ pand xmm14,xmm12
+ paddw xmm3,xmm14
+ psubw xmm2,xmm14
+ packuswb xmm13,xmm3
+ packuswb xmm15,xmm2
+ punpcklbw xmm4,xmm13
+ punpckhbw xmm5,xmm13
+ movdqa xmm0,xmm15
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm15,xmm1
+ movdqa xmm3,xmm4
+ punpcklwd xmm3,xmm0
+ punpckhwd xmm4,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm2,xmm3
+ movdqa xmm1,xmm4
+ punpcklwd xmm0,xmm15
+ punpckhwd xmm5,xmm15
+ punpckldq xmm2,xmm0
+ punpckhdq xmm3,xmm0
+ punpckldq xmm1,xmm5
+ movdqa xmm0,xmm2
+ punpcklqdq xmm0,xmm1
+ punpckhdq xmm4,xmm5
+ punpckhqdq xmm2,xmm1
+ movdqa [rsp+40h],xmm0
+ movdqa xmm0,xmm3
+ movdqa [rsp+90h],xmm2
+ mov eax,[rsp+40h]
+ mov [rdi-2],eax
+ mov eax, [rsp+90h]
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm3,xmm4
+ mov [rsi+rdi-2],eax
+ movdqa [rsp+50h],xmm0
+ mov eax,[rsp+50h]
+ movdqa [rsp+0A0h],xmm3
+ mov [rdi+rsi*2-2],eax
+ mov eax,[rsp+0A0h]
+ mov [r10+rdi-2],eax
+ mov eax,[rsp+48h]
+ mov [rbx],eax
+ mov eax,[rsp+98h]
+ mov [rsi+rbx],eax
+ mov eax,[rsp+58h]
+ mov [rbx+rsi*2],eax
+ mov eax, [rsp+0A8h]
+ mov [r10+rbx],eax
+ mov eax, [rsp+44h]
+ mov [r12-2],eax
+ mov eax,[rsp+94h]
+ mov [rsi+r12-2],eax
+ mov eax,[rsp+54h]
+ mov [r12+rsi*2-2],eax
+ mov eax, [rsp+0A4h]
+ mov [r10+r12-2],eax
+ mov eax,[rsp+4Ch]
+ mov [rbp],eax
+ mov eax,[rsp+9Ch]
+ mov [rsi+rbp],eax
+ mov eax, [rsp+5Ch]
+ mov [rbp+rsi*2],eax
+ mov eax,[rsp+0ACh]
+ mov [r10+rbp],eax
+ lea r11,[rsp+170h]
+ mov rsp,r11
+ POP_XMM
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbp
+ pop rbx
+ ret
+
+
+
+%elifdef UNIX64
+
+
+WELS_EXTERN DeblockLumaLt4V_ssse3
+ push rbp
+ mov r11,r8 ; pTC
+ sub rsp,1B0h
+ lea rbp,[rsp+20h]
+ movd xmm4,edx
+ movd xmm2,ecx
+ mov qword [rbp+180h],r12
+ mov r10,rdi
+ movsxd r12,esi
+ add rsi,rsi
+ movsxd rdx,esi
+ sub r10,r12
+ movsx r8d,byte [r11]
+ pxor xmm3,xmm3
+ punpcklwd xmm2,xmm2
+ movaps [rbp+50h],xmm14
+ lea rax,[r12+r12*2]
+ movdqa xmm14,[rdx+rdi]
+ neg rax
+ pshufd xmm0,xmm2,0
+ movd xmm2,r8d
+ movsx rsi,byte [r11+1]
+ movsx r8d,byte [r11+2]
+ movsx r11d,byte [r11+3]
+ movaps [rbp+70h],xmm12
+ movd xmm1,esi
+ movaps [rbp+80h],xmm11
+ movd xmm12,r8d
+ movd xmm11,r11d
+ movdqa xmm5, [rax+rdi]
+ lea rax,[r12+r12]
+ punpcklwd xmm12,xmm12
+ neg rax
+ punpcklwd xmm11,xmm11
+ movaps [rbp],xmm8
+ movdqa xmm8, [r10]
+ punpcklwd xmm2,xmm2
+ punpcklwd xmm1,xmm1
+ punpcklqdq xmm12,xmm12
+ punpcklqdq xmm11,xmm11
+ punpcklqdq xmm2,xmm2
+ punpcklqdq xmm1,xmm1
+ shufps xmm12,xmm11,88h
+ movdqa xmm11,xmm8
+ movaps [rbp+30h],xmm9
+ movdqa xmm9,[rdi]
+ shufps xmm2,xmm1,88h
+ movdqa xmm1,xmm5
+ punpcklbw xmm11,xmm3
+ movaps [rbp+20h],xmm6
+ movaps [rbp+60h],xmm13
+ movdqa xmm13,xmm11
+ movaps [rbp+90h],xmm10
+ movdqa xmm10,xmm9
+ movdqa xmm6,[rax+rdi]
+ punpcklbw xmm1,xmm3
+ movaps [rbp+0A0h],xmm12
+ psubw xmm13,xmm1
+ movaps [rbp+40h],xmm15
+ movdqa xmm15,xmm14
+ movaps [rbp+10h],xmm7
+ movdqa xmm7,xmm6
+ punpcklbw xmm10,xmm3
+ movdqa xmm12,[r12+rdi]
+ punpcklbw xmm7,xmm3
+ punpcklbw xmm12,xmm3
+ punpcklbw xmm15,xmm3
+ pabsw xmm3,xmm13
+ movdqa xmm13,xmm10
+ psubw xmm13,xmm15
+ movdqa [rbp+0F0h],xmm15
+ pabsw xmm15,xmm13
+ movdqa xmm13,xmm11
+ movdqa [rbp+0B0h],xmm1
+ movdqa xmm1,xmm0
+ pavgw xmm13,xmm10
+ pcmpgtw xmm1,xmm3
+ movdqa [rbp+120h],xmm13
+ movaps xmm13,xmm2
+ punpcklwd xmm4,xmm4
+ movdqa xmm3,xmm0
+ movdqa [rbp+100h],xmm1
+ psubw xmm13,xmm1
+ movdqa xmm1,xmm10
+ pcmpgtw xmm3,xmm15
+ pshufd xmm4,xmm4,0
+ psubw xmm1,xmm11
+ movdqa [rbp+0D0h],xmm10
+ psubw xmm13,xmm3
+ movdqa [rbp+110h],xmm3
+ pabsw xmm15,xmm1
+ movdqa xmm3,xmm4
+ psubw xmm10,xmm12
+ pcmpgtw xmm3,xmm15
+ pabsw xmm15,xmm10
+ movdqa xmm10,xmm0
+ psllw xmm1,2
+ movdqa [rbp+0C0h],xmm11
+ psubw xmm11,xmm7
+ pcmpgtw xmm10,xmm15
+ pabsw xmm11,xmm11
+ movdqa xmm15,xmm0
+ pand xmm3,xmm10
+ pcmpgtw xmm15,xmm11
+ movaps xmm11,xmm2
+ pxor xmm10,xmm10
+ pand xmm3,xmm15
+ pcmpgtw xmm11,xmm10
+ pcmpeqw xmm10,xmm2
+ por xmm11,xmm10
+ pand xmm3,xmm11
+ movdqa xmm11,xmm7
+ psubw xmm11,xmm12
+ pxor xmm15,xmm15
+ paddw xmm11,xmm1
+ psubw xmm15,xmm13
+ movdqa [rbp+0E0h],xmm12
+ paddw xmm11,[FOUR_16B_SSE2]
+ pxor xmm12,xmm12
+ psraw xmm11,3
+ punpckhbw xmm8,xmm12
+ pmaxsw xmm15,xmm11
+ punpckhbw xmm5,xmm12
+ movdqa xmm11,xmm8
+ pminsw xmm13,xmm15
+ psubw xmm11,xmm5
+ punpckhbw xmm9,xmm12
+ pand xmm13,xmm3
+ movdqa [rbp+130h],xmm13
+ pabsw xmm13,xmm11
+ punpckhbw xmm14,xmm12
+ movdqa xmm11,xmm9
+ psubw xmm11,xmm14
+ movdqa xmm15,xmm0
+ movdqa [rbp+140h],xmm14
+ pabsw xmm14,xmm11
+ movdqa xmm11,xmm8
+ pcmpgtw xmm15,xmm14
+ movdqa xmm1,[r12+rdi]
+ pavgw xmm11,xmm9
+ movdqa [rbp+170h],xmm11
+ movdqa xmm10,xmm9
+ punpckhbw xmm6,xmm12
+ psubw xmm10,xmm8
+ punpckhbw xmm1,xmm12
+ movdqa xmm12,xmm0
+ movaps xmm11,[rbp+0A0h]
+ pcmpgtw xmm12,xmm13
+ movaps xmm13,xmm11
+ psubw xmm13,xmm12
+ movdqa [rbp+160h],xmm15
+ psubw xmm13,xmm15
+ movdqa xmm15,xmm9
+ psubw xmm15,xmm1
+ movdqa [rbp+150h],xmm12
+ pabsw xmm12,xmm10
+ pabsw xmm14,xmm15
+ movdqa xmm15,xmm8
+ pcmpgtw xmm4,xmm12
+ movdqa xmm12,xmm0
+ psubw xmm15,xmm6
+ pcmpgtw xmm12,xmm14
+ pabsw xmm14,xmm15
+ psllw xmm10,2
+ pcmpgtw xmm0,xmm14
+ movdqa xmm14,xmm6
+ psubw xmm14,xmm1
+ pand xmm4,xmm12
+ paddw xmm14,xmm10
+ pand xmm4,xmm0
+ paddw xmm14,[FOUR_16B_SSE2]
+ pxor xmm15,xmm15
+ movaps xmm12,xmm11
+ psubw xmm15,xmm13
+ pxor xmm0,xmm0
+ psraw xmm14,3
+ pcmpgtw xmm12,xmm0
+ pcmpeqw xmm0,xmm11
+ pmaxsw xmm15,xmm14
+ por xmm12,xmm0
+ movdqa xmm0,[rbp+120h]
+ pminsw xmm13,xmm15
+ movdqa xmm15,[rbp+0B0h]
+ movdqa xmm10,xmm7
+ pand xmm4,xmm12
+ paddw xmm15,xmm0
+ pxor xmm12,xmm12
+ paddw xmm10,xmm7
+ movdqa xmm14,xmm12
+ psubw xmm15,xmm10
+ psubw xmm14,xmm2
+ psraw xmm15,1
+ pmaxsw xmm15,xmm14
+ movdqa xmm10,xmm6
+ pminsw xmm15,xmm2
+ paddw xmm10,xmm6
+ pand xmm15,xmm3
+ psubw xmm12,xmm11
+ pand xmm15,[rbp+100h]
+ pand xmm13,xmm4
+ paddw xmm7,xmm15
+ paddw xmm8,xmm13
+ movdqa xmm15,[rbp+170h]
+ psubw xmm9,xmm13
+ paddw xmm5,xmm15
+ psubw xmm5,xmm10
+ psraw xmm5,1
+ pmaxsw xmm5,xmm12
+ pminsw xmm5,xmm11
+ pand xmm5,xmm4
+ pand xmm5,[rbp+150h]
+ paddw xmm6,xmm5
+ movdqa xmm5,[rbp+0C0h]
+ packuswb xmm7,xmm6
+ movdqa xmm6,[rbp+130h]
+ paddw xmm5,xmm6
+ packuswb xmm5,xmm8
+ movdqa xmm8,[rbp+0D0h]
+ psubw xmm8,xmm6
+ movdqa xmm6,[rbp+0F0h]
+ paddw xmm6,xmm0
+ movdqa xmm0,[rbp+0E0h]
+ packuswb xmm8,xmm9
+ movdqa xmm9,xmm0
+ paddw xmm9,xmm0
+ psubw xmm6,xmm9
+ psraw xmm6,1
+ pmaxsw xmm14,xmm6
+ pminsw xmm2,xmm14
+ pand xmm2,xmm3
+ pand xmm2,[rbp+110h]
+ paddw xmm0,xmm2
+ movdqa xmm2,[rbp+140h]
+ paddw xmm2,xmm15
+ movdqa xmm15,xmm1
+ paddw xmm15,xmm1
+ psubw xmm2,xmm15
+ psraw xmm2,1
+ pmaxsw xmm12,xmm2
+ pminsw xmm11,xmm12
+ pand xmm11,xmm4
+ pand xmm11,[rbp+160h]
+ paddw xmm1,xmm11
+ movdqa [rax+rdi],xmm7
+ movdqa [r10],xmm5
+ packuswb xmm0,xmm1
+ movdqa [rdi],xmm8
+ movdqa [r12+rdi],xmm0
+ mov r12,qword [rbp+180h]
+ lea rsp,[rbp+190h]
+ pop rbp
+ ret
+
+
+WELS_EXTERN DeblockLumaEq4V_ssse3
+ mov rax,rsp
+ push rbx
+ push rbp
+ mov r8, rdx
+ mov r9, rcx
+ mov rcx, rdi
+ mov rdx, rsi
+ sub rsp,1D8h
+ movaps [rax-38h],xmm6
+ movaps [rax-48h],xmm7
+ movaps [rax-58h],xmm8
+ pxor xmm1,xmm1
+ movsxd r10,edx
+ mov rbp,rcx
+ mov r11d,r8d
+ mov rdx,rcx
+ mov rdi,rbp
+ mov rbx,rbp
+ movdqa xmm5,[rbp]
+ movaps [rax-68h],xmm9
+ movaps [rax-78h],xmm10
+ punpcklbw xmm5,xmm1
+ movaps [rax-88h],xmm11
+ movaps [rax-98h],xmm12
+ movaps [rax-0A8h],xmm13
+ movaps [rax-0B8h],xmm14
+ movdqa xmm14,[r10+rbp]
+ movaps [rax-0C8h],xmm15
+ lea eax,[r10*4]
+ movsxd r8,eax
+ lea eax,[r10+r10*2]
+ movsxd rcx,eax
+ lea eax,[r10+r10]
+ sub rdx,r8
+ punpcklbw xmm14,xmm1
+ movdqa [rsp+90h],xmm5
+ movdqa [rsp+30h],xmm14
+ movsxd rsi,eax
+ movsx eax,r11w
+ sub rdi,rcx
+ sub rbx,rsi
+ mov r8,rbp
+ sub r8,r10
+ movd xmm0,eax
+ movsx eax,r9w
+ movdqa xmm12,[rdi]
+ movdqa xmm6, [rsi+rbp]
+ movdqa xmm13,[rbx]
+ punpcklwd xmm0,xmm0
+ pshufd xmm11,xmm0,0
+ punpcklbw xmm13,xmm1
+ punpcklbw xmm6,xmm1
+ movdqa xmm8,[r8]
+ movd xmm0,eax
+ movdqa xmm10,xmm11
+ mov eax,2
+ punpcklbw xmm8,xmm1
+ punpcklbw xmm12,xmm1
+ cwde
+ punpcklwd xmm0,xmm0
+ psraw xmm10,2
+ movdqa xmm1,xmm8
+ movdqa [rsp+0F0h],xmm13
+ movdqa [rsp+0B0h],xmm8
+ pshufd xmm7,xmm0,0
+ psubw xmm1,xmm13
+ movdqa xmm0,xmm5
+ movdqa xmm4,xmm7
+ movdqa xmm2,xmm7
+ psubw xmm0,xmm8
+ pabsw xmm3,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm5
+ movdqa [rsp+40h],xmm7
+ movdqa [rsp+60h],xmm6
+ pcmpgtw xmm4,xmm0
+ psubw xmm1,xmm14
+ pabsw xmm0,xmm1
+ pcmpgtw xmm2,xmm0
+ pand xmm4,xmm2
+ movdqa xmm0,xmm11
+ pcmpgtw xmm0,xmm3
+ pand xmm4,xmm0
+ movd xmm0,eax
+ movdqa [rsp+20h],xmm4
+ punpcklwd xmm0,xmm0
+ pshufd xmm2,xmm0,0
+ paddw xmm10,xmm2
+ movdqa [rsp+0A0h],xmm2
+ movdqa xmm15,xmm7
+ pxor xmm4,xmm4
+ movdqa xmm0,xmm8
+ psubw xmm0,xmm12
+ mov eax,4
+ pabsw xmm0,xmm0
+ movdqa xmm1,xmm10
+ cwde
+ pcmpgtw xmm15,xmm0
+ pcmpgtw xmm1,xmm3
+ movdqa xmm3,xmm7
+ movdqa xmm7,[rdx]
+ movdqa xmm0,xmm5
+ psubw xmm0,xmm6
+ pand xmm15,xmm1
+ punpcklbw xmm7,xmm4
+ movdqa xmm9,xmm15
+ pabsw xmm0,xmm0
+ psllw xmm7,1
+ pandn xmm9,xmm12
+ pcmpgtw xmm3,xmm0
+ paddw xmm7,xmm12
+ movd xmm0,eax
+ pand xmm3,xmm1
+ paddw xmm7,xmm12
+ punpcklwd xmm0,xmm0
+ paddw xmm7,xmm12
+ pshufd xmm1,xmm0,0
+ paddw xmm7,xmm13
+ movdqa xmm0,xmm3
+ pandn xmm0,xmm6
+ paddw xmm7,xmm8
+ movdqa [rsp+70h],xmm1
+ paddw xmm7,xmm5
+ movdqa [rsp+120h],xmm0
+ movdqa xmm0,[rcx+rbp]
+ punpcklbw xmm0,xmm4
+ paddw xmm7,xmm1
+ movdqa xmm4,xmm15
+ psllw xmm0,1
+ psraw xmm7,3
+ paddw xmm0,xmm6
+ pand xmm7,xmm15
+ paddw xmm0,xmm6
+ paddw xmm0,xmm6
+ paddw xmm0,xmm14
+ movdqa xmm6,xmm15
+ paddw xmm0,xmm5
+ pandn xmm6,xmm13
+ paddw xmm0,xmm8
+ paddw xmm0,xmm1
+ psraw xmm0,3
+ movdqa xmm1,xmm12
+ paddw xmm1,xmm13
+ pand xmm0,xmm3
+ movdqa [rsp+100h],xmm0
+ movdqa xmm0,xmm8
+ paddw xmm0,xmm5
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm3
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ pandn xmm0,xmm14
+ pand xmm4,xmm1
+ movdqa [rsp+0E0h],xmm0
+ movdqa xmm0,xmm5
+ paddw xmm0,xmm8
+ movdqa xmm1,[rsp+60h]
+ paddw xmm1,xmm14
+ movdqa xmm14,xmm3
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm8
+ paddw xmm0,[rsp+30h]
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ pand xmm14,xmm1
+ movdqa xmm1,xmm13
+ paddw xmm1,xmm13
+ paddw xmm1,xmm0
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ movdqa xmm0,[rsp+30h]
+ movdqa xmm2,xmm13
+ movdqa xmm5,xmm15
+ paddw xmm0,[rsp+70h]
+ pandn xmm5,xmm1
+ paddw xmm2,xmm8
+ movdqa xmm8,[rsp+90h]
+ movdqa xmm1,xmm12
+ paddw xmm2,xmm8
+ psllw xmm2,1
+ paddw xmm2,xmm0
+ paddw xmm1,xmm2
+ movdqa xmm0,xmm8
+ movdqa xmm8,xmm3
+ movdqa xmm2,[rsp+30h]
+ paddw xmm0,xmm13
+ psraw xmm1,3
+ pand xmm15,xmm1
+ movdqa xmm1,xmm2
+ paddw xmm1,xmm2
+ paddw xmm2,[rsp+90h]
+ paddw xmm2,[rsp+0B0h]
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm13
+ movdqa xmm13,[r8]
+ paddw xmm0, [rsp+70h]
+ paddw xmm1, [rsp+0A0h]
+ psllw xmm2,1
+ paddw xmm2,xmm0
+ psraw xmm1,2
+ movdqa xmm0, [rdi]
+ pandn xmm8,xmm1
+ movdqa xmm1, [rsp+60h]
+ paddw xmm1,xmm2
+ movdqa xmm2, [rbx]
+ psraw xmm1,3
+ pand xmm3,xmm1
+ movdqa xmm1, [rbp]
+ movdqa [rsp+0D0h],xmm3
+ pxor xmm3,xmm3
+ punpckhbw xmm0,xmm3
+ punpckhbw xmm1,xmm3
+ punpckhbw xmm13,xmm3
+ movdqa [rsp+0C0h],xmm0
+ movdqa xmm0,[r10+rbp]
+ movdqa [rsp],xmm1
+ punpckhbw xmm0,xmm3
+ punpckhbw xmm2,xmm3
+ movdqa [rsp+80h],xmm0
+ movdqa xmm0,[rsi+rbp]
+ movdqa [rsp+10h],xmm13
+ punpckhbw xmm0,xmm3
+ movdqa [rsp+50h],xmm0
+ movdqa xmm0,xmm1
+ movdqa xmm1,xmm13
+ psubw xmm0,xmm13
+ psubw xmm1,xmm2
+ pabsw xmm3,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,[rsp]
+ movdqa xmm13,[rsp+40h]
+ movdqa [rsp+110h],xmm2
+ psubw xmm1, [rsp+80h]
+ pcmpgtw xmm13,xmm0
+ pcmpgtw xmm11,xmm3
+ pabsw xmm0,xmm1
+ pcmpgtw xmm10,xmm3
+ movdqa xmm1, [rsp+40h]
+ movdqa xmm2,xmm1
+ movdqa xmm3,xmm1
+ pcmpgtw xmm2,xmm0
+ movdqa xmm0, [rsp+10h]
+ pand xmm13,xmm2
+ pand xmm13,xmm11
+ movdqa xmm11,[rsp+0C0h]
+ psubw xmm0,xmm11
+ pabsw xmm0,xmm0
+ pcmpgtw xmm3,xmm0
+ pand xmm3,xmm10
+ movdqa xmm0,[rsp]
+ psubw xmm0,[rsp+50h]
+ movdqa xmm2,[rdx]
+ pabsw xmm0,xmm0
+ por xmm7,xmm9
+ movdqa xmm9,[rsp+20h]
+ pcmpgtw xmm1,xmm0
+ pand xmm9,xmm7
+ movdqa xmm7,[rsp+20h]
+ movdqa xmm0,xmm7
+ pandn xmm0,xmm12
+ movdqa xmm12,[rsp+110h]
+ pand xmm1,xmm10
+ movdqa xmm10,[rsp+70h]
+ movdqa [rsp+40h],xmm1
+ movdqa xmm1,xmm13
+ por xmm9,xmm0
+ pxor xmm0,xmm0
+ por xmm4,xmm6
+ movdqa xmm6,xmm7
+ punpckhbw xmm2,xmm0
+ por xmm15,xmm5
+ movdqa xmm5,[rsp+20h]
+ movdqa xmm0,xmm3
+ psllw xmm2,1
+ pandn xmm0,xmm11
+ pand xmm6,xmm4
+ movdqa xmm4,[rsp]
+ paddw xmm2,xmm11
+ pand xmm5,xmm15
+ movdqa xmm15,[rsp+20h]
+ paddw xmm2,xmm11
+ paddw xmm2,xmm11
+ paddw xmm2,xmm12
+ paddw xmm2,[rsp+10h]
+ paddw xmm2,[rsp]
+ paddw xmm2,xmm10
+ psraw xmm2,3
+ pand xmm2,xmm3
+ por xmm2,xmm0
+ pand xmm1,xmm2
+ movdqa xmm0,xmm13
+ movdqa xmm2,xmm11
+ pandn xmm0,xmm11
+ paddw xmm2,xmm12
+ por xmm1,xmm0
+ packuswb xmm9,xmm1
+ movdqa xmm0,xmm7
+ movdqa xmm7,[rsp+0A0h]
+ pandn xmm0,[rsp+0F0h]
+ movdqa xmm1,xmm3
+ por xmm6,xmm0
+ movdqa xmm0,[rsp+10h]
+ paddw xmm0,xmm4
+ paddw xmm2,xmm0
+ paddw xmm2,xmm7
+ movdqa xmm0,xmm3
+ pandn xmm0,xmm12
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ movdqa xmm2,xmm13
+ movdqa xmm0,xmm13
+ pand xmm2,xmm1
+ pandn xmm0,xmm12
+ movdqa xmm1,xmm12
+ paddw xmm1,[rsp+10h]
+ por xmm2,xmm0
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+0B0h]
+ paddw xmm1,xmm4
+ packuswb xmm6,xmm2
+ movdqa xmm2,xmm3
+ psllw xmm1,1
+ por xmm5,xmm0
+ movdqa xmm0,[rsp+80h]
+ paddw xmm0,xmm10
+ paddw xmm1,xmm0
+ paddw xmm11,xmm1
+ psraw xmm11,3
+ movdqa xmm1,xmm12
+ pand xmm2,xmm11
+ paddw xmm1,xmm12
+ movdqa xmm11,[rsp+80h]
+ movdqa xmm0, [rsp+10h]
+ por xmm14,[rsp+0E0h]
+ paddw xmm0,xmm11
+ movdqa xmm4,xmm15
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm13
+ paddw xmm1,xmm7
+ psraw xmm1,2
+ pandn xmm3,xmm1
+ por xmm2,xmm3
+ movdqa xmm1,xmm13
+ movdqa xmm3,[rsp+10h]
+ pandn xmm0,xmm3
+ pand xmm1,xmm2
+ movdqa xmm2,xmm11
+ paddw xmm2,[rsp]
+ por xmm1,xmm0
+ movdqa xmm0,[rsp+0D0h]
+ por xmm0,xmm8
+ paddw xmm2,xmm3
+ packuswb xmm5,xmm1
+ movdqa xmm8,[rsp+40h]
+ movdqa xmm1,[rsp+50h]
+ movdqa xmm3,xmm8
+ pand xmm4,xmm0
+ psllw xmm2,1
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+90h]
+ por xmm4,xmm0
+ movdqa xmm0,xmm12
+ paddw xmm0,xmm10
+ paddw xmm2,xmm0
+ paddw xmm1,xmm2
+ movdqa xmm0,[rsp]
+ movdqa xmm2,xmm11
+ paddw xmm0,xmm12
+ movdqa xmm12,[rsp]
+ paddw xmm2,xmm11
+ paddw xmm2,xmm0
+ psraw xmm1,3
+ movdqa xmm0,xmm8
+ pand xmm3,xmm1
+ paddw xmm2,xmm7
+ movdqa xmm1,xmm13
+ psraw xmm2,2
+ pandn xmm0,xmm2
+ por xmm3,xmm0
+ movdqa xmm2,[rsp+50h]
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm12
+ pand xmm1,xmm3
+ paddw xmm2,xmm11
+ movdqa xmm3,xmm15
+ por xmm1,xmm0
+ pand xmm3,xmm14
+ movdqa xmm14,[rsp+10h]
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+30h]
+ packuswb xmm4,xmm1
+ movdqa xmm1,xmm8
+ por xmm3,xmm0
+ movdqa xmm0,xmm12
+ paddw xmm0,xmm14
+ paddw xmm2,xmm0
+ paddw xmm2,xmm7
+ movdqa xmm0,xmm8
+ pandn xmm0,xmm11
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ movdqa xmm2,xmm13
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm11
+ pand xmm2,xmm1
+ movdqa xmm1,xmm15
+ por xmm2,xmm0
+ packuswb xmm3,xmm2
+ movdqa xmm0,[rsp+100h]
+ por xmm0,[rsp+120h]
+ pand xmm1,xmm0
+ movdqa xmm2,[rcx+rbp]
+ movdqa xmm7,[rsp+50h]
+ pandn xmm15,[rsp+60h]
+ lea r11,[rsp+1D8h]
+ pxor xmm0,xmm0
+ por xmm1,xmm15
+ movaps xmm15,[r11-0A8h]
+ movdqa [rdi],xmm9
+ movaps xmm9,[r11-48h]
+ punpckhbw xmm2,xmm0
+ psllw xmm2,1
+ paddw xmm2,xmm7
+ paddw xmm2,xmm7
+ movdqa [rbx],xmm6
+ movaps xmm6,[r11-18h]
+ paddw xmm2,xmm7
+ paddw xmm2,xmm11
+ movaps xmm11,[r11-68h]
+ paddw xmm2,xmm12
+ movaps xmm12,[r11-78h]
+ paddw xmm2,xmm14
+ paddw xmm2,xmm10
+ psraw xmm2,3
+ movaps xmm10,[r11-58h]
+ movaps xmm14,[r11-98h]
+ movdqa xmm0,xmm13
+ pand xmm2,xmm8
+ pandn xmm8,xmm7
+ pandn xmm13,xmm7
+ por xmm2,xmm8
+ movaps xmm7,[r11-28h]
+ movaps xmm8,[r11-38h]
+ movdqa [r8],xmm5
+ pand xmm0,xmm2
+ por xmm0,xmm13
+ packuswb xmm1,xmm0
+ movaps xmm13,[r11-88h]
+ movdqa [rbp],xmm4
+ movdqa [r10+rbp],xmm3
+ movdqa [rsi+rbp],xmm1
+ mov rsp,r11
+ pop rbp
+ pop rbx
+ ret
+
+WELS_EXTERN DeblockChromaLt4V_ssse3
+ mov rax,rsp
+ push rbx
+ push rbp
+ mov r10, rdx
+ mov r11, rcx
+ mov rcx, rdi
+ mov rdx, rsi
+ mov rsi, r10
+ mov r10, r9
+ mov rbp, r8
+ mov r8, rsi
+ mov r9, r11
+ sub rsp,0C8h
+ pxor xmm1,xmm1
+ mov rbx,rcx
+ movsxd r11,r8d
+ movsx ecx,byte [r10]
+ movsx r8d,byte [r10+2]
+ mov rdi,rdx
+ movq xmm2,[rbx]
+ movq xmm9,[r11+rbx]
+ movsx edx,byte [r10+1]
+ mov word [rsp+2],cx
+ mov word [rsp],cx
+ movsx eax,byte [r10+3]
+ mov word [rsp+6],dx
+ mov word [rsp+4],dx
+ movdqa xmm11,xmm1
+ mov word [rsp+0Eh],ax
+ mov word [rsp+0Ch],ax
+ lea eax,[r11+r11]
+ movsxd rcx,eax
+ mov rax,rbx
+ mov rdx,rdi
+ sub rax,rcx
+ mov word [rsp+0Ah],r8w
+ mov word [rsp+8],r8w
+ movdqa xmm6,[rsp]
+ movdqa xmm7,xmm6
+ movq xmm13, [rax]
+ mov rax,rdi
+ sub rax,rcx
+ mov rcx,rbx
+ pcmpgtw xmm7,xmm1
+ psubw xmm11,xmm6
+ sub rcx,r11
+ sub rdx,r11
+ movq xmm0,[rax]
+ movsx eax,r9w
+ movq xmm15,[rcx]
+ punpcklqdq xmm13,xmm0
+ movq xmm0, [rdx]
+ movdqa xmm4,xmm13
+ punpcklqdq xmm15,xmm0
+ movq xmm0, [rdi]
+ punpcklbw xmm4,xmm1
+ movdqa xmm12,xmm15
+ punpcklqdq xmm2,xmm0
+ movq xmm0, [r11+rdi]
+ punpcklbw xmm12,xmm1
+ movdqa xmm14,xmm2
+ punpcklqdq xmm9,xmm0
+ punpckhbw xmm2,xmm1
+ punpcklbw xmm14,xmm1
+ movd xmm0,eax
+ mov eax, ebp ; iBeta
+ punpckhbw xmm13,xmm1
+ punpckhbw xmm15,xmm1
+ movdqa xmm3,xmm9
+ movdqa [rsp+10h],xmm2
+ punpcklwd xmm0,xmm0
+ punpckhbw xmm9,xmm1
+ punpcklbw xmm3,xmm1
+ movdqa xmm1,xmm14
+ pshufd xmm10,xmm0,0
+ movd xmm0,eax
+ mov eax,4
+ cwde
+ punpcklwd xmm0,xmm0
+ pshufd xmm8,xmm0,0
+ movd xmm0,eax
+ punpcklwd xmm0,xmm0
+ pshufd xmm5,xmm0,0
+ psubw xmm1,xmm12
+ movdqa xmm2,xmm10
+ lea r11,[rsp+0C8h]
+ psllw xmm1,2
+ movdqa xmm0,xmm4
+ psubw xmm4,xmm12
+ psubw xmm0,xmm3
+ psubw xmm3,xmm14
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm11
+ psraw xmm1,3
+ pmaxsw xmm0,xmm1
+ pminsw xmm6,xmm0
+ movdqa xmm1,xmm8
+ movdqa xmm0,xmm12
+ psubw xmm0,xmm14
+ pabsw xmm0,xmm0
+ pcmpgtw xmm2,xmm0
+ pabsw xmm0,xmm4
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm3
+ movdqa xmm3,[rsp]
+ pand xmm2,xmm1
+ movdqa xmm1,xmm8
+ pcmpgtw xmm1,xmm0
+ movdqa xmm0,xmm13
+ pand xmm2,xmm1
+ psubw xmm0,xmm9
+ psubw xmm13,xmm15
+ pand xmm2,xmm7
+ pand xmm6,xmm2
+ paddw xmm12,xmm6
+ psubw xmm14,xmm6
+ movdqa xmm2,[rsp+10h]
+ movaps xmm6,[r11-18h]
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm15
+ psubw xmm9,xmm2
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm15
+ psubw xmm0,xmm2
+ psraw xmm1,3
+ pmaxsw xmm11,xmm1
+ pabsw xmm0,xmm0
+ movdqa xmm1,xmm8
+ pcmpgtw xmm10,xmm0
+ pabsw xmm0,xmm13
+ pminsw xmm3,xmm11
+ movaps xmm11,[r11-68h]
+ movaps xmm13,[rsp+40h]
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm9
+ movaps xmm9, [r11-48h]
+ pand xmm10,xmm1
+ pcmpgtw xmm8,xmm0
+ pand xmm10,xmm8
+ pand xmm10,xmm7
+ movaps xmm8,[r11-38h]
+ movaps xmm7,[r11-28h]
+ pand xmm3,xmm10
+ paddw xmm15,xmm3
+ psubw xmm2,xmm3
+ movaps xmm10,[r11-58h]
+ packuswb xmm12,xmm15
+ movaps xmm15,[rsp+20h]
+ packuswb xmm14,xmm2
+ movq [rcx],xmm12
+ movq [rbx],xmm14
+ psrldq xmm12,8
+ psrldq xmm14,8
+ movq [rdx],xmm12
+ movaps xmm12,[r11-78h]
+ movq [rdi],xmm14
+ movaps xmm14,[rsp+30h]
+ mov rsp,r11
+ pop rbp
+ pop rbx
+ ret
+
+WELS_EXTERN DeblockChromaEq4V_ssse3
+ mov rax,rsp
+ push rbx
+ push rbp
+
+ mov rbp, r8
+ mov r8, rdx
+ mov r9, rcx
+ mov rcx, rdi
+ mov rdx, rsi
+
+ sub rsp,90h
+ pxor xmm1,xmm1
+ mov r11,rcx
+ mov rbx,rdx
+ mov r10d,r9d
+ movq xmm13,[r11]
+ lea eax,[r8+r8]
+ movsxd r9,eax
+ mov rax,rcx
+ sub rax,r9
+ movq xmm14,[rax]
+ mov rax,rdx
+ sub rax,r9
+ movq xmm0,[rax]
+ movsxd rax,r8d
+ sub rcx,rax
+ sub rdx,rax
+ movq xmm12,[rax+r11]
+ movq xmm10,[rcx]
+ punpcklqdq xmm14,xmm0
+ movdqa xmm8,xmm14
+ movq xmm0,[rdx]
+ punpcklbw xmm8,xmm1
+ punpckhbw xmm14,xmm1
+ punpcklqdq xmm10,xmm0
+ movq xmm0,[rbx]
+ movdqa xmm5,xmm10
+ punpcklqdq xmm13,xmm0
+ movq xmm0, [rax+rbx]
+ punpcklbw xmm5,xmm1
+ movsx eax,r10w
+ movdqa xmm9,xmm13
+ punpcklqdq xmm12,xmm0
+ punpcklbw xmm9,xmm1
+ punpckhbw xmm10,xmm1
+ movd xmm0,eax
+ mov eax, ebp ; iBeta
+ punpckhbw xmm13,xmm1
+ movdqa xmm7,xmm12
+ punpcklwd xmm0,xmm0
+ punpckhbw xmm12,xmm1
+ pshufd xmm11,xmm0,0
+ punpcklbw xmm7,xmm1
+ movd xmm0,eax
+ movdqa xmm1,xmm8
+ psubw xmm1,xmm5
+ punpcklwd xmm0,xmm0
+ movdqa xmm6,xmm11
+ pshufd xmm3,xmm0,0
+ movdqa xmm0,xmm5
+ psubw xmm0,xmm9
+ movdqa xmm2,xmm3
+ pabsw xmm0,xmm0
+ pcmpgtw xmm6,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm3
+ pcmpgtw xmm2,xmm0
+ pand xmm6,xmm2
+ movdqa xmm0,xmm7
+ movdqa xmm2,xmm3
+ psubw xmm0,xmm9
+ pabsw xmm0,xmm0
+ pcmpgtw xmm1,xmm0
+ pand xmm6,xmm1
+ movdqa xmm0,xmm10
+ movdqa xmm1,xmm14
+ psubw xmm0,xmm13
+ psubw xmm1,xmm10
+ pabsw xmm0,xmm0
+ pcmpgtw xmm11,xmm0
+ pabsw xmm0,xmm1
+ pcmpgtw xmm2,xmm0
+ pand xmm11,xmm2
+ movdqa xmm0,xmm12
+ movdqa xmm4,xmm6
+ movdqa xmm1,xmm8
+ mov eax,2
+ cwde
+ paddw xmm1,xmm8
+ psubw xmm0,xmm13
+ paddw xmm1,xmm5
+ pabsw xmm0,xmm0
+ movdqa xmm2,xmm14
+ paddw xmm1,xmm7
+ pcmpgtw xmm3,xmm0
+ paddw xmm2,xmm14
+ movd xmm0,eax
+ pand xmm11,xmm3
+ paddw xmm7,xmm7
+ paddw xmm2,xmm10
+ punpcklwd xmm0,xmm0
+ paddw xmm2,xmm12
+ paddw xmm12,xmm12
+ pshufd xmm3,xmm0,0
+ paddw xmm7,xmm9
+ paddw xmm12,xmm13
+ movdqa xmm0,xmm6
+ paddw xmm1,xmm3
+ pandn xmm0,xmm5
+ paddw xmm7,xmm8
+ psraw xmm1,2
+ paddw xmm12,xmm14
+ paddw xmm7,xmm3
+ ;movaps xmm14,[rsp]
+ pand xmm4,xmm1
+ paddw xmm12,xmm3
+ psraw xmm7,2
+ movdqa xmm1,xmm11
+ por xmm4,xmm0
+ psraw xmm12,2
+ paddw xmm2,xmm3
+ movdqa xmm0,xmm11
+ pandn xmm0,xmm10
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ packuswb xmm4,xmm1
+ movdqa xmm0,xmm11
+ movdqa xmm1,xmm6
+ pand xmm1,xmm7
+ movq [rcx],xmm4
+ pandn xmm6,xmm9
+ pandn xmm11,xmm13
+ pand xmm0,xmm12
+ por xmm1,xmm6
+ por xmm0,xmm11
+ psrldq xmm4,8
+ packuswb xmm1,xmm0
+ movq [r11],xmm1
+ psrldq xmm1,8
+ movq [rdx],xmm4
+ lea r11,[rsp+90h]
+ movq [rbx],xmm1
+ mov rsp,r11
+ pop rbp
+ pop rbx
+ ret
+
+WELS_EXTERN DeblockChromaEq4H_ssse3
+ mov rax,rsp
+ push rbx
+ push rbp
+ push r12
+
+ mov rbp, r8
+ mov r8, rdx
+ mov r9, rcx
+ mov rcx, rdi
+ mov rdx, rsi
+ mov rdi, rdx
+
+ sub rsp,140h
+ lea eax,[r8*4]
+ movsxd r10,eax
+ mov eax,[rcx-2]
+ mov [rsp+10h],eax
+ lea rbx,[r10+rdx-2]
+ lea r11,[r10+rcx-2]
+
+ movdqa xmm5,[rsp+10h]
+ movsxd r10,r8d
+ mov eax,[r10+rcx-2]
+ lea rdx,[r10+r10*2]
+ mov [rsp+20h],eax
+ mov eax,[rcx+r10*2-2]
+ mov [rsp+30h],eax
+ mov eax,[rdx+rcx-2]
+ movdqa xmm2,[rsp+20h]
+ mov [rsp+40h],eax
+ mov eax, [rdi-2]
+ movdqa xmm4,[rsp+30h]
+ mov [rsp+50h],eax
+ mov eax,[r10+rdi-2]
+ movdqa xmm3,[rsp+40h]
+ mov [rsp+60h],eax
+ mov eax,[rdi+r10*2-2]
+ punpckldq xmm5,[rsp+50h]
+ mov [rsp+70h],eax
+ mov eax, [rdx+rdi-2]
+ punpckldq xmm2, [rsp+60h]
+ mov [rsp+80h],eax
+ mov eax,[r11]
+ punpckldq xmm4, [rsp+70h]
+ mov [rsp+50h],eax
+ mov eax,[rbx]
+ punpckldq xmm3,[rsp+80h]
+ mov [rsp+60h],eax
+ mov eax,[r10+r11]
+ movdqa xmm0, [rsp+50h]
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm5,xmm0
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax,[r10+rbx]
+ movdqa xmm0,[rsp+50h]
+ movdqa xmm1,xmm5
+ mov [rsp+60h],eax
+ mov eax,[r11+r10*2]
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm2,xmm0
+ punpcklbw xmm1,xmm2
+ punpckhbw xmm5,xmm2
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax,[rbx+r10*2]
+ movdqa xmm0,[rsp+50h]
+ mov [rsp+60h],eax
+ mov eax, [rdx+r11]
+ movdqa xmm15,xmm1
+ punpckldq xmm0,[rsp+60h]
+ punpcklqdq xmm4,xmm0
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax, [rdx+rbx]
+ movdqa xmm0,[rsp+50h]
+ mov [rsp+60h],eax
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm3,xmm0
+ movdqa xmm0,xmm4
+ punpcklbw xmm0,xmm3
+ punpckhbw xmm4,xmm3
+ punpcklwd xmm15,xmm0
+ punpckhwd xmm1,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm12,xmm15
+ punpcklwd xmm0,xmm4
+ punpckhwd xmm5,xmm4
+ punpckldq xmm12,xmm0
+ punpckhdq xmm15,xmm0
+ movdqa xmm0,xmm1
+ movdqa xmm11,xmm12
+ punpckldq xmm0,xmm5
+ punpckhdq xmm1,xmm5
+ punpcklqdq xmm11,xmm0
+ punpckhqdq xmm12,xmm0
+ movsx eax,r9w
+ movdqa xmm14,xmm15
+ punpcklqdq xmm14,xmm1
+ punpckhqdq xmm15,xmm1
+ pxor xmm1,xmm1
+ movd xmm0,eax
+ movdqa xmm4,xmm12
+ movdqa xmm8,xmm11
+ mov eax, ebp ; iBeta
+ punpcklwd xmm0,xmm0
+ punpcklbw xmm4,xmm1
+ punpckhbw xmm12,xmm1
+ movdqa xmm9,xmm14
+ movdqa xmm7,xmm15
+ movdqa xmm10,xmm15
+ pshufd xmm13,xmm0,0
+ punpcklbw xmm9,xmm1
+ punpckhbw xmm14,xmm1
+ movdqa xmm6,xmm13
+ movd xmm0,eax
+ movdqa [rsp],xmm11
+ mov eax,2
+ cwde
+ punpckhbw xmm11,xmm1
+ punpckhbw xmm10,xmm1
+ punpcklbw xmm7,xmm1
+ punpcklwd xmm0,xmm0
+ punpcklbw xmm8,xmm1
+ pshufd xmm3,xmm0,0
+ movdqa xmm1,xmm8
+ movdqa xmm0,xmm4
+ psubw xmm0,xmm9
+ psubw xmm1,xmm4
+ movdqa xmm2,xmm3
+ pabsw xmm0,xmm0
+ pcmpgtw xmm6,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm3
+ pcmpgtw xmm2,xmm0
+ pand xmm6,xmm2
+ movdqa xmm0,xmm7
+ movdqa xmm2,xmm3
+ psubw xmm0,xmm9
+ pabsw xmm0,xmm0
+ pcmpgtw xmm1,xmm0
+ pand xmm6,xmm1
+ movdqa xmm0,xmm12
+ movdqa xmm1,xmm11
+ psubw xmm0,xmm14
+ psubw xmm1,xmm12
+ movdqa xmm5,xmm6
+ pabsw xmm0,xmm0
+ pcmpgtw xmm13,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm8
+ pcmpgtw xmm2,xmm0
+ paddw xmm1,xmm8
+ movdqa xmm0,xmm10
+ pand xmm13,xmm2
+ psubw xmm0,xmm14
+ paddw xmm1,xmm4
+ movdqa xmm2,xmm11
+ pabsw xmm0,xmm0
+ paddw xmm2,xmm11
+ paddw xmm1,xmm7
+ pcmpgtw xmm3,xmm0
+ paddw xmm2,xmm12
+ movd xmm0,eax
+ pand xmm13,xmm3
+ paddw xmm2,xmm10
+ punpcklwd xmm0,xmm0
+ pshufd xmm3,xmm0,0
+ movdqa xmm0,xmm6
+ paddw xmm1,xmm3
+ pandn xmm0,xmm4
+ paddw xmm2,xmm3
+ psraw xmm1,2
+ pand xmm5,xmm1
+ por xmm5,xmm0
+ paddw xmm7,xmm7
+ paddw xmm10,xmm10
+ psraw xmm2,2
+ movdqa xmm1,xmm13
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm12
+ pand xmm1,xmm2
+ paddw xmm7,xmm9
+ por xmm1,xmm0
+ paddw xmm10,xmm14
+ paddw xmm7,xmm8
+ movdqa xmm0,xmm13
+ packuswb xmm5,xmm1
+ paddw xmm7,xmm3
+ paddw xmm10,xmm11
+ movdqa xmm1,xmm6
+ paddw xmm10,xmm3
+ pandn xmm6,xmm9
+ psraw xmm7,2
+ pand xmm1,xmm7
+ psraw xmm10,2
+ pandn xmm13,xmm14
+ pand xmm0,xmm10
+ por xmm1,xmm6
+ movdqa xmm6,[rsp]
+ movdqa xmm4,xmm6
+ por xmm0,xmm13
+ punpcklbw xmm4,xmm5
+ punpckhbw xmm6,xmm5
+ movdqa xmm3,xmm4
+ packuswb xmm1,xmm0
+ movdqa xmm0,xmm1
+ punpckhbw xmm1,xmm15
+ punpcklbw xmm0,xmm15
+ punpcklwd xmm3,xmm0
+ punpckhwd xmm4,xmm0
+ movdqa xmm0,xmm6
+ movdqa xmm2,xmm3
+ punpcklwd xmm0,xmm1
+ punpckhwd xmm6,xmm1
+ movdqa xmm1,xmm4
+ punpckldq xmm2,xmm0
+ punpckhdq xmm3,xmm0
+ punpckldq xmm1,xmm6
+ movdqa xmm0,xmm2
+ punpcklqdq xmm0,xmm1
+ punpckhdq xmm4,xmm6
+ punpckhqdq xmm2,xmm1
+ movdqa [rsp+10h],xmm0
+ movdqa [rsp+60h],xmm2
+ movdqa xmm0,xmm3
+ mov eax,[rsp+10h]
+ mov [rcx-2],eax
+ mov eax,[rsp+60h]
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm3,xmm4
+ mov [r10+rcx-2],eax
+ movdqa [rsp+20h],xmm0
+ mov eax, [rsp+20h]
+ movdqa [rsp+70h],xmm3
+ mov [rcx+r10*2-2],eax
+ mov eax,[rsp+70h]
+ mov [rdx+rcx-2],eax
+ mov eax,[rsp+18h]
+ mov [r11],eax
+ mov eax,[rsp+68h]
+ mov [r10+r11],eax
+ mov eax,[rsp+28h]
+ mov [r11+r10*2],eax
+ mov eax,[rsp+78h]
+ mov [rdx+r11],eax
+ mov eax,[rsp+14h]
+ mov [rdi-2],eax
+ mov eax,[rsp+64h]
+ mov [r10+rdi-2],eax
+ mov eax,[rsp+24h]
+ mov [rdi+r10*2-2],eax
+ mov eax, [rsp+74h]
+ mov [rdx+rdi-2],eax
+ mov eax, [rsp+1Ch]
+ mov [rbx],eax
+ mov eax, [rsp+6Ch]
+ mov [r10+rbx],eax
+ mov eax,[rsp+2Ch]
+ mov [rbx+r10*2],eax
+ mov eax,[rsp+7Ch]
+ mov [rdx+rbx],eax
+ lea r11,[rsp+140h]
+ mov rbx, [r11+28h]
+ mov rsp,r11
+ pop r12
+ pop rbp
+ pop rbx
+ ret
+
+
+WELS_EXTERN DeblockChromaLt4H_ssse3
+ mov rax,rsp
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ sub rsp,170h
+
+ mov r13, r8
+ mov r14, r9
+ mov r8, rdx
+ mov r9, rcx
+ mov rdx, rdi
+ mov rcx, rsi
+
+ movsxd rsi,r8d
+ lea eax,[r8*4]
+ mov r11d,r9d
+ movsxd r10,eax
+ mov eax, [rcx-2]
+ mov r12,rdx
+ mov [rsp+40h],eax
+ mov eax, [rsi+rcx-2]
+ lea rbx,[r10+rcx-2]
+ movdqa xmm5,[rsp+40h]
+ mov [rsp+50h],eax
+ mov eax, [rcx+rsi*2-2]
+ lea rbp,[r10+rdx-2]
+ movdqa xmm2, [rsp+50h]
+ mov [rsp+60h],eax
+ lea r10,[rsi+rsi*2]
+ mov rdi,rcx
+ mov eax,[r10+rcx-2]
+ movdqa xmm4,[rsp+60h]
+ mov [rsp+70h],eax
+ mov eax,[rdx-2]
+ mov [rsp+80h],eax
+ mov eax, [rsi+rdx-2]
+ movdqa xmm3,[rsp+70h]
+ mov [rsp+90h],eax
+ mov eax,[rdx+rsi*2-2]
+ punpckldq xmm5,[rsp+80h]
+ mov [rsp+0A0h],eax
+ mov eax, [r10+rdx-2]
+ punpckldq xmm2,[rsp+90h]
+ mov [rsp+0B0h],eax
+ mov eax, [rbx]
+ punpckldq xmm4,[rsp+0A0h]
+ mov [rsp+80h],eax
+ mov eax,[rbp]
+ punpckldq xmm3,[rsp+0B0h]
+ mov [rsp+90h],eax
+ mov eax,[rsi+rbx]
+ movdqa xmm0,[rsp+80h]
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm5,xmm0
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax,[rsi+rbp]
+ movdqa xmm0,[rsp+80h]
+ movdqa xmm1,xmm5
+ mov [rsp+90h],eax
+ mov eax,[rbx+rsi*2]
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm2,xmm0
+ punpcklbw xmm1,xmm2
+ punpckhbw xmm5,xmm2
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax,[rbp+rsi*2]
+ movdqa xmm0, [rsp+80h]
+ mov [rsp+90h],eax
+ mov eax,[r10+rbx]
+ movdqa xmm7,xmm1
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm4,xmm0
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax, [r10+rbp]
+ movdqa xmm0,[rsp+80h]
+ mov [rsp+90h],eax
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm3,xmm0
+ movdqa xmm0,xmm4
+ punpcklbw xmm0,xmm3
+ punpckhbw xmm4,xmm3
+ punpcklwd xmm7,xmm0
+ punpckhwd xmm1,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm6,xmm7
+ punpcklwd xmm0,xmm4
+ punpckhwd xmm5,xmm4
+ punpckldq xmm6,xmm0
+ punpckhdq xmm7,xmm0
+ movdqa xmm0,xmm1
+ punpckldq xmm0,xmm5
+ mov rax, r14 ; pTC
+ punpckhdq xmm1,xmm5
+ movdqa xmm9,xmm6
+ punpckhqdq xmm6,xmm0
+ punpcklqdq xmm9,xmm0
+ movdqa xmm2,xmm7
+ movdqa xmm13,xmm6
+ movdqa xmm4,xmm9
+ movdqa [rsp+10h],xmm9
+ punpcklqdq xmm2,xmm1
+ punpckhqdq xmm7,xmm1
+ pxor xmm1,xmm1
+ movsx ecx,byte [rax+3]
+ movsx edx,byte [rax+2]
+ movsx r8d,byte [rax+1]
+ movsx r9d,byte [rax]
+ movdqa xmm10,xmm1
+ movdqa xmm15,xmm2
+ punpckhbw xmm2,xmm1
+ punpckhbw xmm6,xmm1
+ punpcklbw xmm4,xmm1
+ movsx eax,r11w
+ mov word [rsp+0Eh],cx
+ mov word [rsp+0Ch],cx
+ movdqa xmm3,xmm7
+ movdqa xmm8,xmm7
+ movdqa [rsp+20h],xmm7
+ punpcklbw xmm15,xmm1
+ punpcklbw xmm13,xmm1
+ punpcklbw xmm3,xmm1
+ mov word [rsp+0Ah],dx
+ mov word [rsp+8],dx
+ mov word [rsp+6],r8w
+ movd xmm0,eax
+ movdqa [rsp+30h],xmm6
+ punpckhbw xmm9,xmm1
+ punpckhbw xmm8,xmm1
+ punpcklwd xmm0,xmm0
+ mov eax, r13d ; iBeta
+ mov word [rsp+4],r8w
+ mov word [rsp+2],r9w
+ pshufd xmm12,xmm0,0
+ mov word [rsp],r9w
+ movd xmm0,eax
+ mov eax,4
+ cwde
+ movdqa xmm14, [rsp]
+ movdqa [rsp],xmm2
+ movdqa xmm2,xmm12
+ punpcklwd xmm0,xmm0
+ pshufd xmm11,xmm0,0
+ psubw xmm10,xmm14
+ movd xmm0,eax
+ movdqa xmm7,xmm14
+ movdqa xmm6,xmm14
+ pcmpgtw xmm7,xmm1
+ punpcklwd xmm0,xmm0
+ pshufd xmm5,xmm0,0
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm15
+ psubw xmm4,xmm13
+ psubw xmm0,xmm3
+ psubw xmm1,xmm13
+ psubw xmm3,xmm15
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm10
+ psraw xmm1,3
+ pmaxsw xmm0,xmm1
+ pminsw xmm6,xmm0
+ movdqa xmm1,xmm11
+ movdqa xmm0,xmm13
+ psubw xmm0,xmm15
+ pabsw xmm0,xmm0
+ pcmpgtw xmm2,xmm0
+ pabsw xmm0,xmm4
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm3
+ pand xmm2,xmm1
+ movdqa xmm1,xmm11
+ movdqa xmm3,[rsp+30h]
+ pcmpgtw xmm1,xmm0
+ movdqa xmm0,xmm9
+ pand xmm2,xmm1
+ psubw xmm0,xmm8
+ psubw xmm9,xmm3
+ pand xmm2,xmm7
+ pand xmm6,xmm2
+ psubw xmm15,xmm6
+ paddw xmm13,xmm6
+ movdqa xmm2,[rsp]
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ psubw xmm8,xmm2
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm3
+ movdqa xmm5,[rsp+10h]
+ psubw xmm0,xmm2
+ psraw xmm1,3
+ movdqa xmm4,xmm5
+ pabsw xmm0,xmm0
+ pmaxsw xmm10,xmm1
+ movdqa xmm1,xmm11
+ pcmpgtw xmm12,xmm0
+ pabsw xmm0,xmm9
+ pminsw xmm14,xmm10
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm8
+ pcmpgtw xmm11,xmm0
+ pand xmm12,xmm1
+ movdqa xmm1,[rsp+20h]
+ pand xmm12,xmm11
+ pand xmm12,xmm7
+ pand xmm14,xmm12
+ paddw xmm3,xmm14
+ psubw xmm2,xmm14
+ packuswb xmm13,xmm3
+ packuswb xmm15,xmm2
+ punpcklbw xmm4,xmm13
+ punpckhbw xmm5,xmm13
+ movdqa xmm0,xmm15
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm15,xmm1
+ movdqa xmm3,xmm4
+ punpcklwd xmm3,xmm0
+ punpckhwd xmm4,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm2,xmm3
+ movdqa xmm1,xmm4
+ punpcklwd xmm0,xmm15
+ punpckhwd xmm5,xmm15
+ punpckldq xmm2,xmm0
+ punpckhdq xmm3,xmm0
+ punpckldq xmm1,xmm5
+ movdqa xmm0,xmm2
+ punpcklqdq xmm0,xmm1
+ punpckhdq xmm4,xmm5
+ punpckhqdq xmm2,xmm1
+ movdqa [rsp+40h],xmm0
+ movdqa xmm0,xmm3
+ movdqa [rsp+90h],xmm2
+ mov eax,[rsp+40h]
+ mov [rdi-2],eax
+ mov eax, [rsp+90h]
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm3,xmm4
+ mov [rsi+rdi-2],eax
+ movdqa [rsp+50h],xmm0
+ mov eax,[rsp+50h]
+ movdqa [rsp+0A0h],xmm3
+ mov [rdi+rsi*2-2],eax
+ mov eax,[rsp+0A0h]
+ mov [r10+rdi-2],eax
+ mov eax,[rsp+48h]
+ mov [rbx],eax
+ mov eax,[rsp+98h]
+ mov [rsi+rbx],eax
+ mov eax,[rsp+58h]
+ mov [rbx+rsi*2],eax
+ mov eax, [rsp+0A8h]
+ mov [r10+rbx],eax
+ mov eax, [rsp+44h]
+ mov [r12-2],eax
+ mov eax,[rsp+94h]
+ mov [rsi+r12-2],eax
+ mov eax,[rsp+54h]
+ mov [r12+rsi*2-2],eax
+ mov eax, [rsp+0A4h]
+ mov [r10+r12-2],eax
+ mov eax,[rsp+4Ch]
+ mov [rbp],eax
+ mov eax,[rsp+9Ch]
+ mov [rsi+rbp],eax
+ mov eax, [rsp+5Ch]
+ mov [rbp+rsi*2],eax
+ mov eax,[rsp+0ACh]
+ mov [r10+rbp],eax
+ lea r11,[rsp+170h]
+ mov rsp,r11
+ pop r14
+ pop r13
+ pop r12
+ pop rbp
+ pop rbx
+ ret
+
+
+
+%elifdef X86_32
+
+;********************************************************************************
+; void DeblockChromaEq4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; int32_t iAlpha, int32_t iBeta)
+;********************************************************************************
+WELS_EXTERN DeblockChromaEq4V_ssse3
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,68h
+ mov edx,[ebp+10h] ; iStride
+ mov eax,[ebp+8] ; pPixCb
+ mov ecx,[ebp+0Ch] ; pPixCr
+ movq xmm4,[ecx]
+ movq xmm5,[edx+ecx]
+ push esi
+ push edi
+ lea esi,[edx+edx]
+ mov edi,eax
+ sub edi,esi
+ movq xmm1,[edi]
+ mov edi,ecx
+ sub edi,esi
+ movq xmm2,[edi]
+ punpcklqdq xmm1,xmm2
+ mov esi,eax
+ sub esi,edx
+ movq xmm2,[esi]
+ mov edi,ecx
+ sub edi,edx
+ movq xmm3,[edi]
+ punpcklqdq xmm2,xmm3
+ movq xmm3,[eax]
+ punpcklqdq xmm3,xmm4
+ movq xmm4,[edx+eax]
+ mov edx, [ebp + 14h]
+ punpcklqdq xmm4,xmm5
+ movd xmm5,edx
+ mov edx, [ebp + 18h]
+ pxor xmm0,xmm0
+ movdqa xmm6,xmm5
+ punpcklwd xmm6,xmm5
+ pshufd xmm5,xmm6,0
+ movd xmm6,edx
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa xmm7,xmm1
+ punpckhbw xmm1,xmm0
+ punpcklbw xmm7,xmm0
+ movdqa [esp+40h],xmm1
+ movdqa [esp+60h],xmm7
+ movdqa xmm7,xmm2
+ punpcklbw xmm7,xmm0
+ movdqa [esp+10h],xmm7
+ movdqa xmm7,xmm3
+ punpcklbw xmm7,xmm0
+ punpckhbw xmm3,xmm0
+ movdqa [esp+50h],xmm7
+ movdqa xmm7,xmm4
+ punpckhbw xmm4,xmm0
+ punpckhbw xmm2,xmm0
+ punpcklbw xmm7,xmm0
+ movdqa [esp+30h],xmm3
+ movdqa xmm3,[esp+10h]
+ movdqa xmm1,xmm3
+ psubw xmm1,[esp+50h]
+ pabsw xmm1,xmm1
+ movdqa [esp+20h],xmm4
+ movdqa xmm0,xmm5
+ pcmpgtw xmm0,xmm1
+ movdqa xmm1,[esp+60h]
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ pand xmm0,xmm4
+ movdqa xmm1,xmm7
+ psubw xmm1,[esp+50h]
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ movdqa xmm1,xmm2
+ psubw xmm1,[esp+30h]
+ pabsw xmm1,xmm1
+ pcmpgtw xmm5,xmm1
+ movdqa xmm1,[esp+40h]
+ pand xmm0,xmm4
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ movdqa xmm1,[esp+20h]
+ psubw xmm1,[esp+30h]
+ pand xmm5,xmm4
+ pabsw xmm1,xmm1
+ pcmpgtw xmm6,xmm1
+ pand xmm5,xmm6
+ mov edx,2
+ movsx edx,dx
+ movd xmm1,edx
+ movdqa xmm4,xmm1
+ punpcklwd xmm4,xmm1
+ pshufd xmm1,xmm4,0
+ movdqa xmm4,[esp+60h]
+ movdqa xmm6,xmm4
+ paddw xmm6,xmm4
+ paddw xmm6,xmm3
+ paddw xmm6,xmm7
+ movdqa [esp+10h],xmm1
+ paddw xmm6,[esp+10h]
+ psraw xmm6,2
+ movdqa xmm4,xmm0
+ pandn xmm4,xmm3
+ movdqa xmm3,[esp+40h]
+ movdqa xmm1,xmm0
+ pand xmm1,xmm6
+ por xmm1,xmm4
+ movdqa xmm6,xmm3
+ paddw xmm6,xmm3
+ movdqa xmm3,[esp+10h]
+ paddw xmm6,xmm2
+ paddw xmm6,[esp+20h]
+ paddw xmm6,xmm3
+ psraw xmm6,2
+ movdqa xmm4,xmm5
+ pand xmm4,xmm6
+ movdqa xmm6,xmm5
+ pandn xmm6,xmm2
+ por xmm4,xmm6
+ packuswb xmm1,xmm4
+ movdqa xmm4,[esp+50h]
+ movdqa xmm6,xmm7
+ paddw xmm6,xmm7
+ paddw xmm6,xmm4
+ paddw xmm6,[esp+60h]
+ paddw xmm6,xmm3
+ psraw xmm6,2
+ movdqa xmm2,xmm0
+ pand xmm2,xmm6
+ pandn xmm0,xmm4
+ por xmm2,xmm0
+ movdqa xmm0,[esp+20h]
+ movdqa xmm6,xmm0
+ paddw xmm6,xmm0
+ movdqa xmm0,[esp+30h]
+ paddw xmm6,xmm0
+ paddw xmm6,[esp+40h]
+ movdqa xmm4,xmm5
+ paddw xmm6,xmm3
+ movq [esi],xmm1
+ psraw xmm6,2
+ pand xmm4,xmm6
+ pandn xmm5,xmm0
+ por xmm4,xmm5
+ packuswb xmm2,xmm4
+ movq [eax],xmm2
+ psrldq xmm1,8
+ movq [edi],xmm1
+ pop edi
+ psrldq xmm2,8
+ movq [ecx],xmm2
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
+
+;******************************************************************************
+; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN DeblockChromaLt4V_ssse3
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,0E4h
+ push ebx
+ push esi
+ mov esi, [ebp+1Ch] ; pTC
+ movsx ebx, byte [esi+2]
+ push edi
+ movsx di,byte [esi+3]
+ mov word [esp+0Ch],bx
+ movsx bx,byte [esi+1]
+ movsx esi,byte [esi]
+ mov word [esp+0Eh],si
+ movzx esi,di
+ movd xmm1,esi
+ movzx esi,di
+ movd xmm2,esi
+ mov si,word [esp+0Ch]
+ mov edx, [ebp + 10h]
+ mov eax, [ebp + 08h]
+ movzx edi,si
+ movzx esi,si
+ mov ecx, [ebp + 0Ch]
+ movd xmm4,esi
+ movzx esi,bx
+ movd xmm5,esi
+ movd xmm3,edi
+ movzx esi,bx
+ movd xmm6,esi
+ mov si,word [esp+0Eh]
+ movzx edi,si
+ movzx esi,si
+ punpcklwd xmm6,xmm2
+ pxor xmm0,xmm0
+ movdqa [esp+40h],xmm0
+ movd xmm7,edi
+ movd xmm0,esi
+ lea esi,[edx+edx]
+ mov edi,eax
+ sub edi,esi
+ punpcklwd xmm5,xmm1
+ movdqa xmm1,[esp+40h]
+ punpcklwd xmm0,xmm4
+ movq xmm4,[edx+ecx]
+ punpcklwd xmm7,xmm3
+ movq xmm3,[eax]
+ punpcklwd xmm0,xmm6
+ movq xmm6,[edi]
+ punpcklwd xmm7,xmm5
+ punpcklwd xmm0,xmm7
+ mov edi,ecx
+ sub edi,esi
+ movdqa xmm2,xmm1
+ psubw xmm2,xmm0
+ movdqa [esp+60h],xmm2
+ movq xmm2, [edi]
+ punpcklqdq xmm6,xmm2
+ mov esi,eax
+ sub esi,edx
+ movq xmm7,[esi]
+ mov edi,ecx
+ sub edi,edx
+ movq xmm2,[edi]
+ punpcklqdq xmm7,xmm2
+ movq xmm2,[ecx]
+ punpcklqdq xmm3,xmm2
+ movq xmm2,[edx+eax]
+ movsx edx,word [ebp + 14h]
+ punpcklqdq xmm2,xmm4
+ movdqa [esp+0E0h],xmm2
+ movd xmm2,edx
+ movsx edx,word [ebp + 18h]
+ movdqa xmm4,xmm2
+ punpcklwd xmm4,xmm2
+ movd xmm2,edx
+ movdqa xmm5,xmm2
+ punpcklwd xmm5,xmm2
+ pshufd xmm2,xmm5,0
+ movdqa [esp+50h],xmm2
+ movdqa xmm2,xmm6
+ punpcklbw xmm2,xmm1
+ movdqa [esp+0D0h],xmm3
+ pshufd xmm4,xmm4,0
+ movdqa [esp+30h],xmm2
+ punpckhbw xmm6,xmm1
+ movdqa [esp+80h],xmm6
+ movdqa xmm6,[esp+0D0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+70h],xmm6
+ movdqa xmm6, [esp+0E0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+90h],xmm6
+ movdqa xmm5, [esp+0E0h]
+ movdqa xmm2,xmm7
+ punpckhbw xmm7,xmm1
+ punpcklbw xmm5,xmm1
+ movdqa [esp+0A0h],xmm7
+ punpcklbw xmm3,xmm1
+ mov edx,4
+ punpcklbw xmm2,xmm1
+ movsx edx,dx
+ movd xmm6,edx
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa xmm7,[esp+30h]
+ movdqa [esp+20h],xmm6
+ psubw xmm7,xmm5
+ movdqa xmm6,xmm0
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1,[esp+60h]
+ movdqa [esp+40h],xmm6
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm2
+ psllw xmm6,2
+ paddw xmm6,xmm7
+ paddw xmm6, [esp+20h]
+ movdqa xmm7, [esp+50h]
+ psraw xmm6,3
+ pmaxsw xmm1,xmm6
+ movdqa [esp+10h],xmm0
+ movdqa xmm6, [esp+10h]
+ pminsw xmm6,xmm1
+ movdqa [esp+10h],xmm6
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ movdqa xmm6,xmm4
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1, [esp+30h]
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ pcmpgtw xmm7,xmm1
+ movdqa xmm1,[esp+50h]
+ pand xmm6,xmm7
+ movdqa xmm7,[esp+50h]
+ psubw xmm5,xmm3
+ pabsw xmm5,xmm5
+ pcmpgtw xmm1,xmm5
+ movdqa xmm5,[esp+80h]
+ psubw xmm5,[esp+90h]
+ pand xmm6,xmm1
+ pand xmm6,[esp+40h]
+ movdqa xmm1,[esp+10h]
+ pand xmm1,xmm6
+ movdqa xmm6,[esp+70h]
+ movdqa [esp+30h],xmm1
+ movdqa xmm1,[esp+0A0h]
+ psubw xmm6,xmm1
+ psllw xmm6,2
+ paddw xmm6,xmm5
+ paddw xmm6,[esp+20h]
+ movdqa xmm5,[esp+60h]
+ psraw xmm6,3
+ pmaxsw xmm5,xmm6
+ pminsw xmm0,xmm5
+ movdqa xmm5,[esp+70h]
+ movdqa xmm6,xmm1
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm4,xmm6
+ movdqa xmm6,[esp+80h]
+ psubw xmm6,xmm1
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+90h]
+ pand xmm4,xmm7
+ movdqa xmm7,[esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ pand xmm4,xmm7
+ pand xmm4,[esp+40h]
+ pand xmm0,xmm4
+ movdqa xmm4,[esp+30h]
+ paddw xmm2,xmm4
+ paddw xmm1,xmm0
+ packuswb xmm2,xmm1
+ movq [esi],xmm2
+ psubw xmm3,xmm4
+ psubw xmm5,xmm0
+ packuswb xmm3,xmm5
+ movq [eax],xmm3
+ psrldq xmm2,8
+ movq [edi],xmm2
+ pop edi
+ pop esi
+ psrldq xmm3,8
+ movq [ecx],xmm3
+ pop ebx
+ mov esp,ebp
+ pop ebp
+ ret
+
+;***************************************************************************
+; void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; int32_t iAlpha, int32_t iBeta)
+;***************************************************************************
+
+WELS_EXTERN DeblockChromaEq4H_ssse3
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,0C8h
+ mov ecx,dword [ebp+8]
+ mov edx,dword [ebp+0Ch]
+ mov eax,dword [ebp+10h]
+ sub ecx,2
+ sub edx,2
+ push esi
+ lea esi,[eax+eax*2]
+ mov dword [esp+18h],ecx
+ mov dword [esp+4],edx
+ lea ecx,[ecx+eax*4]
+ lea edx,[edx+eax*4]
+ lea eax,[esp+7Ch]
+ push edi
+ mov dword [esp+14h],esi
+ mov dword [esp+18h],ecx
+ mov dword [esp+0Ch],edx
+ mov dword [esp+10h],eax
+ mov esi,dword [esp+1Ch]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+14h]
+ movd xmm0,dword [esi]
+ movd xmm1,dword [esi+ecx]
+ movd xmm2,dword [esi+ecx*2]
+ movd xmm3,dword [esi+edx]
+ mov esi,dword [esp+8]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [esi+ecx]
+ movd xmm6,dword [esi+ecx*2]
+ movd xmm7,dword [esi+edx]
+ punpckldq xmm0,xmm4
+ punpckldq xmm1,xmm5
+ punpckldq xmm2,xmm6
+ punpckldq xmm3,xmm7
+ mov esi,dword [esp+18h]
+ mov edi,dword [esp+0Ch]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [edi]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm0,xmm4
+ movd xmm4,dword [esi+ecx]
+ movd xmm5,dword [edi+ecx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm1,xmm4
+ movd xmm4,dword [esi+ecx*2]
+ movd xmm5,dword [edi+ecx*2]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm2,xmm4
+ movd xmm4,dword [esi+edx]
+ movd xmm5,dword [edi+edx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm3,xmm4
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov edi,dword [esp+10h]
+ movdqa [edi],xmm0
+ movdqa [edi+10h],xmm5
+ movdqa [edi+20h],xmm1
+ movdqa [edi+30h],xmm6
+ movsx ecx,word [ebp+14h]
+ movsx edx,word [ebp+18h]
+ movdqa xmm6,[esp+80h]
+ movdqa xmm4,[esp+90h]
+ movdqa xmm5,[esp+0A0h]
+ movdqa xmm7,[esp+0B0h]
+ pxor xmm0,xmm0
+ movd xmm1,ecx
+ movdqa xmm2,xmm1
+ punpcklwd xmm2,xmm1
+ pshufd xmm1,xmm2,0
+ movd xmm2,edx
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm2,xmm3,0
+ movdqa xmm3,xmm6
+ punpckhbw xmm6,xmm0
+ movdqa [esp+60h],xmm6
+ movdqa xmm6,[esp+90h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+30h],xmm6
+ movdqa xmm6,[esp+0A0h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+40h],xmm6
+ movdqa xmm6,[esp+0B0h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+70h],xmm6
+ punpcklbw xmm7,xmm0
+ punpcklbw xmm4,xmm0
+ punpcklbw xmm5,xmm0
+ punpcklbw xmm3,xmm0
+ movdqa [esp+50h],xmm7
+ movdqa xmm6,xmm4
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ movdqa xmm0,xmm1
+ pcmpgtw xmm0,xmm6
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm4
+ pabsw xmm6,xmm6
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pand xmm0,xmm7
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+30h]
+ psubw xmm6,[esp+40h]
+ pabsw xmm6,xmm6
+ pcmpgtw xmm1,xmm6
+ movdqa xmm6,[esp+60h]
+ psubw xmm6,[esp+30h]
+ pabsw xmm6,xmm6
+ pand xmm0,xmm7
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+70h]
+ psubw xmm6,[esp+40h]
+ pabsw xmm6,xmm6
+ pand xmm1,xmm7
+ pcmpgtw xmm2,xmm6
+ pand xmm1,xmm2
+ mov eax,2
+ movsx ecx,ax
+ movd xmm2,ecx
+ movdqa xmm6,xmm2
+ punpcklwd xmm6,xmm2
+ pshufd xmm2,xmm6,0
+ movdqa [esp+20h],xmm2
+ movdqa xmm2,xmm3
+ paddw xmm2,xmm3
+ paddw xmm2,xmm4
+ paddw xmm2,[esp+50h]
+ paddw xmm2,[esp+20h]
+ psraw xmm2,2
+ movdqa xmm6,xmm0
+ pand xmm6,xmm2
+ movdqa xmm2,xmm0
+ pandn xmm2,xmm4
+ por xmm6,xmm2
+ movdqa xmm2,[esp+60h]
+ movdqa xmm7,xmm2
+ paddw xmm7,xmm2
+ paddw xmm7,[esp+30h]
+ paddw xmm7,[esp+70h]
+ paddw xmm7,[esp+20h]
+ movdqa xmm4,xmm1
+ movdqa xmm2,xmm1
+ pandn xmm2,[esp+30h]
+ psraw xmm7,2
+ pand xmm4,xmm7
+ por xmm4,xmm2
+ movdqa xmm2,[esp+50h]
+ packuswb xmm6,xmm4
+ movdqa [esp+90h],xmm6
+ movdqa xmm6,xmm2
+ paddw xmm6,xmm2
+ movdqa xmm2,[esp+20h]
+ paddw xmm6,xmm5
+ paddw xmm6,xmm3
+ movdqa xmm4,xmm0
+ pandn xmm0,xmm5
+ paddw xmm6,xmm2
+ psraw xmm6,2
+ pand xmm4,xmm6
+ por xmm4,xmm0
+ movdqa xmm0,[esp+70h]
+ movdqa xmm5,xmm0
+ paddw xmm5,xmm0
+ movdqa xmm0,[esp+40h]
+ paddw xmm5,xmm0
+ paddw xmm5,[esp+60h]
+ movdqa xmm3,xmm1
+ paddw xmm5,xmm2
+ psraw xmm5,2
+ pand xmm3,xmm5
+ pandn xmm1,xmm0
+ por xmm3,xmm1
+ packuswb xmm4,xmm3
+ movdqa [esp+0A0h],xmm4
+ mov esi,dword [esp+10h]
+ movdqa xmm0,[esi]
+ movdqa xmm1,[esi+10h]
+ movdqa xmm2,[esi+20h]
+ movdqa xmm3,[esi+30h]
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov esi,dword [esp+1Ch]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+14h]
+ mov edi,dword [esp+8]
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov esi,dword [esp+18h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov edi,dword [esp+0Ch]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ pop edi
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
+
+;*******************************************************************************
+; void DeblockChromaLt4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN DeblockChromaLt4H_ssse3
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,108h
+ mov ecx,dword [ebp+8]
+ mov edx,dword [ebp+0Ch]
+ mov eax,dword [ebp+10h]
+ sub ecx,2
+ sub edx,2
+ push esi
+ lea esi,[eax+eax*2]
+ mov dword [esp+10h],ecx
+ mov dword [esp+4],edx
+ lea ecx,[ecx+eax*4]
+ lea edx,[edx+eax*4]
+ lea eax,[esp+6Ch]
+ push edi
+ mov dword [esp+0Ch],esi
+ mov dword [esp+18h],ecx
+ mov dword [esp+10h],edx
+ mov dword [esp+1Ch],eax
+ mov esi,dword [esp+14h]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+0Ch]
+ movd xmm0,dword [esi]
+ movd xmm1,dword [esi+ecx]
+ movd xmm2,dword [esi+ecx*2]
+ movd xmm3,dword [esi+edx]
+ mov esi,dword [esp+8]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [esi+ecx]
+ movd xmm6,dword [esi+ecx*2]
+ movd xmm7,dword [esi+edx]
+ punpckldq xmm0,xmm4
+ punpckldq xmm1,xmm5
+ punpckldq xmm2,xmm6
+ punpckldq xmm3,xmm7
+ mov esi,dword [esp+18h]
+ mov edi,dword [esp+10h]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [edi]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm0,xmm4
+ movd xmm4,dword [esi+ecx]
+ movd xmm5,dword [edi+ecx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm1,xmm4
+ movd xmm4,dword [esi+ecx*2]
+ movd xmm5,dword [edi+ecx*2]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm2,xmm4
+ movd xmm4,dword [esi+edx]
+ movd xmm5,dword [edi+edx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm3,xmm4
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov edi,dword [esp+1Ch]
+ movdqa [edi],xmm0
+ movdqa [edi+10h],xmm5
+ movdqa [edi+20h],xmm1
+ movdqa [edi+30h],xmm6
+ mov eax,dword [ebp+1Ch]
+ movsx cx,byte [eax+3]
+ movsx dx,byte [eax+2]
+ movsx si,byte [eax+1]
+ movsx ax,byte [eax]
+ movzx edi,cx
+ movzx ecx,cx
+ movd xmm2,ecx
+ movzx ecx,dx
+ movzx edx,dx
+ movd xmm3,ecx
+ movd xmm4,edx
+ movzx ecx,si
+ movzx edx,si
+ movd xmm5,ecx
+ pxor xmm0,xmm0
+ movd xmm6,edx
+ movzx ecx,ax
+ movdqa [esp+60h],xmm0
+ movzx edx,ax
+ movsx eax,word [ebp+14h]
+ punpcklwd xmm6,xmm2
+ movd xmm1,edi
+ movd xmm7,ecx
+ movsx ecx,word [ebp+18h]
+ movd xmm0,edx
+ punpcklwd xmm7,xmm3
+ punpcklwd xmm5,xmm1
+ movdqa xmm1,[esp+60h]
+ punpcklwd xmm7,xmm5
+ movdqa xmm5,[esp+0A0h]
+ punpcklwd xmm0,xmm4
+ punpcklwd xmm0,xmm6
+ movdqa xmm6, [esp+70h]
+ punpcklwd xmm0,xmm7
+ movdqa xmm7,[esp+80h]
+ movdqa xmm2,xmm1
+ psubw xmm2,xmm0
+ movdqa [esp+0D0h],xmm2
+ movd xmm2,eax
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm4,xmm3,0
+ movd xmm2,ecx
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm2,xmm3,0
+ movdqa xmm3, [esp+90h]
+ movdqa [esp+50h],xmm2
+ movdqa xmm2,xmm6
+ punpcklbw xmm2,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa [esp+40h],xmm2
+ movdqa [esp+0B0h],xmm6
+ movdqa xmm6,[esp+90h]
+ movdqa xmm2,xmm7
+ punpckhbw xmm7,xmm1
+ punpckhbw xmm6,xmm1
+ punpcklbw xmm2,xmm1
+ punpcklbw xmm3,xmm1
+ punpcklbw xmm5,xmm1
+ movdqa [esp+0F0h],xmm7
+ movdqa [esp+0C0h],xmm6
+ movdqa xmm6, [esp+0A0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+0E0h],xmm6
+ mov edx,4
+ movsx eax,dx
+ movd xmm6,eax
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa [esp+30h],xmm6
+ movdqa xmm7, [esp+40h]
+ psubw xmm7,xmm5
+ movdqa xmm6,xmm0
+ pcmpgtw xmm6,xmm1
+ movdqa [esp+60h],xmm6
+ movdqa xmm1, [esp+0D0h]
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm2
+ psllw xmm6,2
+ paddw xmm6,xmm7
+ paddw xmm6,[esp+30h]
+ psraw xmm6,3
+ pmaxsw xmm1,xmm6
+ movdqa xmm7,[esp+50h]
+ movdqa [esp+20h],xmm0
+ movdqa xmm6, [esp+20h]
+ pminsw xmm6,xmm1
+ movdqa [esp+20h],xmm6
+ movdqa xmm6,xmm4
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1, [esp+40h]
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ pcmpgtw xmm7,xmm1
+ movdqa xmm1, [esp+50h]
+ pand xmm6,xmm7
+ movdqa xmm7, [esp+50h]
+ psubw xmm5,xmm3
+ pabsw xmm5,xmm5
+ pcmpgtw xmm1,xmm5
+ movdqa xmm5, [esp+0B0h]
+ psubw xmm5,[esp+0E0h]
+ pand xmm6,xmm1
+ pand xmm6, [esp+60h]
+ movdqa xmm1, [esp+20h]
+ pand xmm1,xmm6
+ movdqa xmm6, [esp+0C0h]
+ movdqa [esp+40h],xmm1
+ movdqa xmm1, [esp+0F0h]
+ psubw xmm6,xmm1
+ psllw xmm6,2
+ paddw xmm6,xmm5
+ paddw xmm6, [esp+30h]
+ movdqa xmm5, [esp+0D0h]
+ psraw xmm6,3
+ pmaxsw xmm5,xmm6
+ pminsw xmm0,xmm5
+ movdqa xmm5,[esp+0C0h]
+ movdqa xmm6,xmm1
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm4,xmm6
+ movdqa xmm6,[esp+0B0h]
+ psubw xmm6,xmm1
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6, [esp+0E0h]
+ pand xmm4,xmm7
+ movdqa xmm7, [esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ pand xmm4,xmm7
+ pand xmm4,[esp+60h]
+ pand xmm0,xmm4
+ movdqa xmm4, [esp+40h]
+ paddw xmm2,xmm4
+ paddw xmm1,xmm0
+ psubw xmm3,xmm4
+ psubw xmm5,xmm0
+ packuswb xmm2,xmm1
+ packuswb xmm3,xmm5
+ movdqa [esp+80h],xmm2
+ movdqa [esp+90h],xmm3
+ mov esi,dword [esp+1Ch]
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi+10h]
+ movdqa xmm2, [esi+20h]
+ movdqa xmm3, [esi+30h]
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov esi,dword [esp+14h]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+0Ch]
+ mov edi,dword [esp+8]
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov esi,dword [esp+18h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov edi,dword [esp+10h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ pop edi
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
+
+
+
+;*******************************************************************************
+; void DeblockLumaLt4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+; int32_t iBeta, int8_t * pTC)
+;*******************************************************************************
+
+
+WELS_EXTERN DeblockLumaLt4V_ssse3
+ push ebp
+ mov ebp, esp
+ and esp, -16 ; fffffff0H
+ sub esp, 420 ; 000001a4H
+ mov eax, dword [ebp+8]
+ mov ecx, dword [ebp+12]
+
+ pxor xmm0, xmm0
+ push ebx
+ mov edx, dword [ebp+24]
+ movdqa [esp+424-384], xmm0
+ push esi
+
+ lea esi, [ecx+ecx*2]
+ push edi
+ mov edi, eax
+ sub edi, esi
+ movdqa xmm0, [edi]
+
+ lea esi, [ecx+ecx]
+ movdqa [esp+432-208], xmm0
+ mov edi, eax
+ sub edi, esi
+ movdqa xmm0, [edi]
+ movdqa [esp+448-208], xmm0
+
+ mov ebx, eax
+ sub ebx, ecx
+ movdqa xmm0, [ebx]
+ movdqa [esp+464-208], xmm0
+
+ movdqa xmm0, [eax]
+
+ add ecx, eax
+ movdqa [esp+480-208], xmm0
+ movdqa xmm0, [ecx]
+ mov dword [esp+432-404], ecx
+
+ movsx ecx, word [ebp+16]
+ movdqa [esp+496-208], xmm0
+ movdqa xmm0, [esi+eax]
+
+ movsx si, byte [edx]
+ movdqa [esp+512-208], xmm0
+ movd xmm0, ecx
+ movsx ecx, word [ebp+20]
+ movdqa xmm1, xmm0
+ punpcklwd xmm1, xmm0
+ pshufd xmm0, xmm1, 0
+ movdqa [esp+432-112], xmm0
+ movd xmm0, ecx
+ movsx cx, byte [edx+1]
+ movdqa xmm1, xmm0
+ punpcklwd xmm1, xmm0
+ mov dword [esp+432-408], ebx
+ movzx ebx, cx
+ pshufd xmm0, xmm1, 0
+ movd xmm1, ebx
+ movzx ebx, cx
+ movd xmm2, ebx
+ movzx ebx, cx
+ movzx ecx, cx
+ movd xmm4, ecx
+ movzx ecx, si
+ movd xmm5, ecx
+ movzx ecx, si
+ movd xmm6, ecx
+ movzx ecx, si
+ movd xmm7, ecx
+ movzx ecx, si
+ movdqa [esp+432-336], xmm0
+ movd xmm0, ecx
+
+ movsx cx, byte [edx+3]
+ movsx dx, byte [edx+2]
+ movd xmm3, ebx
+ punpcklwd xmm0, xmm4
+ movzx esi, cx
+ punpcklwd xmm6, xmm2
+ punpcklwd xmm5, xmm1
+ punpcklwd xmm0, xmm6
+ punpcklwd xmm7, xmm3
+ punpcklwd xmm7, xmm5
+ punpcklwd xmm0, xmm7
+ movdqa [esp+432-400], xmm0
+ movd xmm0, esi
+ movzx esi, cx
+ movd xmm2, esi
+ movzx esi, cx
+ movzx ecx, cx
+ movd xmm4, ecx
+ movzx ecx, dx
+ movd xmm3, esi
+ movd xmm5, ecx
+ punpcklwd xmm5, xmm0
+
+ movdqa xmm0, [esp+432-384]
+ movzx ecx, dx
+ movd xmm6, ecx
+ movzx ecx, dx
+ movzx edx, dx
+ punpcklwd xmm6, xmm2
+ movd xmm7, ecx
+ movd xmm1, edx
+
+ movdqa xmm2, [esp+448-208]
+ punpcklbw xmm2, xmm0
+
+ mov ecx, 4
+ movsx edx, cx
+ punpcklwd xmm7, xmm3
+ punpcklwd xmm7, xmm5
+ movdqa xmm5, [esp+496-208]
+ movdqa xmm3, [esp+464-208]
+ punpcklbw xmm5, xmm0
+ movdqa [esp+432-240], xmm5
+ movdqa xmm5, [esp+512-208]
+ punpcklbw xmm5, xmm0
+ movdqa [esp+432-352], xmm5
+ punpcklwd xmm1, xmm4
+ movdqa xmm4, [esp+432-208]
+ punpcklwd xmm1, xmm6
+ movdqa xmm6, [esp+480-208]
+ punpcklwd xmm1, xmm7
+ punpcklbw xmm6, xmm0
+ punpcklbw xmm3, xmm0
+ punpcklbw xmm4, xmm0
+ movdqa xmm7, xmm3
+ psubw xmm7, xmm4
+ pabsw xmm7, xmm7
+ movdqa [esp+432-272], xmm4
+ movdqa xmm4, [esp+432-336]
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-288], xmm5
+ movdqa xmm7, xmm6
+ psubw xmm7, [esp+432-352]
+ pabsw xmm7, xmm7
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-256], xmm5
+ movdqa xmm5, xmm3
+ pavgw xmm5, xmm6
+ movdqa [esp+432-304], xmm5
+ movdqa xmm5, [esp+432-400]
+ psubw xmm5, [esp+432-288]
+ psubw xmm5, [esp+432-256]
+ movdqa [esp+432-224], xmm5
+ movdqa xmm5, xmm6
+ psubw xmm5, xmm3
+ movdqa [esp+432-32], xmm6
+ psubw xmm6, [esp+432-240]
+ movdqa xmm7, xmm5
+ movdqa [esp+432-384], xmm5
+ movdqa xmm5, [esp+432-112]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm5, xmm7
+ pabsw xmm6, xmm6
+ movdqa xmm7, xmm4
+ pcmpgtw xmm7, xmm6
+
+ pand xmm5, xmm7
+ movdqa xmm6, xmm3
+ psubw xmm6, xmm2
+ pabsw xmm6, xmm6
+ movdqa xmm7, xmm4
+ pcmpgtw xmm7, xmm6
+ movdqa xmm6, [esp+432-400]
+ pand xmm5, xmm7
+ movdqa xmm7, xmm6
+ pcmpeqw xmm6, xmm0
+ pcmpgtw xmm7, xmm0
+ por xmm7, xmm6
+ pand xmm5, xmm7
+ movdqa [esp+432-320], xmm5
+ movd xmm5, edx
+ movdqa xmm6, xmm5
+ punpcklwd xmm6, xmm5
+ pshufd xmm5, xmm6, 0
+ movdqa [esp+432-336], xmm5
+ movdqa xmm5, [esp+432-224]
+ movdqa [esp+432-368], xmm5
+ movdqa xmm6, xmm0
+ psubw xmm6, xmm5
+ movdqa xmm5, [esp+432-384]
+ psllw xmm5, 2
+ movdqa xmm7, xmm2
+ psubw xmm7, [esp+432-240]
+ paddw xmm7, xmm5
+ paddw xmm7, [esp+432-336]
+ movdqa xmm5, [esp+432-368]
+ psraw xmm7, 3
+ pmaxsw xmm6, xmm7
+ pminsw xmm5, xmm6
+
+ pand xmm5, [esp+432-320]
+ movdqa xmm6, [esp+432-400]
+ movdqa [esp+432-64], xmm5
+ movdqa [esp+432-384], xmm6
+ movdqa xmm5, xmm0
+ psubw xmm5, xmm6
+ movdqa [esp+432-368], xmm5
+ movdqa xmm6, xmm5
+ movdqa xmm5, [esp+432-272]
+ paddw xmm5, [esp+432-304]
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm2
+ psubw xmm5, xmm7
+ psraw xmm5, 1
+ pmaxsw xmm6, xmm5
+ movdqa xmm5, [esp+432-384]
+ pminsw xmm5, xmm6
+
+ pand xmm5, [esp+432-320]
+ pand xmm5, [esp+432-288]
+ movdqa xmm6, [esp+432-240]
+ movdqa [esp+432-96], xmm5
+ movdqa xmm5, [esp+432-352]
+ paddw xmm5, [esp+432-304]
+ movdqa xmm7, xmm6
+ paddw xmm7, xmm6
+ movdqa xmm6, [esp+432-368]
+ psubw xmm5, xmm7
+
+ movdqa xmm7, [esp+496-208]
+ psraw xmm5, 1
+ pmaxsw xmm6, xmm5
+ movdqa xmm5, [esp+432-400]
+ pminsw xmm5, xmm6
+ pand xmm5, [esp+432-320]
+ pand xmm5, [esp+432-256]
+ movdqa xmm6, [esp+448-208]
+ punpckhbw xmm7, xmm0
+ movdqa [esp+432-352], xmm7
+
+ movdqa xmm7, [esp+512-208]
+ punpckhbw xmm6, xmm0
+ movdqa [esp+432-48], xmm5
+ movdqa xmm5, [esp+432-208]
+ movdqa [esp+432-368], xmm6
+ movdqa xmm6, [esp+464-208]
+ punpckhbw xmm7, xmm0
+ punpckhbw xmm5, xmm0
+ movdqa [esp+432-384], xmm7
+ punpckhbw xmm6, xmm0
+ movdqa [esp+432-400], xmm6
+
+ movdqa xmm7, [esp+432-400]
+ movdqa xmm6, [esp+480-208]
+ psubw xmm7, xmm5
+ movdqa [esp+432-16], xmm5
+ pabsw xmm7, xmm7
+ punpckhbw xmm6, xmm0
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-288], xmm5
+
+ movdqa xmm7, xmm6
+ psubw xmm7, [esp+432-384]
+ pabsw xmm7, xmm7
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-256], xmm5
+
+ movdqa xmm5, [esp+432-400]
+ movdqa [esp+432-80], xmm6
+ pavgw xmm5, xmm6
+ movdqa [esp+432-304], xmm5
+
+ movdqa xmm5, xmm1
+ psubw xmm5, [esp+432-288]
+ psubw xmm5, [esp+432-256]
+ movdqa [esp+432-224], xmm5
+ movdqa xmm5, xmm6
+ psubw xmm5, [esp+432-400]
+ psubw xmm6, [esp+432-352]
+ movdqa [esp+432-272], xmm5
+ movdqa xmm7, xmm5
+ movdqa xmm5, [esp+432-112]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm5, xmm7
+ movdqa xmm7, xmm4
+ pabsw xmm6, xmm6
+ pcmpgtw xmm7, xmm6
+ movdqa xmm6, [esp+432-368]
+
+ pand xmm5, xmm7
+ movdqa xmm7, [esp+432-400]
+ psubw xmm7, xmm6
+ psubw xmm6, [esp+432-352]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm4, xmm7
+ pand xmm5, xmm4
+
+ paddw xmm2, [esp+432-96]
+ movdqa xmm4, xmm1
+ pcmpgtw xmm4, xmm0
+ movdqa xmm7, xmm1
+ pcmpeqw xmm7, xmm0
+ por xmm4, xmm7
+ pand xmm5, xmm4
+ movdqa xmm4, [esp+432-224]
+ movdqa [esp+432-320], xmm5
+ movdqa xmm5, [esp+432-272]
+ movdqa xmm7, xmm0
+ psubw xmm7, xmm4
+ psubw xmm0, xmm1
+ psllw xmm5, 2
+ paddw xmm6, xmm5
+ paddw xmm6, [esp+432-336]
+ movdqa xmm5, [esp+432-368]
+ movdqa [esp+432-336], xmm0
+ psraw xmm6, 3
+ pmaxsw xmm7, xmm6
+ pminsw xmm4, xmm7
+ pand xmm4, [esp+432-320]
+ movdqa xmm6, xmm0
+ movdqa xmm0, [esp+432-16]
+ paddw xmm0, [esp+432-304]
+ movdqa [esp+432-272], xmm4
+ movdqa xmm4, [esp+432-368]
+ paddw xmm4, xmm4
+ psubw xmm0, xmm4
+
+ movdqa xmm4, [esp+432-64]
+ psraw xmm0, 1
+ pmaxsw xmm6, xmm0
+ movdqa xmm0, [esp+432-400]
+ movdqa xmm7, xmm1
+ pminsw xmm7, xmm6
+ movdqa xmm6, [esp+432-320]
+ pand xmm7, xmm6
+ pand xmm7, [esp+432-288]
+ paddw xmm5, xmm7
+ packuswb xmm2, xmm5
+ movdqa xmm5, [esp+432-272]
+ paddw xmm0, xmm5
+ paddw xmm3, xmm4
+ packuswb xmm3, xmm0
+
+ movdqa xmm0, [esp+432-32]
+ psubw xmm0, xmm4
+ movdqa xmm4, [esp+432-80]
+ psubw xmm4, xmm5
+
+ movdqa xmm5, [esp+432-240]
+ paddw xmm5, [esp+432-48]
+ packuswb xmm0, xmm4
+ movdqa xmm4, [esp+432-384]
+ paddw xmm4, [esp+432-304]
+ movdqa [esp+480-208], xmm0
+ movdqa xmm0, [esp+432-352]
+ movdqa xmm7, xmm0
+ paddw xmm0, xmm0
+
+ mov ecx, dword [esp+432-408]
+
+ mov edx, dword [esp+432-404]
+ psubw xmm4, xmm0
+ movdqa xmm0, [esp+432-336]
+ movdqa [edi], xmm2
+ psraw xmm4, 1
+ pmaxsw xmm0, xmm4
+ pminsw xmm1, xmm0
+ movdqa xmm0, [esp+480-208]
+
+ pop edi
+ pand xmm1, xmm6
+ pand xmm1, [esp+428-256]
+ movdqa [ecx], xmm3
+ paddw xmm7, xmm1
+ pop esi
+ packuswb xmm5, xmm7
+ movdqa [eax], xmm0
+ movdqa [edx], xmm5
+ pop ebx
+ mov esp, ebp
+ pop ebp
+ ret
+
+
+;*******************************************************************************
+; void DeblockLumaEq4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+; int32_t iBeta)
+;*******************************************************************************
+
+
+WELS_EXTERN DeblockLumaEq4V_ssse3
+
+ push ebp
+ mov ebp, esp
+ and esp, -16 ; fffffff0H
+ sub esp, 628 ; 00000274H
+ mov eax, dword [ebp+8]
+ mov ecx, dword [ebp+12]
+ push ebx
+ push esi
+
+ lea edx, [ecx*4]
+ pxor xmm0, xmm0
+ movdqa xmm2, xmm0
+
+ movdqa xmm0, [ecx+eax]
+ mov esi, eax
+ sub esi, edx
+ movdqa xmm3, [esi]
+ movdqa xmm5, [eax]
+ push edi
+ lea edi, [ecx+ecx]
+ lea ebx, [ecx+ecx*2]
+ mov dword [esp+640-600], edi
+ mov esi, eax
+ sub esi, edi
+ movdqa xmm1, [esi]
+ movdqa [esp+720-272], xmm0
+ mov edi, eax
+ sub edi, ecx
+ movdqa xmm4, [edi]
+ add ecx, eax
+ mov dword [esp+640-596], ecx
+
+ mov ecx, dword [esp+640-600]
+ movdqa xmm0, [ecx+eax]
+ movdqa [esp+736-272], xmm0
+
+ movdqa xmm0, [eax+ebx]
+ mov edx, eax
+ sub edx, ebx
+
+ movsx ebx, word [ebp+16]
+ movdqa xmm6, [edx]
+ add ecx, eax
+ movdqa [esp+752-272], xmm0
+ movd xmm0, ebx
+
+ movsx ebx, word [ebp+20]
+ movdqa xmm7, xmm0
+ punpcklwd xmm7, xmm0
+ pshufd xmm0, xmm7, 0
+ movdqa [esp+640-320], xmm0
+ movd xmm0, ebx
+ movdqa xmm7, xmm0
+ punpcklwd xmm7, xmm0
+ pshufd xmm0, xmm7, 0
+
+ movdqa xmm7, [esp+736-272]
+ punpcklbw xmm7, xmm2
+ movdqa [esp+640-416], xmm7
+ movdqa [esp+640-512], xmm0
+ movdqa xmm0, xmm1
+ movdqa [esp+672-272], xmm1
+ movdqa xmm1, xmm4
+ movdqa [esp+704-272], xmm5
+ punpcklbw xmm5, xmm2
+ punpcklbw xmm1, xmm2
+
+ movdqa xmm7, xmm5
+ psubw xmm7, xmm1
+ pabsw xmm7, xmm7
+ movdqa [esp+640-560], xmm7
+ punpcklbw xmm0, xmm2
+ movdqa [esp+688-272], xmm4
+ movdqa xmm4, [esp+720-272]
+ movdqa [esp+640-480], xmm0
+
+ movdqa xmm7, xmm1
+ psubw xmm7, xmm0
+
+ movdqa xmm0, [esp+640-512]
+ pabsw xmm7, xmm7
+ punpcklbw xmm4, xmm2
+ pcmpgtw xmm0, xmm7
+ movdqa [esp+640-384], xmm4
+ movdqa xmm7, xmm5
+ psubw xmm7, xmm4
+ movdqa xmm4, [esp+640-512]
+ movdqa [esp+656-272], xmm6
+ punpcklbw xmm6, xmm2
+ pabsw xmm7, xmm7
+ movdqa [esp+640-48], xmm2
+ movdqa [esp+640-368], xmm6
+ movdqa [esp+640-144], xmm1
+ movdqa [esp+640-400], xmm5
+ pcmpgtw xmm4, xmm7
+ pand xmm0, xmm4
+ movdqa xmm4, [esp+640-320]
+ pcmpgtw xmm4, [esp+640-560]
+ pand xmm0, xmm4
+
+ mov ebx, 2
+ movsx ebx, bx
+ movd xmm4, ebx
+ movdqa xmm7, xmm4
+ punpcklwd xmm7, xmm4
+ movdqa xmm4, [esp+640-320]
+ psraw xmm4, 2
+ pshufd xmm7, xmm7, 0
+ paddw xmm4, xmm7
+ movdqa [esp+640-576], xmm4
+ pcmpgtw xmm4, [esp+640-560]
+ movdqa [esp+640-560], xmm4
+
+ movdqa xmm4, [esp+640-512]
+ movdqa [esp+640-624], xmm7
+ movdqa xmm7, xmm1
+ psubw xmm7, xmm6
+ pabsw xmm7, xmm7
+ pcmpgtw xmm4, xmm7
+
+ pand xmm4, [esp+640-560]
+ movdqa [esp+640-544], xmm4
+ movdqa xmm4, [esp+640-512]
+ movdqa xmm7, xmm5
+ psubw xmm7, [esp+640-416]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm4, xmm7
+
+ pand xmm4, [esp+640-560]
+ movdqa [esp+640-560], xmm4
+
+ movdqa xmm4, [esp+640-544]
+ pandn xmm4, xmm6
+ movdqa [esp+640-16], xmm4
+ mov ebx, 4
+ movsx ebx, bx
+ movd xmm4, ebx
+ movdqa xmm7, xmm4
+ punpcklwd xmm7, xmm4
+ movdqa xmm4, xmm3
+ punpcklbw xmm4, xmm2
+ psllw xmm4, 1
+ paddw xmm4, xmm6
+ paddw xmm4, xmm6
+ paddw xmm4, xmm6
+ paddw xmm4, [esp+640-480]
+
+ movdqa xmm6, [esp+640-560]
+ pshufd xmm7, xmm7, 0
+ paddw xmm4, xmm1
+ movdqa [esp+640-592], xmm7
+ paddw xmm4, xmm5
+ paddw xmm4, xmm7
+ movdqa xmm7, [esp+640-416]
+ pandn xmm6, xmm7
+ movdqa [esp+640-80], xmm6
+ movdqa xmm6, [esp+752-272]
+ punpcklbw xmm6, xmm2
+ psllw xmm6, 1
+ paddw xmm6, xmm7
+ paddw xmm6, xmm7
+ paddw xmm6, xmm7
+ paddw xmm6, [esp+640-384]
+
+ movdqa xmm7, [esp+640-480]
+ paddw xmm6, xmm5
+ paddw xmm6, xmm1
+ paddw xmm6, [esp+640-592]
+ psraw xmm6, 3
+ pand xmm6, [esp+640-560]
+ movdqa [esp+640-112], xmm6
+ movdqa xmm6, [esp+640-544]
+ pandn xmm6, xmm7
+ movdqa [esp+640-336], xmm6
+ movdqa xmm6, [esp+640-544]
+ movdqa [esp+640-528], xmm6
+ movdqa xmm6, [esp+640-368]
+ paddw xmm6, xmm7
+ movdqa xmm7, xmm1
+ psraw xmm4, 3
+ pand xmm4, [esp+640-544]
+ paddw xmm7, xmm5
+ paddw xmm6, xmm7
+ paddw xmm6, [esp+640-624]
+ movdqa xmm7, [esp+640-528]
+
+ paddw xmm5, xmm1
+ psraw xmm6, 2
+ pand xmm7, xmm6
+
+ movdqa xmm6, [esp+640-384]
+ movdqa [esp+640-64], xmm7
+ movdqa xmm7, [esp+640-560]
+ pandn xmm7, xmm6
+ movdqa [esp+640-304], xmm7
+ movdqa xmm7, [esp+640-560]
+ movdqa [esp+640-528], xmm7
+ movdqa xmm7, [esp+640-416]
+ paddw xmm7, xmm6
+ paddw xmm7, xmm5
+ paddw xmm7, [esp+640-624]
+ movdqa xmm5, [esp+640-528]
+ psraw xmm7, 2
+ pand xmm5, xmm7
+ movdqa [esp+640-32], xmm5
+
+ movdqa xmm5, [esp+640-544]
+ movdqa [esp+640-528], xmm5
+ movdqa xmm5, [esp+640-480]
+ movdqa xmm7, xmm5
+ paddw xmm7, xmm5
+ movdqa xmm5, xmm1
+ paddw xmm5, xmm6
+ paddw xmm6, [esp+640-592]
+ paddw xmm7, xmm5
+ paddw xmm7, [esp+640-624]
+ movdqa xmm5, [esp+640-528]
+ psraw xmm7, 2
+ pandn xmm5, xmm7
+ movdqa xmm7, [esp+640-480]
+ paddw xmm7, xmm1
+ paddw xmm7, [esp+640-400]
+ movdqa xmm1, [esp+640-544]
+ movdqa [esp+640-352], xmm5
+ movdqa xmm5, [esp+640-368]
+ psllw xmm7, 1
+ paddw xmm7, xmm6
+ paddw xmm5, xmm7
+
+ movdqa xmm7, [esp+640-400]
+ psraw xmm5, 3
+ pand xmm1, xmm5
+ movdqa xmm5, [esp+640-480]
+ movdqa [esp+640-96], xmm1
+ movdqa xmm1, [esp+640-560]
+ movdqa [esp+640-528], xmm1
+ movdqa xmm1, [esp+640-384]
+ movdqa xmm6, xmm1
+ paddw xmm6, xmm1
+ paddw xmm1, [esp+640-400]
+ paddw xmm1, [esp+640-144]
+ paddw xmm7, xmm5
+ paddw xmm5, [esp+640-592]
+ paddw xmm6, xmm7
+ paddw xmm6, [esp+640-624]
+ movdqa xmm7, [esp+640-528]
+ psraw xmm6, 2
+ psllw xmm1, 1
+ paddw xmm1, xmm5
+
+ movdqa xmm5, [esp+656-272]
+ pandn xmm7, xmm6
+ movdqa xmm6, [esp+640-416]
+ paddw xmm6, xmm1
+ movdqa xmm1, [esp+640-560]
+ psraw xmm6, 3
+ pand xmm1, xmm6
+
+ movdqa xmm6, [esp+704-272]
+ movdqa [esp+640-128], xmm1
+ movdqa xmm1, [esp+672-272]
+ punpckhbw xmm1, xmm2
+ movdqa [esp+640-448], xmm1
+ movdqa xmm1, [esp+688-272]
+ punpckhbw xmm1, xmm2
+ punpckhbw xmm6, xmm2
+ movdqa [esp+640-288], xmm7
+ punpckhbw xmm5, xmm2
+ movdqa [esp+640-496], xmm1
+ movdqa [esp+640-432], xmm6
+
+ movdqa xmm7, [esp+720-272]
+ punpckhbw xmm7, xmm2
+ movdqa [esp+640-464], xmm7
+
+ movdqa xmm7, [esp+736-272]
+ punpckhbw xmm7, xmm2
+ movdqa [esp+640-528], xmm7
+
+ movdqa xmm7, xmm6
+
+ psubw xmm6, [esp+640-464]
+ psubw xmm7, xmm1
+ pabsw xmm7, xmm7
+ movdqa [esp+640-560], xmm7
+ por xmm4, [esp+640-16]
+ pabsw xmm6, xmm6
+ movdqa xmm7, xmm1
+ psubw xmm7, [esp+640-448]
+
+ movdqa xmm1, [esp+640-512]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm1, xmm7
+ movdqa xmm7, [esp+640-512]
+ pcmpgtw xmm7, xmm6
+ movdqa xmm6, [esp+640-320]
+ pand xmm1, xmm7
+ movdqa xmm7, [esp+640-560]
+ pcmpgtw xmm6, xmm7
+ pand xmm1, xmm6
+
+ movdqa xmm6, [esp+640-576]
+ pcmpgtw xmm6, xmm7
+
+ movdqa xmm7, [esp+640-496]
+ punpckhbw xmm3, xmm2
+ movdqa [esp+640-560], xmm6
+ movdqa xmm6, [esp+640-512]
+ psubw xmm7, xmm5
+ pabsw xmm7, xmm7
+ pcmpgtw xmm6, xmm7
+
+ pand xmm6, [esp+640-560]
+ movdqa xmm7, [esp+640-432]
+ psubw xmm7, [esp+640-528]
+
+ psllw xmm3, 1
+ movdqa [esp+640-544], xmm6
+ movdqa xmm6, [esp+640-512]
+
+ movdqa xmm2, [esp+640-544]
+ paddw xmm3, xmm5
+ paddw xmm3, xmm5
+ paddw xmm3, xmm5
+ paddw xmm3, [esp+640-448]
+ paddw xmm3, [esp+640-496]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm6, xmm7
+ pand xmm6, [esp+640-560]
+ movdqa [esp+640-560], xmm6
+
+ movdqa xmm6, xmm0
+ pand xmm6, xmm4
+ movdqa xmm4, xmm0
+ pandn xmm4, [esp+640-368]
+ por xmm6, xmm4
+ movdqa xmm4, [esp+640-432]
+ paddw xmm3, xmm4
+ paddw xmm3, [esp+640-592]
+ psraw xmm3, 3
+ pand xmm3, xmm2
+ pandn xmm2, xmm5
+ por xmm3, xmm2
+ movdqa xmm7, xmm1
+ pand xmm7, xmm3
+ movdqa xmm3, [esp+640-64]
+ por xmm3, [esp+640-336]
+ movdqa xmm2, xmm1
+ pandn xmm2, xmm5
+ por xmm7, xmm2
+
+ movdqa xmm2, xmm0
+ pand xmm2, xmm3
+ movdqa xmm3, xmm0
+ pandn xmm3, [esp+640-480]
+ por xmm2, xmm3
+ packuswb xmm6, xmm7
+ movdqa [esp+640-336], xmm2
+ movdqa [esp+656-272], xmm6
+ movdqa xmm6, [esp+640-544]
+ movdqa xmm2, xmm5
+ paddw xmm2, [esp+640-448]
+ movdqa xmm3, xmm1
+ movdqa xmm7, [esp+640-496]
+ paddw xmm7, xmm4
+ paddw xmm2, xmm7
+ paddw xmm2, [esp+640-624]
+ movdqa xmm7, [esp+640-544]
+ psraw xmm2, 2
+ pand xmm6, xmm2
+ movdqa xmm2, [esp+640-448]
+ pandn xmm7, xmm2
+ por xmm6, xmm7
+ pand xmm3, xmm6
+ movdqa xmm6, xmm1
+ pandn xmm6, xmm2
+ paddw xmm2, [esp+640-496]
+ paddw xmm2, xmm4
+ por xmm3, xmm6
+ movdqa xmm6, [esp+640-336]
+ packuswb xmm6, xmm3
+ psllw xmm2, 1
+ movdqa [esp+672-272], xmm6
+ movdqa xmm6, [esp+640-96]
+ por xmm6, [esp+640-352]
+
+ movdqa xmm3, xmm0
+ pand xmm3, xmm6
+ movdqa xmm6, xmm0
+ pandn xmm6, [esp+640-144]
+ por xmm3, xmm6
+ movdqa xmm6, [esp+640-544]
+ movdqa [esp+640-352], xmm3
+ movdqa xmm3, [esp+640-464]
+ paddw xmm3, [esp+640-592]
+ paddw xmm2, xmm3
+ movdqa xmm3, [esp+640-448]
+ paddw xmm5, xmm2
+ movdqa xmm2, [esp+640-496]
+ psraw xmm5, 3
+ pand xmm6, xmm5
+ movdqa xmm5, [esp+640-464]
+ paddw xmm2, xmm5
+ paddw xmm5, [esp+640-432]
+ movdqa xmm4, xmm3
+ paddw xmm4, xmm3
+ paddw xmm4, xmm2
+ paddw xmm4, [esp+640-624]
+ movdqa xmm2, [esp+640-544]
+ paddw xmm3, [esp+640-592]
+ psraw xmm4, 2
+ pandn xmm2, xmm4
+ por xmm6, xmm2
+ movdqa xmm7, xmm1
+ pand xmm7, xmm6
+ movdqa xmm6, [esp+640-496]
+ movdqa xmm2, xmm1
+ pandn xmm2, xmm6
+ por xmm7, xmm2
+ movdqa xmm2, [esp+640-352]
+ packuswb xmm2, xmm7
+ movdqa [esp+688-272], xmm2
+ movdqa xmm2, [esp+640-128]
+ por xmm2, [esp+640-288]
+
+ movdqa xmm4, xmm0
+ pand xmm4, xmm2
+ paddw xmm5, xmm6
+ movdqa xmm2, xmm0
+ pandn xmm2, [esp+640-400]
+ por xmm4, xmm2
+ movdqa xmm2, [esp+640-528]
+ psllw xmm5, 1
+ paddw xmm5, xmm3
+ movdqa xmm3, [esp+640-560]
+ paddw xmm2, xmm5
+ psraw xmm2, 3
+ movdqa [esp+640-288], xmm4
+ movdqa xmm4, [esp+640-560]
+ pand xmm4, xmm2
+ movdqa xmm2, [esp+640-464]
+ movdqa xmm5, xmm2
+ paddw xmm5, xmm2
+ movdqa xmm2, [esp+640-432]
+ paddw xmm2, [esp+640-448]
+ movdqa xmm7, xmm1
+ paddw xmm5, xmm2
+ paddw xmm5, [esp+640-624]
+ movdqa xmm6, [esp+640-560]
+ psraw xmm5, 2
+ pandn xmm3, xmm5
+ por xmm4, xmm3
+ movdqa xmm3, [esp+640-32]
+ por xmm3, [esp+640-304]
+ pand xmm7, xmm4
+ movdqa xmm4, [esp+640-432]
+ movdqa xmm5, [esp+640-464]
+ movdqa xmm2, xmm1
+ pandn xmm2, xmm4
+ paddw xmm4, [esp+640-496]
+ por xmm7, xmm2
+ movdqa xmm2, [esp+640-288]
+ packuswb xmm2, xmm7
+ movdqa [esp+704-272], xmm2
+
+ movdqa xmm2, xmm0
+ pand xmm2, xmm3
+ movdqa xmm3, xmm0
+ pandn xmm3, [esp+640-384]
+ por xmm2, xmm3
+ movdqa [esp+640-304], xmm2
+ movdqa xmm2, [esp+640-528]
+ movdqa xmm3, xmm2
+ paddw xmm3, [esp+640-464]
+ paddw xmm3, xmm4
+ paddw xmm3, [esp+640-624]
+ psraw xmm3, 2
+ pand xmm6, xmm3
+ movdqa xmm3, [esp+640-560]
+ movdqa xmm4, xmm3
+ pandn xmm4, xmm5
+ por xmm6, xmm4
+ movdqa xmm7, xmm1
+ pand xmm7, xmm6
+ movdqa xmm6, [esp+640-304]
+ movdqa xmm4, xmm1
+ pandn xmm4, xmm5
+ por xmm7, xmm4
+
+ movdqa xmm4, xmm0
+ pandn xmm0, [esp+640-416]
+ packuswb xmm6, xmm7
+ movdqa xmm7, [esp+640-112]
+ por xmm7, [esp+640-80]
+ pand xmm4, xmm7
+ por xmm4, xmm0
+ movdqa xmm0, [esp+752-272]
+ punpckhbw xmm0, [esp+640-48]
+ psllw xmm0, 1
+ paddw xmm0, xmm2
+ paddw xmm0, xmm2
+ paddw xmm0, xmm2
+ paddw xmm0, xmm5
+ paddw xmm0, [esp+640-432]
+ paddw xmm0, [esp+640-496]
+ paddw xmm0, [esp+640-592]
+ psraw xmm0, 3
+ pand xmm0, xmm3
+ movdqa xmm7, xmm1
+ pandn xmm3, xmm2
+ por xmm0, xmm3
+ pand xmm7, xmm0
+
+ movdqa xmm0, [esp+656-272]
+ movdqa [edx], xmm0
+
+ movdqa xmm0, [esp+672-272]
+
+ mov edx, dword [esp+640-596]
+ movdqa [esi], xmm0
+ movdqa xmm0, [esp+688-272]
+ movdqa [edi], xmm0
+ movdqa xmm0, [esp+704-272]
+
+ pop edi
+ pandn xmm1, xmm2
+ movdqa [eax], xmm0
+ por xmm7, xmm1
+ pop esi
+ packuswb xmm4, xmm7
+ movdqa [edx], xmm6
+ movdqa [ecx], xmm4
+ pop ebx
+ mov esp, ebp
+ pop ebp
+ ret
+
+%endif
+
+
+
+;********************************************************************************
+;
+; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
+;
+;********************************************************************************
+
+WELS_EXTERN DeblockLumaTransposeH2V_sse2
+ push r3
+ push r4
+ push r5
+
+%assign push_num 3
+ LOAD_3_PARA
+ PUSH_XMM 8
+
+ SIGN_EXTENSION r1, r1d
+
+ mov r5, r7
+ mov r3, r7
+ and r3, 0Fh
+ sub r7, r3
+ sub r7, 10h
+
+ lea r3, [r0 + r1 * 8]
+ lea r4, [r1 * 3]
+
+ movq xmm0, [r0]
+ movq xmm7, [r3]
+ punpcklqdq xmm0, xmm7
+ movq xmm1, [r0 + r1]
+ movq xmm7, [r3 + r1]
+ punpcklqdq xmm1, xmm7
+ movq xmm2, [r0 + r1*2]
+ movq xmm7, [r3 + r1*2]
+ punpcklqdq xmm2, xmm7
+ movq xmm3, [r0 + r4]
+ movq xmm7, [r3 + r4]
+ punpcklqdq xmm3, xmm7
+
+ lea r0, [r0 + r1 * 4]
+ lea r3, [r3 + r1 * 4]
+ movq xmm4, [r0]
+ movq xmm7, [r3]
+ punpcklqdq xmm4, xmm7
+ movq xmm5, [r0 + r1]
+ movq xmm7, [r3 + r1]
+ punpcklqdq xmm5, xmm7
+ movq xmm6, [r0 + r1*2]
+ movq xmm7, [r3 + r1*2]
+ punpcklqdq xmm6, xmm7
+
+ movdqa [r7], xmm0
+ movq xmm7, [r0 + r4]
+ movq xmm0, [r3 + r4]
+ punpcklqdq xmm7, xmm0
+ movdqa xmm0, [r7]
+
+ SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
+ ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+
+ movdqa [r2], xmm4
+ movdqa [r2 + 10h], xmm2
+ movdqa [r2 + 20h], xmm3
+ movdqa [r2 + 30h], xmm7
+ movdqa [r2 + 40h], xmm5
+ movdqa [r2 + 50h], xmm1
+ movdqa [r2 + 60h], xmm6
+ movdqa [r2 + 70h], xmm0
+
+ mov r7, r5
+ POP_XMM
+ pop r5
+ pop r4
+ pop r3
+ ret
+
+
+;*******************************************************************************************
+;
+; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
+;
+;*******************************************************************************************
+
+WELS_EXTERN DeblockLumaTransposeV2H_sse2
+ push r3
+ push r4
+
+%assign push_num 2
+ LOAD_3_PARA
+ PUSH_XMM 8
+
+ SIGN_EXTENSION r1, r1d
+
+ mov r4, r7
+ mov r3, r7
+ and r3, 0Fh
+ sub r7, r3
+ sub r7, 10h
+
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2 + 10h]
+ movdqa xmm2, [r2 + 20h]
+ movdqa xmm3, [r2 + 30h]
+ movdqa xmm4, [r2 + 40h]
+ movdqa xmm5, [r2 + 50h]
+ movdqa xmm6, [r2 + 60h]
+ movdqa xmm7, [r2 + 70h]
+
+ SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
+ ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+
+ lea r2, [r1 * 3]
+
+ movq [r0], xmm4
+ movq [r0 + r1], xmm2
+ movq [r0 + r1*2], xmm3
+ movq [r0 + r2], xmm7
+
+ lea r0, [r0 + r1*4]
+ movq [r0], xmm5
+ movq [r0 + r1], xmm1
+ movq [r0 + r1*2], xmm6
+ movq [r0 + r2], xmm0
+
+ psrldq xmm4, 8
+ psrldq xmm2, 8
+ psrldq xmm3, 8
+ psrldq xmm7, 8
+ psrldq xmm5, 8
+ psrldq xmm1, 8
+ psrldq xmm6, 8
+ psrldq xmm0, 8
+
+ lea r0, [r0 + r1*4]
+ movq [r0], xmm4
+ movq [r0 + r1], xmm2
+ movq [r0 + r1*2], xmm3
+ movq [r0 + r2], xmm7
+
+ lea r0, [r0 + r1*4]
+ movq [r0], xmm5
+ movq [r0 + r1], xmm1
+ movq [r0 + r1*2], xmm6
+ movq [r0 + r2], xmm0
+
+
+ mov r7, r4
+ POP_XMM
+ pop r4
+ pop r3
+ ret
+
--- /dev/null
+++ b/codec/common/x86/expand_picture.asm
@@ -1,0 +1,728 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* expand_picture.asm
+;*
+;* Abstract
+;* mmxext/sse for expand_frame
+;*
+;* History
+;* 09/25/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+
+
+SECTION .text
+
+
+;;;;;;;expanding result;;;;;;;
+
+;aaaa|attttttttttttttttb|bbbb
+;aaaa|attttttttttttttttb|bbbb
+;aaaa|attttttttttttttttb|bbbb
+;aaaa|attttttttttttttttb|bbbb
+;----------------------------
+;aaaa|attttttttttttttttb|bbbb
+;llll|l r|rrrr
+;llll|l r|rrrr
+;llll|l r|rrrr
+;llll|l r|rrrr
+;llll|l r|rrrr
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+;----------------------------
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+
+%macro mov_line_8x4_mmx 3 ; dst, stride, mm?
+ movq [%1], %3
+ movq [%1+%2], %3
+ lea %1, [%1+2*%2]
+ movq [%1], %3
+ movq [%1+%2], %3
+ lea %1, [%1+2*%2]
+%endmacro
+
+%macro mov_line_end8x4_mmx 3 ; dst, stride, mm?
+ movq [%1], %3
+ movq [%1+%2], %3
+ lea %1, [%1+2*%2]
+ movq [%1], %3
+ movq [%1+%2], %3
+ lea %1, [%1+%2]
+%endmacro
+
+%macro mov_line_16x4_sse2 4 ; dst, stride, xmm?, u/a
+ movdq%4 [%1], %3 ; top(bottom)_0
+ movdq%4 [%1+%2], %3 ; top(bottom)_1
+ lea %1, [%1+2*%2]
+ movdq%4 [%1], %3 ; top(bottom)_2
+ movdq%4 [%1+%2], %3 ; top(bottom)_3
+ lea %1, [%1+2*%2]
+%endmacro
+
+%macro mov_line_end16x4_sse2 4 ; dst, stride, xmm?, u/a
+ movdq%4 [%1], %3 ; top(bottom)_0
+ movdq%4 [%1+%2], %3 ; top(bottom)_1
+ lea %1, [%1+2*%2]
+ movdq%4 [%1], %3 ; top(bottom)_2
+ movdq%4 [%1+%2], %3 ; top(bottom)_3
+ lea %1, [%1+%2]
+%endmacro
+
+%macro mov_line_32x4_sse2 3 ; dst, stride, xmm?
+ movdqa [%1], %3 ; top(bottom)_0
+ movdqa [%1+16], %3 ; top(bottom)_0
+ movdqa [%1+%2], %3 ; top(bottom)_1
+ movdqa [%1+%2+16], %3 ; top(bottom)_1
+ lea %1, [%1+2*%2]
+ movdqa [%1], %3 ; top(bottom)_2
+ movdqa [%1+16], %3 ; top(bottom)_2
+ movdqa [%1+%2], %3 ; top(bottom)_3
+ movdqa [%1+%2+16], %3 ; top(bottom)_3
+ lea %1, [%1+2*%2]
+%endmacro
+
+%macro mov_line_end32x4_sse2 3 ; dst, stride, xmm?
+ movdqa [%1], %3 ; top(bottom)_0
+ movdqa [%1+16], %3 ; top(bottom)_0
+ movdqa [%1+%2], %3 ; top(bottom)_1
+ movdqa [%1+%2+16], %3 ; top(bottom)_1
+ lea %1, [%1+2*%2]
+ movdqa [%1], %3 ; top(bottom)_2
+ movdqa [%1+16], %3 ; top(bottom)_2
+ movdqa [%1+%2], %3 ; top(bottom)_3
+ movdqa [%1+%2+16], %3 ; top(bottom)_3
+ lea %1, [%1+%2]
+%endmacro
+
+%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)]
+ ;r2 [width/16(8)]
+ ;r0 [pSrc +0], r5 [pSrc -width] r1[-stride], 32(16) ;top
+ ;r3 [pSrc +(h-1)*stride], r4 [pSrc + (h+31)*stride],32(16); bottom
+
+%if %1 == 32 ; for luma
+ sar r2, 04h ; width / 16(8) pixels
+.top_bottom_loops:
+ ; top
+ movdqa xmm0, [r0] ; first line of picture pData
+ mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_end16x4_sse2 r5, r1, xmm0, a
+
+ ; bottom
+ movdqa xmm1, [r3] ; last line of picture pData
+ mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_end16x4_sse2 r4, r1, xmm1, a
+
+ lea r0, [r0+16] ; top pSrc
+ lea r5, [r5+16] ; top dst
+ lea r3, [r3+16] ; bottom pSrc
+ lea r4, [r4+16] ; bottom dst
+ neg r1 ; positive/negative stride need for next loop?
+
+ dec r2
+ jnz near .top_bottom_loops
+%elif %1 == 16 ; for chroma ??
+ mov r6, r2
+ sar r2, 04h ; (width / 16) pixels
+.top_bottom_loops:
+ ; top
+ movdqa xmm0, [r0] ; first line of picture pData
+ mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_end16x4_sse2 r5, r1, xmm0, a
+
+ ; bottom
+ movdqa xmm1, [r3] ; last line of picture pData
+ mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_end16x4_sse2 r4, r1, xmm1, a
+
+ lea r0, [r0+16] ; top pSrc
+ lea r5, [r5+16] ; top dst
+ lea r3, [r3+16] ; bottom pSrc
+ lea r4, [r4+16] ; bottom dst
+ neg r1 ; positive/negative stride need for next loop?
+
+ dec r2
+ jnz near .top_bottom_loops
+
+ ; for remaining 8 bytes
+ and r6, 0fh ; any 8 bytes left?
+ test r6, r6
+ jz near .to_be_continued ; no left to exit here
+
+ ; top
+ movq mm0, [r0] ; remained 8 byte
+ mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
+ mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
+ mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
+ mov_line_end8x4_mmx r5, r1, mm0 ; dst, stride, mm?
+ ; bottom
+ movq mm1, [r3]
+ mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
+ mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
+ mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
+ mov_line_end8x4_mmx r4, r1, mm1 ; dst, stride, mm?
+ WELSEMMS
+
+.to_be_continued:
+%endif
+%endmacro
+
+%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
+ ;r6 [height]
+ ;r0 [pSrc+0] r5[pSrc-32] r1[stride]
+ ;r3 [pSrc+(w-1)] r4[pSrc+w]
+
+%if %1 == 32 ; for luma
+.left_right_loops:
+ ; left
+ movzx r2d, byte [r0] ; pixel pData for left border
+ SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
+ movdqa [r5], xmm0
+ movdqa [r5+16], xmm0
+
+ ; right
+ movzx r2d, byte [r3]
+ SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
+ movdqa [r4], xmm1
+ movdqa [r4+16], xmm1
+
+ lea r0, [r0+r1] ; left pSrc
+ lea r5, [r5+r1] ; left dst
+ lea r3, [r3+r1] ; right pSrc
+ lea r4, [r4+r1] ; right dst
+
+ dec r6
+ jnz near .left_right_loops
+%elif %1 == 16 ; for chroma ??
+.left_right_loops:
+ ; left
+ movzx r2d, byte [r0] ; pixel pData for left border
+ SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
+ movdqa [r5], xmm0
+
+ ; right
+ movzx r2d, byte [r3]
+ SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
+ movdq%2 [r4], xmm1 ; might not be aligned 16 bytes in case chroma planes
+
+ lea r0, [r0+r1] ; left pSrc
+ lea r5, [r5+r1] ; left dst
+ lea r3, [r3+r1] ; right pSrc
+ lea r4, [r4+r1] ; right dst
+
+ dec r6
+ jnz near .left_right_loops
+%endif
+%endmacro
+
+%macro exp_cross_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
+ ; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
+ ; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
+ ;r3:TL ,r4:TR,r5:BL,r6:BR r1:-stride
+%if %1 == 32 ; luma
+ ; TL
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_end32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+
+ ; TR
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_end32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+
+ ; BL
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_end32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+
+ ; BR
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_end32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+%elif %1 == 16 ; chroma
+ ; TL
+ mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
+ mov_line_end16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
+
+ ; TR
+ mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
+ mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
+ mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
+ mov_line_end16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
+
+ ; BL
+ mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
+ mov_line_end16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
+
+ ; BR
+ mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
+ mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
+ mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
+ mov_line_end16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
+%endif
+%endmacro
+
+;***********************************************************************----------------
+; void ExpandPictureLuma_sse2( uint8_t *pDst,
+; const int32_t iStride,
+; const int32_t iWidth,
+; const int32_t iHeight );
+;***********************************************************************----------------
+WELS_EXTERN ExpandPictureLuma_sse2
+
+ push r4
+ push r5
+ push r6
+
+ %assign push_num 3
+ LOAD_4_PARA
+ PUSH_XMM 7
+
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r3, r3d
+
+ ;also prepare for cross border pData top-left:xmm3
+
+ movzx r6d,byte[r0]
+ SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
+
+ neg r1
+ lea r5,[r0+r1] ;last line of top border r5= dst top pSrc[-stride]
+ neg r1
+
+ push r3
+
+
+ dec r3 ;h-1
+ imul r3,r1 ;(h-1)*stride
+ lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
+
+ mov r6,r1 ;r6 = stride
+ sal r6,05h ;r6 = 32*stride
+ lea r4,[r3+r6] ;r4 = dst bottom
+
+ ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
+
+ movzx r6d,byte [r3] ;bottom-left
+ SSE2_Copy16Times xmm5,r6d
+
+ lea r6,[r3+r2-1]
+ movzx r6d,byte [r6]
+ SSE2_Copy16Times xmm6,r6d ;bottom-right
+
+ neg r1 ;r1 = -stride
+
+ push r0
+ push r1
+ push r2
+
+ exp_top_bottom_sse2 32
+
+ ; for both left and right border
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ pop r2
+ pop r1
+ pop r0
+
+ lea r5,[r0-32] ;left border dst luma =32 chroma = -16
+
+ lea r3,[r0+r2-1] ;right border src
+ lea r4,[r3+1] ;right border dst
+
+ ;prepare for cross border data: top-rigth with xmm4
+ movzx r6d,byte [r3] ;top -rigth
+ SSE2_Copy16Times xmm4,r6d
+
+ neg r1 ;r1 = stride
+
+
+ pop r6 ; r6 = height
+
+
+
+ push r0
+ push r1
+ push r2
+ push r6
+
+ exp_left_right_sse2 32,a
+
+ pop r6
+ pop r2
+ pop r1
+ pop r0
+
+ ; for cross border [top-left, top-right, bottom-left, bottom-right]
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
+
+ neg r1 ;r1 = -stride
+ lea r3,[r0-32]
+ lea r3,[r3+r1] ;last line of top-left border
+
+ lea r4,[r0+r2] ;psrc +width
+ lea r4,[r4+r1] ;psrc +width -stride
+
+
+ neg r1 ;r1 = stride
+ add r6,32 ;height +32(16) ,luma = 32, chroma = 16
+ imul r6,r1
+
+ lea r5,[r3+r6] ;last line of bottom-left border
+ lea r6,[r4+r6] ;last line of botoom-right border
+
+ neg r1 ; r1 = -stride
+
+ ; for left & right border expanding
+ exp_cross_sse2 32,a
+
+ POP_XMM
+ LOAD_4_PARA_POP
+
+ pop r6
+ pop r5
+ pop r4
+
+ %assign push_num 0
+
+
+ ret
+
+;***********************************************************************----------------
+; void ExpandPictureChromaAlign_sse2( uint8_t *pDst,
+; const int32_t iStride,
+; const int32_t iWidth,
+; const int32_t iHeight );
+;***********************************************************************----------------
+WELS_EXTERN ExpandPictureChromaAlign_sse2
+
+ push r4
+ push r5
+ push r6
+
+ %assign push_num 3
+ LOAD_4_PARA
+ PUSH_XMM 7
+
+ SIGN_EXTENSION r1,r1d
+ SIGN_EXTENSION r2,r2d
+ SIGN_EXTENSION r3,r3d
+
+ ;also prepare for cross border pData top-left:xmm3
+
+ movzx r6d,byte [r0]
+ SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
+
+ neg r1
+ lea r5,[r0+r1] ;last line of top border r5= dst top pSrc[-stride]
+ neg r1
+
+ push r3
+
+
+ dec r3 ;h-1
+ imul r3,r1 ;(h-1)*stride
+ lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
+
+ mov r6,r1 ;r6 = stride
+ sal r6,04h ;r6 = 32*stride
+ lea r4,[r3+r6] ;r4 = dst bottom
+
+ ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
+
+ movzx r6d,byte [r3] ;bottom-left
+ SSE2_Copy16Times xmm5,r6d
+
+ lea r6,[r3+r2-1]
+ movzx r6d,byte [r6]
+ SSE2_Copy16Times xmm6,r6d ;bottom-right
+
+ neg r1 ;r1 = -stride
+
+ push r0
+ push r1
+ push r2
+
+ exp_top_bottom_sse2 16
+
+ ; for both left and right border
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ pop r2
+ pop r1
+ pop r0
+
+ lea r5,[r0-16] ;left border dst luma =32 chroma = -16
+
+ lea r3,[r0+r2-1] ;right border src
+ lea r4,[r3+1] ;right border dst
+
+ ;prepare for cross border data: top-rigth with xmm4
+ movzx r6d,byte [r3] ;top -rigth
+ SSE2_Copy16Times xmm4,r6d
+
+ neg r1 ;r1 = stride
+
+
+ pop r6 ; r6 = height
+
+
+
+ push r0
+ push r1
+ push r2
+ push r6
+ exp_left_right_sse2 16,a
+
+ pop r6
+ pop r2
+ pop r1
+ pop r0
+
+ ; for cross border [top-left, top-right, bottom-left, bottom-right]
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
+
+ neg r1 ;r1 = -stride
+ lea r3,[r0-16]
+ lea r3,[r3+r1] ;last line of top-left border
+
+ lea r4,[r0+r2] ;psrc +width
+ lea r4,[r4+r1] ;psrc +width -stride
+
+
+ neg r1 ;r1 = stride
+ add r6,16 ;height +32(16) ,luma = 32, chroma = 16
+ imul r6,r1
+
+ lea r5,[r3+r6] ;last line of bottom-left border
+ lea r6,[r4+r6] ;last line of botoom-right border
+
+ neg r1 ; r1 = -stride
+
+ ; for left & right border expanding
+ exp_cross_sse2 16,a
+
+ POP_XMM
+ LOAD_4_PARA_POP
+
+ pop r6
+ pop r5
+ pop r4
+
+ %assign push_num 0
+
+
+ ret
+
+;***********************************************************************----------------
+; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst,
+; const int32_t iStride,
+; const int32_t iWidth,
+; const int32_t iHeight );
+;***********************************************************************----------------
+WELS_EXTERN ExpandPictureChromaUnalign_sse2
+ push r4
+ push r5
+ push r6
+
+ %assign push_num 3
+ LOAD_4_PARA
+ PUSH_XMM 7
+
+ SIGN_EXTENSION r1,r1d
+ SIGN_EXTENSION r2,r2d
+ SIGN_EXTENSION r3,r3d
+
+ ;also prepare for cross border pData top-left:xmm3
+
+ movzx r6d,byte [r0]
+ SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
+
+ neg r1
+ lea r5,[r0+r1] ;last line of top border r5= dst top pSrc[-stride]
+ neg r1
+
+ push r3
+
+
+ dec r3 ;h-1
+ imul r3,r1 ;(h-1)*stride
+ lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
+
+ mov r6,r1 ;r6 = stride
+ sal r6,04h ;r6 = 32*stride
+ lea r4,[r3+r6] ;r4 = dst bottom
+
+ ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
+
+ movzx r6d,byte [r3] ;bottom-left
+ SSE2_Copy16Times xmm5,r6d
+
+ lea r6,[r3+r2-1]
+ movzx r6d,byte [r6]
+ SSE2_Copy16Times xmm6,r6d ;bottom-right
+
+ neg r1 ;r1 = -stride
+
+ push r0
+ push r1
+ push r2
+
+ exp_top_bottom_sse2 16
+
+ ; for both left and right border
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ pop r2
+ pop r1
+ pop r0
+
+ lea r5,[r0-16] ;left border dst luma =32 chroma = -16
+
+ lea r3,[r0+r2-1] ;right border src
+ lea r4,[r3+1] ;right border dst
+
+ ;prepare for cross border data: top-rigth with xmm4
+ movzx r6d,byte [r3] ;top -rigth
+ SSE2_Copy16Times xmm4,r6d
+
+ neg r1 ;r1 = stride
+
+
+ pop r6 ; r6 = height
+
+
+
+ push r0
+ push r1
+ push r2
+ push r6
+ exp_left_right_sse2 16,u
+
+ pop r6
+ pop r2
+ pop r1
+ pop r0
+
+ ; for cross border [top-left, top-right, bottom-left, bottom-right]
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
+
+ neg r1 ;r1 = -stride
+ lea r3,[r0-16]
+ lea r3,[r3+r1] ;last line of top-left border
+
+ lea r4,[r0+r2] ;psrc +width
+ lea r4,[r4+r1] ;psrc +width -stride
+
+
+ neg r1 ;r1 = stride
+ add r6,16 ;height +32(16) ,luma = 32, chroma = 16
+ imul r6,r1
+
+ lea r5,[r3+r6] ;last line of bottom-left border
+ lea r6,[r4+r6] ;last line of botoom-right border
+
+ neg r1 ; r1 = -stride
+
+ ; for left & right border expanding
+ exp_cross_sse2 16,u
+
+ POP_XMM
+ LOAD_4_PARA_POP
+
+ pop r6
+ pop r5
+ pop r4
+
+ %assign push_num 0
+
+
+ ret
--- /dev/null
+++ b/codec/common/x86/mb_copy.asm
@@ -1,0 +1,581 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* mb_copy.asm
+;*
+;* Abstract
+;* mb_copy and mb_copy1
+;*
+;* History
+;* 15/09/2009 Created
+;* 12/28/2009 Modified with larger throughput
+;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
+;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
+;*
+;*
+;*********************************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+
+;***********************************************************************
+; void WelsCopy16x16_sse2( uint8_t* Dst,
+; int32_t iStrideD,
+; uint8_t* Src,
+; int32_t iStrideS )
+;***********************************************************************
+WELS_EXTERN WelsCopy16x16_sse2
+
+ push r4
+ push r5
+ %assign push_num 2
+ LOAD_4_PARA
+ PUSH_XMM 8
+
+ lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
+ lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
+
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2+r3]
+ movdqa xmm2, [r2+2*r3]
+ movdqa xmm3, [r2+r5]
+ lea r2, [r2+4*r3]
+ movdqa xmm4, [r2]
+ movdqa xmm5, [r2+r3]
+ movdqa xmm6, [r2+2*r3]
+ movdqa xmm7, [r2+r5]
+ lea r2, [r2+4*r3]
+
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm2
+ movdqa [r0+r4], xmm3
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm4
+ movdqa [r0+r1], xmm5
+ movdqa [r0+2*r1], xmm6
+ movdqa [r0+r4], xmm7
+ lea r0, [r0+4*r1]
+
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2+r3]
+ movdqa xmm2, [r2+2*r3]
+ movdqa xmm3, [r2+r5]
+ lea r2, [r2+4*r3]
+ movdqa xmm4, [r2]
+ movdqa xmm5, [r2+r3]
+ movdqa xmm6, [r2+2*r3]
+ movdqa xmm7, [r2+r5]
+
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm2
+ movdqa [r0+r4], xmm3
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm4
+ movdqa [r0+r1], xmm5
+ movdqa [r0+2*r1], xmm6
+ movdqa [r0+r4], xmm7
+ POP_XMM
+ LOAD_4_PARA_POP
+ pop r5
+ pop r4
+ ret
+
+;***********************************************************************
+; void WelsCopy16x16NotAligned_sse2( uint8_t* Dst,
+; int32_t iStrideD,
+; uint8_t* Src,
+; int32_t iStrideS )
+;***********************************************************************
+; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011
+WELS_EXTERN WelsCopy16x16NotAligned_sse2
+ push r4
+ push r5
+ %assign push_num 2
+ LOAD_4_PARA
+ PUSH_XMM 8
+
+ lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
+ lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
+
+ movdqu xmm0, [r2]
+ movdqu xmm1, [r2+r3]
+ movdqu xmm2, [r2+2*r3]
+ movdqu xmm3, [r2+r5]
+ lea r2, [r2+4*r3]
+ movdqu xmm4, [r2]
+ movdqu xmm5, [r2+r3]
+ movdqu xmm6, [r2+2*r3]
+ movdqu xmm7, [r2+r5]
+ lea r2, [r2+4*r3]
+
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm2
+ movdqa [r0+r4], xmm3
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm4
+ movdqa [r0+r1], xmm5
+ movdqa [r0+2*r1], xmm6
+ movdqa [r0+r4], xmm7
+ lea r0, [r0+4*r1]
+
+ movdqu xmm0, [r2]
+ movdqu xmm1, [r2+r3]
+ movdqu xmm2, [r2+2*r3]
+ movdqu xmm3, [r2+r5]
+ lea r2, [r2+4*r3]
+ movdqu xmm4, [r2]
+ movdqu xmm5, [r2+r3]
+ movdqu xmm6, [r2+2*r3]
+ movdqu xmm7, [r2+r5]
+
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm2
+ movdqa [r0+r4], xmm3
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm4
+ movdqa [r0+r1], xmm5
+ movdqa [r0+2*r1], xmm6
+ movdqa [r0+r4], xmm7
+ POP_XMM
+ LOAD_4_PARA_POP
+ pop r5
+ pop r4
+ ret
+
+; , 12/29/2011
+;***********************************************************************
+; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst,
+; int32_t iStrideD,
+; uint8_t* Src,
+; int32_t iStrideS )
+;***********************************************************************
+WELS_EXTERN WelsCopy16x8NotAligned_sse2
+ push r4
+ push r5
+ %assign push_num 2
+ LOAD_4_PARA
+ PUSH_XMM 8
+
+ lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
+ lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
+
+ movdqu xmm0, [r2]
+ movdqu xmm1, [r2+r3]
+ movdqu xmm2, [r2+2*r3]
+ movdqu xmm3, [r2+r5]
+ lea r2, [r2+4*r3]
+ movdqu xmm4, [r2]
+ movdqu xmm5, [r2+r3]
+ movdqu xmm6, [r2+2*r3]
+ movdqu xmm7, [r2+r5]
+
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm2
+ movdqa [r0+r4], xmm3
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm4
+ movdqa [r0+r1], xmm5
+ movdqa [r0+2*r1], xmm6
+ movdqa [r0+r4], xmm7
+ POP_XMM
+ LOAD_4_PARA_POP
+ pop r5
+ pop r4
+ ret
+
+
+;***********************************************************************
+; void WelsCopy8x16_mmx(uint8_t* Dst,
+; int32_t iStrideD,
+; uint8_t* Src,
+; int32_t iStrideS )
+;***********************************************************************
+WELS_EXTERN WelsCopy8x16_mmx
+ %assign push_num 0
+ LOAD_4_PARA
+
+ movq mm0, [r2]
+ movq mm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm2, [r2]
+ movq mm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm4, [r2]
+ movq mm5, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm6, [r2]
+ movq mm7, [r2+r3]
+ lea r2, [r2+2*r3]
+
+ movq [r0], mm0
+ movq [r0+r1], mm1
+ lea r0, [r0+2*r1]
+ movq [r0], mm2
+ movq [r0+r1], mm3
+ lea r0, [r0+2*r1]
+ movq [r0], mm4
+ movq [r0+r1], mm5
+ lea r0, [r0+2*r1]
+ movq [r0], mm6
+ movq [r0+r1], mm7
+ lea r0, [r0+2*r1]
+
+ movq mm0, [r2]
+ movq mm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm2, [r2]
+ movq mm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm4, [r2]
+ movq mm5, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm6, [r2]
+ movq mm7, [r2+r3]
+
+ movq [r0], mm0
+ movq [r0+r1], mm1
+ lea r0, [r0+2*r1]
+ movq [r0], mm2
+ movq [r0+r1], mm3
+ lea r0, [r0+2*r1]
+ movq [r0], mm4
+ movq [r0+r1], mm5
+ lea r0, [r0+2*r1]
+ movq [r0], mm6
+ movq [r0+r1], mm7
+
+ WELSEMMS
+ LOAD_4_PARA_POP
+ ret
+
+;***********************************************************************
+; void WelsCopy8x8_mmx( uint8_t* Dst,
+; int32_t iStrideD,
+; uint8_t* Src,
+; int32_t iStrideS )
+;***********************************************************************
+WELS_EXTERN WelsCopy8x8_mmx
+ push r4
+ %assign push_num 1
+ LOAD_4_PARA
+ lea r4, [r3+2*r3] ;edx, [ebx+2*ebx]
+
+ ; to prefetch next loop
+ prefetchnta [r2+2*r3]
+ prefetchnta [r2+r4]
+ movq mm0, [r2]
+ movq mm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ ; to prefetch next loop
+ prefetchnta [r2+2*r3]
+ prefetchnta [r2+r4]
+ movq mm2, [r2]
+ movq mm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ ; to prefetch next loop
+ prefetchnta [r2+2*r3]
+ prefetchnta [r2+r4]
+ movq mm4, [r2]
+ movq mm5, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm6, [r2]
+ movq mm7, [r2+r3]
+
+ movq [r0], mm0
+ movq [r0+r1], mm1
+ lea r0, [r0+2*r1]
+ movq [r0], mm2
+ movq [r0+r1], mm3
+ lea r0, [r0+2*r1]
+ movq [r0], mm4
+ movq [r0+r1], mm5
+ lea r0, [r0+2*r1]
+ movq [r0], mm6
+ movq [r0+r1], mm7
+
+ WELSEMMS
+ LOAD_4_PARA_POP
+ pop r4
+ ret
+
+; (dunhuang@cisco), 12/21/2011
+;***********************************************************************
+; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv )
+;***********************************************************************
+WELS_EXTERN UpdateMbMv_sse2
+
+ %assign push_num 0
+ LOAD_2_PARA
+
+ movd xmm0, r1d ; _mv
+ pshufd xmm1, xmm0, $00
+ movdqa [r0 ], xmm1
+ movdqa [r0+0x10], xmm1
+ movdqa [r0+0x20], xmm1
+ movdqa [r0+0x30], xmm1
+ ret
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+
+
+
+;*******************************************************************************
+; void PixelAvgWidthEq4_mmx( uint8_t *pDst, int iDstStride,
+; uint8_t *pSrcA, int iSrcAStride,
+; uint8_t *pSrcB, int iSrcBStride,
+; int iHeight );
+;*******************************************************************************
+WELS_EXTERN PixelAvgWidthEq4_mmx
+
+ %assign push_num 0
+ LOAD_7_PARA
+
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
+ SIGN_EXTENSION r6, r6d
+
+ALIGN 4
+.height_loop:
+ movd mm0, [r4]
+ pavgb mm0, [r2]
+ movd [r0], mm0
+
+ dec r6
+ lea r0, [r0+r1]
+ lea r2, [r2+r3]
+ lea r4, [r4+r5]
+ jne .height_loop
+
+ WELSEMMS
+ LOAD_7_PARA_POP
+ ret
+
+
+;*******************************************************************************
+; void PixelAvgWidthEq8_mmx( uint8_t *pDst, int iDstStride,
+; uint8_t *pSrcA, int iSrcAStride,
+; uint8_t *pSrcB, int iSrcBStride,
+; int iHeight );
+;*******************************************************************************
+WELS_EXTERN PixelAvgWidthEq8_mmx
+ %assign push_num 0
+ LOAD_7_PARA
+
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
+ SIGN_EXTENSION r6, r6d
+
+ALIGN 4
+.height_loop:
+ movq mm0, [r2]
+ pavgb mm0, [r4]
+ movq [r0], mm0
+ movq mm0, [r2+r3]
+ pavgb mm0, [r4+r5]
+ movq [r0+r1], mm0
+
+ lea r2, [r2+2*r3]
+ lea r4, [r4+2*r5]
+ lea r0, [r0+2*r1]
+
+ sub r6, 2
+ jnz .height_loop
+
+ WELSEMMS
+ LOAD_7_PARA_POP
+ ret
+
+
+
+;*******************************************************************************
+; void PixelAvgWidthEq16_sse2( uint8_t *pDst, int iDstStride,
+; uint8_t *pSrcA, int iSrcAStride,
+; uint8_t *pSrcB, int iSrcBStride,
+; int iHeight );
+;*******************************************************************************
+WELS_EXTERN PixelAvgWidthEq16_sse2
+
+ %assign push_num 0
+ LOAD_7_PARA
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
+ SIGN_EXTENSION r6, r6d
+ALIGN 4
+.height_loop:
+ movdqu xmm0, [r2]
+ movdqu xmm1, [r4]
+ pavgb xmm0, xmm1
+ ;pavgb xmm0, [r4]
+ movdqu [r0], xmm0
+
+ movdqu xmm0, [r2+r3]
+ movdqu xmm1, [r4+r5]
+ pavgb xmm0, xmm1
+ movdqu [r0+r1], xmm0
+
+ movdqu xmm0, [r2+2*r3]
+ movdqu xmm1, [r4+2*r5]
+ pavgb xmm0, xmm1
+ movdqu [r0+2*r1], xmm0
+
+ lea r2, [r2+2*r3]
+ lea r4, [r4+2*r5]
+ lea r0, [r0+2*r1]
+
+ movdqu xmm0, [r2+r3]
+ movdqu xmm1, [r4+r5]
+ pavgb xmm0, xmm1
+ movdqu [r0+r1], xmm0
+
+ lea r2, [r2+2*r3]
+ lea r4, [r4+2*r5]
+ lea r0, [r0+2*r1]
+
+ sub r6, 4
+ jne .height_loop
+
+ WELSEMMS
+ LOAD_7_PARA_POP
+ ret
+
+;*******************************************************************************
+; void McCopyWidthEq4_mmx( uint8_t *pSrc, int iSrcStride,
+; uint8_t *pDst, int iDstStride, int iHeight )
+;*******************************************************************************
+WELS_EXTERN McCopyWidthEq4_mmx
+ push r5
+ %assign push_num 1
+ LOAD_5_PARA
+
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+
+ALIGN 4
+.height_loop:
+ mov r5d, [r0]
+ mov [r2], r5d
+
+ add r0, r1
+ add r2, r3
+ dec r4
+ jnz .height_loop
+ WELSEMMS
+ LOAD_5_PARA_POP
+ pop r5
+ ret
+
+;*******************************************************************************
+; void McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride,
+; uint8_t *pDst, int iDstStride, int iHeight )
+;*******************************************************************************
+WELS_EXTERN McCopyWidthEq8_mmx
+ %assign push_num 0
+ LOAD_5_PARA
+
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+
+ALIGN 4
+.height_loop:
+ movq mm0, [r0]
+ movq [r2], mm0
+ add r0, r1
+ add r2, r3
+ dec r4
+ jnz .height_loop
+
+ WELSEMMS
+ LOAD_5_PARA_POP
+ ret
+
+
+;*******************************************************************************
+; void McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
+;*******************************************************************************
+;read unaligned memory
+%macro SSE_READ_UNA 2
+ movq %1, [%2]
+ movhps %1, [%2+8]
+%endmacro
+
+;write unaligned memory
+%macro SSE_WRITE_UNA 2
+ movq [%1], %2
+ movhps [%1+8], %2
+%endmacro
+WELS_EXTERN McCopyWidthEq16_sse2
+ %assign push_num 0
+ LOAD_5_PARA
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ALIGN 4
+.height_loop:
+ SSE_READ_UNA xmm0, r0
+ SSE_READ_UNA xmm1, r0+r1
+ SSE_WRITE_UNA r2, xmm0
+ SSE_WRITE_UNA r2+r3, xmm1
+
+ sub r4, 2
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ jnz .height_loop
+
+ LOAD_5_PARA_POP
+ ret
--- /dev/null
+++ b/codec/common/x86/mc_chroma.asm
@@ -1,0 +1,293 @@
+;*!
+;* \copy
+;* Copyright (c) 2004-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* mc_chroma.asm
+;*
+;* Abstract
+;* mmx motion compensation for chroma
+;*
+;* History
+;* 10/13/2004 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+h264_d0x20_sse2:
+ dw 32,32,32,32,32,32,32,32
+ALIGN 16
+h264_d0x20_mmx:
+ dw 32,32,32,32
+
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+;*******************************************************************************
+; void McChromaWidthEq4_mmx( const uint8_t *src,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; const uint8_t *pABCD,
+; int32_t iHeigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq4_mmx
+ %assign push_num 0
+ LOAD_6_PARA
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
+
+ movd mm3, [r4]; [eax]
+ WELS_Zero mm7
+ punpcklbw mm3, mm3
+ movq mm4, mm3
+ punpcklwd mm3, mm3
+ punpckhwd mm4, mm4
+
+ movq mm5, mm3
+ punpcklbw mm3, mm7
+ punpckhbw mm5, mm7
+
+ movq mm6, mm4
+ punpcklbw mm4, mm7
+ punpckhbw mm6, mm7
+
+ lea r4, [r0 + r1] ;lea ebx, [esi + eax]
+ movd mm0, [r0]
+ movd mm1, [r0+1]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+.xloop:
+
+ pmullw mm0, mm3
+ pmullw mm1, mm5
+ paddw mm0, mm1
+
+ movd mm1, [r4]
+ punpcklbw mm1, mm7
+ movq mm2, mm1
+ pmullw mm1, mm4
+ paddw mm0, mm1
+
+ movd mm1, [r4+1]
+ punpcklbw mm1, mm7
+ movq mm7, mm1
+ pmullw mm1,mm6
+ paddw mm0, mm1
+ movq mm1,mm7
+
+ paddw mm0, [h264_d0x20_mmx]
+ psrlw mm0, 6
+
+ WELS_Zero mm7
+ packuswb mm0, mm7
+ movd [r2], mm0
+
+ movq mm0, mm2
+
+ lea r2, [r2 + r3]
+ lea r4, [r4 + r1]
+
+ dec r5
+ jnz near .xloop
+ WELSEMMS
+ LOAD_6_PARA_POP
+ ret
+
+
+;*******************************************************************************
+; void McChromaWidthEq8_sse2( const uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; const uint8_t *pABCD,
+; int32_t iheigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq8_sse2
+ %assign push_num 0
+ LOAD_6_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
+
+ movd xmm3, [r4]
+ WELS_Zero xmm7
+ punpcklbw xmm3, xmm3
+ punpcklwd xmm3, xmm3
+
+ movdqa xmm4, xmm3
+ punpckldq xmm3, xmm3
+ punpckhdq xmm4, xmm4
+ movdqa xmm5, xmm3
+ movdqa xmm6, xmm4
+
+ punpcklbw xmm3, xmm7
+ punpckhbw xmm5, xmm7
+ punpcklbw xmm4, xmm7
+ punpckhbw xmm6, xmm7
+
+ lea r4, [r0 + r1] ;lea ebx, [esi + eax]
+ movq xmm0, [r0]
+ movq xmm1, [r0+1]
+ punpcklbw xmm0, xmm7
+ punpcklbw xmm1, xmm7
+.xloop:
+
+ pmullw xmm0, xmm3
+ pmullw xmm1, xmm5
+ paddw xmm0, xmm1
+
+ movq xmm1, [r4]
+ punpcklbw xmm1, xmm7
+ movdqa xmm2, xmm1
+ pmullw xmm1, xmm4
+ paddw xmm0, xmm1
+
+ movq xmm1, [r4+1]
+ punpcklbw xmm1, xmm7
+ movdqa xmm7, xmm1
+ pmullw xmm1, xmm6
+ paddw xmm0, xmm1
+ movdqa xmm1,xmm7
+
+ paddw xmm0, [h264_d0x20_sse2]
+ psrlw xmm0, 6
+
+ WELS_Zero xmm7
+ packuswb xmm0, xmm7
+ movq [r2], xmm0
+
+ movdqa xmm0, xmm2
+
+ lea r2, [r2 + r3]
+ lea r4, [r4 + r1]
+
+ dec r5
+ jnz near .xloop
+
+ POP_XMM
+ LOAD_6_PARA_POP
+
+ ret
+
+
+
+
+;***********************************************************************
+; void McChromaWidthEq8_ssse3( const uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; const uint8_t *pABCD,
+; int32_t iHeigh);
+;***********************************************************************
+WELS_EXTERN McChromaWidthEq8_ssse3
+ %assign push_num 0
+ LOAD_6_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
+
+ pxor xmm7, xmm7
+ movd xmm5, [r4]
+ punpcklwd xmm5, xmm5
+ punpckldq xmm5, xmm5
+ movdqa xmm6, xmm5
+ punpcklqdq xmm5, xmm5
+ punpckhqdq xmm6, xmm6
+
+ sub r2, r3 ;sub esi, edi
+ sub r2, r3
+ movdqa xmm7, [h264_d0x20_sse2]
+
+ movdqu xmm0, [r0]
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+ punpcklbw xmm0, xmm1
+
+.hloop_chroma:
+ lea r2, [r2+2*r3]
+
+ movdqu xmm2, [r0+r1]
+ movdqa xmm3, xmm2
+ psrldq xmm3, 1
+ punpcklbw xmm2, xmm3
+ movdqa xmm4, xmm2
+
+ pmaddubsw xmm0, xmm5
+ pmaddubsw xmm2, xmm6
+ paddw xmm0, xmm2
+ paddw xmm0, xmm7
+ psrlw xmm0, 6
+ packuswb xmm0, xmm0
+ movq [r2],xmm0
+
+ lea r0, [r0+2*r1]
+ movdqu xmm2, [r0]
+ movdqa xmm3, xmm2
+ psrldq xmm3, 1
+ punpcklbw xmm2, xmm3
+ movdqa xmm0, xmm2
+
+ pmaddubsw xmm4, xmm5
+ pmaddubsw xmm2, xmm6
+ paddw xmm4, xmm2
+ paddw xmm4, xmm7
+ psrlw xmm4, 6
+ packuswb xmm4, xmm4
+ movq [r2+r3],xmm4
+
+ sub r5, 2
+ jnz .hloop_chroma
+
+ POP_XMM
+ LOAD_6_PARA_POP
+
+ ret
+
+
--- /dev/null
+++ b/codec/common/x86/mc_luma.asm
@@ -1,0 +1,1164 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* mc_luma.asm
+;*
+;* Abstract
+;* sse2 motion compensation
+;*
+;* History
+;* 17/08/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;*******************************************************************************
+; Local Data (Read Only)
+;*******************************************************************************
+SECTION .rodata align=16
+
+;*******************************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;*******************************************************************************
+
+ALIGN 16
+h264_w0x10:
+ dw 16, 16, 16, 16
+ALIGN 16
+h264_w0x10_1:
+ dw 16, 16, 16, 16, 16, 16, 16, 16
+ALIGN 16
+h264_mc_hc_32:
+ dw 32, 32, 32, 32, 32, 32, 32, 32
+
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+
+
+;*******************************************************************************
+; void McHorVer20WidthEq4_mmx( const uint8_t *pSrc,
+; int iSrcStride,
+; uint8_t *pDst,
+; int iDstStride,
+; int iHeight)
+;*******************************************************************************
+WELS_EXTERN McHorVer20WidthEq4_mmx
+ %assign push_num 0
+ LOAD_5_PARA
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+
+ sub r0, 2
+ WELS_Zero mm7
+ movq mm6, [h264_w0x10]
+.height_loop:
+ movd mm0, [r0]
+ punpcklbw mm0, mm7
+ movd mm1, [r0+5]
+ punpcklbw mm1, mm7
+ movd mm2, [r0+1]
+ punpcklbw mm2, mm7
+ movd mm3, [r0+4]
+ punpcklbw mm3, mm7
+ movd mm4, [r0+2]
+ punpcklbw mm4, mm7
+ movd mm5, [r0+3]
+ punpcklbw mm5, mm7
+
+ paddw mm2, mm3
+ paddw mm4, mm5
+ psllw mm4, 2
+ psubw mm4, mm2
+ paddw mm0, mm1
+ paddw mm0, mm4
+ psllw mm4, 2
+ paddw mm0, mm4
+ paddw mm0, mm6
+ psraw mm0, 5
+ packuswb mm0, mm7
+ movd [r2], mm0
+
+ add r0, r1
+ add r2, r3
+ dec r4
+ jnz .height_loop
+
+ WELSEMMS
+ LOAD_5_PARA_POP
+ ret
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+
+%macro SSE_LOAD_8P 3
+ movq %1, %3
+ punpcklbw %1, %2
+%endmacro
+
+%macro FILTER_HV_W8 9
+ paddw %1, %6
+ movdqa %8, %3
+ movdqa %7, %2
+ paddw %1, [h264_w0x10_1]
+ paddw %8, %4
+ paddw %7, %5
+ psllw %8, 2
+ psubw %8, %7
+ paddw %1, %8
+ psllw %8, 2
+ paddw %1, %8
+ psraw %1, 5
+ WELS_Zero %8
+ packuswb %1, %8
+ movq %9, %1
+%endmacro
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+;***********************************************************************
+; void McHorVer22Width8HorFirst_sse2(const int16_t *pSrc,
+; int16_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride
+; int32_t iHeight
+; )
+;***********************************************************************
+WELS_EXTERN McHorVer22Width8HorFirst_sse2
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ pxor xmm7, xmm7
+
+ sub r0, r1 ;;;;;;;;need more 5 lines.
+ sub r0, r1
+
+.yloop_width_8:
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ movdqa [r2], xmm0
+
+ add r0, r1
+ add r2, r3
+ dec r4
+ jnz .yloop_width_8
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
+
+;*******************************************************************************
+; void McHorVer20WidthEq8_sse2( const uint8_t *pSrc,
+; int iSrcStride,
+; uint8_t *pDst,
+; int iDstStride,
+; int iHeight,
+; );
+;*******************************************************************************
+WELS_EXTERN McHorVer20WidthEq8_sse2
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ lea r0, [r0-2] ;pSrc -= 2;
+
+ pxor xmm7, xmm7
+ movdqa xmm6, [h264_w0x10_1]
+.y_loop:
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ paddw xmm0, xmm6
+ psraw xmm0, 5
+
+ packuswb xmm0, xmm7
+ movq [r2], xmm0
+
+ lea r2, [r2+r3]
+ lea r0, [r0+r1]
+ dec r4
+ jnz near .y_loop
+
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
+
+;*******************************************************************************
+; void McHorVer20WidthEq16_sse2( const uint8_t *pSrc,
+; int iSrcStride,
+; uint8_t *pDst,
+; int iDstStride,
+; int iHeight,
+; );
+;*******************************************************************************
+WELS_EXTERN McHorVer20WidthEq16_sse2
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ lea r0, [r0-2] ;pSrc -= 2;
+
+ pxor xmm7, xmm7
+ movdqa xmm6, [h264_w0x10_1]
+.y_loop:
+
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ paddw xmm0, xmm6
+ psraw xmm0, 5
+ packuswb xmm0, xmm7
+ movq [r2], xmm0
+
+ movq xmm0, [r0+8]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5+8]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1+8]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4+8]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2+8]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3+8]
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ paddw xmm0, xmm6
+ psraw xmm0, 5
+ packuswb xmm0, xmm7
+ movq [r2+8], xmm0
+
+ lea r2, [r2+r3]
+ lea r0, [r0+r1]
+ dec r4
+ jnz near .y_loop
+
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
+
+
+;*******************************************************************************
+; void McHorVer02WidthEq8_sse2( const uint8_t *pSrc,
+; int iSrcStride,
+; uint8_t *pDst,
+; int iDstStride,
+; int iHeight )
+;*******************************************************************************
+WELS_EXTERN McHorVer02WidthEq8_sse2
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ sub r0, r1
+ sub r0, r1
+
+ WELS_Zero xmm7
+
+ SSE_LOAD_8P xmm0, xmm7, [r0]
+ SSE_LOAD_8P xmm1, xmm7, [r0+r1]
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm2, xmm7, [r0]
+ SSE_LOAD_8P xmm3, xmm7, [r0+r1]
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm4, xmm7, [r0]
+ SSE_LOAD_8P xmm5, xmm7, [r0+r1]
+
+.start:
+ FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r4
+ jz near .xx_exit
+
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm6, xmm7, [r0]
+ FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
+ dec r4
+ jz near .xx_exit
+
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm7, xmm0, [r0+r1]
+ FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+ dec r4
+ jz near .xx_exit
+
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm0, xmm1, [r0]
+ FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
+ dec r4
+ jz near .xx_exit
+
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm1, xmm2, [r0+r1]
+ FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
+ dec r4
+ jz near .xx_exit
+
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm2, xmm3, [r0]
+ FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
+ dec r4
+ jz near .xx_exit
+
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm3, xmm4, [r0+r1]
+ FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
+ dec r4
+ jz near .xx_exit
+
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm4, xmm5, [r0]
+ FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
+ dec r4
+ jz near .xx_exit
+
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm5, xmm6, [r0+r1]
+ jmp near .start
+
+.xx_exit:
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+
+
+;***********************************************************************
+; void McHorVer02Height9Or17_sse2( const uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; int32_t iWidth,
+; int32_t iHeight )
+;***********************************************************************
+WELS_EXTERN McHorVer02Height9Or17_sse2
+ %assign push_num 0
+ LOAD_6_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r5, r5d
+
+%ifndef X86_32
+ push r12
+ push r13
+ push r14
+ mov r12, r0
+ mov r13, r2
+ mov r14, r5
+%endif
+
+ shr r4, 3
+ sub r0, r1
+ sub r0, r1
+
+.xloop:
+ WELS_Zero xmm7
+ SSE_LOAD_8P xmm0, xmm7, [r0]
+ SSE_LOAD_8P xmm1, xmm7, [r0+r1]
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm2, xmm7, [r0]
+ SSE_LOAD_8P xmm3, xmm7, [r0+r1]
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm4, xmm7, [r0]
+ SSE_LOAD_8P xmm5, xmm7, [r0+r1]
+
+ FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm6, xmm7, [r0]
+ movdqa xmm0,xmm1
+ movdqa xmm1,xmm2
+ movdqa xmm2,xmm3
+ movdqa xmm3,xmm4
+ movdqa xmm4,xmm5
+ movdqa xmm5,xmm6
+ add r2, r3
+ sub r0, r1
+
+.start:
+ FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm6, xmm7, [r0]
+ FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm7, xmm0, [r0+r1]
+ FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm0, xmm1, [r0]
+ FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm1, xmm2, [r0+r1]
+ FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm2, xmm3, [r0]
+ FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm3, xmm4, [r0+r1]
+ FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm4, xmm5, [r0]
+ FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm5, xmm6, [r0+r1]
+ jmp near .start
+
+.x_loop_dec:
+ dec r4
+ jz near .xx_exit
+%ifdef X86_32
+ mov r0, arg1
+ mov r2, arg3
+ mov r5, arg6
+%else
+ mov r0, r12
+ mov r2, r13
+ mov r5, r14
+%endif
+ sub r0, r1
+ sub r0, r1
+ add r0, 8
+ add r2, 8
+ jmp near .xloop
+
+.xx_exit:
+%ifndef X86_32
+ pop r14
+ pop r13
+ pop r12
+%endif
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
+
+
+;***********************************************************************
+; void McHorVer20Width9Or17_sse2( const uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; int32_t iWidth,
+; int32_t iHeight
+; );
+;***********************************************************************
+WELS_EXTERN McHorVer20Width9Or17_sse2
+ %assign push_num 0
+ LOAD_6_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r5, r5d
+ sub r0, 2
+ pxor xmm7, xmm7
+
+ cmp r4, 9
+ jne near .width_17
+
+.yloop_width_9:
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
+
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm3
+ movdqa xmm6, xmm4
+ paddw xmm6, xmm5
+ psllw xmm6, 2
+ psubw xmm6, xmm7
+ paddw xmm0, xmm1
+ paddw xmm0, xmm6
+ psllw xmm6, 2
+ paddw xmm0, xmm6
+ paddw xmm0, [h264_w0x10_1]
+ psraw xmm0, 5
+ packuswb xmm0, xmm0
+ movd [r2], xmm0
+
+ pxor xmm7, xmm7
+ movq xmm0, [r0+6]
+ punpcklbw xmm0, xmm7
+
+ paddw xmm4, xmm1
+ paddw xmm5, xmm3
+ psllw xmm5, 2
+ psubw xmm5, xmm4
+ paddw xmm2, xmm0
+ paddw xmm2, xmm5
+ psllw xmm5, 2
+ paddw xmm2, xmm5
+ paddw xmm2, [h264_w0x10_1]
+ psraw xmm2, 5
+ packuswb xmm2, xmm2
+ movq [r2+1], xmm2
+
+ add r0, r1
+ add r2, r3
+ dec r5
+ jnz .yloop_width_9
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
+
+
+.width_17:
+.yloop_width_17:
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ paddw xmm0, [h264_w0x10_1]
+ psraw xmm0, 5
+ packuswb xmm0, xmm0
+ movq [r2], xmm0
+
+ movq xmm0, [r0+8]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5+8]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1+8]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4+8]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2+8]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3+8]
+ punpcklbw xmm5, xmm7
+
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm3
+ movdqa xmm6, xmm4
+ paddw xmm6, xmm5
+ psllw xmm6, 2
+ psubw xmm6, xmm7
+ paddw xmm0, xmm1
+ paddw xmm0, xmm6
+ psllw xmm6, 2
+ paddw xmm0, xmm6
+ paddw xmm0, [h264_w0x10_1]
+ psraw xmm0, 5
+ packuswb xmm0, xmm0
+ movd [r2+8], xmm0
+
+
+ pxor xmm7, xmm7
+ movq xmm0, [r0+6+8]
+ punpcklbw xmm0, xmm7
+
+ paddw xmm4, xmm1
+ paddw xmm5, xmm3
+ psllw xmm5, 2
+ psubw xmm5, xmm4
+ paddw xmm2, xmm0
+ paddw xmm2, xmm5
+ psllw xmm5, 2
+ paddw xmm2, xmm5
+ paddw xmm2, [h264_w0x10_1]
+ psraw xmm2, 5
+ packuswb xmm2, xmm2
+ movq [r2+9], xmm2
+ add r0, r1
+ add r2, r3
+ dec r5
+ jnz .yloop_width_17
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
+
+
+
+;***********************************************************************
+;void McHorVer22HorFirst_sse2
+; (const uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t * pTap,
+; int32_t iTapStride,
+; int32_t iWidth,int32_t iHeight);
+;***********************************************************************
+WELS_EXTERN McHorVer22HorFirst_sse2
+ %assign push_num 0
+ LOAD_6_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r5, r5d
+ pxor xmm7, xmm7
+ sub r0, r1 ;;;;;;;;need more 5 lines.
+ sub r0, r1
+
+ cmp r4, 9
+ jne near .width_17
+
+.yloop_width_9:
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
+
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm3
+ movdqa xmm6, xmm4
+ paddw xmm6, xmm5
+ psllw xmm6, 2
+ psubw xmm6, xmm7
+ paddw xmm0, xmm1
+ paddw xmm0, xmm6
+ psllw xmm6, 2
+ paddw xmm0, xmm6
+ movd [r2], xmm0
+
+ pxor xmm7, xmm7
+ movq xmm0, [r0+6]
+ punpcklbw xmm0, xmm7
+
+ paddw xmm4, xmm1
+ paddw xmm5, xmm3
+ psllw xmm5, 2
+ psubw xmm5, xmm4
+ paddw xmm2, xmm0
+ paddw xmm2, xmm5
+ psllw xmm5, 2
+ paddw xmm2, xmm5
+ movq [r2+2], xmm2
+ movhps [r2+2+8], xmm2
+
+ add r0, r1
+ add r2, r3
+ dec r5
+ jnz .yloop_width_9
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
+
+
+.width_17:
+.yloop_width_17:
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ movdqa [r2], xmm0
+
+ movq xmm0, [r0+8]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5+8]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1+8]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4+8]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2+8]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3+8]
+ punpcklbw xmm5, xmm7
+
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm3
+ movdqa xmm6, xmm4
+ paddw xmm6, xmm5
+ psllw xmm6, 2
+ psubw xmm6, xmm7
+ paddw xmm0, xmm1
+ paddw xmm0, xmm6
+ psllw xmm6, 2
+ paddw xmm0, xmm6
+ movd [r2+16], xmm0
+
+
+ pxor xmm7, xmm7
+ movq xmm0, [r0+6+8]
+ punpcklbw xmm0, xmm7
+
+ paddw xmm4, xmm1
+ paddw xmm5, xmm3
+ psllw xmm5, 2
+ psubw xmm5, xmm4
+ paddw xmm2, xmm0
+ paddw xmm2, xmm5
+ psllw xmm5, 2
+ paddw xmm2, xmm5
+ movq [r2+18], xmm2
+ movhps [r2+18+8], xmm2
+
+ add r0, r1
+ add r2, r3
+ dec r5
+ jnz .yloop_width_17
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
+
+
+%macro FILTER_VER 9
+ paddw %1, %6
+ movdqa %7, %2
+ movdqa %8, %3
+
+
+ paddw %7, %5
+ paddw %8, %4
+
+ psubw %1, %7
+ psraw %1, 2
+ paddw %1, %8
+ psubw %1, %7
+ psraw %1, 2
+ paddw %8, %1
+ paddw %8, [h264_mc_hc_32]
+ psraw %8, 6
+ packuswb %8, %8
+ movq %9, %8
+%endmacro
+;***********************************************************************
+;void McHorVer22Width8VerLastAlign_sse2(
+; const uint8_t *pTap,
+; int32_t iTapStride,
+; uint8_t * pDst,
+; int32_t iDstStride,
+; int32_t iWidth,
+; int32_t iHeight);
+;***********************************************************************
+
+WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
+ %assign push_num 0
+ LOAD_6_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r5, r5d
+%ifndef X86_32
+ push r12
+ push r13
+ push r14
+ mov r12, r0
+ mov r13, r2
+ mov r14, r5
+%endif
+
+ shr r4, 3
+
+.width_loop:
+ movdqa xmm0, [r0]
+ movdqa xmm1, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqa xmm2, [r0]
+ movdqa xmm3, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqa xmm4, [r0]
+ movdqa xmm5, [r0+r1]
+
+ FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ lea r0, [r0+2*r1]
+ movdqa xmm6, [r0]
+
+ movdqa xmm0, xmm1
+ movdqa xmm1, xmm2
+ movdqa xmm2, xmm3
+ movdqa xmm3, xmm4
+ movdqa xmm4, xmm5
+ movdqa xmm5, xmm6
+
+ add r2, r3
+ sub r0, r1
+
+.start:
+ FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ movdqa xmm6, [r0]
+ FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ movdqa xmm7, [r0+r1]
+ FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ movdqa xmm0, [r0]
+ FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ movdqa xmm1, [r0+r1]
+ FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ movdqa xmm2, [r0]
+ FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ movdqa xmm3, [r0+r1]
+ FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ movdqa xmm4, [r0]
+ FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ movdqa xmm5, [r0+r1]
+ jmp near .start
+
+.x_loop_dec:
+ dec r4
+ jz near .exit
+%ifdef X86_32
+ mov r0, arg1
+ mov r2, arg3
+ mov r5, arg6
+%else
+ mov r0, r12
+ mov r2, r13
+ mov r5, r14
+%endif
+ add r0, 16
+ add r2, 8
+ jmp .width_loop
+
+.exit:
+%ifndef X86_32
+ pop r14
+ pop r13
+ pop r12
+%endif
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
+
+;***********************************************************************
+;void McHorVer22Width8VerLastUnAlign_sse2(
+; const uint8_t *pTap,
+; int32_t iTapStride,
+; uint8_t * pDst,
+; int32_t iDstStride,
+; int32_t iWidth,
+; int32_t iHeight);
+;***********************************************************************
+
+WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
+ %assign push_num 0
+ LOAD_6_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ SIGN_EXTENSION r5, r5d
+%ifndef X86_32
+ push r12
+ push r13
+ push r14
+ mov r12, r0
+ mov r13, r2
+ mov r14, r5
+%endif
+ shr r4, 3
+
+.width_loop:
+ movdqu xmm0, [r0]
+ movdqu xmm1, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqu xmm2, [r0]
+ movdqu xmm3, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqu xmm4, [r0]
+ movdqu xmm5, [r0+r1]
+
+ FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ lea r0, [r0+2*r1]
+ movdqu xmm6, [r0]
+
+ movdqa xmm0, xmm1
+ movdqa xmm1, xmm2
+ movdqa xmm2, xmm3
+ movdqa xmm3, xmm4
+ movdqa xmm4, xmm5
+ movdqa xmm5, xmm6
+
+ add r2, r3
+ sub r0, r1
+
+.start:
+ FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ movdqu xmm6, [r0]
+ FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ movdqu xmm7, [r0+r1]
+ FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ movdqu xmm0, [r0]
+ FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ movdqu xmm1, [r0+r1]
+ FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ movdqu xmm2, [r0]
+ FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ movdqu xmm3, [r0+r1]
+ FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ movdqu xmm4, [r0]
+ FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ movdqu xmm5, [r0+r1]
+ jmp near .start
+
+.x_loop_dec:
+ dec r4
+ jz near .exit
+%ifdef X86_32
+ mov r0, arg1
+ mov r2, arg3
+ mov r5, arg6
+%else
+ mov r0, r12
+ mov r2, r13
+ mov r5, r14
+%endif
+ add r0, 16
+ add r2, 8
+ jmp .width_loop
+
+.exit:
+%ifndef X86_32
+ pop r14
+ pop r13
+ pop r12
+%endif
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
--- /dev/null
+++ b/codec/common/x86/satd_sad.asm
@@ -1,0 +1,2184 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* satd_sad.asm
+;*
+;* Abstract
+;* WelsSampleSatd4x4_sse2
+;* WelsSampleSatd8x8_sse2
+;* WelsSampleSatd16x8_sse2
+;* WelsSampleSatd8x16_sse2
+;* WelsSampleSatd16x16_sse2
+;*
+;* WelsSampleSad16x8_sse2
+;* WelsSampleSad16x16_sse2
+;*
+;* History
+;* 8/5/2009 Created
+;* 24/9/2009 modified
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Data
+;***********************************************************************
+SECTION .rodata align=16
+
+align 16
+HSumSubDB1: db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
+align 16
+HSumSubDW1: dw 1,-1,1,-1,1,-1,1,-1
+align 16
+PDW1: dw 1,1,1,1,1,1,1,1
+align 16
+PDQ2: dw 2,0,0,0,2,0,0,0
+align 16
+HSwapSumSubDB1: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
+
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse2 BEGIN
+;
+;***********************************************************************
+%macro MMX_DW_1_2REG 2
+ pxor %1, %1
+ pcmpeqw %2, %2
+ psubw %1, %2
+%endmacro
+
+%macro SSE2_SumWHorizon1 2
+ movdqa %2, %1
+ psrldq %2, 8
+ paddusw %1, %2
+ movdqa %2, %1
+ psrldq %2, 4
+ paddusw %1, %2
+ movdqa %2, %1
+ psrldq %2, 2
+ paddusw %1, %2
+%endmacro
+
+%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4 pOut: xmm4,xmm2,xmm1,xmm3
+ SSE2_SumSub %1, %2, %5
+ SSE2_SumSub %3, %4, %5
+ SSE2_SumSub %2, %4, %5
+ SSE2_SumSub %1, %3, %5
+%endmacro
+
+%macro SSE2_SumAbs4 7
+ WELS_AbsW %1, %3
+ WELS_AbsW %2, %3
+ WELS_AbsW %4, %6
+ WELS_AbsW %5, %6
+ paddusw %1, %2
+ paddusw %4, %5
+ paddusw %7, %1
+ paddusw %7, %4
+%endmacro
+
+%macro SSE2_SumWHorizon 3
+ movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
+ paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
+ punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
+ movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
+ paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
+ pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
+ paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
+%endmacro
+
+%macro SSE2_GetSatd8x8 0
+ SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
+ SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
+ SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
+
+ SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
+ SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
+ SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
+ SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
+ SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
+ SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
+
+ SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
+ SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
+ SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
+ SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+%endmacro
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd4x4_sse2
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ movd xmm0, [r0]
+ movd xmm1, [r0+r1]
+ lea r0 , [r0+2*r1]
+ movd xmm2, [r0]
+ movd xmm3, [r0+r1]
+ punpckldq xmm0, xmm2
+ punpckldq xmm1, xmm3
+
+ movd xmm4, [r2]
+ movd xmm5, [r2+r3]
+ lea r2 , [r2+2*r3]
+ movd xmm6, [r2]
+ movd xmm7, [r2+r3]
+ punpckldq xmm4, xmm6
+ punpckldq xmm5, xmm7
+
+ pxor xmm6, xmm6
+ punpcklbw xmm0, xmm6
+ punpcklbw xmm1, xmm6
+ punpcklbw xmm4, xmm6
+ punpcklbw xmm5, xmm6
+
+ psubw xmm0, xmm4
+ psubw xmm1, xmm5
+
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ psubw xmm2, xmm1
+ SSE2_XSawp qdq, xmm0, xmm2, xmm3
+
+ movdqa xmm4, xmm0
+ paddw xmm0, xmm3
+ psubw xmm4, xmm3
+
+ movdqa xmm2, xmm0
+ punpcklwd xmm0, xmm4
+ punpckhwd xmm4, xmm2
+
+ SSE2_XSawp dq, xmm0, xmm4, xmm3
+ SSE2_XSawp qdq, xmm0, xmm3, xmm5
+
+ movdqa xmm7, xmm0
+ paddw xmm0, xmm5
+ psubw xmm7, xmm5
+
+ SSE2_XSawp qdq, xmm0, xmm7, xmm1
+
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ psubw xmm2, xmm1
+
+ WELS_AbsW xmm0, xmm3
+ paddusw xmm6, xmm0
+ WELS_AbsW xmm2, xmm4
+ paddusw xmm6, xmm2
+ SSE2_SumWHorizon1 xmm6, xmm4
+ movd retrd, xmm6
+ and retrd, 0xffff
+ shr retrd, 1
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
+
+ ;***********************************************************************
+ ;
+ ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+ ;
+ ;***********************************************************************
+WELS_EXTERN WelsSampleSatd8x8_sse2
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
+ SSE2_GetSatd8x8
+ psrlw xmm6, 1
+ SSE2_SumWHorizon xmm6,xmm4,xmm7
+ movd retrd, xmm6
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
+
+ ;***********************************************************************
+ ;
+ ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+ ;
+ ;***********************************************************************
+WELS_EXTERN WelsSampleSatd8x16_sse2
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
+
+ SSE2_GetSatd8x8
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSatd8x8
+
+ psrlw xmm6, 1
+ SSE2_SumWHorizon xmm6,xmm4,xmm7
+ movd retrd, xmm6
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x8_sse2
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ push r0
+ push r2
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
+
+ SSE2_GetSatd8x8
+
+ pop r2
+ pop r0
+ add r0, 8
+ add r2, 8
+ SSE2_GetSatd8x8
+
+ psrlw xmm6, 1
+ SSE2_SumWHorizon xmm6,xmm4,xmm7
+ movd retrd, xmm6
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x16_sse2
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ push r0
+ push r2
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
+
+ SSE2_GetSatd8x8
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSatd8x8
+
+ pop r2
+ pop r0
+ add r0, 8
+ add r2, 8
+
+ SSE2_GetSatd8x8
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSatd8x8
+
+ ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
+ psrlw xmm6, 1
+ SSE2_SumWHorizon xmm6,xmm4,xmm7
+ movd retrd, xmm6
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse2 END
+;
+;***********************************************************************
+
+;***********************************************************************
+;
+;Pixel_satd_intra_sse2 BEGIN
+;
+;***********************************************************************
+
+%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
+ pmaddubsw %1, xmm5
+ movdqa %2, %1
+ pmaddwd %1, xmm7
+ pmaddwd %2, xmm6
+ movdqa %3, %1
+ punpckldq %1, %2
+ punpckhdq %2, %3
+ movdqa %3, %1
+ punpcklqdq %1, %2
+ punpckhqdq %3, %2
+ paddd xmm4, %1 ;for dc
+ paddd xmm4, %3 ;for dc
+ packssdw %1, %3
+ psllw %1, 2
+%endmacro
+%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
+ pmaddubsw %1, xmm5
+ movdqa %2, %1
+ pmaddwd %1, xmm7
+ pmaddwd %2, xmm6
+ movdqa %3, %1
+ punpckldq %1, %2
+ punpckhdq %2, %3
+ movdqa %3, %1
+ punpcklqdq %1, %2
+ punpckhqdq %3, %2
+; paddd xmm4, %1 ;for dc
+; paddd xmm4, %3 ;for dc
+ movdqa %4, %1
+ punpcklqdq %4, %3
+ packssdw %1, %3
+ psllw %1, 2
+%endmacro
+
+%macro SSE41_GetX38x4SatdDec 0
+ pxor xmm7, xmm7
+ movq xmm0, [eax]
+ movq xmm1, [eax+ebx]
+ lea eax, [eax+2*ebx]
+ movq xmm2, [eax]
+ movq xmm3, [eax+ebx]
+ lea eax, [eax+2*ebx]
+ punpcklbw xmm0, xmm7
+ punpcklbw xmm1, xmm7
+ punpcklbw xmm2, xmm7
+ punpcklbw xmm3, xmm7
+ SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm7
+ SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm7
+ SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
+ ;doesn't need another transpose
+%endmacro
+%macro SSE41_GetX38x4SatdV 2
+ pxor xmm0, xmm0
+ pinsrw xmm0, word[esi+%2], 0
+ pinsrw xmm0, word[esi+%2+8], 4
+ psubsw xmm0, xmm7
+ pabsw xmm0, xmm0
+ paddw xmm4, xmm0
+ pxor xmm0, xmm0
+ pinsrw xmm0, word[esi+%2+2], 0
+ pinsrw xmm0, word[esi+%2+10], 4
+ psubsw xmm0, xmm1
+ pabsw xmm0, xmm0
+ paddw xmm4, xmm0
+ pxor xmm0, xmm0
+ pinsrw xmm0, word[esi+%2+4], 0
+ pinsrw xmm0, word[esi+%2+12], 4
+ psubsw xmm0, xmm3
+ pabsw xmm0, xmm0
+ paddw xmm4, xmm0
+ pxor xmm0, xmm0
+ pinsrw xmm0, word[esi+%2+6], 0
+ pinsrw xmm0, word[esi+%2+14], 4
+ psubsw xmm0, xmm2
+ pabsw xmm0, xmm0
+ paddw xmm4, xmm0
+%endmacro
+%macro SSE41_GetX38x4SatdH 3
+ movq xmm0, [esi+%3+8*%1]
+ punpcklqdq xmm0, xmm0
+ psubsw xmm0, xmm7
+ pabsw xmm0, xmm0
+ paddw xmm5, xmm0
+ pabsw xmm1, xmm1
+ pabsw xmm2, xmm2
+ pabsw xmm3, xmm3
+ paddw xmm2, xmm1;for DC
+ paddw xmm2, xmm3;for DC
+ paddw xmm5, xmm2
+%endmacro
+%macro SSE41_I16X16GetX38x4SatdDC 0
+ pxor xmm0, xmm0
+ movq2dq xmm0, mm4
+ punpcklqdq xmm0, xmm0
+ psubsw xmm0, xmm7
+ pabsw xmm0, xmm0
+ paddw xmm6, xmm0
+ paddw xmm6, xmm2
+%endmacro
+%macro SSE41_ChromaGetX38x4SatdDC 1
+ shl %1, 4
+ movdqa xmm0, [esi+32+%1]
+ psubsw xmm0, xmm7
+ pabsw xmm0, xmm0
+ paddw xmm6, xmm0
+ paddw xmm6, xmm2
+%endmacro
+%macro SSE41_I16x16GetX38x4Satd 2
+ SSE41_GetX38x4SatdDec
+ SSE41_GetX38x4SatdV %1, %2
+ SSE41_GetX38x4SatdH %1, %2, 32
+ SSE41_I16X16GetX38x4SatdDC
+%endmacro
+%macro SSE41_ChromaGetX38x4Satd 2
+ SSE41_GetX38x4SatdDec
+ SSE41_GetX38x4SatdV %1, %2
+ SSE41_GetX38x4SatdH %1, %2, 16
+ SSE41_ChromaGetX38x4SatdDC %1
+%endmacro
+%macro SSE41_HSum8W 3
+ pmaddwd %1, %2
+ movhlps %3, %1
+ paddd %1, %3
+ pshuflw %3, %1,0Eh
+ paddd %1, %3
+%endmacro
+
+
+%ifdef X86_32
+WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
+ push ebx
+ push esi
+ push edi
+ mov ecx, [esp+16]
+ mov edx, [esp+20]
+ mov eax, [esp+24]
+ mov ebx, [esp+28]
+ mov esi, [esp+40] ;temp_satd
+ pxor xmm4, xmm4
+ movdqa xmm5, [HSumSubDB1]
+ movdqa xmm6, [HSumSubDW1]
+ movdqa xmm7, [PDW1]
+ sub ecx, edx
+ movdqu xmm0, [ecx]
+ movhlps xmm1, xmm0
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+ SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+ movdqa [esi], xmm0 ;V
+ movdqa [esi+16], xmm1
+ add ecx, edx
+ pinsrb xmm0, byte[ecx-1], 0
+ pinsrb xmm0, byte[ecx+edx-1], 1
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 2
+ pinsrb xmm0, byte[ecx+edx-1], 3
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 4
+ pinsrb xmm0, byte[ecx+edx-1], 5
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 6
+ pinsrb xmm0, byte[ecx+edx-1], 7
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 8
+ pinsrb xmm0, byte[ecx+edx-1], 9
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 10
+ pinsrb xmm0, byte[ecx+edx-1], 11
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 12
+ pinsrb xmm0, byte[ecx+edx-1], 13
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 14
+ pinsrb xmm0, byte[ecx+edx-1], 15
+ movhlps xmm1, xmm0
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+ SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+ movdqa [esi+32], xmm0 ;H
+ movdqa [esi+48], xmm1
+ movd ecx, xmm4 ;dc
+ add ecx, 16 ;(sum+16)
+ shr ecx, 5 ;((sum+16)>>5)
+ shl ecx, 4 ;
+ movd mm4, ecx ; mm4 copy DC
+ pxor xmm4, xmm4 ;V
+ pxor xmm5, xmm5 ;H
+ pxor xmm6, xmm6 ;DC
+ mov ecx, 0
+ mov edi, 0
+.loop16x16_get_satd:
+.loopStart1:
+ SSE41_I16x16GetX38x4Satd ecx, edi
+ inc ecx
+ cmp ecx, 4
+ jl .loopStart1
+ cmp edi, 16
+ je .loop16x16_get_satd_end
+ mov eax, [esp+24]
+ add eax, 8
+ mov ecx, 0
+ add edi, 16
+ jmp .loop16x16_get_satd
+ .loop16x16_get_satd_end:
+ MMX_DW_1_2REG xmm0, xmm1
+ psrlw xmm4, 1 ;/2
+ psrlw xmm5, 1 ;/2
+ psrlw xmm6, 1 ;/2
+ SSE41_HSum8W xmm4, xmm0, xmm1
+ SSE41_HSum8W xmm5, xmm0, xmm1
+ SSE41_HSum8W xmm6, xmm0, xmm1
+
+ ; comparing order: DC H V
+ movd ebx, xmm6 ;DC
+ movd edi, xmm5 ;H
+ movd ecx, xmm4 ;V
+ mov edx, [esp+36]
+ shl edx, 1
+ add edi, edx
+ add ebx, edx
+ mov edx, [esp+32]
+ cmp ebx, edi
+ jge near not_dc_16x16
+ cmp ebx, ecx
+ jge near not_dc_h_16x16
+
+ ; for DC mode
+ mov dword[edx], 2;I16_PRED_DC
+ mov eax, ebx
+ jmp near return_satd_intra_16x16_x3
+not_dc_16x16:
+ ; for H mode
+ cmp edi, ecx
+ jge near not_dc_h_16x16
+ mov dword[edx], 1;I16_PRED_H
+ mov eax, edi
+ jmp near return_satd_intra_16x16_x3
+not_dc_h_16x16:
+ ; for V mode
+ mov dword[edx], 0;I16_PRED_V
+ mov eax, ecx
+return_satd_intra_16x16_x3:
+ WELSEMMS
+ pop edi
+ pop esi
+ pop ebx
+ret
+
+%macro SSE41_ChromaGetX38x8Satd 0
+ movdqa xmm5, [HSumSubDB1]
+ movdqa xmm6, [HSumSubDW1]
+ movdqa xmm7, [PDW1]
+ sub ecx, edx
+ movq xmm0, [ecx]
+ punpcklqdq xmm0, xmm0
+ SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
+ movdqa [esi], xmm0 ;V
+ add ecx, edx
+ pinsrb xmm0, byte[ecx-1], 0
+ pinsrb xmm0, byte[ecx+edx-1], 1
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 2
+ pinsrb xmm0, byte[ecx+edx-1], 3
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 4
+ pinsrb xmm0, byte[ecx+edx-1], 5
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 6
+ pinsrb xmm0, byte[ecx+edx-1], 7
+ punpcklqdq xmm0, xmm0
+ SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
+ movdqa [esi+16], xmm0 ;H
+;(sum+2)>>2
+ movdqa xmm6, [PDQ2]
+ movdqa xmm5, xmm4
+ punpckhqdq xmm5, xmm1
+ paddd xmm5, xmm6
+ psrld xmm5, 2
+;(sum1+sum2+4)>>3
+ paddd xmm6, xmm6
+ paddd xmm4, xmm1
+ paddd xmm4, xmm6
+ psrld xmm4, 3
+;satd *16
+ pslld xmm5, 4
+ pslld xmm4, 4
+;temp satd
+ movdqa xmm6, xmm4
+ punpcklqdq xmm4, xmm5
+ psllq xmm4, 32
+ psrlq xmm4, 32
+ movdqa [esi+32], xmm4
+ punpckhqdq xmm5, xmm6
+ psllq xmm5, 32
+ psrlq xmm5, 32
+ movdqa [esi+48], xmm5
+
+ pxor xmm4, xmm4 ;V
+ pxor xmm5, xmm5 ;H
+ pxor xmm6, xmm6 ;DC
+ mov ecx, 0
+loop_chroma_satdx3_cb_cr:
+ SSE41_ChromaGetX38x4Satd ecx, 0
+ inc ecx
+ cmp ecx, 2
+ jl loop_chroma_satdx3_cb_cr
+%endmacro
+
+%macro SSEReg2MMX 3
+ movdq2q %2, %1
+ movhlps %1, %1
+ movdq2q %3, %1
+%endmacro
+%macro MMXReg2SSE 4
+ movq2dq %1, %3
+ movq2dq %2, %4
+ punpcklqdq %1, %2
+%endmacro
+;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
+
+WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
+ push ebx
+ push esi
+ push edi
+ mov ecx, [esp+16]
+ mov edx, [esp+20]
+ mov eax, [esp+24]
+ mov ebx, [esp+28]
+ mov esi, [esp+40] ;temp_satd
+ xor edi, edi
+loop_chroma_satdx3:
+ SSE41_ChromaGetX38x8Satd
+ cmp edi, 1
+ je loop_chroma_satdx3end
+ inc edi
+ SSEReg2MMX xmm4, mm0,mm1
+ SSEReg2MMX xmm5, mm2,mm3
+ SSEReg2MMX xmm6, mm5,mm6
+ mov ecx, [esp+44]
+ mov eax, [esp+48]
+ jmp loop_chroma_satdx3
+loop_chroma_satdx3end:
+ MMXReg2SSE xmm0, xmm3, mm0, mm1
+ MMXReg2SSE xmm1, xmm3, mm2, mm3
+ MMXReg2SSE xmm2, xmm3, mm5, mm6
+
+ paddw xmm4, xmm0
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+
+ MMX_DW_1_2REG xmm0, xmm1
+ psrlw xmm4, 1 ;/2
+ psrlw xmm5, 1 ;/2
+ psrlw xmm6, 1 ;/2
+ SSE41_HSum8W xmm4, xmm0, xmm1
+ SSE41_HSum8W xmm5, xmm0, xmm1
+ SSE41_HSum8W xmm6, xmm0, xmm1
+ ; comparing order: DC H V
+ movd ebx, xmm6 ;DC
+ movd edi, xmm5 ;H
+ movd ecx, xmm4 ;V
+ mov edx, [esp+36]
+ shl edx, 1
+ add edi, edx
+ add ecx, edx
+ mov edx, [esp+32]
+ cmp ebx, edi
+ jge near not_dc_8x8
+ cmp ebx, ecx
+ jge near not_dc_h_8x8
+
+ ; for DC mode
+ mov dword[edx], 0;I8_PRED_DC
+ mov eax, ebx
+ jmp near return_satd_intra_8x8_x3
+not_dc_8x8:
+ ; for H mode
+ cmp edi, ecx
+ jge near not_dc_h_8x8
+ mov dword[edx], 1;I8_PRED_H
+ mov eax, edi
+ jmp near return_satd_intra_8x8_x3
+not_dc_h_8x8:
+ ; for V mode
+ mov dword[edx], 2;I8_PRED_V
+ mov eax, ecx
+return_satd_intra_8x8_x3:
+ WELSEMMS
+ pop edi
+ pop esi
+ pop ebx
+ret
+
+
+;***********************************************************************
+;
+;Pixel_satd_intra_sse2 END
+;
+;***********************************************************************
+%macro SSSE3_Get16BSadHVDC 2
+ movd xmm6,%1
+ pshufb xmm6,xmm1
+ movdqa %1, xmm6
+ movdqa xmm0,%2
+ psadbw xmm0,xmm7
+ paddw xmm4,xmm0
+ movdqa xmm0,%2
+ psadbw xmm0,xmm5
+ paddw xmm2,xmm0
+ psadbw xmm6,%2
+ paddw xmm3,xmm6
+%endmacro
+%macro WelsAddDCValue 4
+ movzx %2, byte %1
+ mov %3, %2
+ add %4, %2
+%endmacro
+
+;***********************************************************************
+;
+;Pixel_sad_intra_ssse3 BEGIN
+;
+;***********************************************************************
+WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
+ push ebx
+ push esi
+ push edi
+ mov ecx, [esp+16]
+ mov edx, [esp+20]
+ mov edi, [esp+40] ;temp_sad
+ sub ecx, edx
+ movdqa xmm5,[ecx]
+ pxor xmm0,xmm0
+ psadbw xmm0,xmm5
+ movhlps xmm1,xmm0
+ paddw xmm0,xmm1
+ movd eax,xmm0
+
+ add ecx,edx
+ lea ebx, [edx+2*edx]
+ WelsAddDCValue [ecx-1 ], esi, [edi ], eax
+ WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
+ WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+ WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
+ lea ecx, [ecx+4*edx]
+ add edi, 64
+ WelsAddDCValue [ecx-1 ], esi, [edi ], eax
+ WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
+ WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+ WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
+ lea ecx, [ecx+4*edx]
+ add edi, 64
+ WelsAddDCValue [ecx-1 ], esi, [edi ], eax
+ WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
+ WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+ WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
+ lea ecx, [ecx+4*edx]
+ add edi, 64
+ WelsAddDCValue [ecx-1 ], esi, [edi ], eax
+ WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
+ WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+ WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
+ sub edi, 192
+ add eax,10h
+ shr eax,5
+ movd xmm7,eax
+ pxor xmm1,xmm1
+ pshufb xmm7,xmm1
+ pxor xmm4,xmm4
+ pxor xmm3,xmm3
+ pxor xmm2,xmm2
+;sad begin
+ mov eax, [esp+24]
+ mov ebx, [esp+28]
+ lea esi, [ebx+2*ebx]
+ SSSE3_Get16BSadHVDC [edi], [eax]
+ SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+ SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+ SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+ add edi, 64
+ lea eax, [eax+4*ebx]
+ SSSE3_Get16BSadHVDC [edi], [eax]
+ SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+ SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+ SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+ add edi, 64
+ lea eax, [eax+4*ebx]
+ SSSE3_Get16BSadHVDC [edi], [eax]
+ SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+ SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+ SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+ add edi, 64
+ lea eax, [eax+4*ebx]
+ SSSE3_Get16BSadHVDC [edi], [eax]
+ SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+ SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+ SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+
+ pslldq xmm3,4
+ por xmm3,xmm2
+ movhlps xmm1,xmm3
+ paddw xmm3,xmm1
+ movhlps xmm0,xmm4
+ paddw xmm4,xmm0
+; comparing order: DC H V
+ movd ebx, xmm4 ;DC
+ movd ecx, xmm3 ;V
+ psrldq xmm3, 4
+ movd esi, xmm3 ;H
+ mov eax, [esp+36] ;lamda
+ shl eax, 1
+ add esi, eax
+ add ebx, eax
+ mov edx, [esp+32]
+ cmp ebx, esi
+ jge near not_dc_16x16_sad
+ cmp ebx, ecx
+ jge near not_dc_h_16x16_sad
+ ; for DC mode
+ mov dword[edx], 2;I16_PRED_DC
+ mov eax, ebx
+ sub edi, 192
+%assign x 0
+%rep 16
+ movdqa [edi+16*x], xmm7
+%assign x x+1
+%endrep
+ jmp near return_sad_intra_16x16_x3
+not_dc_16x16_sad:
+ ; for H mode
+ cmp esi, ecx
+ jge near not_dc_h_16x16_sad
+ mov dword[edx], 1;I16_PRED_H
+ mov eax, esi
+ jmp near return_sad_intra_16x16_x3
+not_dc_h_16x16_sad:
+ ; for V mode
+ mov dword[edx], 0;I16_PRED_V
+ mov eax, ecx
+ sub edi, 192
+%assign x 0
+%rep 16
+ movdqa [edi+16*x], xmm5
+%assign x x+1
+%endrep
+return_sad_intra_16x16_x3:
+ pop edi
+ pop esi
+ pop ebx
+ ret
+%endif
+;***********************************************************************
+;
+;Pixel_sad_intra_ssse3 END
+;
+;***********************************************************************
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse41 BEGIN
+;
+;***********************************************************************
+
+;SSE4.1
+%macro SSE41_GetSatd8x4 0
+ movq xmm0, [r0]
+ punpcklqdq xmm0, xmm0
+ pmaddubsw xmm0, xmm7
+ movq xmm1, [r0+r1]
+ punpcklqdq xmm1, xmm1
+ pmaddubsw xmm1, xmm7
+ movq xmm2, [r2]
+ punpcklqdq xmm2, xmm2
+ pmaddubsw xmm2, xmm7
+ movq xmm3, [r2+r3]
+ punpcklqdq xmm3, xmm3
+ pmaddubsw xmm3, xmm7
+ psubsw xmm0, xmm2
+ psubsw xmm1, xmm3
+ movq xmm2, [r0+2*r1]
+ punpcklqdq xmm2, xmm2
+ pmaddubsw xmm2, xmm7
+ movq xmm3, [r0+r4]
+ punpcklqdq xmm3, xmm3
+ pmaddubsw xmm3, xmm7
+ movq xmm4, [r2+2*r3]
+ punpcklqdq xmm4, xmm4
+ pmaddubsw xmm4, xmm7
+ movq xmm5, [r2+r5]
+ punpcklqdq xmm5, xmm5
+ pmaddubsw xmm5, xmm7
+ psubsw xmm2, xmm4
+ psubsw xmm3, xmm5
+ SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4
+ pabsw xmm0, xmm0
+ pabsw xmm2, xmm2
+ pabsw xmm1, xmm1
+ pabsw xmm3, xmm3
+ movdqa xmm4, xmm3
+ pblendw xmm3, xmm1, 0xAA
+ pslld xmm1, 16
+ psrld xmm4, 16
+ por xmm1, xmm4
+ pmaxuw xmm1, xmm3
+ paddw xmm6, xmm1
+ movdqa xmm4, xmm0
+ pblendw xmm0, xmm2, 0xAA
+ pslld xmm2, 16
+ psrld xmm4, 16
+ por xmm2, xmm4
+ pmaxuw xmm0, xmm2
+ paddw xmm6, xmm0
+%endmacro
+
+%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
+ MMX_DW_1_2REG %3, %4
+ pmaddwd %2, %3
+ movhlps %4, %2
+ paddd %2, %4
+ pshuflw %4, %2,0Eh
+ paddd %2, %4
+ movd %1, %2
+%endmacro
+;***********************************************************************
+;
+;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd4x4_sse41
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ movdqa xmm4,[HSwapSumSubDB1]
+ movd xmm2,[r2]
+ movd xmm5,[r2+r3]
+ shufps xmm2,xmm5,0
+ movd xmm3,[r2+r3*2]
+ lea r2, [r3*2+r2]
+ movd xmm5,[r2+r3]
+ shufps xmm3,xmm5,0
+ movd xmm0,[r0]
+ movd xmm5,[r0+r1]
+ shufps xmm0,xmm5,0
+ movd xmm1,[r0+r1*2]
+ lea r0, [r1*2+r0]
+ movd xmm5,[r0+r1]
+ shufps xmm1,xmm5,0
+ pmaddubsw xmm0,xmm4
+ pmaddubsw xmm1,xmm4
+ pmaddubsw xmm2,xmm4
+ pmaddubsw xmm3,xmm4
+ psubw xmm0,xmm2
+ psubw xmm1,xmm3
+ movdqa xmm2,xmm0
+ paddw xmm0,xmm1
+ psubw xmm1,xmm2
+ movdqa xmm2,xmm0
+ punpcklqdq xmm0,xmm1
+ punpckhqdq xmm2,xmm1
+ movdqa xmm1,xmm0
+ paddw xmm0,xmm2
+ psubw xmm2,xmm1
+ movdqa xmm1,xmm0
+ pblendw xmm0,xmm2,0AAh
+ pslld xmm2,16
+ psrld xmm1,16
+ por xmm2,xmm1
+ pabsw xmm0,xmm0
+ pabsw xmm2,xmm2
+ pmaxsw xmm0,xmm2
+ SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd8x8_sse41
+%ifdef X86_32
+ push r4
+ push r5
+%endif
+ %assign push_num 2
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ movdqa xmm7, [HSumSubDB1]
+ lea r4, [r1+r1*2]
+ lea r5, [r3+r3*2]
+ pxor xmm6, xmm6
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE41_GetSatd8x4
+ SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ POP_XMM
+ LOAD_4_PARA_POP
+%ifdef X86_32
+ pop r5
+ pop r4
+%endif
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd8x16_sse41
+%ifdef X86_32
+ push r4
+ push r5
+ push r6
+%endif
+ %assign push_num 3
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ movdqa xmm7, [HSumSubDB1]
+ lea r4, [r1+r1*2]
+ lea r5, [r3+r3*2]
+ pxor xmm6, xmm6
+ mov r6, 0
+loop_get_satd_8x16:
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ inc r6
+ cmp r6, 4
+ jl loop_get_satd_8x16
+ SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ POP_XMM
+ LOAD_4_PARA_POP
+%ifdef X86_32
+ pop r6
+ pop r5
+ pop r4
+%endif
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x8_sse41
+%ifdef X86_32
+ push r4
+ push r5
+%endif
+ %assign push_num 2
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ push r0
+ push r2
+
+ movdqa xmm7, [HSumSubDB1]
+ lea r4, [r1+r1*2]
+ lea r5, [r3+r3*2]
+ pxor xmm6, xmm6
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE41_GetSatd8x4
+
+ pop r2
+ pop r0
+ add r0, 8
+ add r2, 8
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE41_GetSatd8x4
+ SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ POP_XMM
+ LOAD_4_PARA_POP
+%ifdef X86_32
+ pop r5
+ pop r4
+%endif
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+
+WELS_EXTERN WelsSampleSatd16x16_sse41
+%ifdef X86_32
+ push r4
+ push r5
+ push r6
+%endif
+ %assign push_num 3
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+
+ push r0
+ push r2
+
+ movdqa xmm7, [HSumSubDB1]
+ lea r4, [r1+r1*2]
+ lea r5, [r3+r3*2]
+ pxor xmm6, xmm6
+ mov r6, 0
+loop_get_satd_16x16_left:
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ inc r6
+ cmp r6, 4
+ jl loop_get_satd_16x16_left
+
+ pop r2
+ pop r0
+ add r0, 8
+ add r2, 8
+ mov r6, 0
+loop_get_satd_16x16_right:
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ inc r6
+ cmp r6, 4
+ jl loop_get_satd_16x16_right
+ SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ POP_XMM
+ LOAD_4_PARA_POP
+%ifdef X86_32
+ pop r6
+ pop r5
+ pop r4
+%endif
+ ret
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse41 END
+;
+;***********************************************************************
+
+;***********************************************************************
+;
+;Pixel_sad_wxh_sse2 BEGIN
+;
+;***********************************************************************
+
+%macro SSE2_GetSad2x16 0
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqu xmm1, [r2]
+ MOVDQ xmm2, [r0];[eax] must aligned 16
+ psadbw xmm1, xmm2
+ paddw xmm0, xmm1
+ movdqu xmm1, [r2+r3]
+ MOVDQ xmm2, [r0+r1]
+ psadbw xmm1, xmm2
+ paddw xmm0, xmm1
+%endmacro
+
+
+%macro SSE2_GetSad4x16 0
+ movdqu xmm0, [r2]
+ MOVDQ xmm2, [r0]
+ psadbw xmm0, xmm2
+ paddw xmm7, xmm0
+ movdqu xmm1, [r2+r3]
+ MOVDQ xmm2, [r0+r1]
+ psadbw xmm1, xmm2
+ paddw xmm7, xmm1
+ movdqu xmm1, [r2+2*r3]
+ MOVDQ xmm2, [r0+2*r1];[eax] must aligned 16
+ psadbw xmm1, xmm2
+ paddw xmm7, xmm1
+ movdqu xmm1, [r2+r5]
+ MOVDQ xmm2, [r0+r4]
+ psadbw xmm1, xmm2
+ paddw xmm7, xmm1
+%endmacro
+
+
+%macro SSE2_GetSad8x4 0
+ movq xmm0, [r0]
+ movq xmm1, [r0+r1]
+ lea r0, [r0+2*r1]
+ movhps xmm0, [r0]
+ movhps xmm1, [r0+r1]
+
+ movq xmm2, [r2]
+ movq xmm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ movhps xmm2, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm2
+ psadbw xmm1, xmm3
+ paddw xmm6, xmm0
+ paddw xmm6, xmm1
+%endmacro
+
+;***********************************************************************
+;
+;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
+;First parameter can align to 16 bytes,
+;In wels, the third parameter can't align to 16 bytes.
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSad16x16_sse2
+%ifdef X86_32
+ push r4
+ push r5
+%endif
+
+ %assign push_num 2
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+
+ pxor xmm7, xmm7
+ SSE2_GetSad4x16
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE2_GetSad4x16
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE2_GetSad4x16
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE2_GetSad4x16
+ movhlps xmm0, xmm7
+ paddw xmm0, xmm7
+ movd retrd, xmm0
+ POP_XMM
+ LOAD_4_PARA_POP
+%ifdef X86_32
+ pop r5
+ pop r4
+%endif
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
+;First parameter can align to 16 bytes,
+;In wels, the third parameter can't align to 16 bytes.
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSad16x8_sse2
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ movdqu xmm0, [r2]
+ MOVDQ xmm2, [r0]
+ psadbw xmm0, xmm2
+ movdqu xmm1, [r2+r3]
+ MOVDQ xmm2, [r0+r1]
+ psadbw xmm1, xmm2
+ paddw xmm0, xmm1
+
+ SSE2_GetSad2x16
+ SSE2_GetSad2x16
+ SSE2_GetSad2x16
+
+ movhlps xmm1, xmm0
+ paddw xmm0, xmm1
+ movd retrd, xmm0
+ LOAD_4_PARA_POP
+ ret
+
+
+
+WELS_EXTERN WelsSampleSad8x16_sse2
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 7
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm6, xmm6
+
+ SSE2_GetSad8x4
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSad8x4
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSad8x4
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSad8x4
+
+ movhlps xmm0, xmm6
+ paddw xmm0, xmm6
+ movd retrd, xmm0
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
+
+
+%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
+and %1, 0x1f|(%3>>1)
+cmp %1, (32-%2)|(%3>>1)
+%endmacro
+
+WELS_EXTERN WelsSampleSad8x8_sse21
+ %assign push_num 0
+ mov r2, arg3
+ push r2
+ CACHE_SPLIT_CHECK r2, 8, 64
+ jle near .pixel_sad_8x8_nsplit
+ pop r2
+%ifdef X86_32
+ push r3
+ push r4
+ push r5
+%endif
+ %assign push_num 3
+ PUSH_XMM 8
+ mov r0, arg1
+ mov r1, arg2
+ SIGN_EXTENSION r1, r1d
+ pxor xmm7, xmm7
+
+ ;ecx r2, edx r4, edi r5
+
+ mov r5, r2
+ and r5, 0x07
+ sub r2, r5
+ mov r4, 8
+ sub r4, r5
+
+ shl r5, 3
+ shl r4, 3
+ movd xmm5, r5d
+ movd xmm6, r4d
+ mov r5, 8
+ add r5, r2
+ mov r3, arg4
+ SIGN_EXTENSION r3, r3d
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
+
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r5, [r5+2*r3]
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
+
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r5, [r5+2*r3]
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
+
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r5, [r5+2*r3]
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
+
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
+
+ movhlps xmm0, xmm7
+ paddw xmm0, xmm7
+ movd retrd, xmm0
+ POP_XMM
+%ifdef X86_32
+ pop r5
+ pop r4
+ pop r3
+%endif
+ jmp .return
+
+.pixel_sad_8x8_nsplit:
+
+ pop r2
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 7
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm6, xmm6
+ SSE2_GetSad8x4
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSad8x4
+ movhlps xmm0, xmm6
+ paddw xmm0, xmm6
+ movd retrd, xmm0
+ POP_XMM
+ LOAD_4_PARA_POP
+.return:
+ ret
+
+
+;***********************************************************************
+;
+;Pixel_sad_wxh_sse2 END
+;
+;***********************************************************************
+
+
+;***********************************************************************
+;
+;Pixel_sad_4_wxh_sse2 BEGIN
+;
+;***********************************************************************
+
+
+%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
+ psadbw %1, %4
+ paddw xmm5, %1
+ psadbw %4, %3
+ paddw xmm4, %4
+ movdqu %4, [%5-1]
+ psadbw %4, %2
+ paddw xmm6, %4
+ movdqu %4, [%5+1]
+ psadbw %4, %2
+ paddw xmm7, %4
+%endmacro
+WELS_EXTERN WelsSampleSadFour16x16_sse2
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
+ pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
+ pxor xmm6, xmm6 ;sad pRefMb-1
+ pxor xmm7, xmm7 ;sad pRefMb+1
+ movdqa xmm0, [r0]
+ sub r2, r3
+ movdqu xmm3, [r2]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ psadbw xmm3, xmm1
+ paddw xmm4, xmm3
+
+ movdqu xmm2, [r2+r3-1]
+ psadbw xmm2, xmm0
+ paddw xmm6, xmm2
+
+ movdqu xmm3, [r2+r3+1]
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm2, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+ movdqa xmm0, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm1, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+ movdqa xmm2, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm0, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm2, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+ movdqa xmm0, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm1, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+ movdqa xmm2, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm0, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm2, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+ movdqa xmm0, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+ lea r2, [r2+2*r3]
+ movdqu xmm3, [r2]
+ psadbw xmm2, xmm3
+ paddw xmm5, xmm2
+
+ movdqu xmm2, [r2-1]
+ psadbw xmm2, xmm0
+ paddw xmm6, xmm2
+
+ movdqu xmm3, [r2+1]
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movdqu xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ movhlps xmm0, xmm5
+ paddw xmm5, xmm0
+ movhlps xmm0, xmm6
+ paddw xmm6, xmm0
+ movhlps xmm0, xmm7
+ paddw xmm7, xmm0
+ punpckldq xmm4, xmm5
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6
+ movdqa [r4],xmm4
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
+
+
+WELS_EXTERN WelsSampleSadFour16x8_sse2
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
+ pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
+ pxor xmm6, xmm6 ;sad pRefMb-1
+ pxor xmm7, xmm7 ;sad pRefMb+1
+ movdqa xmm0, [r0]
+ sub r2, r3
+ movdqu xmm3, [r2]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ psadbw xmm3, xmm1
+ paddw xmm4, xmm3
+
+ movdqu xmm2, [r2+r3-1]
+ psadbw xmm2, xmm0
+ paddw xmm6, xmm2
+
+ movdqu xmm3, [r2+r3+1]
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm2, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+ movdqa xmm0, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm1, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+ movdqa xmm2, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm0, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+ lea r2, [r2+2*r3]
+ movdqu xmm3, [r2]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movdqu xmm0, [r2-1]
+ psadbw xmm0, xmm1
+ paddw xmm6, xmm0
+
+ movdqu xmm3, [r2+1]
+ psadbw xmm3, xmm1
+ paddw xmm7, xmm3
+
+ movdqu xmm3, [r2+r3]
+ psadbw xmm1, xmm3
+ paddw xmm5, xmm1
+
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ movhlps xmm0, xmm5
+ paddw xmm5, xmm0
+ movhlps xmm0, xmm6
+ paddw xmm6, xmm0
+ movhlps xmm0, xmm7
+ paddw xmm7, xmm0
+ punpckldq xmm4, xmm5
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6
+ movdqa [r4],xmm4
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
+
+WELS_EXTERN WelsSampleSadFour8x16_sse2
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
+ pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
+ pxor xmm6, xmm6 ;sad pRefMb-1
+ pxor xmm7, xmm7 ;sad pRefMb+1
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ sub r2, r3
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ movhlps xmm0, xmm5
+ paddw xmm5, xmm0
+ movhlps xmm0, xmm6
+ paddw xmm6, xmm0
+ movhlps xmm0, xmm7
+ paddw xmm7, xmm0
+ punpckldq xmm4, xmm5
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6
+ movdqa [r4],xmm4
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
+
+
+WELS_EXTERN WelsSampleSadFour8x8_sse2
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
+ pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
+ pxor xmm6, xmm6 ;sad pRefMb-1
+ pxor xmm7, xmm7 ;sad pRefMb+1
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ sub r2, r3
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ movhlps xmm0, xmm5
+ paddw xmm5, xmm0
+ movhlps xmm0, xmm6
+ paddw xmm6, xmm0
+ movhlps xmm0, xmm7
+ paddw xmm7, xmm0
+ punpckldq xmm4, xmm5
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6
+ movdqa [r4],xmm4
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
+
+WELS_EXTERN WelsSampleSadFour4x4_sse2
+ %assign push_num 0
+ LOAD_5_PARA
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ movd xmm0, [r0]
+ movd xmm1, [r0+r1]
+ lea r0, [r0+2*r1]
+ movd xmm2, [r0]
+ movd xmm3, [r0+r1]
+ punpckldq xmm0, xmm1
+ punpckldq xmm2, xmm3
+ punpcklqdq xmm0, xmm2
+ sub r2, r3
+ movd xmm1, [r2]
+ movd xmm2, [r2+r3]
+ punpckldq xmm1, xmm2
+ movd xmm2, [r2+r3-1]
+ movd xmm3, [r2+r3+1]
+
+ lea r2, [r2+2*r3]
+
+ movd xmm4, [r2]
+ movd xmm5, [r2-1]
+ punpckldq xmm2, xmm5
+ movd xmm5, [r2+1]
+ punpckldq xmm3, xmm5
+
+ movd xmm5, [r2+r3]
+ punpckldq xmm4, xmm5
+
+ punpcklqdq xmm1, xmm4 ;-L
+
+ movd xmm5, [r2+r3-1]
+ movd xmm6, [r2+r3+1]
+
+ lea r2, [r2+2*r3]
+ movd xmm7, [r2-1]
+ punpckldq xmm5, xmm7
+ punpcklqdq xmm2, xmm5 ;-1
+ movd xmm7, [r2+1]
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm3, xmm6 ;+1
+ movd xmm6, [r2]
+ movd xmm7, [r2+r3]
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6 ;+L
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+
+ movhlps xmm0, xmm1
+ paddw xmm1, xmm0
+ movhlps xmm0, xmm2
+ paddw xmm2, xmm0
+ movhlps xmm0, xmm3
+ paddw xmm3, xmm0
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ punpckldq xmm1, xmm4
+ punpckldq xmm2, xmm3
+ punpcklqdq xmm1, xmm2
+ movdqa [r4],xmm1
+ LOAD_5_PARA_POP
+ ret
+
+;***********************************************************************
+;
+;Pixel_sad_4_wxh_sse2 END
+;
+;***********************************************************************
+
+;***********************************************************************
+; int32_t WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
+;***********************************************************************
+WELS_EXTERN WelsSampleSad4x4_mmx
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ movd mm0, [r0]
+ movd mm1, [r0+r1]
+ punpckldq mm0, mm1
+
+ movd mm3, [r2]
+ movd mm4, [r2+r3]
+ punpckldq mm3, mm4
+ psadbw mm0, mm3
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+
+ movd mm1, [r0]
+ movd mm2, [r0+r1]
+ punpckldq mm1, mm2
+
+ movd mm3, [r2]
+ movd mm4, [r2+r3]
+ punpckldq mm3, mm4
+ psadbw mm1, mm3
+ paddw mm0, mm1
+
+ movd retrd, mm0
+
+ WELSEMMS
+ LOAD_4_PARA_POP
+ ret
--- /dev/null
+++ b/codec/common/x86/vaa.asm
@@ -1,0 +1,411 @@
+;*!
+;* \copy
+;* Copyright (c) 2010-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* vaa.asm
+;*
+;* Abstract
+;* sse2 for pVaa routines
+;*
+;* History
+;* 04/14/2010 Created
+;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3)
+;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement
+;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+; by comparing it outperforms than phaddw(SSSE3) sets
+%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp
+ ; @sum_8x2 begin
+ pshufd %2, %1, 04Eh ; 01001110 B
+ paddw %1, %2
+ pshuflw %2, %1, 04Eh ; 01001110 B
+ paddw %1, %2
+ pshuflw %2, %1, 0B1h ; 10110001 B
+ paddw %1, %2
+ ; end of @sum_8x2
+%endmacro ; END of SUM_WORD_8x2_SSE2
+
+
+%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
+ movdqa %1, [r0 ] ; line 0
+ movdqa %2, [r0+r1] ; line 1
+ movdqa %3, %1
+ punpcklbw %1, xmm7
+ punpckhbw %3, xmm7
+ movdqa %4, %2
+ punpcklbw %4, xmm7
+ punpckhbw %2, xmm7
+ paddw %1, %4
+ paddw %2, %3
+ movdqa %3, [r0+r2] ; line 2
+ movdqa %4, [r0+r3] ; line 3
+ movdqa %5, %3
+ punpcklbw %3, xmm7
+ punpckhbw %5, xmm7
+ movdqa %6, %4
+ punpcklbw %6, xmm7
+ punpckhbw %4, xmm7
+ paddw %3, %6
+ paddw %4, %5
+ paddw %1, %3 ; block 0, 1
+ paddw %2, %4 ; block 2, 3
+ pshufd %3, %1, 0B1h
+ pshufd %4, %2, 0B1h
+ paddw %1, %3
+ paddw %2, %4
+ movdqa %3, %1
+ movdqa %4, %2
+ pshuflw %5, %1, 0B1h
+ pshufhw %6, %3, 0B1h
+ paddw %1, %5
+ paddw %3, %6
+ pshuflw %5, %2, 0B1h
+ pshufhw %6, %4, 0B1h
+ paddw %2, %5
+ paddw %4, %6
+ punpcklwd %1, %2
+ punpckhwd %3, %4
+ punpcklwd %1, %3
+ psraw %1, $04
+%endmacro
+
+%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
+ movdqa %1, [r0 ] ; line 0
+ movdqa %2, [r0+r1] ; line 1
+ movdqa %3, %1
+ punpcklbw %1, xmm7
+ punpckhbw %3, xmm7
+ movdqa %4, %2
+ punpcklbw %4, xmm7
+ punpckhbw %2, xmm7
+ paddw %1, %4
+ paddw %2, %3
+ movdqa %3, [r0+r2] ; line 2
+ movdqa %4, [r0+r3] ; line 3
+ movdqa %5, %3
+ punpcklbw %3, xmm7
+ punpckhbw %5, xmm7
+ movdqa %6, %4
+ punpcklbw %6, xmm7
+ punpckhbw %4, xmm7
+ paddw %3, %6
+ paddw %4, %5
+ paddw %1, %3 ; block 0, 1
+ paddw %2, %4 ; block 2, 3
+ phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
+ phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
+ psraw %1, $04
+%endmacro
+
+
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+; , 6/7/2010
+
+;***********************************************************************
+; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t iLineSize );
+;***********************************************************************
+WELS_EXTERN AnalysisVaaInfoIntra_sse2
+
+ %assign push_num 0
+ LOAD_2_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1,r1d
+
+%ifdef X86_32
+ push r3
+ push r4
+ push r5
+ push r6
+ %assign push_num push_num+4
+%endif
+
+ mov r5,r7
+ and r5,0fh
+ sub r7,r5
+ sub r7,32
+
+
+ mov r2,r1
+ sal r2,$01 ;r2 = 2*iLineSize
+ mov r3,r2
+ add r3,r1 ;r3 = 3*iLineSize
+
+ mov r4,r2
+ sal r4,$01 ;r4 = 4*iLineSize
+
+ pxor xmm7, xmm7
+
+ ; loops
+ VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [r7], xmm0
+
+ lea r0, [r0+r4]
+ VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [r7+8], xmm0
+
+ lea r0, [r0+r4]
+ VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [r7+16], xmm0
+
+ lea r0, [r0+r4]
+ VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [r7+24], xmm0
+
+ movdqa xmm0, [r7] ; block 0~7
+ movdqa xmm1, [r7+16] ; block 8~15
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ SUM_WORD_8x2_SSE2 xmm0, xmm3
+
+ pmullw xmm1, xmm1
+ pmullw xmm2, xmm2
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+ punpcklwd xmm1, xmm7
+ punpckhwd xmm3, xmm7
+ punpcklwd xmm2, xmm7
+ punpckhwd xmm4, xmm7
+ paddd xmm1, xmm2
+ paddd xmm3, xmm4
+ paddd xmm1, xmm3
+ pshufd xmm2, xmm1, 01Bh
+ paddd xmm1, xmm2
+ pshufd xmm2, xmm1, 0B1h
+ paddd xmm1, xmm2
+
+
+
+ movd r2d, xmm0
+ and r2, 0ffffh ; effective low work truncated
+ mov r3, r2
+ imul r2, r3
+ sar r2, $04
+ movd retrd, xmm1
+ sub retrd, r2d
+
+ add r7,32
+ add r7,r5
+
+%ifdef X86_32
+ pop r6
+ pop r5
+ pop r4
+ pop r3
+%endif
+ POP_XMM
+
+ ret
+
+;***********************************************************************
+; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t iLineSize );
+;***********************************************************************
+WELS_EXTERN AnalysisVaaInfoIntra_ssse3
+
+ %assign push_num 0
+ LOAD_2_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1,r1d
+
+%ifdef X86_32
+ push r3
+ push r4
+ push r5
+ push r6
+ %assign push_num push_num+4
+%endif
+
+ mov r5,r7
+ and r5,0fh
+ sub r7,r5
+ sub r7,32
+
+
+ mov r2,r1
+ sal r2,$01 ;r2 = 2*iLineSize
+ mov r3,r2
+ add r3,r1 ;r3 = 3*iLineSize
+
+ mov r4,r2
+ sal r4,$01 ;r4 = 4*iLineSize
+
+ pxor xmm7, xmm7
+
+ ; loops
+ VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [r7],xmm0
+
+ lea r0,[r0+r4]
+ VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+ movq [r7+8],xmm1
+
+
+ lea r0,[r0+r4]
+ VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [r7+16],xmm0
+
+ lea r0,[r0+r4]
+ VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+ movq [r7+24],xmm1
+
+
+ movdqa xmm0,[r7]
+ movdqa xmm1,[r7+16]
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets
+
+ pmullw xmm1, xmm1
+ pmullw xmm2, xmm2
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+ punpcklwd xmm1, xmm7
+ punpckhwd xmm3, xmm7
+ punpcklwd xmm2, xmm7
+ punpckhwd xmm4, xmm7
+ paddd xmm1, xmm2
+ paddd xmm3, xmm4
+ paddd xmm1, xmm3
+ pshufd xmm2, xmm1, 01Bh
+ paddd xmm1, xmm2
+ pshufd xmm2, xmm1, 0B1h
+ paddd xmm1, xmm2
+
+
+ movd r2d, xmm0
+ and r2, 0ffffh ; effective low work truncated
+ mov r3, r2
+ imul r2, r3
+ sar r2, $04
+ movd retrd, xmm1
+ sub retrd, r2d
+
+ add r7,32
+ add r7,r5
+%ifdef X86_32
+ pop r6
+ pop r5
+ pop r4
+ pop r3
+%endif
+ POP_XMM
+
+ ret
+
+;***********************************************************************
+; uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
+;***********************************************************************
+WELS_EXTERN MdInterAnalysisVaaInfo_sse41
+ %assign push_num 0
+ LOAD_1_PARA
+ movdqa xmm0,[r0]
+ pshufd xmm1, xmm0, 01Bh
+ paddd xmm1, xmm0
+ pshufd xmm2, xmm1, 0B1h
+ paddd xmm1, xmm2
+ psrad xmm1, 02h ; iAverageSad
+ movdqa xmm2, xmm1
+ psrad xmm2, 06h
+ movdqa xmm3, xmm0 ; iSadBlock
+ psrad xmm3, 06h
+ psubd xmm3, xmm2
+ pmulld xmm3, xmm3 ; [comment]: pmulld from SSE4.1 instruction sets
+ pshufd xmm4, xmm3, 01Bh
+ paddd xmm4, xmm3
+ pshufd xmm3, xmm4, 0B1h
+ paddd xmm3, xmm4
+ movd r0d, xmm3
+ cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
+
+ jb near .threshold_exit
+ pshufd xmm0, xmm0, 01Bh
+ pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
+ movmskps retrd, xmm0
+ ret
+.threshold_exit:
+ mov retrd, 15
+ ret
+
+;***********************************************************************
+; uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
+;***********************************************************************
+WELS_EXTERN MdInterAnalysisVaaInfo_sse2
+ %assign push_num 0
+ LOAD_1_PARA
+ movdqa xmm0, [r0]
+ pshufd xmm1, xmm0, 01Bh
+ paddd xmm1, xmm0
+ pshufd xmm2, xmm1, 0B1h
+ paddd xmm1, xmm2
+ psrad xmm1, 02h ; iAverageSad
+ movdqa xmm2, xmm1
+ psrad xmm2, 06h
+ movdqa xmm3, xmm0 ; iSadBlock
+ psrad xmm3, 06h
+ psubd xmm3, xmm2
+
+ ; to replace pmulld functionality as below
+ movdqa xmm2, xmm3
+ pmuludq xmm2, xmm3
+ pshufd xmm4, xmm3, 0B1h
+ pmuludq xmm4, xmm4
+ movdqa xmm5, xmm2
+ punpckldq xmm5, xmm4
+ punpckhdq xmm2, xmm4
+ punpcklqdq xmm5, xmm2
+
+ pshufd xmm4, xmm5, 01Bh
+ paddd xmm4, xmm5
+ pshufd xmm5, xmm4, 0B1h
+ paddd xmm5, xmm4
+
+ movd r0d, xmm5
+ cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
+ jb near .threshold_exit
+ pshufd xmm0, xmm0, 01Bh
+ pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
+ movmskps retrd, xmm0
+ ret
+.threshold_exit:
+ mov retrd, 15
+ ret
--- a/codec/processing/build/win32/WelsVP_2008.vcproj
+++ b/codec/processing/build/win32/WelsVP_2008.vcproj
@@ -52,7 +52,7 @@
<Tool
Name="VCCLCompilerTool"
Optimization="0"
- AdditionalIncludeDirectories="../../../common/;../../interface;../../src/common"
+ AdditionalIncludeDirectories="../../../common/inc;../../interface;../../src/common"
PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM"
MinimalRebuild="true"
BasicRuntimeChecks="3"
@@ -137,7 +137,7 @@
<Tool
Name="VCCLCompilerTool"
Optimization="0"
- AdditionalIncludeDirectories="../../../common/;../../interface;../../src/common"
+ AdditionalIncludeDirectories="../../../common/inc;../../interface;../../src/common"
PreprocessorDefinitions="WIN64;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM"
MinimalRebuild="true"
BasicRuntimeChecks="3"
@@ -224,7 +224,7 @@
Optimization="3"
EnableIntrinsicFunctions="false"
FavorSizeOrSpeed="1"
- AdditionalIncludeDirectories="../../../common/;../../interface;../../src/common"
+ AdditionalIncludeDirectories="../../../common/inc/;../../interface;../../src/common"
PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM"
RuntimeLibrary="0"
EnableFunctionLevelLinking="false"
@@ -314,7 +314,7 @@
Optimization="3"
EnableIntrinsicFunctions="false"
FavorSizeOrSpeed="1"
- AdditionalIncludeDirectories="../../../common/;../../interface;../../src/common"
+ AdditionalIncludeDirectories="../../../common/inc/;../../interface;../../src/common"
PreprocessorDefinitions="WIN64;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM"
RuntimeLibrary="0"
EnableFunctionLevelLinking="false"
@@ -380,7 +380,7 @@
UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
>
<File
- RelativePath="..\..\..\common\cpu.cpp"
+ RelativePath="..\..\..\common\src\cpu.cpp"
>
</File>
<File
@@ -388,7 +388,7 @@
>
</File>
<File
- RelativePath="..\..\..\common\WelsThreadLib.cpp"
+ RelativePath="..\..\..\common\src\WelsThreadLib.cpp"
>
</File>
<File
@@ -446,7 +446,7 @@
Name="Header Files"
>
<File
- RelativePath="..\..\..\common\cpu.h"
+ RelativePath="..\..\..\common\inc\cpu.h"
>
</File>
<File
@@ -454,7 +454,7 @@
>
</File>
<File
- RelativePath="..\..\..\common\WelsThreadLib.h"
+ RelativePath="..\..\..\common\inc\WelsThreadLib.h"
>
</File>
<File
@@ -474,7 +474,7 @@
Name="ASM"
>
<File
- RelativePath="..\..\..\common\cpuid.asm"
+ RelativePath="..\..\..\common\x86\cpuid.asm"
>
<FileConfiguration
Name="Debug|Win32"
@@ -490,7 +490,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -508,7 +508,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -521,7 +521,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -530,7 +530,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -539,7 +539,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -548,7 +548,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -561,7 +561,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -570,7 +570,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -579,7 +579,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -588,13 +588,13 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\common\satd_sad.asm"
+ RelativePath="..\..\..\common\x86\satd_sad.asm"
>
<FileConfiguration
Name="Debug|Win32"
@@ -601,7 +601,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -610,7 +610,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -619,7 +619,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -628,7 +628,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -641,7 +641,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -650,7 +650,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -659,7 +659,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -668,7 +668,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>