ref: a913cc853e517c2a5a0f79cc72cd5df590d82317
parent: 90e0057ba6b46df54897bda88869665c7dd08fe1
parent: f9dea467123fbff2c74422a8634b20af4026de49
author: Ethan Hugg <[email protected]>
date: Fri Dec 13 03:54:14 EST 2013
Merge pull request #32 from mstorsjo/cosmetics Consistently use unix newlines, remove trailing whitespace
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
OpenH264
=======
OpenH264 is a codec library which supports H.264 encoding and decoding. It is suitable for use in real time applications such as WebRTC. See http://www.openh264.org/ for more details.
-
+
Encoder Features
------------------------
- Constrained Baseline Profile up to Level 5.2 (4096x2304)
@@ -17,10 +17,10 @@
- Single reference frame for inter prediction
- Multiple reference frames when using LTR and/or 3-4 temporal layers
- Periodic and on-demand Instantaneous Decoder Refresh (IDR) frame insertion
-- Dynamic changes to bit rate, frame rate, and resolution
+- Dynamic changes to bit rate, frame rate, and resolution
- Annex B byte stream output
- YUV 4:2:0 planar input
-
+
Decoder Features
------------------------
- Constrained Baseline Profile up to Level 5.2 (4096x2304)
@@ -32,7 +32,7 @@
- Multiple reference frames when specified in Sequence Parameter Set (SPS)
- Annex B byte stream input
- YUV 4:2:0 planar output
-
+
OS Support
----------------
- Windows 64-bit and 32-bit (initial release is only 32-bit, 64-bit will follow soon)
@@ -40,7 +40,7 @@
- Linux 64-bit and 32-bit (initial release is only 32-bit, 64-bit will follow soon)
- Android 32-bit (initial release does not include this target, will follow soon)
- iOS 64-bit and 32-bit (not supported yet, may be added in the future)
-
+
Processor Support
-------------------------
- Intel x86 optionally with MMX/SSE (no AVX yet, help is welcome)
@@ -53,30 +53,30 @@
: build the decoder library and executable via codec/build/linux/dec/makefile
: build the encoder library and executable via codec/build/linux/enc/makefile
: build the encoder shared library via processing/build/linux/makefile
-
+
Windows Visual Studio 2008/2010/2012 projects are available:
: build the decoder via the Visual Studio projects in codec/build/win32/dec
: build the encoder via the Visual Studio projects in codec/build/win32/enc
: build the encoder shared library via the Visual Studio projects in processing/build/win32/
-
+
NASM needed to be installed for assembly code: workable version 2.07 or above, nasm can downloaded from http://www.nasm.us/
-
+
API details to be provided later.
-
+
Using the Test App
-------------------------
Linux shell scripts to build the test apps:
: build via testbin/AutoBuild_Linux.sh
: clean via testbin/AutoClean_Linux.sh
-
+
Windows batch files to build the test apps:
: Visual Studio 2008 use testbin/AutoBuild_Windows_VS2008.bat
: Visual Studio 2010 use testbin/AutoBuild_Windows_VS2010.bat
: Visual Studio 2012 use testbin/AutoBuild_Windows_VS2012.bat
-
+
Usage information can be found in testbin/CmdLineReadMe
Command line options and details to be provided later.
-
+
Using the Source
-----------------------
codec - encoder, decoder, console (test app), build (makefile, vcproj)
@@ -83,7 +83,7 @@
processing - raw pixel processing (used by encoder)
testbin - autobuild scripts, test app config files, yuv test files
bin - binaries for library and test app
-
+
Known Issues
-------------------
See the issue tracker on https://github.com/cisco/openh264/issues
@@ -91,7 +91,7 @@
- Encoder errors when compressed frame size exceeds half uncompressed size
- Encoder console app only support multiple of 16 width/height for now
- Decoder errors when compressed frame size exceeds 1MB
-
+
License
----------
BSD, see LICENSE file for details.
--- a/build/mktargets.py
+++ b/build/mktargets.py
@@ -19,7 +19,7 @@
def write_cpp_rule(f, x):
src = "$(%s_SRCDIR)/%s"%(PREFIX, x)
dst = "$(%s_SRCDIR)/%s"%(PREFIX, make_o(x))
-
+
f.write("%s: %s\n"%(dst, src))
f.write('\t$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(' + PREFIX + '_CFLAGS) $(' + PREFIX + '_INCLUDES) -c -o ' + dst + ' ' + src + '\n');
f.write("\n")
@@ -27,7 +27,7 @@
def write_asm_rule(f, x):
src = "$(%s_SRCDIR)/%s"%(PREFIX, x)
dst = "$(%s_SRCDIR)/%s"%(PREFIX, make_o(x))
-
+
f.write("%s: %s\n"%(dst, src))
f.write('\t$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(' + PREFIX + '_ASMFLAGS) $(' + PREFIX + '_ASM_INCLUDES) -o ' + dst + ' ' + src + '\n');
f.write("\n")
@@ -70,7 +70,7 @@
f.write("%s_CPP_SRCS=\\\n"%(PREFIX))
for c in cpp:
f.write("\t$(%s_SRCDIR)/%s\\\n"%(PREFIX, c))
-f.write("\n")
+f.write("\n")
f.write("%s_OBJS += $(%s_CPP_SRCS:.cpp=.o)\n"%(PREFIX, PREFIX))
f.write("ifeq ($(USE_ASM), Yes)\n");
--- a/codec/build/linux/dec/makefile
+++ b/codec/build/linux/dec/makefile
@@ -25,7 +25,7 @@
ASFLAGS= -f elf -DNOPREFIX -I ../../../decoder/core/asm/
LIBS= -lstdc++ -ldl
-#-lm
+#-lm
CFLAGS= $(INCLUDE) -fPIC -D__GCC__ -DLINUX -D__NO_CTYPE -DHAVE_CACHE_LINE_ALIGN
ifeq ($(DBG),1)
@@ -65,7 +65,7 @@
$(CORESRCDIR)/utils.cpp \
$(PLUSSRCDIR)/welsDecoderExt.cpp \
$(PLUSSRCDIR)/welsCodecTrace.cpp \
-$(COMMONSRCDIR)/logging.cpp
+$(COMMONSRCDIR)/logging.cpp
ASMSRC= $(ASMSRCDIR)/block_add.asm \
$(ASMSRCDIR)/cpuid.asm \
@@ -78,7 +78,7 @@
$(ASMSRCDIR)/mc_luma.asm \
$(ASMSRCDIR)/memzero.asm \
$(ASMSRCDIR)/asm_inc.asm \
-
+
MAINSRC= $(MAINSRCDIR)/d3d9_utils.cpp \
$(MAINSRCDIR)/h264dec.cpp \
$(MAINSRCDIR)/read_config.cpp
@@ -119,7 +119,7 @@
$(OBJDIR)/mb_copy.o \
$(OBJDIR)/mc_luma.o \
$(OBJDIR)/memzero.o \
-$(OBJDIR)/asm_inc.o
+$(OBJDIR)/asm_inc.o
endif
OBJBIN= $(OBJDIR)/d3d9_utils.o \
@@ -134,7 +134,7 @@
dependencies:
@echo "" >dependencies
-
+
checkdir:
@echo 'checkdir..'
@if test ! -d $(BINDIR) ; \
@@ -154,7 +154,7 @@
mkdir -p $(OBJDIR) ; \
fi
@echo
-
+
release:
@echo 'release..'
@echo 'cp -f $(SHAREDLIB) $(OUTDIR)'
@@ -169,14 +169,14 @@
@rm -f $(OBJBIN)
@rm -f $(BINLIB)
@rm -f $(SHAREDLIB)
- @rm -f $(BIN)
+ @rm -f $(BIN)
tags:
@echo update tag table
@etags $(CORESRCDIR)/*.c $(CORESRCDIR)/*.cpp $(PLUSSRCDIR)/*.cpp $(MAINSRCDIR)/*.cpp
-
-
-lib: $(OBJDEC)
+
+
+lib: $(OBJDEC)
@echo '$(OBJDEC)'
@echo
@echo 'ar cr $(BINLIB) $(OBJDEC)'
@@ -197,15 +197,15 @@
@$(CXX) -shared -Wl,-Bsymbolic -o $(SHAREDLIB) $(OBJDEC) $(LIBS)
@echo '... done'
@echo
-
+
exe: $(OBJBIN)
- @echo
+ @echo
@echo '$(OBJBIN)'
@echo
@echo '$(CXX) $(LIBS) $(OBJBIN) $(BINLIB) -o $(BIN)'
@echo 'creating binary "$(BIN)"'
- @$(CXX) $(OBJBIN) $(BINLIB) -o $(BIN) $(LIBS)
+ @$(CXX) $(OBJBIN) $(BINLIB) -o $(BIN) $(LIBS)
@echo '... done'
@echo
@@ -223,31 +223,31 @@
$(OBJDIR)/%.o$(SUFFIX): $(CORESRCDIR)/%.c
@echo 'compiling object file "$@" ...'
- @$(CC) -m32 -c $(CFLAGS) -o $@ $<
+ @$(CC) -m32 -c $(CFLAGS) -o $@ $<
$(OBJDIR)/%.o$(SUFFIX): $(CORESRCDIR)/%.cpp
@echo 'compiling object file "$@" ...'
@$(CC) -m32 -c $(CFLAGS) -o $@ $<
-
+
$(OBJDIR)/%.o$(SUFFIX): $(PLUSSRCDIR)/%.cpp
@echo 'compiling object file "$@" ...'
- @$(CC) -m32 -c $(CFLAGS) -o $@ $<
-
+ @$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
$(OBJDIR)/%.o$(SUFFIX): $(ASMSRCDIR)/%.asm
@echo 'compiling object file "$@" ...'
- @$(AS) $(ASFLAGS) -o $@ $<
+ @$(AS) $(ASFLAGS) -o $@ $<
#$(OBJDIR)/%.o$(SUFFIX): $(ASMCOMDIR)/%.asm
# @echo 'compiling object file "$@" ...'
# @$(AS) $(ASFLAGS) -o $@ $<
-
+
$(OBJDIR)/%.o$(SUFFIX): $(MAINSRCDIR)/%.cpp
@echo 'compiling object file "$@" ...'
- @$(CC) -m32 -c $(CFLAGS) -o $@ $<
+ @$(CC) -m32 -c $(CFLAGS) -o $@ $<
$(OBJDIR)/%.o$(SUFFIX): $(COMMONSRCDIR)/%.cpp
@echo 'compiling object file "$@" ...'
@$(CC) -m32 -c $(CFLAGS) -o $@ $<
-
+
include $(DEPEND)
--- a/codec/build/linux/enc/makefile
+++ b/codec/build/linux/enc/makefile
@@ -26,8 +26,8 @@
ASFLAGS= -f elf -DNOPREFIX -I ../../../encoder/core/asm/
LIBS= -lstdc++ -ldl -lpthread -lm
-#-lm
-CFLAGS= $(INCLUDE) -m32 -fPIC -D__GCC__ -DLINUX -D__NO_CTYPE -DWELS_SVC -DENCODER_CORE -DHAVE_CACHE_LINE_ALIGN -DWELS_TESTBED -DMT_ENABLED
+#-lm
+CFLAGS= $(INCLUDE) -m32 -fPIC -D__GCC__ -DLINUX -D__NO_CTYPE -DWELS_SVC -DENCODER_CORE -DHAVE_CACHE_LINE_ALIGN -DWELS_TESTBED -DMT_ENABLED
ifeq ($(DBG),1)
#SUFFIX= .dbg
@@ -150,7 +150,7 @@
$(OBJDIR)/satd_sad.o \
$(OBJDIR)/score.o \
$(OBJDIR)/asm_inc.o \
-$(OBJDIR)/vaa.o
+$(OBJDIR)/vaa.o
endif
OBJBIN= $(OBJDIR)/read_config.o \
$(OBJDIR)/welsenc.o
@@ -163,7 +163,7 @@
dependencies:
@echo "" >dependencies
-
+
checkdir:
@echo 'checkdir..'
@if test ! -d $(OUTDIR) ; \
@@ -195,9 +195,9 @@
tags:
@echo update tag table
@etags $(THREADLIBSRCDIR)/*.cpp $(COMMSRCDIR)/*.cpp $(CORESRCDIR)/*.cpp $(PLUSSRCDIR)/*.cpp $(MAINSRCDIR)/*.cpp
-
-
-lib: $(OBJENC)
+
+
+lib: $(OBJENC)
@echo '$(OBJENC)'
@echo
@echo 'ar cr $(BINLIB) $(OBJENC)'
@@ -218,7 +218,7 @@
@$(GCC) -shared -Wl,-Bsymbolic -m32 -o $(SHAREDLIB) $(OBJENC) $(LIBS)
@echo '... done'
@echo
-
+
release:
@echo 'release..'
@echo 'cp -f $(SHAREDLIB) $(OUTDIR)'
@@ -228,7 +228,7 @@
@echo
exe: $(OBJBIN)
- @echo
+ @echo
@echo '$(OBJBIN)'
@echo
@echo '$(GCC) $(LIBS) $(OBJBIN) $(BINLIB) -m32 -o $(BIN)'
@@ -251,24 +251,24 @@
$(OBJDIR)/%.o$(SUFFIX): $(CORESRCDIR)/%.cpp
@echo 'compiling object file "$@" ...'
- @$(CC) -m32 -c $(CFLAGS) -o $@ $<
-
+ @$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
$(OBJDIR)/%.o$(SUFFIX): $(PLUSSRCDIR)/%.cpp
@echo 'compiling object file "$@" ...'
- @$(CC) -m32 -c $(CFLAGS) -o $@ $<
+ @$(CC) -m32 -c $(CFLAGS) -o $@ $<
$(OBJDIR)/%.o$(SUFFIX): $(ASMSRCDIR)/%.asm
@echo 'compiling object file "$@" ...'
- @$(AS) $(ASFLAGS) -o $@ $<
-
+ @$(AS) $(ASFLAGS) -o $@ $<
+
$(OBJDIR)/%.o$(SUFFIX): $(MAINSRCDIR)/%.cpp
@echo 'compiling object file "$@" ...'
- @$(CC) -m32 -c $(CFLAGS) -o $@ $<
-
+ @$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
$(OBJDIR)/%.o$(SUFFIX): $(MAINSRCDIR)/%.cpp
@echo 'compiling object file "$@" ...'
- @$(CC) -m32 -c $(CFLAGS) -o $@ $<
-
+ @$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
$(OBJDIR)/%.o$(SUFFIX): $(COMMONSRCDIR)/%.cpp
@echo 'compiling object file "$@" ...'
@$(CC) -m32 -c $(CFLAGS) -o $@ $<
--- a/codec/decoder/core/asm/asm_inc.asm
+++ b/codec/decoder/core/asm/asm_inc.asm
@@ -43,7 +43,7 @@
; Options, for DEBUG
;***********************************************************************
-%if 1
+%if 1
%define MOVDQ movdqa
%else
%define MOVDQ movdqu
@@ -58,7 +58,7 @@
BITS 32
;***********************************************************************
-; Macros
+; Macros
;***********************************************************************
%macro WELS_EXTERN 1
@@ -74,7 +74,7 @@
pxor %2, %2
psubw %2, %1
pmaxsw %1, %2
-%endmacro
+%endmacro
%macro MMX_XSwap 4
movq %4, %2
@@ -105,7 +105,7 @@
SSE2_XSawp qdq, %5, %2, %3
%endmacro
-;in: xmm0, xmm1, xmm2, xmm3 pOut: xmm0, xmm1, xmm3, xmm4
+;in: xmm0, xmm1, xmm2, xmm3 pOut: xmm0, xmm1, xmm3, xmm4
%macro SSE2_TransTwo4x4W 5
SSE2_XSawp wd, %1, %2, %5
SSE2_XSawp wd, %3, %4, %2
@@ -125,26 +125,26 @@
movdqa %6, %9
movdqa %9, %4
SSE2_XSawp bw, %7, %6, %4
-
- SSE2_XSawp wd, %1, %3, %6
+
+ SSE2_XSawp wd, %1, %3, %6
SSE2_XSawp wd, %8, %2, %3
SSE2_XSawp wd, %5, %7, %2
movdqa %7, %9
- movdqa %9, %3
+ movdqa %9, %3
SSE2_XSawp wd, %7, %4, %3
-
- SSE2_XSawp dq, %1, %5, %4
+
+ SSE2_XSawp dq, %1, %5, %4
SSE2_XSawp dq, %6, %2, %5
SSE2_XSawp dq, %8, %7, %2
movdqa %7, %9
- movdqa %9, %5
+ movdqa %9, %5
SSE2_XSawp dq, %7, %3, %5
-
+
SSE2_XSawp qdq, %1, %8, %3
SSE2_XSawp qdq, %4, %2, %8
SSE2_XSawp qdq, %6, %7, %2
movdqa %7, %9
- movdqa %9, %1
+ movdqa %9, %1
SSE2_XSawp qdq, %7, %5, %1
movdqa %5, %9
%endmacro
@@ -170,9 +170,9 @@
%macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
mov %3h, %3l
movd %1, e%3x ; i.e, 1% = eax (=b0)
- pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
- pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
-%endmacro
+ pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
+ pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro
;copy a dw into a xmm for 8 times
%macro SSE2_Copy8Times 2
--- a/codec/decoder/core/asm/block_add.asm
+++ b/codec/decoder/core/asm/block_add.asm
@@ -48,7 +48,7 @@
; Macros and other preprocessor constants
;*******************************************************************************
-%macro BLOCK_ADD_16_SSE2 4
+%macro BLOCK_ADD_16_SSE2 4
movdqa xmm0, [%2]
movdqa xmm1, [%3]
movdqa xmm2, [%3+10h]
@@ -65,7 +65,7 @@
lea %2, [%2+%4]
lea %3, [%3+%4*2]
- lea %1, [%1+%4]
+ lea %1, [%1+%4]
%endmacro
%macro BLOCK_ADD_8_MMXEXT 4
@@ -106,7 +106,7 @@
lea %2, [%2+%4]
lea %3, [%3+%5*2]
- lea %1, [%1+%4]
+ lea %1, [%1+%4]
%endmacro
@@ -130,24 +130,24 @@
lea %1, [%1+%4]
%endmacro
-%macro BLOCK_ADD_8_STRIDE_2_LINES_SSE2 5
+%macro BLOCK_ADD_8_STRIDE_2_LINES_SSE2 5
movdqa xmm1, [%3]
movq xmm0, [%2]
punpcklbw xmm0, xmm7
paddw xmm0, xmm1
packuswb xmm0, xmm7
- movq [%1], xmm0
-
+ movq [%1], xmm0
+
movdqa xmm3, [%3+%5*2]
movq xmm2, [%2+%4]
punpcklbw xmm2, xmm7
paddw xmm2, xmm3
- packuswb xmm2, xmm7
- movq [%1+%4], xmm2
-
+ packuswb xmm2, xmm7
+ movq [%1+%4], xmm2
+
lea %1, [%1+%4*2]
lea %2, [%2+%4*2]
- lea %3, [%3+%5*4]
+ lea %3, [%3+%5*4]
%endmacro
%macro CHECK_DATA_16_ZERO_SSE4 3
@@ -159,7 +159,7 @@
por xmm0, xmm1
ptest xmm7, xmm0
cmovae eax, %3
-
+
add %1, 20h
add ecx, 04h
mov byte [%2+ebx], al
@@ -170,12 +170,12 @@
movdqa xmm1, [%1+%3]
movdqa xmm2, [%1+%3*2]
movdqa xmm3, [%1+%4]
-
+
mov eax, 0h
mov ebx, 0h
movdqa xmm4, xmm0
movdqa xmm5, xmm2
-
+
punpcklqdq xmm0, xmm1
punpckhqdq xmm4, xmm1
punpcklqdq xmm2, xmm3
@@ -183,12 +183,12 @@
por xmm0, xmm2
por xmm4, xmm5
-
+
ptest xmm7, xmm0
cmovae eax, %5
ptest xmm7, xmm4
- cmovae ebx, %5
-
+ cmovae ebx, %5
+
mov byte [%2], al
mov byte [%2+1], bl
%endmacro
@@ -230,45 +230,45 @@
movdqa xmm0, [%1]
movdqa xmm1, [%1+10h]
mov ebx, [ecx]
-
+
pcmpeqw xmm0, xmm7
pcmpeqw xmm1, xmm7
packsswb xmm0, xmm1
- pmovmskb edx, xmm0
+ pmovmskb edx, xmm0
sub edx, 0ffffh
-
- cmovb eax, ebp
+
+ cmovb eax, ebp
add ecx, 4
add %1, 20h
mov byte [%2+ebx], al
%endmacro
-
+
%macro CHECK_RS_4x4_BLOCK_2_ZERO_SSE2 5
movdqa xmm0, [%1]
movdqa xmm1, [%1 + %3]
movdqa xmm2, [%1 + %3*2]
- movdqa xmm3, [%1 + %4]
-
+ movdqa xmm3, [%1 + %4]
+
movdqa xmm4, xmm0
movdqa xmm5, xmm2
-
+
punpcklqdq xmm0, xmm1
punpckhqdq xmm4, xmm1
punpcklqdq xmm2, xmm3
punpckhqdq xmm5, xmm3
-
+
pcmpeqw xmm0, xmm7
pcmpeqw xmm2, xmm7
pcmpeqw xmm4, xmm7
pcmpeqw xmm5, xmm7
-
+
packsswb xmm0, xmm2
packsswb xmm4, xmm5
pmovmskb eax, xmm0
pmovmskb ebx, xmm4
-
+
sub eax, 0ffffh
mov eax, 0
cmovb eax, %5
@@ -276,7 +276,7 @@
mov ebx, 0
cmovb ebx, %5
mov byte [%2], al
- mov byte [%2+1], bl
+ mov byte [%2+1], bl
%endmacro
;*******************************************************************************
@@ -291,12 +291,12 @@
ALIGN 16
SubMbScanIdx:
- dd 0x0, 0x1, 0x4, 0x5,
+ dd 0x0, 0x1, 0x4, 0x5,
dd 0x2, 0x3, 0x6, 0x7,
dd 0x8, 0x9, 0xc, 0xd,
dd 0xa, 0xb, 0xe, 0xf,
dd 0x10, 0x11, 0x14, 0x15,
- dd 0x12, 0x13, 0x16, 0x17,
+ dd 0x12, 0x13, 0x16, 0x17,
;*******************************************************************************
; Code
@@ -312,10 +312,10 @@
; void_t WelsResBlockZero16x16_sse2(int16_t* pBlock,int32_t iStride)
;*******************************************************************************
WelsResBlockZero16x16_sse2:
- push esi
+ push esi
mov esi, [esp+08h]
- mov ecx, [esp+0ch]
+ mov ecx, [esp+0ch]
lea ecx, [ecx*2]
lea eax, [ecx*3]
@@ -375,7 +375,7 @@
movdqa [esi+eax], xmm7
movdqa [esi+eax+10h], xmm7
-
+
pop esi
ret
@@ -386,7 +386,7 @@
;*******************************************************************************
; void_t WelsResBlockZero8x8_sse2(int16_t * pBlock, int32_t iStride)
;*******************************************************************************
-WelsResBlockZero8x8_sse2:
+WelsResBlockZero8x8_sse2:
push esi
mov esi, [esp+08h]
@@ -407,7 +407,7 @@
movdqa [esi+ecx*2], xmm7
movdqa [esi+eax], xmm7
-
+
pop esi
ret
--- a/codec/decoder/core/asm/cpuid.asm
+++ b/codec/decoder/core/asm/cpuid.asm
@@ -84,12 +84,12 @@
; void WelsCPUId( int32_t index, int32_t *uiFeatureA, int32_t *uiFeatureB, int32_t *uiFeatureC, int32_t *uiFeatureD )
;****************************************************************************************************
WelsCPUId:
- push ebx
+ push ebx
push edi
-
+
mov eax, [esp+12] ; operating index
cpuid ; cpuid
-
+
; processing various information return
mov edi, [esp+16]
mov [edi], eax
@@ -100,10 +100,10 @@
mov edi, [esp+28]
mov [edi], edx
- pop edi
+ pop edi
pop ebx
ret
-
+
WELS_EXTERN WelsCPUSupportAVX
; need call after cpuid=1 and eax, ecx flag got then
ALIGN 16
@@ -139,7 +139,7 @@
WelsCPUSupportFMA:
mov eax, [esp+4]
mov ecx, [esp+8]
-
+
; refer to detection of FMA addressed in INTEL AVX manual document
and ecx, 018001000H
cmp ecx, 018001000H ; check OSXSAVE, AVX, FMA feature flags
@@ -153,7 +153,7 @@
mov eax, 1
ret
fma_not_supported:
- mov eax, 0
+ mov eax, 0
ret
WELS_EXTERN WelsEmms
--- a/codec/decoder/core/asm/dct.asm
+++ b/codec/decoder/core/asm/dct.asm
@@ -1,129 +1,129 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* ?Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* ?Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* dct.asm
-;*
-;* Abstract
-;* WelsDctFourT4_sse2
-;*
-;* History
-;* 8/4/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-BITS 32
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-%macro MMX_SumSubDiv2 3
- movq %3, %2
- psraw %3, $1
- paddw %3, %1
- psraw %1, $1
- psubw %1, %2
-%endmacro
-
-%macro MMX_SumSub 3
- movq %3, %2
- psubw %2, %1
- paddw %1, %3
-%endmacro
-
-%macro MMX_IDCT 6
- MMX_SumSub %4, %5, %6
- MMX_SumSubDiv2 %3, %2, %1
- MMX_SumSub %1, %4, %6
- MMX_SumSub %3, %5, %6
-%endmacro
-
-
-%macro MMX_StoreDiff4P 5
- movd %2, %5
- punpcklbw %2, %4
- paddw %1, %3
- psraw %1, $6
- paddsw %1, %2
- packuswb %1, %2
- movd %5, %1
-%endmacro
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-
-WELS_EXTERN IdctResAddPred_mmx
-
-ALIGN 16
-;*******************************************************************************
-; void_t __cdecl IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs )
-;*******************************************************************************
-
-IdctResAddPred_mmx:
-
-%define pushsize 0
-%define pPred esp+pushsize+4
-%define kiStride esp+pushsize+8
-%define pRs esp+pushsize+12
-
- mov eax, [pRs ]
- mov edx, [pPred ]
- mov ecx, [kiStride]
- movq mm0, [eax+ 0]
- movq mm1, [eax+ 8]
- movq mm2, [eax+16]
- movq mm3, [eax+24]
-
- MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
- MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
- MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
- MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
-
- WELS_Zero mm7
- WELS_DW32 mm6
-
- MMX_StoreDiff4P mm3, mm0, mm6, mm7, [edx]
- MMX_StoreDiff4P mm4, mm0, mm6, mm7, [edx+ecx]
- lea edx, [edx+2*ecx]
- MMX_StoreDiff4P mm1, mm0, mm6, mm7, [edx]
- MMX_StoreDiff4P mm2, mm0, mm6, mm7, [edx+ecx]
-
-%undef pushsize
-%undef pPred
-%undef kiStride
-%undef pRs
- emms
- ret
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* ?Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* ?Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* dct.asm
+;*
+;* Abstract
+;* WelsDctFourT4_sse2
+;*
+;* History
+;* 8/4/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+BITS 32
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+%macro MMX_SumSubDiv2 3
+ movq %3, %2
+ psraw %3, $1
+ paddw %3, %1
+ psraw %1, $1
+ psubw %1, %2
+%endmacro
+
+%macro MMX_SumSub 3
+ movq %3, %2
+ psubw %2, %1
+ paddw %1, %3
+%endmacro
+
+%macro MMX_IDCT 6
+ MMX_SumSub %4, %5, %6
+ MMX_SumSubDiv2 %3, %2, %1
+ MMX_SumSub %1, %4, %6
+ MMX_SumSub %3, %5, %6
+%endmacro
+
+
+%macro MMX_StoreDiff4P 5
+ movd %2, %5
+ punpcklbw %2, %4
+ paddw %1, %3
+ psraw %1, $6
+ paddsw %1, %2
+ packuswb %1, %2
+ movd %5, %1
+%endmacro
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+WELS_EXTERN IdctResAddPred_mmx
+
+ALIGN 16
+;*******************************************************************************
+; void_t __cdecl IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs )
+;*******************************************************************************
+
+IdctResAddPred_mmx:
+
+%define pushsize 0
+%define pPred esp+pushsize+4
+%define kiStride esp+pushsize+8
+%define pRs esp+pushsize+12
+
+ mov eax, [pRs ]
+ mov edx, [pPred ]
+ mov ecx, [kiStride]
+ movq mm0, [eax+ 0]
+ movq mm1, [eax+ 8]
+ movq mm2, [eax+16]
+ movq mm3, [eax+24]
+
+ MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
+ MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
+ MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
+ MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
+
+ WELS_Zero mm7
+ WELS_DW32 mm6
+
+ MMX_StoreDiff4P mm3, mm0, mm6, mm7, [edx]
+ MMX_StoreDiff4P mm4, mm0, mm6, mm7, [edx+ecx]
+ lea edx, [edx+2*ecx]
+ MMX_StoreDiff4P mm1, mm0, mm6, mm7, [edx]
+ MMX_StoreDiff4P mm2, mm0, mm6, mm7, [edx+ecx]
+
+%undef pushsize
+%undef pPred
+%undef kiStride
+%undef pRs
+ emms
+ ret
--- a/codec/decoder/core/asm/deblock.asm
+++ b/codec/decoder/core/asm/deblock.asm
@@ -1,2113 +1,2113 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* deblock.asm
-;*
-;* Abstract
-;* edge loop
-;*
-;* History
-;* 08/07/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata pData
-%else
-SECTION .rodata align=16
-%endif
-
-SECTION .text
-
-;********************************************************************************
-; void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta)
-;********************************************************************************
-WELS_EXTERN DeblockChromaEq4V_sse2
-
-ALIGN 16
-DeblockChromaEq4V_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,68h
- mov edx,[ebp+10h] ; iStride
- mov eax,[ebp+8] ; pPixCb
- mov ecx,[ebp+0Ch] ; pPixCr
- movq xmm4,[ecx]
- movq xmm5,[edx+ecx]
- push esi
- push edi
- lea esi,[edx+edx]
- mov edi,eax
- sub edi,esi
- movq xmm1,[edi]
- mov edi,ecx
- sub edi,esi
- movq xmm2,[edi]
- punpcklqdq xmm1,xmm2
- mov esi,eax
- sub esi,edx
- movq xmm2,[esi]
- mov edi,ecx
- sub edi,edx
- movq xmm3,[edi]
- punpcklqdq xmm2,xmm3
- movq xmm3,[eax]
- punpcklqdq xmm3,xmm4
- movq xmm4,[edx+eax]
- mov edx, [ebp + 14h]
- punpcklqdq xmm4,xmm5
- movd xmm5,edx
- mov edx, [ebp + 18h]
- pxor xmm0,xmm0
- movdqa xmm6,xmm5
- punpcklwd xmm6,xmm5
- pshufd xmm5,xmm6,0
- movd xmm6,edx
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa xmm7,xmm1
- punpckhbw xmm1,xmm0
- punpcklbw xmm7,xmm0
- movdqa [esp+40h],xmm1
- movdqa [esp+60h],xmm7
- movdqa xmm7,xmm2
- punpcklbw xmm7,xmm0
- movdqa [esp+10h],xmm7
- movdqa xmm7,xmm3
- punpcklbw xmm7,xmm0
- punpckhbw xmm3,xmm0
- movdqa [esp+50h],xmm7
- movdqa xmm7,xmm4
- punpckhbw xmm4,xmm0
- punpckhbw xmm2,xmm0
- punpcklbw xmm7,xmm0
- movdqa [esp+30h],xmm3
- movdqa xmm3,[esp+10h]
- movdqa xmm1,xmm3
- psubw xmm1,[esp+50h]
- pabsw xmm1,xmm1
- movdqa [esp+20h],xmm4
- movdqa xmm0,xmm5
- pcmpgtw xmm0,xmm1
- movdqa xmm1,[esp+60h]
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- pand xmm0,xmm4
- movdqa xmm1,xmm7
- psubw xmm1,[esp+50h]
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- movdqa xmm1,xmm2
- psubw xmm1,[esp+30h]
- pabsw xmm1,xmm1
- pcmpgtw xmm5,xmm1
- movdqa xmm1,[esp+40h]
- pand xmm0,xmm4
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- movdqa xmm1,[esp+20h]
- psubw xmm1,[esp+30h]
- pand xmm5,xmm4
- pabsw xmm1,xmm1
- pcmpgtw xmm6,xmm1
- pand xmm5,xmm6
- mov edx,2
- movsx edx,dx
- movd xmm1,edx
- movdqa xmm4,xmm1
- punpcklwd xmm4,xmm1
- pshufd xmm1,xmm4,0
- movdqa xmm4,[esp+60h]
- movdqa xmm6,xmm4
- paddw xmm6,xmm4
- paddw xmm6,xmm3
- paddw xmm6,xmm7
- movdqa [esp+10h],xmm1
- paddw xmm6,[esp+10h]
- psraw xmm6,2
- movdqa xmm4,xmm0
- pandn xmm4,xmm3
- movdqa xmm3,[esp+40h]
- movdqa xmm1,xmm0
- pand xmm1,xmm6
- por xmm1,xmm4
- movdqa xmm6,xmm3
- paddw xmm6,xmm3
- movdqa xmm3,[esp+10h]
- paddw xmm6,xmm2
- paddw xmm6,[esp+20h]
- paddw xmm6,xmm3
- psraw xmm6,2
- movdqa xmm4,xmm5
- pand xmm4,xmm6
- movdqa xmm6,xmm5
- pandn xmm6,xmm2
- por xmm4,xmm6
- packuswb xmm1,xmm4
- movdqa xmm4,[esp+50h]
- movdqa xmm6,xmm7
- paddw xmm6,xmm7
- paddw xmm6,xmm4
- paddw xmm6,[esp+60h]
- paddw xmm6,xmm3
- psraw xmm6,2
- movdqa xmm2,xmm0
- pand xmm2,xmm6
- pandn xmm0,xmm4
- por xmm2,xmm0
- movdqa xmm0,[esp+20h]
- movdqa xmm6,xmm0
- paddw xmm6,xmm0
- movdqa xmm0,[esp+30h]
- paddw xmm6,xmm0
- paddw xmm6,[esp+40h]
- movdqa xmm4,xmm5
- paddw xmm6,xmm3
- movq [esi],xmm1
- psraw xmm6,2
- pand xmm4,xmm6
- pandn xmm5,xmm0
- por xmm4,xmm5
- packuswb xmm2,xmm4
- movq [eax],xmm2
- psrldq xmm1,8
- movq [edi],xmm1
- pop edi
- psrldq xmm2,8
- movq [ecx],xmm2
- pop esi
- mov esp,ebp
- pop ebp
- ret
-
-;******************************************************************************
-; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN DeblockChromaLt4V_sse2
-
-DeblockChromaLt4V_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,0E4h
- push ebx
- push esi
- mov esi, [ebp+1Ch] ; pTC
- movsx ebx, byte [esi+2]
- push edi
- movsx di,byte [esi+3]
- mov word [esp+0Ch],bx
- movsx bx,byte [esi+1]
- movsx esi,byte [esi]
- mov word [esp+0Eh],si
- movzx esi,di
- movd xmm1,esi
- movzx esi,di
- movd xmm2,esi
- mov si,word [esp+0Ch]
- mov edx, [ebp + 10h]
- mov eax, [ebp + 08h]
- movzx edi,si
- movzx esi,si
- mov ecx, [ebp + 0Ch]
- movd xmm4,esi
- movzx esi,bx
- movd xmm5,esi
- movd xmm3,edi
- movzx esi,bx
- movd xmm6,esi
- mov si,word [esp+0Eh]
- movzx edi,si
- movzx esi,si
- punpcklwd xmm6,xmm2
- pxor xmm0,xmm0
- movdqa [esp+40h],xmm0
- movd xmm7,edi
- movd xmm0,esi
- lea esi,[edx+edx]
- mov edi,eax
- sub edi,esi
- punpcklwd xmm5,xmm1
- movdqa xmm1,[esp+40h]
- punpcklwd xmm0,xmm4
- movq xmm4,[edx+ecx]
- punpcklwd xmm7,xmm3
- movq xmm3,[eax]
- punpcklwd xmm0,xmm6
- movq xmm6,[edi]
- punpcklwd xmm7,xmm5
- punpcklwd xmm0,xmm7
- mov edi,ecx
- sub edi,esi
- movdqa xmm2,xmm1
- psubw xmm2,xmm0
- movdqa [esp+60h],xmm2
- movq xmm2, [edi]
- punpcklqdq xmm6,xmm2
- mov esi,eax
- sub esi,edx
- movq xmm7,[esi]
- mov edi,ecx
- sub edi,edx
- movq xmm2,[edi]
- punpcklqdq xmm7,xmm2
- movq xmm2,[ecx]
- punpcklqdq xmm3,xmm2
- movq xmm2,[edx+eax]
- movsx edx,word [ebp + 14h]
- punpcklqdq xmm2,xmm4
- movdqa [esp+0E0h],xmm2
- movd xmm2,edx
- movsx edx,word [ebp + 18h]
- movdqa xmm4,xmm2
- punpcklwd xmm4,xmm2
- movd xmm2,edx
- movdqa xmm5,xmm2
- punpcklwd xmm5,xmm2
- pshufd xmm2,xmm5,0
- movdqa [esp+50h],xmm2
- movdqa xmm2,xmm6
- punpcklbw xmm2,xmm1
- movdqa [esp+0D0h],xmm3
- pshufd xmm4,xmm4,0
- movdqa [esp+30h],xmm2
- punpckhbw xmm6,xmm1
- movdqa [esp+80h],xmm6
- movdqa xmm6,[esp+0D0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+70h],xmm6
- movdqa xmm6, [esp+0E0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+90h],xmm6
- movdqa xmm5, [esp+0E0h]
- movdqa xmm2,xmm7
- punpckhbw xmm7,xmm1
- punpcklbw xmm5,xmm1
- movdqa [esp+0A0h],xmm7
- punpcklbw xmm3,xmm1
- mov edx,4
- punpcklbw xmm2,xmm1
- movsx edx,dx
- movd xmm6,edx
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa xmm7,[esp+30h]
- movdqa [esp+20h],xmm6
- psubw xmm7,xmm5
- movdqa xmm6,xmm0
- pcmpgtw xmm6,xmm1
- movdqa xmm1,[esp+60h]
- movdqa [esp+40h],xmm6
- movdqa xmm6,xmm3
- psubw xmm6,xmm2
- psllw xmm6,2
- paddw xmm6,xmm7
- paddw xmm6, [esp+20h]
- movdqa xmm7, [esp+50h]
- psraw xmm6,3
- pmaxsw xmm1,xmm6
- movdqa [esp+10h],xmm0
- movdqa xmm6, [esp+10h]
- pminsw xmm6,xmm1
- movdqa [esp+10h],xmm6
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- movdqa xmm6,xmm4
- pcmpgtw xmm6,xmm1
- movdqa xmm1, [esp+30h]
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- pcmpgtw xmm7,xmm1
- movdqa xmm1,[esp+50h]
- pand xmm6,xmm7
- movdqa xmm7,[esp+50h]
- psubw xmm5,xmm3
- pabsw xmm5,xmm5
- pcmpgtw xmm1,xmm5
- movdqa xmm5,[esp+80h]
- psubw xmm5,[esp+90h]
- pand xmm6,xmm1
- pand xmm6,[esp+40h]
- movdqa xmm1,[esp+10h]
- pand xmm1,xmm6
- movdqa xmm6,[esp+70h]
- movdqa [esp+30h],xmm1
- movdqa xmm1,[esp+0A0h]
- psubw xmm6,xmm1
- psllw xmm6,2
- paddw xmm6,xmm5
- paddw xmm6,[esp+20h]
- movdqa xmm5,[esp+60h]
- psraw xmm6,3
- pmaxsw xmm5,xmm6
- pminsw xmm0,xmm5
- movdqa xmm5,[esp+70h]
- movdqa xmm6,xmm1
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm4,xmm6
- movdqa xmm6,[esp+80h]
- psubw xmm6,xmm1
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+90h]
- pand xmm4,xmm7
- movdqa xmm7,[esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- pand xmm4,xmm7
- pand xmm4,[esp+40h]
- pand xmm0,xmm4
- movdqa xmm4,[esp+30h]
- paddw xmm2,xmm4
- paddw xmm1,xmm0
- packuswb xmm2,xmm1
- movq [esi],xmm2
- psubw xmm3,xmm4
- psubw xmm5,xmm0
- packuswb xmm3,xmm5
- movq [eax],xmm3
- psrldq xmm2,8
- movq [edi],xmm2
- pop edi
- pop esi
- psrldq xmm3,8
- movq [ecx],xmm3
- pop ebx
- mov esp,ebp
- pop ebp
- ret
-
-;***************************************************************************
-; void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta)
-;***************************************************************************
-
-WELS_EXTERN DeblockChromaEq4H_sse2
-
-ALIGN 16
-
-DeblockChromaEq4H_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,0C8h
- mov ecx,dword [ebp+8]
- mov edx,dword [ebp+0Ch]
- mov eax,dword [ebp+10h]
- sub ecx,2
- sub edx,2
- push esi
- lea esi,[eax+eax*2]
- mov dword [esp+18h],ecx
- mov dword [esp+4],edx
- lea ecx,[ecx+eax*4]
- lea edx,[edx+eax*4]
- lea eax,[esp+7Ch]
- push edi
- mov dword [esp+14h],esi
- mov dword [esp+18h],ecx
- mov dword [esp+0Ch],edx
- mov dword [esp+10h],eax
- mov esi,dword [esp+1Ch]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+14h]
- movd xmm0,dword [esi]
- movd xmm1,dword [esi+ecx]
- movd xmm2,dword [esi+ecx*2]
- movd xmm3,dword [esi+edx]
- mov esi,dword [esp+8]
- movd xmm4,dword [esi]
- movd xmm5,dword [esi+ecx]
- movd xmm6,dword [esi+ecx*2]
- movd xmm7,dword [esi+edx]
- punpckldq xmm0,xmm4
- punpckldq xmm1,xmm5
- punpckldq xmm2,xmm6
- punpckldq xmm3,xmm7
- mov esi,dword [esp+18h]
- mov edi,dword [esp+0Ch]
- movd xmm4,dword [esi]
- movd xmm5,dword [edi]
- punpckldq xmm4,xmm5
- punpcklqdq xmm0,xmm4
- movd xmm4,dword [esi+ecx]
- movd xmm5,dword [edi+ecx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm1,xmm4
- movd xmm4,dword [esi+ecx*2]
- movd xmm5,dword [edi+ecx*2]
- punpckldq xmm4,xmm5
- punpcklqdq xmm2,xmm4
- movd xmm4,dword [esi+edx]
- movd xmm5,dword [edi+edx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm3,xmm4
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov edi,dword [esp+10h]
- movdqa [edi],xmm0
- movdqa [edi+10h],xmm5
- movdqa [edi+20h],xmm1
- movdqa [edi+30h],xmm6
- movsx ecx,word [ebp+14h]
- movsx edx,word [ebp+18h]
- movdqa xmm6,[esp+80h]
- movdqa xmm4,[esp+90h]
- movdqa xmm5,[esp+0A0h]
- movdqa xmm7,[esp+0B0h]
- pxor xmm0,xmm0
- movd xmm1,ecx
- movdqa xmm2,xmm1
- punpcklwd xmm2,xmm1
- pshufd xmm1,xmm2,0
- movd xmm2,edx
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm2,xmm3,0
- movdqa xmm3,xmm6
- punpckhbw xmm6,xmm0
- movdqa [esp+60h],xmm6
- movdqa xmm6,[esp+90h]
- punpckhbw xmm6,xmm0
- movdqa [esp+30h],xmm6
- movdqa xmm6,[esp+0A0h]
- punpckhbw xmm6,xmm0
- movdqa [esp+40h],xmm6
- movdqa xmm6,[esp+0B0h]
- punpckhbw xmm6,xmm0
- movdqa [esp+70h],xmm6
- punpcklbw xmm7,xmm0
- punpcklbw xmm4,xmm0
- punpcklbw xmm5,xmm0
- punpcklbw xmm3,xmm0
- movdqa [esp+50h],xmm7
- movdqa xmm6,xmm4
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- movdqa xmm0,xmm1
- pcmpgtw xmm0,xmm6
- movdqa xmm6,xmm3
- psubw xmm6,xmm4
- pabsw xmm6,xmm6
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pand xmm0,xmm7
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+30h]
- psubw xmm6,[esp+40h]
- pabsw xmm6,xmm6
- pcmpgtw xmm1,xmm6
- movdqa xmm6,[esp+60h]
- psubw xmm6,[esp+30h]
- pabsw xmm6,xmm6
- pand xmm0,xmm7
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+70h]
- psubw xmm6,[esp+40h]
- pabsw xmm6,xmm6
- pand xmm1,xmm7
- pcmpgtw xmm2,xmm6
- pand xmm1,xmm2
- mov eax,2
- movsx ecx,ax
- movd xmm2,ecx
- movdqa xmm6,xmm2
- punpcklwd xmm6,xmm2
- pshufd xmm2,xmm6,0
- movdqa [esp+20h],xmm2
- movdqa xmm2,xmm3
- paddw xmm2,xmm3
- paddw xmm2,xmm4
- paddw xmm2,[esp+50h]
- paddw xmm2,[esp+20h]
- psraw xmm2,2
- movdqa xmm6,xmm0
- pand xmm6,xmm2
- movdqa xmm2,xmm0
- pandn xmm2,xmm4
- por xmm6,xmm2
- movdqa xmm2,[esp+60h]
- movdqa xmm7,xmm2
- paddw xmm7,xmm2
- paddw xmm7,[esp+30h]
- paddw xmm7,[esp+70h]
- paddw xmm7,[esp+20h]
- movdqa xmm4,xmm1
- movdqa xmm2,xmm1
- pandn xmm2,[esp+30h]
- psraw xmm7,2
- pand xmm4,xmm7
- por xmm4,xmm2
- movdqa xmm2,[esp+50h]
- packuswb xmm6,xmm4
- movdqa [esp+90h],xmm6
- movdqa xmm6,xmm2
- paddw xmm6,xmm2
- movdqa xmm2,[esp+20h]
- paddw xmm6,xmm5
- paddw xmm6,xmm3
- movdqa xmm4,xmm0
- pandn xmm0,xmm5
- paddw xmm6,xmm2
- psraw xmm6,2
- pand xmm4,xmm6
- por xmm4,xmm0
- movdqa xmm0,[esp+70h]
- movdqa xmm5,xmm0
- paddw xmm5,xmm0
- movdqa xmm0,[esp+40h]
- paddw xmm5,xmm0
- paddw xmm5,[esp+60h]
- movdqa xmm3,xmm1
- paddw xmm5,xmm2
- psraw xmm5,2
- pand xmm3,xmm5
- pandn xmm1,xmm0
- por xmm3,xmm1
- packuswb xmm4,xmm3
- movdqa [esp+0A0h],xmm4
- mov esi,dword [esp+10h]
- movdqa xmm0,[esi]
- movdqa xmm1,[esi+10h]
- movdqa xmm2,[esi+20h]
- movdqa xmm3,[esi+30h]
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov esi,dword [esp+1Ch]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+14h]
- mov edi,dword [esp+8]
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov esi,dword [esp+18h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov edi,dword [esp+0Ch]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- pop edi
- pop esi
- mov esp,ebp
- pop ebp
- ret
-
-;*******************************************************************************
-; void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN DeblockChromaLt4H_sse2
-
-ALIGN 16
-
-DeblockChromaLt4H_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,108h
- mov ecx,dword [ebp+8]
- mov edx,dword [ebp+0Ch]
- mov eax,dword [ebp+10h]
- sub ecx,2
- sub edx,2
- push esi
- lea esi,[eax+eax*2]
- mov dword [esp+10h],ecx
- mov dword [esp+4],edx
- lea ecx,[ecx+eax*4]
- lea edx,[edx+eax*4]
- lea eax,[esp+6Ch]
- push edi
- mov dword [esp+0Ch],esi
- mov dword [esp+18h],ecx
- mov dword [esp+10h],edx
- mov dword [esp+1Ch],eax
- mov esi,dword [esp+14h]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+0Ch]
- movd xmm0,dword [esi]
- movd xmm1,dword [esi+ecx]
- movd xmm2,dword [esi+ecx*2]
- movd xmm3,dword [esi+edx]
- mov esi,dword [esp+8]
- movd xmm4,dword [esi]
- movd xmm5,dword [esi+ecx]
- movd xmm6,dword [esi+ecx*2]
- movd xmm7,dword [esi+edx]
- punpckldq xmm0,xmm4
- punpckldq xmm1,xmm5
- punpckldq xmm2,xmm6
- punpckldq xmm3,xmm7
- mov esi,dword [esp+18h]
- mov edi,dword [esp+10h]
- movd xmm4,dword [esi]
- movd xmm5,dword [edi]
- punpckldq xmm4,xmm5
- punpcklqdq xmm0,xmm4
- movd xmm4,dword [esi+ecx]
- movd xmm5,dword [edi+ecx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm1,xmm4
- movd xmm4,dword [esi+ecx*2]
- movd xmm5,dword [edi+ecx*2]
- punpckldq xmm4,xmm5
- punpcklqdq xmm2,xmm4
- movd xmm4,dword [esi+edx]
- movd xmm5,dword [edi+edx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm3,xmm4
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov edi,dword [esp+1Ch]
- movdqa [edi],xmm0
- movdqa [edi+10h],xmm5
- movdqa [edi+20h],xmm1
- movdqa [edi+30h],xmm6
- mov eax,dword [ebp+1Ch]
- movsx cx,byte [eax+3]
- movsx dx,byte [eax+2]
- movsx si,byte [eax+1]
- movsx ax,byte [eax]
- movzx edi,cx
- movzx ecx,cx
- movd xmm2,ecx
- movzx ecx,dx
- movzx edx,dx
- movd xmm3,ecx
- movd xmm4,edx
- movzx ecx,si
- movzx edx,si
- movd xmm5,ecx
- pxor xmm0,xmm0
- movd xmm6,edx
- movzx ecx,ax
- movdqa [esp+60h],xmm0
- movzx edx,ax
- movsx eax,word [ebp+14h]
- punpcklwd xmm6,xmm2
- movd xmm1,edi
- movd xmm7,ecx
- movsx ecx,word [ebp+18h]
- movd xmm0,edx
- punpcklwd xmm7,xmm3
- punpcklwd xmm5,xmm1
- movdqa xmm1,[esp+60h]
- punpcklwd xmm7,xmm5
- movdqa xmm5,[esp+0A0h]
- punpcklwd xmm0,xmm4
- punpcklwd xmm0,xmm6
- movdqa xmm6, [esp+70h]
- punpcklwd xmm0,xmm7
- movdqa xmm7,[esp+80h]
- movdqa xmm2,xmm1
- psubw xmm2,xmm0
- movdqa [esp+0D0h],xmm2
- movd xmm2,eax
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm4,xmm3,0
- movd xmm2,ecx
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm2,xmm3,0
- movdqa xmm3, [esp+90h]
- movdqa [esp+50h],xmm2
- movdqa xmm2,xmm6
- punpcklbw xmm2,xmm1
- punpckhbw xmm6,xmm1
- movdqa [esp+40h],xmm2
- movdqa [esp+0B0h],xmm6
- movdqa xmm6,[esp+90h]
- movdqa xmm2,xmm7
- punpckhbw xmm7,xmm1
- punpckhbw xmm6,xmm1
- punpcklbw xmm2,xmm1
- punpcklbw xmm3,xmm1
- punpcklbw xmm5,xmm1
- movdqa [esp+0F0h],xmm7
- movdqa [esp+0C0h],xmm6
- movdqa xmm6, [esp+0A0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+0E0h],xmm6
- mov edx,4
- movsx eax,dx
- movd xmm6,eax
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa [esp+30h],xmm6
- movdqa xmm7, [esp+40h]
- psubw xmm7,xmm5
- movdqa xmm6,xmm0
- pcmpgtw xmm6,xmm1
- movdqa [esp+60h],xmm6
- movdqa xmm1, [esp+0D0h]
- movdqa xmm6,xmm3
- psubw xmm6,xmm2
- psllw xmm6,2
- paddw xmm6,xmm7
- paddw xmm6,[esp+30h]
- psraw xmm6,3
- pmaxsw xmm1,xmm6
- movdqa xmm7,[esp+50h]
- movdqa [esp+20h],xmm0
- movdqa xmm6, [esp+20h]
- pminsw xmm6,xmm1
- movdqa [esp+20h],xmm6
- movdqa xmm6,xmm4
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- pcmpgtw xmm6,xmm1
- movdqa xmm1, [esp+40h]
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- pcmpgtw xmm7,xmm1
- movdqa xmm1, [esp+50h]
- pand xmm6,xmm7
- movdqa xmm7, [esp+50h]
- psubw xmm5,xmm3
- pabsw xmm5,xmm5
- pcmpgtw xmm1,xmm5
- movdqa xmm5, [esp+0B0h]
- psubw xmm5,[esp+0E0h]
- pand xmm6,xmm1
- pand xmm6, [esp+60h]
- movdqa xmm1, [esp+20h]
- pand xmm1,xmm6
- movdqa xmm6, [esp+0C0h]
- movdqa [esp+40h],xmm1
- movdqa xmm1, [esp+0F0h]
- psubw xmm6,xmm1
- psllw xmm6,2
- paddw xmm6,xmm5
- paddw xmm6, [esp+30h]
- movdqa xmm5, [esp+0D0h]
- psraw xmm6,3
- pmaxsw xmm5,xmm6
- pminsw xmm0,xmm5
- movdqa xmm5,[esp+0C0h]
- movdqa xmm6,xmm1
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm4,xmm6
- movdqa xmm6,[esp+0B0h]
- psubw xmm6,xmm1
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- movdqa xmm6, [esp+0E0h]
- pand xmm4,xmm7
- movdqa xmm7, [esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- pand xmm4,xmm7
- pand xmm4,[esp+60h]
- pand xmm0,xmm4
- movdqa xmm4, [esp+40h]
- paddw xmm2,xmm4
- paddw xmm1,xmm0
- psubw xmm3,xmm4
- psubw xmm5,xmm0
- packuswb xmm2,xmm1
- packuswb xmm3,xmm5
- movdqa [esp+80h],xmm2
- movdqa [esp+90h],xmm3
- mov esi,dword [esp+1Ch]
- movdqa xmm0, [esi]
- movdqa xmm1, [esi+10h]
- movdqa xmm2, [esi+20h]
- movdqa xmm3, [esi+30h]
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov esi,dword [esp+14h]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+0Ch]
- mov edi,dword [esp+8]
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov esi,dword [esp+18h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov edi,dword [esp+10h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- pop edi
- pop esi
- mov esp,ebp
- pop ebp
- ret
-
-
-
-;*******************************************************************************
-; void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
-; int32_t iBeta, int8_t * pTC)
-;*******************************************************************************
-
-
-WELS_EXTERN DeblockLumaLt4V_sse2
-
-ALIGN 16
-
-DeblockLumaLt4V_sse2:
- push ebp
- mov ebp, esp
- and esp, -16 ; fffffff0H
- sub esp, 420 ; 000001a4H
- mov eax, dword [ebp+8]
- mov ecx, dword [ebp+12]
-
- pxor xmm0, xmm0
- push ebx
- mov edx, dword [ebp+24]
- movdqa [esp+424-384], xmm0
- push esi
-
- lea esi, [ecx+ecx*2]
- push edi
- mov edi, eax
- sub edi, esi
- movdqa xmm0, [edi]
-
- lea esi, [ecx+ecx]
- movdqa [esp+432-208], xmm0
- mov edi, eax
- sub edi, esi
- movdqa xmm0, [edi]
- movdqa [esp+448-208], xmm0
-
- mov ebx, eax
- sub ebx, ecx
- movdqa xmm0, [ebx]
- movdqa [esp+464-208], xmm0
-
- movdqa xmm0, [eax]
-
- add ecx, eax
- movdqa [esp+480-208], xmm0
- movdqa xmm0, [ecx]
- mov dword [esp+432-404], ecx
-
- movsx ecx, word [ebp+16]
- movdqa [esp+496-208], xmm0
- movdqa xmm0, [esi+eax]
-
- movsx si, byte [edx]
- movdqa [esp+512-208], xmm0
- movd xmm0, ecx
- movsx ecx, word [ebp+20]
- movdqa xmm1, xmm0
- punpcklwd xmm1, xmm0
- pshufd xmm0, xmm1, 0
- movdqa [esp+432-112], xmm0
- movd xmm0, ecx
- movsx cx, byte [edx+1]
- movdqa xmm1, xmm0
- punpcklwd xmm1, xmm0
- mov dword [esp+432-408], ebx
- movzx ebx, cx
- pshufd xmm0, xmm1, 0
- movd xmm1, ebx
- movzx ebx, cx
- movd xmm2, ebx
- movzx ebx, cx
- movzx ecx, cx
- movd xmm4, ecx
- movzx ecx, si
- movd xmm5, ecx
- movzx ecx, si
- movd xmm6, ecx
- movzx ecx, si
- movd xmm7, ecx
- movzx ecx, si
- movdqa [esp+432-336], xmm0
- movd xmm0, ecx
-
- movsx cx, byte [edx+3]
- movsx dx, byte [edx+2]
- movd xmm3, ebx
- punpcklwd xmm0, xmm4
- movzx esi, cx
- punpcklwd xmm6, xmm2
- punpcklwd xmm5, xmm1
- punpcklwd xmm0, xmm6
- punpcklwd xmm7, xmm3
- punpcklwd xmm7, xmm5
- punpcklwd xmm0, xmm7
- movdqa [esp+432-400], xmm0
- movd xmm0, esi
- movzx esi, cx
- movd xmm2, esi
- movzx esi, cx
- movzx ecx, cx
- movd xmm4, ecx
- movzx ecx, dx
- movd xmm3, esi
- movd xmm5, ecx
- punpcklwd xmm5, xmm0
-
- movdqa xmm0, [esp+432-384]
- movzx ecx, dx
- movd xmm6, ecx
- movzx ecx, dx
- movzx edx, dx
- punpcklwd xmm6, xmm2
- movd xmm7, ecx
- movd xmm1, edx
-
- movdqa xmm2, [esp+448-208]
- punpcklbw xmm2, xmm0
-
- mov ecx, 4
- movsx edx, cx
- punpcklwd xmm7, xmm3
- punpcklwd xmm7, xmm5
- movdqa xmm5, [esp+496-208]
- movdqa xmm3, [esp+464-208]
- punpcklbw xmm5, xmm0
- movdqa [esp+432-240], xmm5
- movdqa xmm5, [esp+512-208]
- punpcklbw xmm5, xmm0
- movdqa [esp+432-352], xmm5
- punpcklwd xmm1, xmm4
- movdqa xmm4, [esp+432-208]
- punpcklwd xmm1, xmm6
- movdqa xmm6, [esp+480-208]
- punpcklwd xmm1, xmm7
- punpcklbw xmm6, xmm0
- punpcklbw xmm3, xmm0
- punpcklbw xmm4, xmm0
- movdqa xmm7, xmm3
- psubw xmm7, xmm4
- pabsw xmm7, xmm7
- movdqa [esp+432-272], xmm4
- movdqa xmm4, [esp+432-336]
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-288], xmm5
- movdqa xmm7, xmm6
- psubw xmm7, [esp+432-352]
- pabsw xmm7, xmm7
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-256], xmm5
- movdqa xmm5, xmm3
- pavgw xmm5, xmm6
- movdqa [esp+432-304], xmm5
- movdqa xmm5, [esp+432-400]
- psubw xmm5, [esp+432-288]
- psubw xmm5, [esp+432-256]
- movdqa [esp+432-224], xmm5
- movdqa xmm5, xmm6
- psubw xmm5, xmm3
- movdqa [esp+432-32], xmm6
- psubw xmm6, [esp+432-240]
- movdqa xmm7, xmm5
- movdqa [esp+432-384], xmm5
- movdqa xmm5, [esp+432-112]
- pabsw xmm7, xmm7
- pcmpgtw xmm5, xmm7
- pabsw xmm6, xmm6
- movdqa xmm7, xmm4
- pcmpgtw xmm7, xmm6
-
- pand xmm5, xmm7
- movdqa xmm6, xmm3
- psubw xmm6, xmm2
- pabsw xmm6, xmm6
- movdqa xmm7, xmm4
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+432-400]
- pand xmm5, xmm7
- movdqa xmm7, xmm6
- pcmpeqw xmm6, xmm0
- pcmpgtw xmm7, xmm0
- por xmm7, xmm6
- pand xmm5, xmm7
- movdqa [esp+432-320], xmm5
- movd xmm5, edx
- movdqa xmm6, xmm5
- punpcklwd xmm6, xmm5
- pshufd xmm5, xmm6, 0
- movdqa [esp+432-336], xmm5
- movdqa xmm5, [esp+432-224]
- movdqa [esp+432-368], xmm5
- movdqa xmm6, xmm0
- psubw xmm6, xmm5
- movdqa xmm5, [esp+432-384]
- psllw xmm5, 2
- movdqa xmm7, xmm2
- psubw xmm7, [esp+432-240]
- paddw xmm7, xmm5
- paddw xmm7, [esp+432-336]
- movdqa xmm5, [esp+432-368]
- psraw xmm7, 3
- pmaxsw xmm6, xmm7
- pminsw xmm5, xmm6
-
- pand xmm5, [esp+432-320]
- movdqa xmm6, [esp+432-400]
- movdqa [esp+432-64], xmm5
- movdqa [esp+432-384], xmm6
- movdqa xmm5, xmm0
- psubw xmm5, xmm6
- movdqa [esp+432-368], xmm5
- movdqa xmm6, xmm5
- movdqa xmm5, [esp+432-272]
- paddw xmm5, [esp+432-304]
- movdqa xmm7, xmm2
- paddw xmm7, xmm2
- psubw xmm5, xmm7
- psraw xmm5, 1
- pmaxsw xmm6, xmm5
- movdqa xmm5, [esp+432-384]
- pminsw xmm5, xmm6
-
- pand xmm5, [esp+432-320]
- pand xmm5, [esp+432-288]
- movdqa xmm6, [esp+432-240]
- movdqa [esp+432-96], xmm5
- movdqa xmm5, [esp+432-352]
- paddw xmm5, [esp+432-304]
- movdqa xmm7, xmm6
- paddw xmm7, xmm6
- movdqa xmm6, [esp+432-368]
- psubw xmm5, xmm7
-
- movdqa xmm7, [esp+496-208]
- psraw xmm5, 1
- pmaxsw xmm6, xmm5
- movdqa xmm5, [esp+432-400]
- pminsw xmm5, xmm6
- pand xmm5, [esp+432-320]
- pand xmm5, [esp+432-256]
- movdqa xmm6, [esp+448-208]
- punpckhbw xmm7, xmm0
- movdqa [esp+432-352], xmm7
-
- movdqa xmm7, [esp+512-208]
- punpckhbw xmm6, xmm0
- movdqa [esp+432-48], xmm5
- movdqa xmm5, [esp+432-208]
- movdqa [esp+432-368], xmm6
- movdqa xmm6, [esp+464-208]
- punpckhbw xmm7, xmm0
- punpckhbw xmm5, xmm0
- movdqa [esp+432-384], xmm7
- punpckhbw xmm6, xmm0
- movdqa [esp+432-400], xmm6
-
- movdqa xmm7, [esp+432-400]
- movdqa xmm6, [esp+480-208]
- psubw xmm7, xmm5
- movdqa [esp+432-16], xmm5
- pabsw xmm7, xmm7
- punpckhbw xmm6, xmm0
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-288], xmm5
-
- movdqa xmm7, xmm6
- psubw xmm7, [esp+432-384]
- pabsw xmm7, xmm7
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-256], xmm5
-
- movdqa xmm5, [esp+432-400]
- movdqa [esp+432-80], xmm6
- pavgw xmm5, xmm6
- movdqa [esp+432-304], xmm5
-
- movdqa xmm5, xmm1
- psubw xmm5, [esp+432-288]
- psubw xmm5, [esp+432-256]
- movdqa [esp+432-224], xmm5
- movdqa xmm5, xmm6
- psubw xmm5, [esp+432-400]
- psubw xmm6, [esp+432-352]
- movdqa [esp+432-272], xmm5
- movdqa xmm7, xmm5
- movdqa xmm5, [esp+432-112]
- pabsw xmm7, xmm7
- pcmpgtw xmm5, xmm7
- movdqa xmm7, xmm4
- pabsw xmm6, xmm6
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+432-368]
-
- pand xmm5, xmm7
- movdqa xmm7, [esp+432-400]
- psubw xmm7, xmm6
- psubw xmm6, [esp+432-352]
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
- pand xmm5, xmm4
-
- paddw xmm2, [esp+432-96]
- movdqa xmm4, xmm1
- pcmpgtw xmm4, xmm0
- movdqa xmm7, xmm1
- pcmpeqw xmm7, xmm0
- por xmm4, xmm7
- pand xmm5, xmm4
- movdqa xmm4, [esp+432-224]
- movdqa [esp+432-320], xmm5
- movdqa xmm5, [esp+432-272]
- movdqa xmm7, xmm0
- psubw xmm7, xmm4
- psubw xmm0, xmm1
- psllw xmm5, 2
- paddw xmm6, xmm5
- paddw xmm6, [esp+432-336]
- movdqa xmm5, [esp+432-368]
- movdqa [esp+432-336], xmm0
- psraw xmm6, 3
- pmaxsw xmm7, xmm6
- pminsw xmm4, xmm7
- pand xmm4, [esp+432-320]
- movdqa xmm6, xmm0
- movdqa xmm0, [esp+432-16]
- paddw xmm0, [esp+432-304]
- movdqa [esp+432-272], xmm4
- movdqa xmm4, [esp+432-368]
- paddw xmm4, xmm4
- psubw xmm0, xmm4
-
- movdqa xmm4, [esp+432-64]
- psraw xmm0, 1
- pmaxsw xmm6, xmm0
- movdqa xmm0, [esp+432-400]
- movdqa xmm7, xmm1
- pminsw xmm7, xmm6
- movdqa xmm6, [esp+432-320]
- pand xmm7, xmm6
- pand xmm7, [esp+432-288]
- paddw xmm5, xmm7
- packuswb xmm2, xmm5
- movdqa xmm5, [esp+432-272]
- paddw xmm0, xmm5
- paddw xmm3, xmm4
- packuswb xmm3, xmm0
-
- movdqa xmm0, [esp+432-32]
- psubw xmm0, xmm4
- movdqa xmm4, [esp+432-80]
- psubw xmm4, xmm5
-
- movdqa xmm5, [esp+432-240]
- paddw xmm5, [esp+432-48]
- packuswb xmm0, xmm4
- movdqa xmm4, [esp+432-384]
- paddw xmm4, [esp+432-304]
- movdqa [esp+480-208], xmm0
- movdqa xmm0, [esp+432-352]
- movdqa xmm7, xmm0
- paddw xmm0, xmm0
-
- mov ecx, dword [esp+432-408]
-
- mov edx, dword [esp+432-404]
- psubw xmm4, xmm0
- movdqa xmm0, [esp+432-336]
- movdqa [edi], xmm2
- psraw xmm4, 1
- pmaxsw xmm0, xmm4
- pminsw xmm1, xmm0
- movdqa xmm0, [esp+480-208]
-
- pop edi
- pand xmm1, xmm6
- pand xmm1, [esp+428-256]
- movdqa [ecx], xmm3
- paddw xmm7, xmm1
- pop esi
- packuswb xmm5, xmm7
- movdqa [eax], xmm0
- movdqa [edx], xmm5
- pop ebx
- mov esp, ebp
- pop ebp
- ret
-
-
-;*******************************************************************************
-; void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
-; int32_t iBeta)
-;*******************************************************************************
-
-WELS_EXTERN DeblockLumaEq4V_sse2
-
-ALIGN 16
-
-DeblockLumaEq4V_sse2:
-
- push ebp
- mov ebp, esp
- and esp, -16 ; fffffff0H
- sub esp, 628 ; 00000274H
- mov eax, dword [ebp+8]
- mov ecx, dword [ebp+12]
- push ebx
- push esi
-
- lea edx, [ecx*4]
- pxor xmm0, xmm0
- movdqa xmm2, xmm0
-
- movdqa xmm0, [ecx+eax]
- mov esi, eax
- sub esi, edx
- movdqa xmm3, [esi]
- movdqa xmm5, [eax]
- push edi
- lea edi, [ecx+ecx]
- lea ebx, [ecx+ecx*2]
- mov dword [esp+640-600], edi
- mov esi, eax
- sub esi, edi
- movdqa xmm1, [esi]
- movdqa [esp+720-272], xmm0
- mov edi, eax
- sub edi, ecx
- movdqa xmm4, [edi]
- add ecx, eax
- mov dword [esp+640-596], ecx
-
- mov ecx, dword [esp+640-600]
- movdqa xmm0, [ecx+eax]
- movdqa [esp+736-272], xmm0
-
- movdqa xmm0, [eax+ebx]
- mov edx, eax
- sub edx, ebx
-
- movsx ebx, word [ebp+16]
- movdqa xmm6, [edx]
- add ecx, eax
- movdqa [esp+752-272], xmm0
- movd xmm0, ebx
-
- movsx ebx, word [ebp+20]
- movdqa xmm7, xmm0
- punpcklwd xmm7, xmm0
- pshufd xmm0, xmm7, 0
- movdqa [esp+640-320], xmm0
- movd xmm0, ebx
- movdqa xmm7, xmm0
- punpcklwd xmm7, xmm0
- pshufd xmm0, xmm7, 0
-
- movdqa xmm7, [esp+736-272]
- punpcklbw xmm7, xmm2
- movdqa [esp+640-416], xmm7
- movdqa [esp+640-512], xmm0
- movdqa xmm0, xmm1
- movdqa [esp+672-272], xmm1
- movdqa xmm1, xmm4
- movdqa [esp+704-272], xmm5
- punpcklbw xmm5, xmm2
- punpcklbw xmm1, xmm2
-
- movdqa xmm7, xmm5
- psubw xmm7, xmm1
- pabsw xmm7, xmm7
- movdqa [esp+640-560], xmm7
- punpcklbw xmm0, xmm2
- movdqa [esp+688-272], xmm4
- movdqa xmm4, [esp+720-272]
- movdqa [esp+640-480], xmm0
-
- movdqa xmm7, xmm1
- psubw xmm7, xmm0
-
- movdqa xmm0, [esp+640-512]
- pabsw xmm7, xmm7
- punpcklbw xmm4, xmm2
- pcmpgtw xmm0, xmm7
- movdqa [esp+640-384], xmm4
- movdqa xmm7, xmm5
- psubw xmm7, xmm4
- movdqa xmm4, [esp+640-512]
- movdqa [esp+656-272], xmm6
- punpcklbw xmm6, xmm2
- pabsw xmm7, xmm7
- movdqa [esp+640-48], xmm2
- movdqa [esp+640-368], xmm6
- movdqa [esp+640-144], xmm1
- movdqa [esp+640-400], xmm5
- pcmpgtw xmm4, xmm7
- pand xmm0, xmm4
- movdqa xmm4, [esp+640-320]
- pcmpgtw xmm4, [esp+640-560]
- pand xmm0, xmm4
-
- mov ebx, 2
- movsx ebx, bx
- movd xmm4, ebx
- movdqa xmm7, xmm4
- punpcklwd xmm7, xmm4
- movdqa xmm4, [esp+640-320]
- psraw xmm4, 2
- pshufd xmm7, xmm7, 0
- paddw xmm4, xmm7
- movdqa [esp+640-576], xmm4
- pcmpgtw xmm4, [esp+640-560]
- movdqa [esp+640-560], xmm4
-
- movdqa xmm4, [esp+640-512]
- movdqa [esp+640-624], xmm7
- movdqa xmm7, xmm1
- psubw xmm7, xmm6
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
-
- pand xmm4, [esp+640-560]
- movdqa [esp+640-544], xmm4
- movdqa xmm4, [esp+640-512]
- movdqa xmm7, xmm5
- psubw xmm7, [esp+640-416]
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
-
- pand xmm4, [esp+640-560]
- movdqa [esp+640-560], xmm4
-
- movdqa xmm4, [esp+640-544]
- pandn xmm4, xmm6
- movdqa [esp+640-16], xmm4
- mov ebx, 4
- movsx ebx, bx
- movd xmm4, ebx
- movdqa xmm7, xmm4
- punpcklwd xmm7, xmm4
- movdqa xmm4, xmm3
- punpcklbw xmm4, xmm2
- psllw xmm4, 1
- paddw xmm4, xmm6
- paddw xmm4, xmm6
- paddw xmm4, xmm6
- paddw xmm4, [esp+640-480]
-
- movdqa xmm6, [esp+640-560]
- pshufd xmm7, xmm7, 0
- paddw xmm4, xmm1
- movdqa [esp+640-592], xmm7
- paddw xmm4, xmm5
- paddw xmm4, xmm7
- movdqa xmm7, [esp+640-416]
- pandn xmm6, xmm7
- movdqa [esp+640-80], xmm6
- movdqa xmm6, [esp+752-272]
- punpcklbw xmm6, xmm2
- psllw xmm6, 1
- paddw xmm6, xmm7
- paddw xmm6, xmm7
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-384]
-
- movdqa xmm7, [esp+640-480]
- paddw xmm6, xmm5
- paddw xmm6, xmm1
- paddw xmm6, [esp+640-592]
- psraw xmm6, 3
- pand xmm6, [esp+640-560]
- movdqa [esp+640-112], xmm6
- movdqa xmm6, [esp+640-544]
- pandn xmm6, xmm7
- movdqa [esp+640-336], xmm6
- movdqa xmm6, [esp+640-544]
- movdqa [esp+640-528], xmm6
- movdqa xmm6, [esp+640-368]
- paddw xmm6, xmm7
- movdqa xmm7, xmm1
- psraw xmm4, 3
- pand xmm4, [esp+640-544]
- paddw xmm7, xmm5
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-624]
- movdqa xmm7, [esp+640-528]
-
- paddw xmm5, xmm1
- psraw xmm6, 2
- pand xmm7, xmm6
-
- movdqa xmm6, [esp+640-384]
- movdqa [esp+640-64], xmm7
- movdqa xmm7, [esp+640-560]
- pandn xmm7, xmm6
- movdqa [esp+640-304], xmm7
- movdqa xmm7, [esp+640-560]
- movdqa [esp+640-528], xmm7
- movdqa xmm7, [esp+640-416]
- paddw xmm7, xmm6
- paddw xmm7, xmm5
- paddw xmm7, [esp+640-624]
- movdqa xmm5, [esp+640-528]
- psraw xmm7, 2
- pand xmm5, xmm7
- movdqa [esp+640-32], xmm5
-
- movdqa xmm5, [esp+640-544]
- movdqa [esp+640-528], xmm5
- movdqa xmm5, [esp+640-480]
- movdqa xmm7, xmm5
- paddw xmm7, xmm5
- movdqa xmm5, xmm1
- paddw xmm5, xmm6
- paddw xmm6, [esp+640-592]
- paddw xmm7, xmm5
- paddw xmm7, [esp+640-624]
- movdqa xmm5, [esp+640-528]
- psraw xmm7, 2
- pandn xmm5, xmm7
- movdqa xmm7, [esp+640-480]
- paddw xmm7, xmm1
- paddw xmm7, [esp+640-400]
- movdqa xmm1, [esp+640-544]
- movdqa [esp+640-352], xmm5
- movdqa xmm5, [esp+640-368]
- psllw xmm7, 1
- paddw xmm7, xmm6
- paddw xmm5, xmm7
-
- movdqa xmm7, [esp+640-400]
- psraw xmm5, 3
- pand xmm1, xmm5
- movdqa xmm5, [esp+640-480]
- movdqa [esp+640-96], xmm1
- movdqa xmm1, [esp+640-560]
- movdqa [esp+640-528], xmm1
- movdqa xmm1, [esp+640-384]
- movdqa xmm6, xmm1
- paddw xmm6, xmm1
- paddw xmm1, [esp+640-400]
- paddw xmm1, [esp+640-144]
- paddw xmm7, xmm5
- paddw xmm5, [esp+640-592]
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-624]
- movdqa xmm7, [esp+640-528]
- psraw xmm6, 2
- psllw xmm1, 1
- paddw xmm1, xmm5
-
- movdqa xmm5, [esp+656-272]
- pandn xmm7, xmm6
- movdqa xmm6, [esp+640-416]
- paddw xmm6, xmm1
- movdqa xmm1, [esp+640-560]
- psraw xmm6, 3
- pand xmm1, xmm6
-
- movdqa xmm6, [esp+704-272]
- movdqa [esp+640-128], xmm1
- movdqa xmm1, [esp+672-272]
- punpckhbw xmm1, xmm2
- movdqa [esp+640-448], xmm1
- movdqa xmm1, [esp+688-272]
- punpckhbw xmm1, xmm2
- punpckhbw xmm6, xmm2
- movdqa [esp+640-288], xmm7
- punpckhbw xmm5, xmm2
- movdqa [esp+640-496], xmm1
- movdqa [esp+640-432], xmm6
-
- movdqa xmm7, [esp+720-272]
- punpckhbw xmm7, xmm2
- movdqa [esp+640-464], xmm7
-
- movdqa xmm7, [esp+736-272]
- punpckhbw xmm7, xmm2
- movdqa [esp+640-528], xmm7
-
- movdqa xmm7, xmm6
-
- psubw xmm6, [esp+640-464]
- psubw xmm7, xmm1
- pabsw xmm7, xmm7
- movdqa [esp+640-560], xmm7
- por xmm4, [esp+640-16]
- pabsw xmm6, xmm6
- movdqa xmm7, xmm1
- psubw xmm7, [esp+640-448]
-
- movdqa xmm1, [esp+640-512]
- pabsw xmm7, xmm7
- pcmpgtw xmm1, xmm7
- movdqa xmm7, [esp+640-512]
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+640-320]
- pand xmm1, xmm7
- movdqa xmm7, [esp+640-560]
- pcmpgtw xmm6, xmm7
- pand xmm1, xmm6
-
- movdqa xmm6, [esp+640-576]
- pcmpgtw xmm6, xmm7
-
- movdqa xmm7, [esp+640-496]
- punpckhbw xmm3, xmm2
- movdqa [esp+640-560], xmm6
- movdqa xmm6, [esp+640-512]
- psubw xmm7, xmm5
- pabsw xmm7, xmm7
- pcmpgtw xmm6, xmm7
-
- pand xmm6, [esp+640-560]
- movdqa xmm7, [esp+640-432]
- psubw xmm7, [esp+640-528]
-
- psllw xmm3, 1
- movdqa [esp+640-544], xmm6
- movdqa xmm6, [esp+640-512]
-
- movdqa xmm2, [esp+640-544]
- paddw xmm3, xmm5
- paddw xmm3, xmm5
- paddw xmm3, xmm5
- paddw xmm3, [esp+640-448]
- paddw xmm3, [esp+640-496]
- pabsw xmm7, xmm7
- pcmpgtw xmm6, xmm7
- pand xmm6, [esp+640-560]
- movdqa [esp+640-560], xmm6
-
- movdqa xmm6, xmm0
- pand xmm6, xmm4
- movdqa xmm4, xmm0
- pandn xmm4, [esp+640-368]
- por xmm6, xmm4
- movdqa xmm4, [esp+640-432]
- paddw xmm3, xmm4
- paddw xmm3, [esp+640-592]
- psraw xmm3, 3
- pand xmm3, xmm2
- pandn xmm2, xmm5
- por xmm3, xmm2
- movdqa xmm7, xmm1
- pand xmm7, xmm3
- movdqa xmm3, [esp+640-64]
- por xmm3, [esp+640-336]
- movdqa xmm2, xmm1
- pandn xmm2, xmm5
- por xmm7, xmm2
-
- movdqa xmm2, xmm0
- pand xmm2, xmm3
- movdqa xmm3, xmm0
- pandn xmm3, [esp+640-480]
- por xmm2, xmm3
- packuswb xmm6, xmm7
- movdqa [esp+640-336], xmm2
- movdqa [esp+656-272], xmm6
- movdqa xmm6, [esp+640-544]
- movdqa xmm2, xmm5
- paddw xmm2, [esp+640-448]
- movdqa xmm3, xmm1
- movdqa xmm7, [esp+640-496]
- paddw xmm7, xmm4
- paddw xmm2, xmm7
- paddw xmm2, [esp+640-624]
- movdqa xmm7, [esp+640-544]
- psraw xmm2, 2
- pand xmm6, xmm2
- movdqa xmm2, [esp+640-448]
- pandn xmm7, xmm2
- por xmm6, xmm7
- pand xmm3, xmm6
- movdqa xmm6, xmm1
- pandn xmm6, xmm2
- paddw xmm2, [esp+640-496]
- paddw xmm2, xmm4
- por xmm3, xmm6
- movdqa xmm6, [esp+640-336]
- packuswb xmm6, xmm3
- psllw xmm2, 1
- movdqa [esp+672-272], xmm6
- movdqa xmm6, [esp+640-96]
- por xmm6, [esp+640-352]
-
- movdqa xmm3, xmm0
- pand xmm3, xmm6
- movdqa xmm6, xmm0
- pandn xmm6, [esp+640-144]
- por xmm3, xmm6
- movdqa xmm6, [esp+640-544]
- movdqa [esp+640-352], xmm3
- movdqa xmm3, [esp+640-464]
- paddw xmm3, [esp+640-592]
- paddw xmm2, xmm3
- movdqa xmm3, [esp+640-448]
- paddw xmm5, xmm2
- movdqa xmm2, [esp+640-496]
- psraw xmm5, 3
- pand xmm6, xmm5
- movdqa xmm5, [esp+640-464]
- paddw xmm2, xmm5
- paddw xmm5, [esp+640-432]
- movdqa xmm4, xmm3
- paddw xmm4, xmm3
- paddw xmm4, xmm2
- paddw xmm4, [esp+640-624]
- movdqa xmm2, [esp+640-544]
- paddw xmm3, [esp+640-592]
- psraw xmm4, 2
- pandn xmm2, xmm4
- por xmm6, xmm2
- movdqa xmm7, xmm1
- pand xmm7, xmm6
- movdqa xmm6, [esp+640-496]
- movdqa xmm2, xmm1
- pandn xmm2, xmm6
- por xmm7, xmm2
- movdqa xmm2, [esp+640-352]
- packuswb xmm2, xmm7
- movdqa [esp+688-272], xmm2
- movdqa xmm2, [esp+640-128]
- por xmm2, [esp+640-288]
-
- movdqa xmm4, xmm0
- pand xmm4, xmm2
- paddw xmm5, xmm6
- movdqa xmm2, xmm0
- pandn xmm2, [esp+640-400]
- por xmm4, xmm2
- movdqa xmm2, [esp+640-528]
- psllw xmm5, 1
- paddw xmm5, xmm3
- movdqa xmm3, [esp+640-560]
- paddw xmm2, xmm5
- psraw xmm2, 3
- movdqa [esp+640-288], xmm4
- movdqa xmm4, [esp+640-560]
- pand xmm4, xmm2
- movdqa xmm2, [esp+640-464]
- movdqa xmm5, xmm2
- paddw xmm5, xmm2
- movdqa xmm2, [esp+640-432]
- paddw xmm2, [esp+640-448]
- movdqa xmm7, xmm1
- paddw xmm5, xmm2
- paddw xmm5, [esp+640-624]
- movdqa xmm6, [esp+640-560]
- psraw xmm5, 2
- pandn xmm3, xmm5
- por xmm4, xmm3
- movdqa xmm3, [esp+640-32]
- por xmm3, [esp+640-304]
- pand xmm7, xmm4
- movdqa xmm4, [esp+640-432]
- movdqa xmm5, [esp+640-464]
- movdqa xmm2, xmm1
- pandn xmm2, xmm4
- paddw xmm4, [esp+640-496]
- por xmm7, xmm2
- movdqa xmm2, [esp+640-288]
- packuswb xmm2, xmm7
- movdqa [esp+704-272], xmm2
-
- movdqa xmm2, xmm0
- pand xmm2, xmm3
- movdqa xmm3, xmm0
- pandn xmm3, [esp+640-384]
- por xmm2, xmm3
- movdqa [esp+640-304], xmm2
- movdqa xmm2, [esp+640-528]
- movdqa xmm3, xmm2
- paddw xmm3, [esp+640-464]
- paddw xmm3, xmm4
- paddw xmm3, [esp+640-624]
- psraw xmm3, 2
- pand xmm6, xmm3
- movdqa xmm3, [esp+640-560]
- movdqa xmm4, xmm3
- pandn xmm4, xmm5
- por xmm6, xmm4
- movdqa xmm7, xmm1
- pand xmm7, xmm6
- movdqa xmm6, [esp+640-304]
- movdqa xmm4, xmm1
- pandn xmm4, xmm5
- por xmm7, xmm4
-
- movdqa xmm4, xmm0
- pandn xmm0, [esp+640-416]
- packuswb xmm6, xmm7
- movdqa xmm7, [esp+640-112]
- por xmm7, [esp+640-80]
- pand xmm4, xmm7
- por xmm4, xmm0
- movdqa xmm0, [esp+752-272]
- punpckhbw xmm0, [esp+640-48]
- psllw xmm0, 1
- paddw xmm0, xmm2
- paddw xmm0, xmm2
- paddw xmm0, xmm2
- paddw xmm0, xmm5
- paddw xmm0, [esp+640-432]
- paddw xmm0, [esp+640-496]
- paddw xmm0, [esp+640-592]
- psraw xmm0, 3
- pand xmm0, xmm3
- movdqa xmm7, xmm1
- pandn xmm3, xmm2
- por xmm0, xmm3
- pand xmm7, xmm0
-
- movdqa xmm0, [esp+656-272]
- movdqa [edx], xmm0
-
- movdqa xmm0, [esp+672-272]
-
- mov edx, dword [esp+640-596]
- movdqa [esi], xmm0
- movdqa xmm0, [esp+688-272]
- movdqa [edi], xmm0
- movdqa xmm0, [esp+704-272]
-
- pop edi
- pandn xmm1, xmm2
- movdqa [eax], xmm0
- por xmm7, xmm1
- pop esi
- packuswb xmm4, xmm7
- movdqa [edx], xmm6
- movdqa [ecx], xmm4
- pop ebx
- mov esp, ebp
- pop ebp
- ret
-
-
-;********************************************************************************
-;
-; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
-;
-;********************************************************************************
-
-WELS_EXTERN DeblockLumaTransposeH2V_sse2
-
-ALIGN 16
-
-DeblockLumaTransposeH2V_sse2:
- push ebp
- push ebx
- mov ebp, esp
- and esp,0FFFFFFF0h
- sub esp, 10h
-
- mov eax, [ebp + 0Ch]
- mov ecx, [ebp + 10h]
- lea edx, [eax + ecx * 8]
- lea ebx, [ecx*3]
-
- movq xmm0, [eax]
- movq xmm7, [edx]
- punpcklqdq xmm0, xmm7
- movq xmm1, [eax + ecx]
- movq xmm7, [edx + ecx]
- punpcklqdq xmm1, xmm7
- movq xmm2, [eax + ecx*2]
- movq xmm7, [edx + ecx*2]
- punpcklqdq xmm2, xmm7
- movq xmm3, [eax + ebx]
- movq xmm7, [edx + ebx]
- punpcklqdq xmm3, xmm7
-
- lea eax, [eax + ecx * 4]
- lea edx, [edx + ecx * 4]
- movq xmm4, [eax]
- movq xmm7, [edx]
- punpcklqdq xmm4, xmm7
- movq xmm5, [eax + ecx]
- movq xmm7, [edx + ecx]
- punpcklqdq xmm5, xmm7
- movq xmm6, [eax + ecx*2]
- movq xmm7, [edx + ecx*2]
- punpcklqdq xmm6, xmm7
-
- movdqa [esp], xmm0
- movq xmm7, [eax + ebx]
- movq xmm0, [edx + ebx]
- punpcklqdq xmm7, xmm0
- movdqa xmm0, [esp]
-
- SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
- ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-
- mov eax, [ebp + 14h]
- movdqa [eax], xmm4
- movdqa [eax + 10h], xmm2
- movdqa [eax + 20h], xmm3
- movdqa [eax + 30h], xmm7
- movdqa [eax + 40h], xmm5
- movdqa [eax + 50h], xmm1
- movdqa [eax + 60h], xmm6
- movdqa [eax + 70h], xmm0
-
- mov esp, ebp
- pop ebx
- pop ebp
- ret
-
-
-
-;*******************************************************************************************
-;
-; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
-;
-;*******************************************************************************************
-
-WELS_EXTERN DeblockLumaTransposeV2H_sse2
-
-ALIGN 16
-
-DeblockLumaTransposeV2H_sse2:
- push ebp
- mov ebp, esp
-
- and esp, 0FFFFFFF0h
- sub esp, 10h
-
- mov eax, [ebp + 10h]
- mov ecx, [ebp + 0Ch]
- mov edx, [ebp + 08h]
-
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 10h]
- movdqa xmm2, [eax + 20h]
- movdqa xmm3, [eax + 30h]
- movdqa xmm4, [eax + 40h]
- movdqa xmm5, [eax + 50h]
- movdqa xmm6, [eax + 60h]
- movdqa xmm7, [eax + 70h]
-
- SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
- ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-
- lea eax, [ecx * 3]
-
- movq [edx], xmm4
- movq [edx + ecx], xmm2
- movq [edx + ecx*2], xmm3
- movq [edx + eax], xmm7
-
- lea edx, [edx + ecx*4]
- movq [edx], xmm5
- movq [edx + ecx], xmm1
- movq [edx + ecx*2], xmm6
- movq [edx + eax], xmm0
-
- psrldq xmm4, 8
- psrldq xmm2, 8
- psrldq xmm3, 8
- psrldq xmm7, 8
- psrldq xmm5, 8
- psrldq xmm1, 8
- psrldq xmm6, 8
- psrldq xmm0, 8
-
- lea edx, [edx + ecx*4]
- movq [edx], xmm4
- movq [edx + ecx], xmm2
- movq [edx + ecx*2], xmm3
- movq [edx + eax], xmm7
-
- lea edx, [edx + ecx*4]
- movq [edx], xmm5
- movq [edx + ecx], xmm1
- movq [edx + ecx*2], xmm6
- movq [edx + eax], xmm0
-
-
- mov esp, ebp
- pop ebp
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* deblock.asm
+;*
+;* Abstract
+;* edge loop
+;*
+;* History
+;* 08/07/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata pData
+%else
+SECTION .rodata align=16
+%endif
+
+SECTION .text
+
+;********************************************************************************
+; void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; int32_t iAlpha, int32_t iBeta)
+;********************************************************************************
+WELS_EXTERN DeblockChromaEq4V_sse2
+
+ALIGN 16
+DeblockChromaEq4V_sse2:
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,68h
+ mov edx,[ebp+10h] ; iStride
+ mov eax,[ebp+8] ; pPixCb
+ mov ecx,[ebp+0Ch] ; pPixCr
+ movq xmm4,[ecx]
+ movq xmm5,[edx+ecx]
+ push esi
+ push edi
+ lea esi,[edx+edx]
+ mov edi,eax
+ sub edi,esi
+ movq xmm1,[edi]
+ mov edi,ecx
+ sub edi,esi
+ movq xmm2,[edi]
+ punpcklqdq xmm1,xmm2
+ mov esi,eax
+ sub esi,edx
+ movq xmm2,[esi]
+ mov edi,ecx
+ sub edi,edx
+ movq xmm3,[edi]
+ punpcklqdq xmm2,xmm3
+ movq xmm3,[eax]
+ punpcklqdq xmm3,xmm4
+ movq xmm4,[edx+eax]
+ mov edx, [ebp + 14h]
+ punpcklqdq xmm4,xmm5
+ movd xmm5,edx
+ mov edx, [ebp + 18h]
+ pxor xmm0,xmm0
+ movdqa xmm6,xmm5
+ punpcklwd xmm6,xmm5
+ pshufd xmm5,xmm6,0
+ movd xmm6,edx
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa xmm7,xmm1
+ punpckhbw xmm1,xmm0
+ punpcklbw xmm7,xmm0
+ movdqa [esp+40h],xmm1
+ movdqa [esp+60h],xmm7
+ movdqa xmm7,xmm2
+ punpcklbw xmm7,xmm0
+ movdqa [esp+10h],xmm7
+ movdqa xmm7,xmm3
+ punpcklbw xmm7,xmm0
+ punpckhbw xmm3,xmm0
+ movdqa [esp+50h],xmm7
+ movdqa xmm7,xmm4
+ punpckhbw xmm4,xmm0
+ punpckhbw xmm2,xmm0
+ punpcklbw xmm7,xmm0
+ movdqa [esp+30h],xmm3
+ movdqa xmm3,[esp+10h]
+ movdqa xmm1,xmm3
+ psubw xmm1,[esp+50h]
+ pabsw xmm1,xmm1
+ movdqa [esp+20h],xmm4
+ movdqa xmm0,xmm5
+ pcmpgtw xmm0,xmm1
+ movdqa xmm1,[esp+60h]
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ pand xmm0,xmm4
+ movdqa xmm1,xmm7
+ psubw xmm1,[esp+50h]
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ movdqa xmm1,xmm2
+ psubw xmm1,[esp+30h]
+ pabsw xmm1,xmm1
+ pcmpgtw xmm5,xmm1
+ movdqa xmm1,[esp+40h]
+ pand xmm0,xmm4
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ movdqa xmm1,[esp+20h]
+ psubw xmm1,[esp+30h]
+ pand xmm5,xmm4
+ pabsw xmm1,xmm1
+ pcmpgtw xmm6,xmm1
+ pand xmm5,xmm6
+ mov edx,2
+ movsx edx,dx
+ movd xmm1,edx
+ movdqa xmm4,xmm1
+ punpcklwd xmm4,xmm1
+ pshufd xmm1,xmm4,0
+ movdqa xmm4,[esp+60h]
+ movdqa xmm6,xmm4
+ paddw xmm6,xmm4
+ paddw xmm6,xmm3
+ paddw xmm6,xmm7
+ movdqa [esp+10h],xmm1
+ paddw xmm6,[esp+10h]
+ psraw xmm6,2
+ movdqa xmm4,xmm0
+ pandn xmm4,xmm3
+ movdqa xmm3,[esp+40h]
+ movdqa xmm1,xmm0
+ pand xmm1,xmm6
+ por xmm1,xmm4
+ movdqa xmm6,xmm3
+ paddw xmm6,xmm3
+ movdqa xmm3,[esp+10h]
+ paddw xmm6,xmm2
+ paddw xmm6,[esp+20h]
+ paddw xmm6,xmm3
+ psraw xmm6,2
+ movdqa xmm4,xmm5
+ pand xmm4,xmm6
+ movdqa xmm6,xmm5
+ pandn xmm6,xmm2
+ por xmm4,xmm6
+ packuswb xmm1,xmm4
+ movdqa xmm4,[esp+50h]
+ movdqa xmm6,xmm7
+ paddw xmm6,xmm7
+ paddw xmm6,xmm4
+ paddw xmm6,[esp+60h]
+ paddw xmm6,xmm3
+ psraw xmm6,2
+ movdqa xmm2,xmm0
+ pand xmm2,xmm6
+ pandn xmm0,xmm4
+ por xmm2,xmm0
+ movdqa xmm0,[esp+20h]
+ movdqa xmm6,xmm0
+ paddw xmm6,xmm0
+ movdqa xmm0,[esp+30h]
+ paddw xmm6,xmm0
+ paddw xmm6,[esp+40h]
+ movdqa xmm4,xmm5
+ paddw xmm6,xmm3
+ movq [esi],xmm1
+ psraw xmm6,2
+ pand xmm4,xmm6
+ pandn xmm5,xmm0
+ por xmm4,xmm5
+ packuswb xmm2,xmm4
+ movq [eax],xmm2
+ psrldq xmm1,8
+ movq [edi],xmm1
+ pop edi
+ psrldq xmm2,8
+ movq [ecx],xmm2
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
+
+;******************************************************************************
+; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN DeblockChromaLt4V_sse2
+
+DeblockChromaLt4V_sse2:
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,0E4h
+ push ebx
+ push esi
+ mov esi, [ebp+1Ch] ; pTC
+ movsx ebx, byte [esi+2]
+ push edi
+ movsx di,byte [esi+3]
+ mov word [esp+0Ch],bx
+ movsx bx,byte [esi+1]
+ movsx esi,byte [esi]
+ mov word [esp+0Eh],si
+ movzx esi,di
+ movd xmm1,esi
+ movzx esi,di
+ movd xmm2,esi
+ mov si,word [esp+0Ch]
+ mov edx, [ebp + 10h]
+ mov eax, [ebp + 08h]
+ movzx edi,si
+ movzx esi,si
+ mov ecx, [ebp + 0Ch]
+ movd xmm4,esi
+ movzx esi,bx
+ movd xmm5,esi
+ movd xmm3,edi
+ movzx esi,bx
+ movd xmm6,esi
+ mov si,word [esp+0Eh]
+ movzx edi,si
+ movzx esi,si
+ punpcklwd xmm6,xmm2
+ pxor xmm0,xmm0
+ movdqa [esp+40h],xmm0
+ movd xmm7,edi
+ movd xmm0,esi
+ lea esi,[edx+edx]
+ mov edi,eax
+ sub edi,esi
+ punpcklwd xmm5,xmm1
+ movdqa xmm1,[esp+40h]
+ punpcklwd xmm0,xmm4
+ movq xmm4,[edx+ecx]
+ punpcklwd xmm7,xmm3
+ movq xmm3,[eax]
+ punpcklwd xmm0,xmm6
+ movq xmm6,[edi]
+ punpcklwd xmm7,xmm5
+ punpcklwd xmm0,xmm7
+ mov edi,ecx
+ sub edi,esi
+ movdqa xmm2,xmm1
+ psubw xmm2,xmm0
+ movdqa [esp+60h],xmm2
+ movq xmm2, [edi]
+ punpcklqdq xmm6,xmm2
+ mov esi,eax
+ sub esi,edx
+ movq xmm7,[esi]
+ mov edi,ecx
+ sub edi,edx
+ movq xmm2,[edi]
+ punpcklqdq xmm7,xmm2
+ movq xmm2,[ecx]
+ punpcklqdq xmm3,xmm2
+ movq xmm2,[edx+eax]
+ movsx edx,word [ebp + 14h]
+ punpcklqdq xmm2,xmm4
+ movdqa [esp+0E0h],xmm2
+ movd xmm2,edx
+ movsx edx,word [ebp + 18h]
+ movdqa xmm4,xmm2
+ punpcklwd xmm4,xmm2
+ movd xmm2,edx
+ movdqa xmm5,xmm2
+ punpcklwd xmm5,xmm2
+ pshufd xmm2,xmm5,0
+ movdqa [esp+50h],xmm2
+ movdqa xmm2,xmm6
+ punpcklbw xmm2,xmm1
+ movdqa [esp+0D0h],xmm3
+ pshufd xmm4,xmm4,0
+ movdqa [esp+30h],xmm2
+ punpckhbw xmm6,xmm1
+ movdqa [esp+80h],xmm6
+ movdqa xmm6,[esp+0D0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+70h],xmm6
+ movdqa xmm6, [esp+0E0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+90h],xmm6
+ movdqa xmm5, [esp+0E0h]
+ movdqa xmm2,xmm7
+ punpckhbw xmm7,xmm1
+ punpcklbw xmm5,xmm1
+ movdqa [esp+0A0h],xmm7
+ punpcklbw xmm3,xmm1
+ mov edx,4
+ punpcklbw xmm2,xmm1
+ movsx edx,dx
+ movd xmm6,edx
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa xmm7,[esp+30h]
+ movdqa [esp+20h],xmm6
+ psubw xmm7,xmm5
+ movdqa xmm6,xmm0
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1,[esp+60h]
+ movdqa [esp+40h],xmm6
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm2
+ psllw xmm6,2
+ paddw xmm6,xmm7
+ paddw xmm6, [esp+20h]
+ movdqa xmm7, [esp+50h]
+ psraw xmm6,3
+ pmaxsw xmm1,xmm6
+ movdqa [esp+10h],xmm0
+ movdqa xmm6, [esp+10h]
+ pminsw xmm6,xmm1
+ movdqa [esp+10h],xmm6
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ movdqa xmm6,xmm4
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1, [esp+30h]
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ pcmpgtw xmm7,xmm1
+ movdqa xmm1,[esp+50h]
+ pand xmm6,xmm7
+ movdqa xmm7,[esp+50h]
+ psubw xmm5,xmm3
+ pabsw xmm5,xmm5
+ pcmpgtw xmm1,xmm5
+ movdqa xmm5,[esp+80h]
+ psubw xmm5,[esp+90h]
+ pand xmm6,xmm1
+ pand xmm6,[esp+40h]
+ movdqa xmm1,[esp+10h]
+ pand xmm1,xmm6
+ movdqa xmm6,[esp+70h]
+ movdqa [esp+30h],xmm1
+ movdqa xmm1,[esp+0A0h]
+ psubw xmm6,xmm1
+ psllw xmm6,2
+ paddw xmm6,xmm5
+ paddw xmm6,[esp+20h]
+ movdqa xmm5,[esp+60h]
+ psraw xmm6,3
+ pmaxsw xmm5,xmm6
+ pminsw xmm0,xmm5
+ movdqa xmm5,[esp+70h]
+ movdqa xmm6,xmm1
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm4,xmm6
+ movdqa xmm6,[esp+80h]
+ psubw xmm6,xmm1
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+90h]
+ pand xmm4,xmm7
+ movdqa xmm7,[esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ pand xmm4,xmm7
+ pand xmm4,[esp+40h]
+ pand xmm0,xmm4
+ movdqa xmm4,[esp+30h]
+ paddw xmm2,xmm4
+ paddw xmm1,xmm0
+ packuswb xmm2,xmm1
+ movq [esi],xmm2
+ psubw xmm3,xmm4
+ psubw xmm5,xmm0
+ packuswb xmm3,xmm5
+ movq [eax],xmm3
+ psrldq xmm2,8
+ movq [edi],xmm2
+ pop edi
+ pop esi
+ psrldq xmm3,8
+ movq [ecx],xmm3
+ pop ebx
+ mov esp,ebp
+ pop ebp
+ ret
+
+;***************************************************************************
+; void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; int32_t iAlpha, int32_t iBeta)
+;***************************************************************************
+
+WELS_EXTERN DeblockChromaEq4H_sse2
+
+ALIGN 16
+
+DeblockChromaEq4H_sse2:
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,0C8h
+ mov ecx,dword [ebp+8]
+ mov edx,dword [ebp+0Ch]
+ mov eax,dword [ebp+10h]
+ sub ecx,2
+ sub edx,2
+ push esi
+ lea esi,[eax+eax*2]
+ mov dword [esp+18h],ecx
+ mov dword [esp+4],edx
+ lea ecx,[ecx+eax*4]
+ lea edx,[edx+eax*4]
+ lea eax,[esp+7Ch]
+ push edi
+ mov dword [esp+14h],esi
+ mov dword [esp+18h],ecx
+ mov dword [esp+0Ch],edx
+ mov dword [esp+10h],eax
+ mov esi,dword [esp+1Ch]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+14h]
+ movd xmm0,dword [esi]
+ movd xmm1,dword [esi+ecx]
+ movd xmm2,dword [esi+ecx*2]
+ movd xmm3,dword [esi+edx]
+ mov esi,dword [esp+8]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [esi+ecx]
+ movd xmm6,dword [esi+ecx*2]
+ movd xmm7,dword [esi+edx]
+ punpckldq xmm0,xmm4
+ punpckldq xmm1,xmm5
+ punpckldq xmm2,xmm6
+ punpckldq xmm3,xmm7
+ mov esi,dword [esp+18h]
+ mov edi,dword [esp+0Ch]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [edi]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm0,xmm4
+ movd xmm4,dword [esi+ecx]
+ movd xmm5,dword [edi+ecx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm1,xmm4
+ movd xmm4,dword [esi+ecx*2]
+ movd xmm5,dword [edi+ecx*2]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm2,xmm4
+ movd xmm4,dword [esi+edx]
+ movd xmm5,dword [edi+edx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm3,xmm4
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov edi,dword [esp+10h]
+ movdqa [edi],xmm0
+ movdqa [edi+10h],xmm5
+ movdqa [edi+20h],xmm1
+ movdqa [edi+30h],xmm6
+ movsx ecx,word [ebp+14h]
+ movsx edx,word [ebp+18h]
+ movdqa xmm6,[esp+80h]
+ movdqa xmm4,[esp+90h]
+ movdqa xmm5,[esp+0A0h]
+ movdqa xmm7,[esp+0B0h]
+ pxor xmm0,xmm0
+ movd xmm1,ecx
+ movdqa xmm2,xmm1
+ punpcklwd xmm2,xmm1
+ pshufd xmm1,xmm2,0
+ movd xmm2,edx
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm2,xmm3,0
+ movdqa xmm3,xmm6
+ punpckhbw xmm6,xmm0
+ movdqa [esp+60h],xmm6
+ movdqa xmm6,[esp+90h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+30h],xmm6
+ movdqa xmm6,[esp+0A0h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+40h],xmm6
+ movdqa xmm6,[esp+0B0h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+70h],xmm6
+ punpcklbw xmm7,xmm0
+ punpcklbw xmm4,xmm0
+ punpcklbw xmm5,xmm0
+ punpcklbw xmm3,xmm0
+ movdqa [esp+50h],xmm7
+ movdqa xmm6,xmm4
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ movdqa xmm0,xmm1
+ pcmpgtw xmm0,xmm6
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm4
+ pabsw xmm6,xmm6
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pand xmm0,xmm7
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+30h]
+ psubw xmm6,[esp+40h]
+ pabsw xmm6,xmm6
+ pcmpgtw xmm1,xmm6
+ movdqa xmm6,[esp+60h]
+ psubw xmm6,[esp+30h]
+ pabsw xmm6,xmm6
+ pand xmm0,xmm7
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+70h]
+ psubw xmm6,[esp+40h]
+ pabsw xmm6,xmm6
+ pand xmm1,xmm7
+ pcmpgtw xmm2,xmm6
+ pand xmm1,xmm2
+ mov eax,2
+ movsx ecx,ax
+ movd xmm2,ecx
+ movdqa xmm6,xmm2
+ punpcklwd xmm6,xmm2
+ pshufd xmm2,xmm6,0
+ movdqa [esp+20h],xmm2
+ movdqa xmm2,xmm3
+ paddw xmm2,xmm3
+ paddw xmm2,xmm4
+ paddw xmm2,[esp+50h]
+ paddw xmm2,[esp+20h]
+ psraw xmm2,2
+ movdqa xmm6,xmm0
+ pand xmm6,xmm2
+ movdqa xmm2,xmm0
+ pandn xmm2,xmm4
+ por xmm6,xmm2
+ movdqa xmm2,[esp+60h]
+ movdqa xmm7,xmm2
+ paddw xmm7,xmm2
+ paddw xmm7,[esp+30h]
+ paddw xmm7,[esp+70h]
+ paddw xmm7,[esp+20h]
+ movdqa xmm4,xmm1
+ movdqa xmm2,xmm1
+ pandn xmm2,[esp+30h]
+ psraw xmm7,2
+ pand xmm4,xmm7
+ por xmm4,xmm2
+ movdqa xmm2,[esp+50h]
+ packuswb xmm6,xmm4
+ movdqa [esp+90h],xmm6
+ movdqa xmm6,xmm2
+ paddw xmm6,xmm2
+ movdqa xmm2,[esp+20h]
+ paddw xmm6,xmm5
+ paddw xmm6,xmm3
+ movdqa xmm4,xmm0
+ pandn xmm0,xmm5
+ paddw xmm6,xmm2
+ psraw xmm6,2
+ pand xmm4,xmm6
+ por xmm4,xmm0
+ movdqa xmm0,[esp+70h]
+ movdqa xmm5,xmm0
+ paddw xmm5,xmm0
+ movdqa xmm0,[esp+40h]
+ paddw xmm5,xmm0
+ paddw xmm5,[esp+60h]
+ movdqa xmm3,xmm1
+ paddw xmm5,xmm2
+ psraw xmm5,2
+ pand xmm3,xmm5
+ pandn xmm1,xmm0
+ por xmm3,xmm1
+ packuswb xmm4,xmm3
+ movdqa [esp+0A0h],xmm4
+ mov esi,dword [esp+10h]
+ movdqa xmm0,[esi]
+ movdqa xmm1,[esi+10h]
+ movdqa xmm2,[esi+20h]
+ movdqa xmm3,[esi+30h]
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov esi,dword [esp+1Ch]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+14h]
+ mov edi,dword [esp+8]
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov esi,dword [esp+18h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov edi,dword [esp+0Ch]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ pop edi
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
+
+;*******************************************************************************
+; void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN DeblockChromaLt4H_sse2
+
+ALIGN 16
+
+DeblockChromaLt4H_sse2:
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,108h
+ mov ecx,dword [ebp+8]
+ mov edx,dword [ebp+0Ch]
+ mov eax,dword [ebp+10h]
+ sub ecx,2
+ sub edx,2
+ push esi
+ lea esi,[eax+eax*2]
+ mov dword [esp+10h],ecx
+ mov dword [esp+4],edx
+ lea ecx,[ecx+eax*4]
+ lea edx,[edx+eax*4]
+ lea eax,[esp+6Ch]
+ push edi
+ mov dword [esp+0Ch],esi
+ mov dword [esp+18h],ecx
+ mov dword [esp+10h],edx
+ mov dword [esp+1Ch],eax
+ mov esi,dword [esp+14h]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+0Ch]
+ movd xmm0,dword [esi]
+ movd xmm1,dword [esi+ecx]
+ movd xmm2,dword [esi+ecx*2]
+ movd xmm3,dword [esi+edx]
+ mov esi,dword [esp+8]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [esi+ecx]
+ movd xmm6,dword [esi+ecx*2]
+ movd xmm7,dword [esi+edx]
+ punpckldq xmm0,xmm4
+ punpckldq xmm1,xmm5
+ punpckldq xmm2,xmm6
+ punpckldq xmm3,xmm7
+ mov esi,dword [esp+18h]
+ mov edi,dword [esp+10h]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [edi]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm0,xmm4
+ movd xmm4,dword [esi+ecx]
+ movd xmm5,dword [edi+ecx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm1,xmm4
+ movd xmm4,dword [esi+ecx*2]
+ movd xmm5,dword [edi+ecx*2]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm2,xmm4
+ movd xmm4,dword [esi+edx]
+ movd xmm5,dword [edi+edx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm3,xmm4
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov edi,dword [esp+1Ch]
+ movdqa [edi],xmm0
+ movdqa [edi+10h],xmm5
+ movdqa [edi+20h],xmm1
+ movdqa [edi+30h],xmm6
+ mov eax,dword [ebp+1Ch]
+ movsx cx,byte [eax+3]
+ movsx dx,byte [eax+2]
+ movsx si,byte [eax+1]
+ movsx ax,byte [eax]
+ movzx edi,cx
+ movzx ecx,cx
+ movd xmm2,ecx
+ movzx ecx,dx
+ movzx edx,dx
+ movd xmm3,ecx
+ movd xmm4,edx
+ movzx ecx,si
+ movzx edx,si
+ movd xmm5,ecx
+ pxor xmm0,xmm0
+ movd xmm6,edx
+ movzx ecx,ax
+ movdqa [esp+60h],xmm0
+ movzx edx,ax
+ movsx eax,word [ebp+14h]
+ punpcklwd xmm6,xmm2
+ movd xmm1,edi
+ movd xmm7,ecx
+ movsx ecx,word [ebp+18h]
+ movd xmm0,edx
+ punpcklwd xmm7,xmm3
+ punpcklwd xmm5,xmm1
+ movdqa xmm1,[esp+60h]
+ punpcklwd xmm7,xmm5
+ movdqa xmm5,[esp+0A0h]
+ punpcklwd xmm0,xmm4
+ punpcklwd xmm0,xmm6
+ movdqa xmm6, [esp+70h]
+ punpcklwd xmm0,xmm7
+ movdqa xmm7,[esp+80h]
+ movdqa xmm2,xmm1
+ psubw xmm2,xmm0
+ movdqa [esp+0D0h],xmm2
+ movd xmm2,eax
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm4,xmm3,0
+ movd xmm2,ecx
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm2,xmm3,0
+ movdqa xmm3, [esp+90h]
+ movdqa [esp+50h],xmm2
+ movdqa xmm2,xmm6
+ punpcklbw xmm2,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa [esp+40h],xmm2
+ movdqa [esp+0B0h],xmm6
+ movdqa xmm6,[esp+90h]
+ movdqa xmm2,xmm7
+ punpckhbw xmm7,xmm1
+ punpckhbw xmm6,xmm1
+ punpcklbw xmm2,xmm1
+ punpcklbw xmm3,xmm1
+ punpcklbw xmm5,xmm1
+ movdqa [esp+0F0h],xmm7
+ movdqa [esp+0C0h],xmm6
+ movdqa xmm6, [esp+0A0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+0E0h],xmm6
+ mov edx,4
+ movsx eax,dx
+ movd xmm6,eax
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa [esp+30h],xmm6
+ movdqa xmm7, [esp+40h]
+ psubw xmm7,xmm5
+ movdqa xmm6,xmm0
+ pcmpgtw xmm6,xmm1
+ movdqa [esp+60h],xmm6
+ movdqa xmm1, [esp+0D0h]
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm2
+ psllw xmm6,2
+ paddw xmm6,xmm7
+ paddw xmm6,[esp+30h]
+ psraw xmm6,3
+ pmaxsw xmm1,xmm6
+ movdqa xmm7,[esp+50h]
+ movdqa [esp+20h],xmm0
+ movdqa xmm6, [esp+20h]
+ pminsw xmm6,xmm1
+ movdqa [esp+20h],xmm6
+ movdqa xmm6,xmm4
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1, [esp+40h]
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ pcmpgtw xmm7,xmm1
+ movdqa xmm1, [esp+50h]
+ pand xmm6,xmm7
+ movdqa xmm7, [esp+50h]
+ psubw xmm5,xmm3
+ pabsw xmm5,xmm5
+ pcmpgtw xmm1,xmm5
+ movdqa xmm5, [esp+0B0h]
+ psubw xmm5,[esp+0E0h]
+ pand xmm6,xmm1
+ pand xmm6, [esp+60h]
+ movdqa xmm1, [esp+20h]
+ pand xmm1,xmm6
+ movdqa xmm6, [esp+0C0h]
+ movdqa [esp+40h],xmm1
+ movdqa xmm1, [esp+0F0h]
+ psubw xmm6,xmm1
+ psllw xmm6,2
+ paddw xmm6,xmm5
+ paddw xmm6, [esp+30h]
+ movdqa xmm5, [esp+0D0h]
+ psraw xmm6,3
+ pmaxsw xmm5,xmm6
+ pminsw xmm0,xmm5
+ movdqa xmm5,[esp+0C0h]
+ movdqa xmm6,xmm1
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm4,xmm6
+ movdqa xmm6,[esp+0B0h]
+ psubw xmm6,xmm1
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6, [esp+0E0h]
+ pand xmm4,xmm7
+ movdqa xmm7, [esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ pand xmm4,xmm7
+ pand xmm4,[esp+60h]
+ pand xmm0,xmm4
+ movdqa xmm4, [esp+40h]
+ paddw xmm2,xmm4
+ paddw xmm1,xmm0
+ psubw xmm3,xmm4
+ psubw xmm5,xmm0
+ packuswb xmm2,xmm1
+ packuswb xmm3,xmm5
+ movdqa [esp+80h],xmm2
+ movdqa [esp+90h],xmm3
+ mov esi,dword [esp+1Ch]
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi+10h]
+ movdqa xmm2, [esi+20h]
+ movdqa xmm3, [esi+30h]
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov esi,dword [esp+14h]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+0Ch]
+ mov edi,dword [esp+8]
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov esi,dword [esp+18h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov edi,dword [esp+10h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ pop edi
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
+
+
+
+;*******************************************************************************
+; void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+; int32_t iBeta, int8_t * pTC)
+;*******************************************************************************
+
+
+WELS_EXTERN DeblockLumaLt4V_sse2
+
+ALIGN 16
+
+DeblockLumaLt4V_sse2:
+ push ebp
+ mov ebp, esp
+ and esp, -16 ; fffffff0H
+ sub esp, 420 ; 000001a4H
+ mov eax, dword [ebp+8]
+ mov ecx, dword [ebp+12]
+
+ pxor xmm0, xmm0
+ push ebx
+ mov edx, dword [ebp+24]
+ movdqa [esp+424-384], xmm0
+ push esi
+
+ lea esi, [ecx+ecx*2]
+ push edi
+ mov edi, eax
+ sub edi, esi
+ movdqa xmm0, [edi]
+
+ lea esi, [ecx+ecx]
+ movdqa [esp+432-208], xmm0
+ mov edi, eax
+ sub edi, esi
+ movdqa xmm0, [edi]
+ movdqa [esp+448-208], xmm0
+
+ mov ebx, eax
+ sub ebx, ecx
+ movdqa xmm0, [ebx]
+ movdqa [esp+464-208], xmm0
+
+ movdqa xmm0, [eax]
+
+ add ecx, eax
+ movdqa [esp+480-208], xmm0
+ movdqa xmm0, [ecx]
+ mov dword [esp+432-404], ecx
+
+ movsx ecx, word [ebp+16]
+ movdqa [esp+496-208], xmm0
+ movdqa xmm0, [esi+eax]
+
+ movsx si, byte [edx]
+ movdqa [esp+512-208], xmm0
+ movd xmm0, ecx
+ movsx ecx, word [ebp+20]
+ movdqa xmm1, xmm0
+ punpcklwd xmm1, xmm0
+ pshufd xmm0, xmm1, 0
+ movdqa [esp+432-112], xmm0
+ movd xmm0, ecx
+ movsx cx, byte [edx+1]
+ movdqa xmm1, xmm0
+ punpcklwd xmm1, xmm0
+ mov dword [esp+432-408], ebx
+ movzx ebx, cx
+ pshufd xmm0, xmm1, 0
+ movd xmm1, ebx
+ movzx ebx, cx
+ movd xmm2, ebx
+ movzx ebx, cx
+ movzx ecx, cx
+ movd xmm4, ecx
+ movzx ecx, si
+ movd xmm5, ecx
+ movzx ecx, si
+ movd xmm6, ecx
+ movzx ecx, si
+ movd xmm7, ecx
+ movzx ecx, si
+ movdqa [esp+432-336], xmm0
+ movd xmm0, ecx
+
+ movsx cx, byte [edx+3]
+ movsx dx, byte [edx+2]
+ movd xmm3, ebx
+ punpcklwd xmm0, xmm4
+ movzx esi, cx
+ punpcklwd xmm6, xmm2
+ punpcklwd xmm5, xmm1
+ punpcklwd xmm0, xmm6
+ punpcklwd xmm7, xmm3
+ punpcklwd xmm7, xmm5
+ punpcklwd xmm0, xmm7
+ movdqa [esp+432-400], xmm0
+ movd xmm0, esi
+ movzx esi, cx
+ movd xmm2, esi
+ movzx esi, cx
+ movzx ecx, cx
+ movd xmm4, ecx
+ movzx ecx, dx
+ movd xmm3, esi
+ movd xmm5, ecx
+ punpcklwd xmm5, xmm0
+
+ movdqa xmm0, [esp+432-384]
+ movzx ecx, dx
+ movd xmm6, ecx
+ movzx ecx, dx
+ movzx edx, dx
+ punpcklwd xmm6, xmm2
+ movd xmm7, ecx
+ movd xmm1, edx
+
+ movdqa xmm2, [esp+448-208]
+ punpcklbw xmm2, xmm0
+
+ mov ecx, 4
+ movsx edx, cx
+ punpcklwd xmm7, xmm3
+ punpcklwd xmm7, xmm5
+ movdqa xmm5, [esp+496-208]
+ movdqa xmm3, [esp+464-208]
+ punpcklbw xmm5, xmm0
+ movdqa [esp+432-240], xmm5
+ movdqa xmm5, [esp+512-208]
+ punpcklbw xmm5, xmm0
+ movdqa [esp+432-352], xmm5
+ punpcklwd xmm1, xmm4
+ movdqa xmm4, [esp+432-208]
+ punpcklwd xmm1, xmm6
+ movdqa xmm6, [esp+480-208]
+ punpcklwd xmm1, xmm7
+ punpcklbw xmm6, xmm0
+ punpcklbw xmm3, xmm0
+ punpcklbw xmm4, xmm0
+ movdqa xmm7, xmm3
+ psubw xmm7, xmm4
+ pabsw xmm7, xmm7
+ movdqa [esp+432-272], xmm4
+ movdqa xmm4, [esp+432-336]
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-288], xmm5
+ movdqa xmm7, xmm6
+ psubw xmm7, [esp+432-352]
+ pabsw xmm7, xmm7
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-256], xmm5
+ movdqa xmm5, xmm3
+ pavgw xmm5, xmm6
+ movdqa [esp+432-304], xmm5
+ movdqa xmm5, [esp+432-400]
+ psubw xmm5, [esp+432-288]
+ psubw xmm5, [esp+432-256]
+ movdqa [esp+432-224], xmm5
+ movdqa xmm5, xmm6
+ psubw xmm5, xmm3
+ movdqa [esp+432-32], xmm6
+ psubw xmm6, [esp+432-240]
+ movdqa xmm7, xmm5
+ movdqa [esp+432-384], xmm5
+ movdqa xmm5, [esp+432-112]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm5, xmm7
+ pabsw xmm6, xmm6
+ movdqa xmm7, xmm4
+ pcmpgtw xmm7, xmm6
+
+ pand xmm5, xmm7
+ movdqa xmm6, xmm3
+ psubw xmm6, xmm2
+ pabsw xmm6, xmm6
+ movdqa xmm7, xmm4
+ pcmpgtw xmm7, xmm6
+ movdqa xmm6, [esp+432-400]
+ pand xmm5, xmm7
+ movdqa xmm7, xmm6
+ pcmpeqw xmm6, xmm0
+ pcmpgtw xmm7, xmm0
+ por xmm7, xmm6
+ pand xmm5, xmm7
+ movdqa [esp+432-320], xmm5
+ movd xmm5, edx
+ movdqa xmm6, xmm5
+ punpcklwd xmm6, xmm5
+ pshufd xmm5, xmm6, 0
+ movdqa [esp+432-336], xmm5
+ movdqa xmm5, [esp+432-224]
+ movdqa [esp+432-368], xmm5
+ movdqa xmm6, xmm0
+ psubw xmm6, xmm5
+ movdqa xmm5, [esp+432-384]
+ psllw xmm5, 2
+ movdqa xmm7, xmm2
+ psubw xmm7, [esp+432-240]
+ paddw xmm7, xmm5
+ paddw xmm7, [esp+432-336]
+ movdqa xmm5, [esp+432-368]
+ psraw xmm7, 3
+ pmaxsw xmm6, xmm7
+ pminsw xmm5, xmm6
+
+ pand xmm5, [esp+432-320]
+ movdqa xmm6, [esp+432-400]
+ movdqa [esp+432-64], xmm5
+ movdqa [esp+432-384], xmm6
+ movdqa xmm5, xmm0
+ psubw xmm5, xmm6
+ movdqa [esp+432-368], xmm5
+ movdqa xmm6, xmm5
+ movdqa xmm5, [esp+432-272]
+ paddw xmm5, [esp+432-304]
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm2
+ psubw xmm5, xmm7
+ psraw xmm5, 1
+ pmaxsw xmm6, xmm5
+ movdqa xmm5, [esp+432-384]
+ pminsw xmm5, xmm6
+
+ pand xmm5, [esp+432-320]
+ pand xmm5, [esp+432-288]
+ movdqa xmm6, [esp+432-240]
+ movdqa [esp+432-96], xmm5
+ movdqa xmm5, [esp+432-352]
+ paddw xmm5, [esp+432-304]
+ movdqa xmm7, xmm6
+ paddw xmm7, xmm6
+ movdqa xmm6, [esp+432-368]
+ psubw xmm5, xmm7
+
+ movdqa xmm7, [esp+496-208]
+ psraw xmm5, 1
+ pmaxsw xmm6, xmm5
+ movdqa xmm5, [esp+432-400]
+ pminsw xmm5, xmm6
+ pand xmm5, [esp+432-320]
+ pand xmm5, [esp+432-256]
+ movdqa xmm6, [esp+448-208]
+ punpckhbw xmm7, xmm0
+ movdqa [esp+432-352], xmm7
+
+ movdqa xmm7, [esp+512-208]
+ punpckhbw xmm6, xmm0
+ movdqa [esp+432-48], xmm5
+ movdqa xmm5, [esp+432-208]
+ movdqa [esp+432-368], xmm6
+ movdqa xmm6, [esp+464-208]
+ punpckhbw xmm7, xmm0
+ punpckhbw xmm5, xmm0
+ movdqa [esp+432-384], xmm7
+ punpckhbw xmm6, xmm0
+ movdqa [esp+432-400], xmm6
+
+ movdqa xmm7, [esp+432-400]
+ movdqa xmm6, [esp+480-208]
+ psubw xmm7, xmm5
+ movdqa [esp+432-16], xmm5
+ pabsw xmm7, xmm7
+ punpckhbw xmm6, xmm0
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-288], xmm5
+
+ movdqa xmm7, xmm6
+ psubw xmm7, [esp+432-384]
+ pabsw xmm7, xmm7
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-256], xmm5
+
+ movdqa xmm5, [esp+432-400]
+ movdqa [esp+432-80], xmm6
+ pavgw xmm5, xmm6
+ movdqa [esp+432-304], xmm5
+
+ movdqa xmm5, xmm1
+ psubw xmm5, [esp+432-288]
+ psubw xmm5, [esp+432-256]
+ movdqa [esp+432-224], xmm5
+ movdqa xmm5, xmm6
+ psubw xmm5, [esp+432-400]
+ psubw xmm6, [esp+432-352]
+ movdqa [esp+432-272], xmm5
+ movdqa xmm7, xmm5
+ movdqa xmm5, [esp+432-112]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm5, xmm7
+ movdqa xmm7, xmm4
+ pabsw xmm6, xmm6
+ pcmpgtw xmm7, xmm6
+ movdqa xmm6, [esp+432-368]
+
+ pand xmm5, xmm7
+ movdqa xmm7, [esp+432-400]
+ psubw xmm7, xmm6
+ psubw xmm6, [esp+432-352]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm4, xmm7
+ pand xmm5, xmm4
+
+ paddw xmm2, [esp+432-96]
+ movdqa xmm4, xmm1
+ pcmpgtw xmm4, xmm0
+ movdqa xmm7, xmm1
+ pcmpeqw xmm7, xmm0
+ por xmm4, xmm7
+ pand xmm5, xmm4
+ movdqa xmm4, [esp+432-224]
+ movdqa [esp+432-320], xmm5
+ movdqa xmm5, [esp+432-272]
+ movdqa xmm7, xmm0
+ psubw xmm7, xmm4
+ psubw xmm0, xmm1
+ psllw xmm5, 2
+ paddw xmm6, xmm5
+ paddw xmm6, [esp+432-336]
+ movdqa xmm5, [esp+432-368]
+ movdqa [esp+432-336], xmm0
+ psraw xmm6, 3
+ pmaxsw xmm7, xmm6
+ pminsw xmm4, xmm7
+ pand xmm4, [esp+432-320]
+ movdqa xmm6, xmm0
+ movdqa xmm0, [esp+432-16]
+ paddw xmm0, [esp+432-304]
+ movdqa [esp+432-272], xmm4
+ movdqa xmm4, [esp+432-368]
+ paddw xmm4, xmm4
+ psubw xmm0, xmm4
+
+ movdqa xmm4, [esp+432-64]
+ psraw xmm0, 1
+ pmaxsw xmm6, xmm0
+ movdqa xmm0, [esp+432-400]
+ movdqa xmm7, xmm1
+ pminsw xmm7, xmm6
+ movdqa xmm6, [esp+432-320]
+ pand xmm7, xmm6
+ pand xmm7, [esp+432-288]
+ paddw xmm5, xmm7
+ packuswb xmm2, xmm5
+ movdqa xmm5, [esp+432-272]
+ paddw xmm0, xmm5
+ paddw xmm3, xmm4
+ packuswb xmm3, xmm0
+
+ movdqa xmm0, [esp+432-32]
+ psubw xmm0, xmm4
+ movdqa xmm4, [esp+432-80]
+ psubw xmm4, xmm5
+
+ movdqa xmm5, [esp+432-240]
+ paddw xmm5, [esp+432-48]
+ packuswb xmm0, xmm4
+ movdqa xmm4, [esp+432-384]
+ paddw xmm4, [esp+432-304]
+ movdqa [esp+480-208], xmm0
+ movdqa xmm0, [esp+432-352]
+ movdqa xmm7, xmm0
+ paddw xmm0, xmm0
+
+ mov ecx, dword [esp+432-408]
+
+ mov edx, dword [esp+432-404]
+ psubw xmm4, xmm0
+ movdqa xmm0, [esp+432-336]
+ movdqa [edi], xmm2
+ psraw xmm4, 1
+ pmaxsw xmm0, xmm4
+ pminsw xmm1, xmm0
+ movdqa xmm0, [esp+480-208]
+
+ pop edi
+ pand xmm1, xmm6
+ pand xmm1, [esp+428-256]
+ movdqa [ecx], xmm3
+ paddw xmm7, xmm1
+ pop esi
+ packuswb xmm5, xmm7
+ movdqa [eax], xmm0
+ movdqa [edx], xmm5
+ pop ebx
+ mov esp, ebp
+ pop ebp
+ ret
+
+
+;*******************************************************************************
+; void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+; int32_t iBeta)
+;*******************************************************************************
+
+WELS_EXTERN DeblockLumaEq4V_sse2
+
+ALIGN 16
+
+DeblockLumaEq4V_sse2:
+
+ push ebp
+ mov ebp, esp
+ and esp, -16 ; fffffff0H
+ sub esp, 628 ; 00000274H
+ mov eax, dword [ebp+8]
+ mov ecx, dword [ebp+12]
+ push ebx
+ push esi
+
+ lea edx, [ecx*4]
+ pxor xmm0, xmm0
+ movdqa xmm2, xmm0
+
+ movdqa xmm0, [ecx+eax]
+ mov esi, eax
+ sub esi, edx
+ movdqa xmm3, [esi]
+ movdqa xmm5, [eax]
+ push edi
+ lea edi, [ecx+ecx]
+ lea ebx, [ecx+ecx*2]
+ mov dword [esp+640-600], edi
+ mov esi, eax
+ sub esi, edi
+ movdqa xmm1, [esi]
+ movdqa [esp+720-272], xmm0
+ mov edi, eax
+ sub edi, ecx
+ movdqa xmm4, [edi]
+ add ecx, eax
+ mov dword [esp+640-596], ecx
+
+ mov ecx, dword [esp+640-600]
+ movdqa xmm0, [ecx+eax]
+ movdqa [esp+736-272], xmm0
+
+ movdqa xmm0, [eax+ebx]
+ mov edx, eax
+ sub edx, ebx
+
+ movsx ebx, word [ebp+16]
+ movdqa xmm6, [edx]
+ add ecx, eax
+ movdqa [esp+752-272], xmm0
+ movd xmm0, ebx
+
+ movsx ebx, word [ebp+20]
+ movdqa xmm7, xmm0
+ punpcklwd xmm7, xmm0
+ pshufd xmm0, xmm7, 0
+ movdqa [esp+640-320], xmm0
+ movd xmm0, ebx
+ movdqa xmm7, xmm0
+ punpcklwd xmm7, xmm0
+ pshufd xmm0, xmm7, 0
+
+ movdqa xmm7, [esp+736-272]
+ punpcklbw xmm7, xmm2
+ movdqa [esp+640-416], xmm7
+ movdqa [esp+640-512], xmm0
+ movdqa xmm0, xmm1
+ movdqa [esp+672-272], xmm1
+ movdqa xmm1, xmm4
+ movdqa [esp+704-272], xmm5
+ punpcklbw xmm5, xmm2
+ punpcklbw xmm1, xmm2
+
+ movdqa xmm7, xmm5
+ psubw xmm7, xmm1
+ pabsw xmm7, xmm7
+ movdqa [esp+640-560], xmm7
+ punpcklbw xmm0, xmm2
+ movdqa [esp+688-272], xmm4
+ movdqa xmm4, [esp+720-272]
+ movdqa [esp+640-480], xmm0
+
+ movdqa xmm7, xmm1
+ psubw xmm7, xmm0
+
+ movdqa xmm0, [esp+640-512]
+ pabsw xmm7, xmm7
+ punpcklbw xmm4, xmm2
+ pcmpgtw xmm0, xmm7
+ movdqa [esp+640-384], xmm4
+ movdqa xmm7, xmm5
+ psubw xmm7, xmm4
+ movdqa xmm4, [esp+640-512]
+ movdqa [esp+656-272], xmm6
+ punpcklbw xmm6, xmm2
+ pabsw xmm7, xmm7
+ movdqa [esp+640-48], xmm2
+ movdqa [esp+640-368], xmm6
+ movdqa [esp+640-144], xmm1
+ movdqa [esp+640-400], xmm5
+ pcmpgtw xmm4, xmm7
+ pand xmm0, xmm4
+ movdqa xmm4, [esp+640-320]
+ pcmpgtw xmm4, [esp+640-560]
+ pand xmm0, xmm4
+
+ mov ebx, 2
+ movsx ebx, bx
+ movd xmm4, ebx
+ movdqa xmm7, xmm4
+ punpcklwd xmm7, xmm4
+ movdqa xmm4, [esp+640-320]
+ psraw xmm4, 2
+ pshufd xmm7, xmm7, 0
+ paddw xmm4, xmm7
+ movdqa [esp+640-576], xmm4
+ pcmpgtw xmm4, [esp+640-560]
+ movdqa [esp+640-560], xmm4
+
+ movdqa xmm4, [esp+640-512]
+ movdqa [esp+640-624], xmm7
+ movdqa xmm7, xmm1
+ psubw xmm7, xmm6
+ pabsw xmm7, xmm7
+ pcmpgtw xmm4, xmm7
+
+ pand xmm4, [esp+640-560]
+ movdqa [esp+640-544], xmm4
+ movdqa xmm4, [esp+640-512]
+ movdqa xmm7, xmm5
+ psubw xmm7, [esp+640-416]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm4, xmm7
+
+ pand xmm4, [esp+640-560]
+ movdqa [esp+640-560], xmm4
+
+ movdqa xmm4, [esp+640-544]
+ pandn xmm4, xmm6
+ movdqa [esp+640-16], xmm4
+ mov ebx, 4
+ movsx ebx, bx
+ movd xmm4, ebx
+ movdqa xmm7, xmm4
+ punpcklwd xmm7, xmm4
+ movdqa xmm4, xmm3
+ punpcklbw xmm4, xmm2
+ psllw xmm4, 1
+ paddw xmm4, xmm6
+ paddw xmm4, xmm6
+ paddw xmm4, xmm6
+ paddw xmm4, [esp+640-480]
+
+ movdqa xmm6, [esp+640-560]
+ pshufd xmm7, xmm7, 0
+ paddw xmm4, xmm1
+ movdqa [esp+640-592], xmm7
+ paddw xmm4, xmm5
+ paddw xmm4, xmm7
+ movdqa xmm7, [esp+640-416]
+ pandn xmm6, xmm7
+ movdqa [esp+640-80], xmm6
+ movdqa xmm6, [esp+752-272]
+ punpcklbw xmm6, xmm2
+ psllw xmm6, 1
+ paddw xmm6, xmm7
+ paddw xmm6, xmm7
+ paddw xmm6, xmm7
+ paddw xmm6, [esp+640-384]
+
+ movdqa xmm7, [esp+640-480]
+ paddw xmm6, xmm5
+ paddw xmm6, xmm1
+ paddw xmm6, [esp+640-592]
+ psraw xmm6, 3
+ pand xmm6, [esp+640-560]
+ movdqa [esp+640-112], xmm6
+ movdqa xmm6, [esp+640-544]
+ pandn xmm6, xmm7
+ movdqa [esp+640-336], xmm6
+ movdqa xmm6, [esp+640-544]
+ movdqa [esp+640-528], xmm6
+ movdqa xmm6, [esp+640-368]
+ paddw xmm6, xmm7
+ movdqa xmm7, xmm1
+ psraw xmm4, 3
+ pand xmm4, [esp+640-544]
+ paddw xmm7, xmm5
+ paddw xmm6, xmm7
+ paddw xmm6, [esp+640-624]
+ movdqa xmm7, [esp+640-528]
+
+ paddw xmm5, xmm1
+ psraw xmm6, 2
+ pand xmm7, xmm6
+
+ movdqa xmm6, [esp+640-384]
+ movdqa [esp+640-64], xmm7
+ movdqa xmm7, [esp+640-560]
+ pandn xmm7, xmm6
+ movdqa [esp+640-304], xmm7
+ movdqa xmm7, [esp+640-560]
+ movdqa [esp+640-528], xmm7
+ movdqa xmm7, [esp+640-416]
+ paddw xmm7, xmm6
+ paddw xmm7, xmm5
+ paddw xmm7, [esp+640-624]
+ movdqa xmm5, [esp+640-528]
+ psraw xmm7, 2
+ pand xmm5, xmm7
+ movdqa [esp+640-32], xmm5
+
+ movdqa xmm5, [esp+640-544]
+ movdqa [esp+640-528], xmm5
+ movdqa xmm5, [esp+640-480]
+ movdqa xmm7, xmm5
+ paddw xmm7, xmm5
+ movdqa xmm5, xmm1
+ paddw xmm5, xmm6
+ paddw xmm6, [esp+640-592]
+ paddw xmm7, xmm5
+ paddw xmm7, [esp+640-624]
+ movdqa xmm5, [esp+640-528]
+ psraw xmm7, 2
+ pandn xmm5, xmm7
+ movdqa xmm7, [esp+640-480]
+ paddw xmm7, xmm1
+ paddw xmm7, [esp+640-400]
+ movdqa xmm1, [esp+640-544]
+ movdqa [esp+640-352], xmm5
+ movdqa xmm5, [esp+640-368]
+ psllw xmm7, 1
+ paddw xmm7, xmm6
+ paddw xmm5, xmm7
+
+ movdqa xmm7, [esp+640-400]
+ psraw xmm5, 3
+ pand xmm1, xmm5
+ movdqa xmm5, [esp+640-480]
+ movdqa [esp+640-96], xmm1
+ movdqa xmm1, [esp+640-560]
+ movdqa [esp+640-528], xmm1
+ movdqa xmm1, [esp+640-384]
+ movdqa xmm6, xmm1
+ paddw xmm6, xmm1
+ paddw xmm1, [esp+640-400]
+ paddw xmm1, [esp+640-144]
+ paddw xmm7, xmm5
+ paddw xmm5, [esp+640-592]
+ paddw xmm6, xmm7
+ paddw xmm6, [esp+640-624]
+ movdqa xmm7, [esp+640-528]
+ psraw xmm6, 2
+ psllw xmm1, 1
+ paddw xmm1, xmm5
+
+ movdqa xmm5, [esp+656-272]
+ pandn xmm7, xmm6
+ movdqa xmm6, [esp+640-416]
+ paddw xmm6, xmm1
+ movdqa xmm1, [esp+640-560]
+ psraw xmm6, 3
+ pand xmm1, xmm6
+
+ movdqa xmm6, [esp+704-272]
+ movdqa [esp+640-128], xmm1
+ movdqa xmm1, [esp+672-272]
+ punpckhbw xmm1, xmm2
+ movdqa [esp+640-448], xmm1
+ movdqa xmm1, [esp+688-272]
+ punpckhbw xmm1, xmm2
+ punpckhbw xmm6, xmm2
+ movdqa [esp+640-288], xmm7
+ punpckhbw xmm5, xmm2
+ movdqa [esp+640-496], xmm1
+ movdqa [esp+640-432], xmm6
+
+ movdqa xmm7, [esp+720-272]
+ punpckhbw xmm7, xmm2
+ movdqa [esp+640-464], xmm7
+
+ movdqa xmm7, [esp+736-272]
+ punpckhbw xmm7, xmm2
+ movdqa [esp+640-528], xmm7
+
+ movdqa xmm7, xmm6
+
+ psubw xmm6, [esp+640-464]
+ psubw xmm7, xmm1
+ pabsw xmm7, xmm7
+ movdqa [esp+640-560], xmm7
+ por xmm4, [esp+640-16]
+ pabsw xmm6, xmm6
+ movdqa xmm7, xmm1
+ psubw xmm7, [esp+640-448]
+
+ movdqa xmm1, [esp+640-512]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm1, xmm7
+ movdqa xmm7, [esp+640-512]
+ pcmpgtw xmm7, xmm6
+ movdqa xmm6, [esp+640-320]
+ pand xmm1, xmm7
+ movdqa xmm7, [esp+640-560]
+ pcmpgtw xmm6, xmm7
+ pand xmm1, xmm6
+
+ movdqa xmm6, [esp+640-576]
+ pcmpgtw xmm6, xmm7
+
+ movdqa xmm7, [esp+640-496]
+ punpckhbw xmm3, xmm2
+ movdqa [esp+640-560], xmm6
+ movdqa xmm6, [esp+640-512]
+ psubw xmm7, xmm5
+ pabsw xmm7, xmm7
+ pcmpgtw xmm6, xmm7
+
+ pand xmm6, [esp+640-560]
+ movdqa xmm7, [esp+640-432]
+ psubw xmm7, [esp+640-528]
+
+ psllw xmm3, 1
+ movdqa [esp+640-544], xmm6
+ movdqa xmm6, [esp+640-512]
+
+ movdqa xmm2, [esp+640-544]
+ paddw xmm3, xmm5
+ paddw xmm3, xmm5
+ paddw xmm3, xmm5
+ paddw xmm3, [esp+640-448]
+ paddw xmm3, [esp+640-496]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm6, xmm7
+ pand xmm6, [esp+640-560]
+ movdqa [esp+640-560], xmm6
+
+ movdqa xmm6, xmm0
+ pand xmm6, xmm4
+ movdqa xmm4, xmm0
+ pandn xmm4, [esp+640-368]
+ por xmm6, xmm4
+ movdqa xmm4, [esp+640-432]
+ paddw xmm3, xmm4
+ paddw xmm3, [esp+640-592]
+ psraw xmm3, 3
+ pand xmm3, xmm2
+ pandn xmm2, xmm5
+ por xmm3, xmm2
+ movdqa xmm7, xmm1
+ pand xmm7, xmm3
+ movdqa xmm3, [esp+640-64]
+ por xmm3, [esp+640-336]
+ movdqa xmm2, xmm1
+ pandn xmm2, xmm5
+ por xmm7, xmm2
+
+ movdqa xmm2, xmm0
+ pand xmm2, xmm3
+ movdqa xmm3, xmm0
+ pandn xmm3, [esp+640-480]
+ por xmm2, xmm3
+ packuswb xmm6, xmm7
+ movdqa [esp+640-336], xmm2
+ movdqa [esp+656-272], xmm6
+ movdqa xmm6, [esp+640-544]
+ movdqa xmm2, xmm5
+ paddw xmm2, [esp+640-448]
+ movdqa xmm3, xmm1
+ movdqa xmm7, [esp+640-496]
+ paddw xmm7, xmm4
+ paddw xmm2, xmm7
+ paddw xmm2, [esp+640-624]
+ movdqa xmm7, [esp+640-544]
+ psraw xmm2, 2
+ pand xmm6, xmm2
+ movdqa xmm2, [esp+640-448]
+ pandn xmm7, xmm2
+ por xmm6, xmm7
+ pand xmm3, xmm6
+ movdqa xmm6, xmm1
+ pandn xmm6, xmm2
+ paddw xmm2, [esp+640-496]
+ paddw xmm2, xmm4
+ por xmm3, xmm6
+ movdqa xmm6, [esp+640-336]
+ packuswb xmm6, xmm3
+ psllw xmm2, 1
+ movdqa [esp+672-272], xmm6
+ movdqa xmm6, [esp+640-96]
+ por xmm6, [esp+640-352]
+
+ movdqa xmm3, xmm0
+ pand xmm3, xmm6
+ movdqa xmm6, xmm0
+ pandn xmm6, [esp+640-144]
+ por xmm3, xmm6
+ movdqa xmm6, [esp+640-544]
+ movdqa [esp+640-352], xmm3
+ movdqa xmm3, [esp+640-464]
+ paddw xmm3, [esp+640-592]
+ paddw xmm2, xmm3
+ movdqa xmm3, [esp+640-448]
+ paddw xmm5, xmm2
+ movdqa xmm2, [esp+640-496]
+ psraw xmm5, 3
+ pand xmm6, xmm5
+ movdqa xmm5, [esp+640-464]
+ paddw xmm2, xmm5
+ paddw xmm5, [esp+640-432]
+ movdqa xmm4, xmm3
+ paddw xmm4, xmm3
+ paddw xmm4, xmm2
+ paddw xmm4, [esp+640-624]
+ movdqa xmm2, [esp+640-544]
+ paddw xmm3, [esp+640-592]
+ psraw xmm4, 2
+ pandn xmm2, xmm4
+ por xmm6, xmm2
+ movdqa xmm7, xmm1
+ pand xmm7, xmm6
+ movdqa xmm6, [esp+640-496]
+ movdqa xmm2, xmm1
+ pandn xmm2, xmm6
+ por xmm7, xmm2
+ movdqa xmm2, [esp+640-352]
+ packuswb xmm2, xmm7
+ movdqa [esp+688-272], xmm2
+ movdqa xmm2, [esp+640-128]
+ por xmm2, [esp+640-288]
+
+ movdqa xmm4, xmm0
+ pand xmm4, xmm2
+ paddw xmm5, xmm6
+ movdqa xmm2, xmm0
+ pandn xmm2, [esp+640-400]
+ por xmm4, xmm2
+ movdqa xmm2, [esp+640-528]
+ psllw xmm5, 1
+ paddw xmm5, xmm3
+ movdqa xmm3, [esp+640-560]
+ paddw xmm2, xmm5
+ psraw xmm2, 3
+ movdqa [esp+640-288], xmm4
+ movdqa xmm4, [esp+640-560]
+ pand xmm4, xmm2
+ movdqa xmm2, [esp+640-464]
+ movdqa xmm5, xmm2
+ paddw xmm5, xmm2
+ movdqa xmm2, [esp+640-432]
+ paddw xmm2, [esp+640-448]
+ movdqa xmm7, xmm1
+ paddw xmm5, xmm2
+ paddw xmm5, [esp+640-624]
+ movdqa xmm6, [esp+640-560]
+ psraw xmm5, 2
+ pandn xmm3, xmm5
+ por xmm4, xmm3
+ movdqa xmm3, [esp+640-32]
+ por xmm3, [esp+640-304]
+ pand xmm7, xmm4
+ movdqa xmm4, [esp+640-432]
+ movdqa xmm5, [esp+640-464]
+ movdqa xmm2, xmm1
+ pandn xmm2, xmm4
+ paddw xmm4, [esp+640-496]
+ por xmm7, xmm2
+ movdqa xmm2, [esp+640-288]
+ packuswb xmm2, xmm7
+ movdqa [esp+704-272], xmm2
+
+ movdqa xmm2, xmm0
+ pand xmm2, xmm3
+ movdqa xmm3, xmm0
+ pandn xmm3, [esp+640-384]
+ por xmm2, xmm3
+ movdqa [esp+640-304], xmm2
+ movdqa xmm2, [esp+640-528]
+ movdqa xmm3, xmm2
+ paddw xmm3, [esp+640-464]
+ paddw xmm3, xmm4
+ paddw xmm3, [esp+640-624]
+ psraw xmm3, 2
+ pand xmm6, xmm3
+ movdqa xmm3, [esp+640-560]
+ movdqa xmm4, xmm3
+ pandn xmm4, xmm5
+ por xmm6, xmm4
+ movdqa xmm7, xmm1
+ pand xmm7, xmm6
+ movdqa xmm6, [esp+640-304]
+ movdqa xmm4, xmm1
+ pandn xmm4, xmm5
+ por xmm7, xmm4
+
+ movdqa xmm4, xmm0
+ pandn xmm0, [esp+640-416]
+ packuswb xmm6, xmm7
+ movdqa xmm7, [esp+640-112]
+ por xmm7, [esp+640-80]
+ pand xmm4, xmm7
+ por xmm4, xmm0
+ movdqa xmm0, [esp+752-272]
+ punpckhbw xmm0, [esp+640-48]
+ psllw xmm0, 1
+ paddw xmm0, xmm2
+ paddw xmm0, xmm2
+ paddw xmm0, xmm2
+ paddw xmm0, xmm5
+ paddw xmm0, [esp+640-432]
+ paddw xmm0, [esp+640-496]
+ paddw xmm0, [esp+640-592]
+ psraw xmm0, 3
+ pand xmm0, xmm3
+ movdqa xmm7, xmm1
+ pandn xmm3, xmm2
+ por xmm0, xmm3
+ pand xmm7, xmm0
+
+ movdqa xmm0, [esp+656-272]
+ movdqa [edx], xmm0
+
+ movdqa xmm0, [esp+672-272]
+
+ mov edx, dword [esp+640-596]
+ movdqa [esi], xmm0
+ movdqa xmm0, [esp+688-272]
+ movdqa [edi], xmm0
+ movdqa xmm0, [esp+704-272]
+
+ pop edi
+ pandn xmm1, xmm2
+ movdqa [eax], xmm0
+ por xmm7, xmm1
+ pop esi
+ packuswb xmm4, xmm7
+ movdqa [edx], xmm6
+ movdqa [ecx], xmm4
+ pop ebx
+ mov esp, ebp
+ pop ebp
+ ret
+
+
+;********************************************************************************
+;
+; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
+;
+;********************************************************************************
+
+WELS_EXTERN DeblockLumaTransposeH2V_sse2
+
+ALIGN 16
+
+DeblockLumaTransposeH2V_sse2:
+ push ebp
+ push ebx
+ mov ebp, esp
+ and esp,0FFFFFFF0h
+ sub esp, 10h
+
+ mov eax, [ebp + 0Ch]
+ mov ecx, [ebp + 10h]
+ lea edx, [eax + ecx * 8]
+ lea ebx, [ecx*3]
+
+ movq xmm0, [eax]
+ movq xmm7, [edx]
+ punpcklqdq xmm0, xmm7
+ movq xmm1, [eax + ecx]
+ movq xmm7, [edx + ecx]
+ punpcklqdq xmm1, xmm7
+ movq xmm2, [eax + ecx*2]
+ movq xmm7, [edx + ecx*2]
+ punpcklqdq xmm2, xmm7
+ movq xmm3, [eax + ebx]
+ movq xmm7, [edx + ebx]
+ punpcklqdq xmm3, xmm7
+
+ lea eax, [eax + ecx * 4]
+ lea edx, [edx + ecx * 4]
+ movq xmm4, [eax]
+ movq xmm7, [edx]
+ punpcklqdq xmm4, xmm7
+ movq xmm5, [eax + ecx]
+ movq xmm7, [edx + ecx]
+ punpcklqdq xmm5, xmm7
+ movq xmm6, [eax + ecx*2]
+ movq xmm7, [edx + ecx*2]
+ punpcklqdq xmm6, xmm7
+
+ movdqa [esp], xmm0
+ movq xmm7, [eax + ebx]
+ movq xmm0, [edx + ebx]
+ punpcklqdq xmm7, xmm0
+ movdqa xmm0, [esp]
+
+ SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
+ ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+
+ mov eax, [ebp + 14h]
+ movdqa [eax], xmm4
+ movdqa [eax + 10h], xmm2
+ movdqa [eax + 20h], xmm3
+ movdqa [eax + 30h], xmm7
+ movdqa [eax + 40h], xmm5
+ movdqa [eax + 50h], xmm1
+ movdqa [eax + 60h], xmm6
+ movdqa [eax + 70h], xmm0
+
+ mov esp, ebp
+ pop ebx
+ pop ebp
+ ret
+
+
+
+;*******************************************************************************************
+;
+; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
+;
+;*******************************************************************************************
+
+WELS_EXTERN DeblockLumaTransposeV2H_sse2
+
+ALIGN 16
+
+DeblockLumaTransposeV2H_sse2:
+ push ebp
+ mov ebp, esp
+
+ and esp, 0FFFFFFF0h
+ sub esp, 10h
+
+ mov eax, [ebp + 10h]
+ mov ecx, [ebp + 0Ch]
+ mov edx, [ebp + 08h]
+
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 10h]
+ movdqa xmm2, [eax + 20h]
+ movdqa xmm3, [eax + 30h]
+ movdqa xmm4, [eax + 40h]
+ movdqa xmm5, [eax + 50h]
+ movdqa xmm6, [eax + 60h]
+ movdqa xmm7, [eax + 70h]
+
+ SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
+ ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+
+ lea eax, [ecx * 3]
+
+ movq [edx], xmm4
+ movq [edx + ecx], xmm2
+ movq [edx + ecx*2], xmm3
+ movq [edx + eax], xmm7
+
+ lea edx, [edx + ecx*4]
+ movq [edx], xmm5
+ movq [edx + ecx], xmm1
+ movq [edx + ecx*2], xmm6
+ movq [edx + eax], xmm0
+
+ psrldq xmm4, 8
+ psrldq xmm2, 8
+ psrldq xmm3, 8
+ psrldq xmm7, 8
+ psrldq xmm5, 8
+ psrldq xmm1, 8
+ psrldq xmm6, 8
+ psrldq xmm0, 8
+
+ lea edx, [edx + ecx*4]
+ movq [edx], xmm4
+ movq [edx + ecx], xmm2
+ movq [edx + ecx*2], xmm3
+ movq [edx + eax], xmm7
+
+ lea edx, [edx + ecx*4]
+ movq [edx], xmm5
+ movq [edx + ecx], xmm1
+ movq [edx + ecx*2], xmm6
+ movq [edx + eax], xmm0
+
+
+ mov esp, ebp
+ pop ebp
ret
\ No newline at end of file
--- a/codec/decoder/core/asm/expand_picture.asm
+++ b/codec/decoder/core/asm/expand_picture.asm
@@ -155,11 +155,11 @@
lea %1, [%1+%2]
%endmacro
-%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)]
+%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)]
; ebx [width/16(8)]
; esi [pSrc+0], edi [pSrc-1], ecx [-stride], 32(16) ; top
; eax [pSrc+(h-1)*stride], ebp [pSrc+(h+31)*stride], 32(16) ; bottom
-
+
%if %1 == 32 ; for luma
sar ebx, 04h ; width / 16(8) pixels
.top_bottom_loops:
@@ -173,7 +173,7 @@
mov_line_16x4_sse2 edi, ecx, xmm0, a
mov_line_16x4_sse2 edi, ecx, xmm0, a
mov_line_end16x4_sse2 edi, ecx, xmm0, a
-
+
; bottom
movdqa xmm1, [eax] ; last line of picture pData
mov_line_16x4_sse2 ebp, ecx, xmm1, a ; dst, stride, xmm?
@@ -184,15 +184,15 @@
mov_line_16x4_sse2 ebp, ecx, xmm1, a
mov_line_16x4_sse2 ebp, ecx, xmm1, a
mov_line_end16x4_sse2 ebp, ecx, xmm1, a
-
+
lea esi, [esi+16] ; top pSrc
lea edi, [edi+16] ; top dst
lea eax, [eax+16] ; bottom pSrc
lea ebp, [ebp+16] ; bottom dst
- neg ecx ; positive/negative stride need for next loop?
-
+ neg ecx ; positive/negative stride need for next loop?
+
dec ebx
- jnz near .top_bottom_loops
+ jnz near .top_bottom_loops
%elif %1 == 16 ; for chroma ??
mov edx, ebx
sar ebx, 04h ; (width / 16) pixels
@@ -202,21 +202,21 @@
mov_line_16x4_sse2 edi, ecx, xmm0, a ; dst, stride, xmm?
mov_line_16x4_sse2 edi, ecx, xmm0, a
mov_line_16x4_sse2 edi, ecx, xmm0, a
- mov_line_end16x4_sse2 edi, ecx, xmm0, a
-
+ mov_line_end16x4_sse2 edi, ecx, xmm0, a
+
; bottom
movdqa xmm1, [eax] ; last line of picture pData
mov_line_16x4_sse2 ebp, ecx, xmm1, a ; dst, stride, xmm?
mov_line_16x4_sse2 ebp, ecx, xmm1, a
mov_line_16x4_sse2 ebp, ecx, xmm1, a
- mov_line_end16x4_sse2 ebp, ecx, xmm1, a
-
+ mov_line_end16x4_sse2 ebp, ecx, xmm1, a
+
lea esi, [esi+16] ; top pSrc
lea edi, [edi+16] ; top dst
lea eax, [eax+16] ; bottom pSrc
lea ebp, [ebp+16] ; bottom dst
- neg ecx ; positive/negative stride need for next loop?
-
+ neg ecx ; positive/negative stride need for next loop?
+
dec ebx
jnz near .top_bottom_loops
@@ -243,13 +243,13 @@
%endif
%endmacro
-%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
+%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
; ecx [height]
; esi [pSrc+0], edi [pSrc-32], edx [stride], 32(16) ; left
; ebx [pSrc+(w-1)], ebp [pSrc+w], 32(16) ; right
; xor eax, eax ; for pixel pData (uint8_t) ; make sure eax=0 at least high 24 bits of eax = 0
-
-%if %1 == 32 ; for luma
+
+%if %1 == 32 ; for luma
.left_right_loops:
; left
mov al, byte [esi] ; pixel pData for left border
@@ -256,37 +256,37 @@
butterfly_1to16_sse xmm0, xmm1, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdqa [edi], xmm0
movdqa [edi+16], xmm0
-
+
; right
mov al, byte [ebx]
butterfly_1to16_sse xmm1, xmm2, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdqa [ebp], xmm1
movdqa [ebp+16], xmm1
-
+
lea esi, [esi+edx] ; left pSrc
lea edi, [edi+edx] ; left dst
lea ebx, [ebx+edx] ; right pSrc
- lea ebp, [ebp+edx] ; right dst
-
+ lea ebp, [ebp+edx] ; right dst
+
dec ecx
- jnz near .left_right_loops
-%elif %1 == 16 ; for chroma ??
+ jnz near .left_right_loops
+%elif %1 == 16 ; for chroma ??
.left_right_loops:
; left
mov al, byte [esi] ; pixel pData for left border
butterfly_1to16_sse xmm0, xmm1, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
- movdqa [edi], xmm0
-
+ movdqa [edi], xmm0
+
; right
mov al, byte [ebx]
butterfly_1to16_sse xmm1, xmm2, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdq%2 [ebp], xmm1 ; might not be aligned 16 bytes in case chroma planes
-
+
lea esi, [esi+edx] ; left pSrc
lea edi, [edi+edx] ; left dst
lea ebx, [ebx+edx] ; right pSrc
- lea ebp, [ebp+edx] ; right dst
-
+ lea ebp, [ebp+edx] ; right dst
+
dec ecx
jnz near .left_right_loops
%endif
@@ -339,25 +339,25 @@
; TL
mov_line_16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
mov_line_16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
- mov_line_16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
mov_line_end16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
; TR
mov_line_16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
+ mov_line_16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
mov_line_end16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
; BL
mov_line_16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
mov_line_16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
- mov_line_16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
mov_line_end16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
; BR
mov_line_16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
+ mov_line_16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
mov_line_end16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
%endif
%endmacro
@@ -375,7 +375,7 @@
push esi
push edi
push ebp
-
+
; for both top and bottom border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; pDst
@@ -387,10 +387,10 @@
mov cl, byte [esi]
butterfly_1to16_sse xmm3, xmm4, c ; pDst, tmp, pSrc [generic register name: a/b/c/d]
; load top border
- mov ecx, edx ; kiStride
+ mov ecx, edx ; kiStride
neg ecx ; -kiStride
lea edi, [esi+ecx] ; last line of top border
- ; load bottom border
+ ; load bottom border
dec eax ; h-1
imul eax, edx ; (h-1)*kiStride
lea eax, [esi+eax] ; last line of picture pData
@@ -398,16 +398,16 @@
lea ebp, [eax+edx] ; last line of bottom border, (h-1)*stride + 32 * stride
; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
dec ebx ; kiWidth-1
- lea ebx, [eax+ebx] ; dst[w-1][h-1]
+ lea ebx, [eax+ebx] ; dst[w-1][h-1]
; xor edx, edx
mov dl, byte [eax] ; bottom-left
butterfly_1to16_sse xmm5, xmm6, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
mov dl, byte [ebx] ; bottom-right
butterfly_1to16_sse xmm6, xmm4, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- ; for top & bottom expanding
+ ; for top & bottom expanding
mov ebx, [esp+32] ; kiWidth
- exp_top_bottom_sse2 32
-
+ exp_top_bottom_sse2 32
+
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst: left border pSrc
@@ -419,7 +419,7 @@
lea edi, [esi+eax] ; left border dst
dec ebx
lea ebx, [esi+ebx] ; right border pSrc, (p_dst + width - 1)
- lea ebp, [ebx+1] ; right border dst
+ lea ebp, [ebx+1] ; right border dst
; prepare for cross border pData: top-right with xmm4
; xor eax, eax
mov al, byte [ebx] ; top-right
@@ -426,7 +426,7 @@
butterfly_1to16_sse xmm4, xmm0, a ; pDst, tmp, pSrc [generic register name: a/b/c/d]
; for left & right border expanding
exp_left_right_sse2 32, a
-
+
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; pDst
@@ -436,7 +436,7 @@
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
mov eax, -32 ; luma=-32, chroma=-16
neg ecx ; -stride
- lea edi, [esi+eax]
+ lea edi, [esi+eax]
lea edi, [edi+ecx] ; last line of top-left border
lea ebp, [esi+ebx]
lea ebp, [ebp+ecx] ; last line of top-right border
@@ -444,19 +444,19 @@
mov ecx, [esp+28] ; kiStride
imul edx, ecx ; (height+32(16)) * stride
lea eax, [edi+edx] ; last line of bottom-left border
- lea ebx, [ebp+edx] ; last line of bottom-right border
+ lea ebx, [ebp+edx] ; last line of bottom-right border
neg ecx ; -kiStride
; for left & right border expanding
- exp_cross_sse2 32, a
-
+ exp_cross_sse2 32, a
+
; sfence ; commit cache write back memory
-
+
pop ebp
pop edi
pop esi
pop edx
pop ebx
-
+
ret
ALIGN 16
@@ -472,7 +472,7 @@
push esi
push edi
push ebp
-
+
; for both top and bottom border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; pDst
@@ -484,10 +484,10 @@
mov cl, byte [esi]
butterfly_1to16_sse xmm3, xmm4, c ; pDst, tmp, pSrc [generic register name: a/b/c/d]
; load top border
- mov ecx, edx ; kiStride
+ mov ecx, edx ; kiStride
neg ecx ; -kiStride
lea edi, [esi+ecx] ; last line of top border
- ; load bottom border
+ ; load bottom border
dec eax ; h-1
imul eax, edx ; (h-1)*kiStride
lea eax, [esi+eax] ; last line of picture pData
@@ -495,16 +495,16 @@
lea ebp, [eax+edx] ; last line of bottom border, (h-1)*kiStride + 16 * kiStride
; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
dec ebx ; kiWidth-1
- lea ebx, [eax+ebx] ; pDst[w-1][h-1]
+ lea ebx, [eax+ebx] ; pDst[w-1][h-1]
; xor edx, edx
mov dl, byte [eax] ; bottom-left
butterfly_1to16_sse xmm5, xmm6, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
mov dl, byte [ebx] ; bottom-right
butterfly_1to16_sse xmm6, xmm4, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- ; for top & bottom expanding
+ ; for top & bottom expanding
mov ebx, [esp+32] ; kiWidth
- exp_top_bottom_sse2 16
-
+ exp_top_bottom_sse2 16
+
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; pDst: left border pSrc
@@ -516,7 +516,7 @@
lea edi, [esi+eax] ; left border dst
dec ebx
lea ebx, [esi+ebx] ; right border pSrc, (p_dst + width - 1)
- lea ebp, [ebx+1] ; right border dst
+ lea ebp, [ebx+1] ; right border dst
; prepare for cross border pData: top-right with xmm4
; xor eax, eax
mov al, byte [ebx] ; top-right
@@ -523,7 +523,7 @@
butterfly_1to16_sse xmm4, xmm0, a ; pDst, tmp, pSrc [generic register name: a/b/c/d]
; for left & right border expanding
exp_left_right_sse2 16, a
-
+
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; pDst
@@ -533,9 +533,9 @@
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
mov eax, -16 ; chroma=-16
neg ecx ; -stride
- lea edi, [esi+eax]
+ lea edi, [esi+eax]
lea edi, [edi+ecx] ; last line of top-left border
- lea ebp, [esi+ebx]
+ lea ebp, [esi+ebx]
lea ebp, [ebp+ecx] ; last line of top-right border
mov ecx, [esp+28] ; kiStride
add edx, 16 ; height+16, luma=32, chroma=16
@@ -545,15 +545,15 @@
neg ecx ; -kiStride
; for left & right border expanding
exp_cross_sse2 16, a
-
+
; sfence ; commit cache write back memory
-
+
pop ebp
pop edi
pop esi
pop edx
pop ebx
-
+
ret
ALIGN 16
@@ -569,7 +569,7 @@
push esi
push edi
push ebp
-
+
; for both top and bottom border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; pDst
@@ -581,10 +581,10 @@
mov cl, byte [esi]
butterfly_1to16_sse xmm3, xmm4, c ; pDst, tmp, pSrc [generic register name: a/b/c/d]
; load top border
- mov ecx, edx ; kiStride
+ mov ecx, edx ; kiStride
neg ecx ; -kiStride
lea edi, [esi+ecx] ; last line of top border
- ; load bottom border
+ ; load bottom border
dec eax ; h-1
imul eax, edx ; (h-1)*kiStride
lea eax, [esi+eax] ; last line of picture pData
@@ -592,16 +592,16 @@
lea ebp, [eax+edx] ; last line of bottom border, (h-1)*kiStride + 16 * kiStride
; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
dec ebx ; kiWidth-1
- lea ebx, [eax+ebx] ; dst[w-1][h-1]
+ lea ebx, [eax+ebx] ; dst[w-1][h-1]
; xor edx, edx
mov dl, byte [eax] ; bottom-left
butterfly_1to16_sse xmm5, xmm6, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
mov dl, byte [ebx] ; bottom-right
butterfly_1to16_sse xmm6, xmm4, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- ; for top & bottom expanding
+ ; for top & bottom expanding
mov ebx, [esp+32] ; kiWidth
- exp_top_bottom_sse2 16
-
+ exp_top_bottom_sse2 16
+
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst: left border pSrc
@@ -613,7 +613,7 @@
lea edi, [esi+eax] ; left border dst
dec ebx
lea ebx, [esi+ebx] ; right border pSrc, (p_dst + width - 1)
- lea ebp, [ebx+1] ; right border dst
+ lea ebp, [ebx+1] ; right border dst
; prepare for cross border pData: top-right with xmm4
; xor eax, eax
mov al, byte [ebx] ; top-right
@@ -620,7 +620,7 @@
butterfly_1to16_sse xmm4, xmm0, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
; for left & right border expanding
exp_left_right_sse2 16, u
-
+
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst
@@ -630,9 +630,9 @@
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
neg ecx ; -kiStride
mov eax, -16 ; chroma=-16
- lea edi, [esi+eax]
+ lea edi, [esi+eax]
lea edi, [edi+ecx] ; last line of top-left border
- lea ebp, [esi+ebx]
+ lea ebp, [esi+ebx]
lea ebp, [ebp+ecx] ; last line of top-right border
mov ecx, [esp+28] ; kiStride
add edx, 16 ; kiHeight+16, luma=32, chroma=16
@@ -642,14 +642,14 @@
neg ecx ; -kiStride
; for left & right border expanding
exp_cross_sse2 16, u
-
+
; sfence ; commit cache write back memory
-
+
pop ebp
pop edi
pop esi
pop edx
pop ebx
-
+
ret
--- a/codec/decoder/core/asm/intra_pred.asm
+++ b/codec/decoder/core/asm/intra_pred.asm
@@ -38,7 +38,7 @@
;* 18/09/2009 Created
;* 19/11/2010 Added
;* WelsI16x16LumaPredDcTop_sse2, WelsI16x16LumaPredDcNA_sse2,
-;* WelsIChromaPredDcLeft_mmx, WelsIChromaPredDcTop_sse2
+;* WelsIChromaPredDcLeft_mmx, WelsIChromaPredDcTop_sse2
;* and WelsIChromaPredDcNA_mmx
;*
;*
@@ -96,13 +96,13 @@
punpcklbw %1, %3
movdqa %3, %1
punpcklbw %1, %3
-
+
;add %4, %5
movd %2, [%4+%5-1]
movdqa %3, %2
punpcklbw %2, %3
movdqa %3, %2
- punpcklbw %2, %3
+ punpcklbw %2, %3
punpckldq %1, %2
%endmacro
@@ -116,24 +116,24 @@
movd %2, [%5+%6]
punpcklbw %3, %2
punpcklwd %1, %3
- lea %5, [%5+2*%6]
+ lea %5, [%5+2*%6]
movd %4, [%5]
movd %2, [%5+%6]
punpcklbw %4, %2
- lea %5, [%5+2*%6]
+ lea %5, [%5+2*%6]
movd %3, [%5]
movd %2, [%5+%6]
lea %5, [%5+2*%6]
punpcklbw %3, %2
punpcklwd %4, %3
- punpckhdq %1, %4
-%endmacro
+ punpckhdq %1, %4
+%endmacro
%macro SUMW_HORIZON 3
movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
- punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
- movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
+ punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
+ movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
@@ -162,7 +162,7 @@
movd %2, [%5+%6]
punpcklbw %3, %2
punpckhwd %1, %3
- lea %5, [%5+2*%6]
+ lea %5, [%5+2*%6]
%endmacro
%macro LOAD_2_LEFT_AND_ADD 0
@@ -186,7 +186,7 @@
ALIGN 16
;*******************************************************************************
; void_t __cdecl WelsI4x4LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride)
-;
+;
; pPred must align to 16
;*******************************************************************************
WelsI4x4LumaPredH_sse2:
@@ -196,7 +196,7 @@
movzx edx, byte [eax-1]
movd xmm0, edx
pmuludq xmm0, [mmx_01bytes]
-
+
movzx edx, byte [eax+ecx-1]
movd xmm1, edx
pmuludq xmm1, [mmx_01bytes]
@@ -205,11 +205,11 @@
movzx edx, byte [eax+ecx-1]
movd xmm2, edx
pmuludq xmm2, [mmx_01bytes]
-
+
movzx edx, byte [eax+2*ecx-1]
- movd xmm3, edx
+ movd xmm3, edx
pmuludq xmm3, [mmx_01bytes]
-
+
sub eax, ecx
movd [eax], xmm0
movd [eax+ecx], xmm1
@@ -216,9 +216,9 @@
lea eax, [eax+2*ecx]
movd [eax], xmm2
movd [eax+ecx], xmm3
-
+
ret
-
+
;*******************************************************************************
; void_t WelsI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
@@ -229,9 +229,9 @@
mov ecx, [esp + pushsize + 8]
sub esi, 1
sub esi, ecx
-
+
;for H
- pxor xmm7, xmm7
+ pxor xmm7, xmm7
movq xmm0, [esi]
movdqa xmm5, [sse2_plane_dec]
punpcklbw xmm0, xmm7
@@ -241,7 +241,7 @@
punpcklbw xmm1, xmm7
pmullw xmm1, xmm6
psubw xmm1, xmm0
-
+
SUMW_HORIZON xmm1,xmm0,xmm2
movd eax, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]);
movsx eax, ax
@@ -249,26 +249,26 @@
add eax, 32
sar eax, 6 ; b = (5 * H + 32) >> 6;
SSE2_Copy8Times xmm1, eax ; xmm1 = b,b,b,b,b,b,b,b
-
- movzx edx, BYTE [esi+16]
+
+ movzx edx, BYTE [esi+16]
sub esi, 3
LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, esi, ecx
-
+
add esi, 3
movzx eax, BYTE [esi+8*ecx]
add edx, eax
shl edx, 4 ; a = (left[15*kiStride] + top[15]) << 4;
-
+
sub esi, 3
add esi, ecx
LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, esi, ecx
- pxor xmm4, xmm4
+ pxor xmm4, xmm4
punpckhbw xmm0, xmm4
pmullw xmm0, xmm5
punpckhbw xmm7, xmm4
pmullw xmm7, xmm6
psubw xmm7, xmm0
-
+
SUMW_HORIZON xmm7,xmm0,xmm2
movd eax, xmm7 ; V
movsx eax, ax
@@ -276,17 +276,17 @@
imul eax, 5
add eax, 32
sar eax, 6 ; c = (5 * V + 32) >> 6;
- SSE2_Copy8Times xmm4, eax ; xmm4 = c,c,c,c,c,c,c,c
-
+ SSE2_Copy8Times xmm4, eax ; xmm4 = c,c,c,c,c,c,c,c
+
mov esi, [esp + pushsize + 4]
add edx, 16
imul eax, -7
- add edx, eax ; s = a + 16 + (-7)*c
- SSE2_Copy8Times xmm0, edx ; xmm0 = s,s,s,s,s,s,s,s
-
+ add edx, eax ; s = a + 16 + (-7)*c
+ SSE2_Copy8Times xmm0, edx ; xmm0 = s,s,s,s,s,s,s,s
+
xor eax, eax
movdqa xmm5, [sse2_plane_inc_minus]
-
+
get_i16x16_luma_pred_plane_sse2_1:
movdqa xmm2, xmm1
pmullw xmm2, xmm5
@@ -295,7 +295,7 @@
movdqa xmm3, xmm1
pmullw xmm3, xmm6
paddw xmm3, xmm0
- psraw xmm3, 5
+ psraw xmm3, 5
packuswb xmm2, xmm3
movdqa [esi], xmm2
paddw xmm0, xmm4
@@ -302,13 +302,13 @@
add esi, ecx
inc eax
cmp eax, 16
- jnz get_i16x16_luma_pred_plane_sse2_1
-
+ jnz get_i16x16_luma_pred_plane_sse2_1
+
pop esi
ret
-
-
-
+
+
+
;*******************************************************************************
; void_t WelsI16x16LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
@@ -315,7 +315,7 @@
%macro SSE2_PRED_H_16X16_TWO_LINE_DEC 0
lea eax, [eax+ecx*2]
-
+
COPY_16_TIMES eax, xmm0
movdqa [eax], xmm0
COPY_16_TIMESS eax, xmm0, ecx
@@ -326,13 +326,12 @@
WelsI16x16LumaPredH_sse2:
mov eax, [esp+4] ; pPred
mov ecx, [esp+8] ; kiStride
-
+
COPY_16_TIMES eax, xmm0
movdqa [eax], xmm0
COPY_16_TIMESS eax, xmm0, ecx
movdqa [eax+ecx], xmm0
-
- SSE2_PRED_H_16X16_TWO_LINE_DEC
+
SSE2_PRED_H_16X16_TWO_LINE_DEC
SSE2_PRED_H_16X16_TWO_LINE_DEC
SSE2_PRED_H_16X16_TWO_LINE_DEC
@@ -339,9 +338,10 @@
SSE2_PRED_H_16X16_TWO_LINE_DEC
SSE2_PRED_H_16X16_TWO_LINE_DEC
SSE2_PRED_H_16X16_TWO_LINE_DEC
-
+ SSE2_PRED_H_16X16_TWO_LINE_DEC
+
ret
-
+
;*******************************************************************************
; void_t WelsI16x16LumaPredV_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
@@ -349,10 +349,10 @@
WelsI16x16LumaPredV_sse2:
mov edx, [esp+4] ; pPred
mov ecx, [esp+8] ; kiStride
-
+
sub edx, ecx
movdqa xmm0, [edx]
-
+
movdqa [edx+ecx], xmm0
lea edx, [edx+2*ecx]
movdqa [edx], xmm0
@@ -377,9 +377,9 @@
movdqa [edx+ecx], xmm0
lea edx, [edx+2*ecx]
movdqa [edx], xmm0
-
+
ret
-
+
;*******************************************************************************
; void_t WelsIChromaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
@@ -391,8 +391,8 @@
mov ecx, [esp + pushsize + 8] ;kiStride
sub esi, 1
sub esi, ecx
-
- pxor mm7, mm7
+
+ pxor mm7, mm7
movq mm0, [esi]
movq mm5, [sse2_plane_dec_c]
punpcklbw mm0, mm7
@@ -402,7 +402,7 @@
punpcklbw mm1, mm7
pmullw mm1, mm6
psubw mm1, mm0
-
+
movq2dq xmm1, mm1
pxor xmm2, xmm2
SUMW_HORIZON xmm1,xmm0,xmm2
@@ -412,7 +412,7 @@
add eax, 16
sar eax, 5 ; b = (17 * H + 16) >> 5;
SSE2_Copy8Times xmm1, eax ; mm1 = b,b,b,b,b,b,b,b
-
+
movzx edx, BYTE [esi+8]
sub esi, 3
LOAD_COLUMN_C mm0, mm2, mm3, mm4, esi, ecx
@@ -421,17 +421,17 @@
movzx eax, BYTE [esi+4*ecx]
add edx, eax
shl edx, 4 ; a = (left[7*kiStride] + top[7]) << 4;
-
+
sub esi, 3
add esi, ecx
LOAD_COLUMN_C mm7, mm2, mm3, mm4, esi, ecx
- pxor mm4, mm4
+ pxor mm4, mm4
punpckhbw mm0, mm4
pmullw mm0, mm5
punpckhbw mm7, mm4
pmullw mm7, mm6
psubw mm7, mm0
-
+
movq2dq xmm7, mm7
pxor xmm2, xmm2
SUMW_HORIZON xmm7,xmm0,xmm2
@@ -441,17 +441,17 @@
imul eax, 17
add eax, 16
sar eax, 5 ; c = (17 * V + 16) >> 5;
- SSE2_Copy8Times xmm4, eax ; mm4 = c,c,c,c,c,c,c,c
-
+ SSE2_Copy8Times xmm4, eax ; mm4 = c,c,c,c,c,c,c,c
+
mov esi, [esp + pushsize + 4]
add edx, 16
imul eax, -3
- add edx, eax ; s = a + 16 + (-3)*c
- SSE2_Copy8Times xmm0, edx ; xmm0 = s,s,s,s,s,s,s,s
-
+ add edx, eax ; s = a + 16 + (-3)*c
+ SSE2_Copy8Times xmm0, edx ; xmm0 = s,s,s,s,s,s,s,s
+
xor eax, eax
movdqa xmm5, [sse2_plane_mul_b_c]
-
+
get_i_chroma_pred_plane_sse2_1:
movdqa xmm2, xmm1
pmullw xmm2, xmm5
@@ -463,12 +463,12 @@
add esi, ecx
inc eax
cmp eax, 8
- jnz get_i_chroma_pred_plane_sse2_1
-
+ jnz get_i_chroma_pred_plane_sse2_1
+
pop esi
WELSEMMS
- ret
-
+ ret
+
ALIGN 16
;*******************************************************************************
; 0 |1 |2 |3 |4 |
@@ -480,13 +480,13 @@
; pPred[7] = ([6]+[0]*2+[1]+2)/4
;
; void_t __cdecl WelsI4x4LumaPredDDR_mmx(uint8_t *pPred, const int32_t kiStride)
-;
+;
;*******************************************************************************
-WelsI4x4LumaPredDDR_mmx:
+WelsI4x4LumaPredDDR_mmx:
mov edx,[esp+4] ;pPred
mov eax,edx
mov ecx,[esp+8] ;kiStride
-
+
movq mm1,[eax+ecx-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
movq mm2,[eax-8] ;get value of 6 mm2[8] = 6
sub eax, ecx ;mov eax to above line of current block(postion of 1)
@@ -513,19 +513,19 @@
pand mm1,[mmx_01bytes] ;set the odd bit
psubusb mm3,mm1 ;decrease 1 from odd bytes
pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
-
+
lea edx,[edx+ecx]
- movd [edx+2*ecx],mm2
+ movd [edx+2*ecx],mm2
sub edx,ecx
- psrlq mm2,8
- movd [edx+2*ecx],mm2
- psrlq mm2,8
- movd [edx+ecx],mm2
- psrlq mm2,8
+ psrlq mm2,8
+ movd [edx+2*ecx],mm2
+ psrlq mm2,8
+ movd [edx+ecx],mm2
+ psrlq mm2,8
movd [edx],mm2
WELSEMMS
ret
-
+
ALIGN 16
;*******************************************************************************
; 0 |1 |2 |3 |4 |
@@ -537,36 +537,36 @@
; pPred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
;
; void_t __cdecl WelsI4x4LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
-;
+;
;*******************************************************************************
-WelsI4x4LumaPredDc_sse2:
+WelsI4x4LumaPredDc_sse2:
mov eax,[esp+4] ;pPred
mov ecx,[esp+8] ;kiStride
push ebx
-
+
movzx edx, byte [eax-1h]
-
+
sub eax, ecx
movd xmm0, [eax]
pxor xmm1, xmm1
psadbw xmm0, xmm1
-
+
movd ebx, xmm0
add ebx, edx
-
+
movzx edx, byte [eax+ecx*2-1h]
add ebx, edx
-
+
lea eax, [eax+ecx*2-1]
movzx edx, byte [eax+ecx]
add ebx, edx
-
+
movzx edx, byte [eax+ecx*2]
add ebx, edx
add ebx, 4
sar ebx, 3
imul ebx, 0x01010101
-
+
mov edx, [esp+8] ;pPred
mov [edx], ebx
mov [edx+ecx], ebx
@@ -575,8 +575,8 @@
mov [edx+ecx], ebx
pop ebx
- ret
-
+ ret
+
ALIGN 16
;*******************************************************************************
; void_t __cdecl WelsIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride)
@@ -585,7 +585,7 @@
%macro MMX_PRED_H_8X8_ONE_LINE 4
movq %1, [%3-8]
psrlq %1, 38h
-
+
pmullw %1, [mmx_01bytes]
pshufw %1, %1, 0
movq [%4], %1
@@ -594,7 +594,7 @@
%macro MMX_PRED_H_8X8_ONE_LINEE 4
movq %1, [%3+ecx-8]
psrlq %1, 38h
-
+
pmullw %1, [mmx_01bytes]
pshufw %1, %1, 0
movq [%4], %1
@@ -605,37 +605,37 @@
mov edx, [esp+4] ;pPred
mov eax, edx
mov ecx, [esp+8] ;kiStride
-
+
movq mm0, [eax-8]
psrlq mm0, 38h
-
+
pmullw mm0, [mmx_01bytes]
pshufw mm0, mm0, 0
movq [edx], mm0
-
+
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
-
+
lea eax, [eax+ecx*2]
MMX_PRED_H_8X8_ONE_LINE mm0, mm1, eax, edx+2*ecx
-
+
lea edx, [edx+2*ecx]
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
-
+
lea eax, [eax+ecx*2]
MMX_PRED_H_8X8_ONE_LINE mm0, mm1, eax, edx+2*ecx
-
+
lea edx, [edx+2*ecx]
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
-
+
lea eax, [eax+ecx*2]
MMX_PRED_H_8X8_ONE_LINE mm0, mm1, eax, edx+2*ecx
lea edx, [edx+2*ecx]
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
-
+
WELSEMMS
- ret
-
+ ret
+
ALIGN 16
;*******************************************************************************
; void_t __cdecl get_i4x4_luma_pred_v_asm(uint8_t *pPred, const int32_t kiStride)
@@ -645,7 +645,7 @@
get_i4x4_luma_pred_v_asm:
mov eax, [esp+4] ;pPred
mov ecx, [esp+8] ;kiStride
-
+
sub eax, ecx
mov edx, [eax]
mov [eax+ecx], edx
@@ -653,9 +653,9 @@
lea eax, [eax+2*ecx]
mov [eax+ecx], edx
mov [eax+2*ecx], edx
-
- ret
+ ret
+
ALIGN 16
;*******************************************************************************
; void_t __cdecl WelsIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
@@ -665,7 +665,7 @@
WelsIChromaPredV_mmx:
mov eax, [esp+4] ;pPred
mov ecx, [esp+8] ;kiStride
-
+
sub eax, ecx
movq mm0, [eax]
@@ -680,11 +680,11 @@
lea eax, [eax+2*ecx]
movq [eax+ecx], mm0
movq [eax+2*ecx], mm0
-
+
WELSEMMS
ret
-
-
+
+
ALIGN 16
;*******************************************************************************
; lt|t0|t1|t2|t3|
@@ -710,13 +710,13 @@
; f = (2 + l1 + (l0<<1) + lt)>>2
; h = (2 + l2 + (l1<<1) + l0)>>2
-; j = (2 + l3 + (l2<<1) + l1)>>2
+; j = (2 + l3 + (l2<<1) + l1)>>2
; [b a f e h g j i] + [d c b a] --> mov to memory
-;
+;
; void_t WelsI4x4LumaPredHD_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsI4x4LumaPredHD_mmx
-WelsI4x4LumaPredHD_mmx:
+WelsI4x4LumaPredHD_mmx:
mov edx, [esp+4] ; pPred
mov eax, edx
mov ecx, [esp+8] ; kiStride
@@ -723,16 +723,16 @@
sub eax, ecx
movd mm0, [eax-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt]
psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx]
-
- movd mm1, [eax+2*ecx-4]
- punpcklbw mm1, [eax+ecx-4] ; mm1[7] = l0, mm1[6] = l1
+
+ movd mm1, [eax+2*ecx-4]
+ punpcklbw mm1, [eax+ecx-4] ; mm1[7] = l0, mm1[6] = l1
lea eax, [eax+2*ecx]
- movd mm2, [eax+2*ecx-4]
+ movd mm2, [eax+2*ecx-4]
punpcklbw mm2, [eax+ecx-4] ; mm2[7] = l2, mm2[6] = l3
punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
psrlq mm2, 20h
pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
-
+
movq mm1, mm0
psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
movq mm2, mm0
@@ -740,17 +740,17 @@
movq mm3, mm2
movq mm4, mm1
pavgb mm1, mm0
-
+
pxor mm4, mm0 ; find odd value in the lowest bit of each byte
pand mm4, [mmx_01bytes] ; set the odd bit
psubusb mm1, mm4 ; decrease 1 from odd bytes
-
+
pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
-
+
movq mm4, mm0
pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i]
punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i]
-
+
psrlq mm2, 20h
psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0]
movq mm4, mm3
@@ -757,7 +757,7 @@
psrlq mm4, 10h ; mm4 = [0 0 b a f e h j]
pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx]
psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a]
-
+
movd [edx], mm2
lea edx, [edx+ecx]
movd [edx+2*ecx], mm3
@@ -768,9 +768,9 @@
movd [edx+ecx], mm3
WELSEMMS
ret
-
-
-
+
+
+
ALIGN 16
;*******************************************************************************
; lt|t0|t1|t2|t3|
@@ -793,17 +793,17 @@
; b = (2 + l0 + (l1<<1) + l2)>>2
; d = (2 + l1 + (l2<<1) + l3)>>2
; f = (2 + l2 + (l3<<1) + l3)>>2
-
+
; [g g f e d c b a] + [g g g g] --> mov to memory
-;
+;
; void_t WelsI4x4LumaPredHU_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsI4x4LumaPredHU_mmx
-WelsI4x4LumaPredHU_mmx:
+WelsI4x4LumaPredHU_mmx:
mov edx, [esp+4] ; pPred
mov eax, edx
mov ecx, [esp+8] ; kiStride
-
+
movd mm0, [eax-4] ; mm0[3] = l0
punpcklbw mm0, [eax+ecx-4] ; mm0[7] = l1, mm0[6] = l0
lea eax, [eax+2*ecx]
@@ -811,39 +811,39 @@
movd mm4, [eax+ecx-4] ; mm4[3] = l3
punpcklbw mm2, mm4
punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
-
+
psrlq mm4, 18h
psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx]
psrlq mm0, 8h
pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
-
+
movq mm1, mm0
psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx]
-
+
movq mm2, mm0
psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx]
movq mm5, mm2
pavgb mm2, mm0
-
+
pxor mm5, mm0 ; find odd value in the lowest bit of each byte
pand mm5, [mmx_01bytes] ; set the odd bit
psubusb mm2, mm5 ; decrease 1 from odd bytes
-
+
pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
-
+
psrlq mm2, 8h
pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx]
-
+
punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a]
punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx]
punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx]
-
+
psrlq mm4, 20h
lea edx, [edx+ecx]
movd [edx+2*ecx], mm4
-
+
sub edx, ecx
movd [edx], mm1
psrlq mm1, 10h
@@ -852,9 +852,9 @@
movd [edx+2*ecx], mm1
WELSEMMS
ret
-
-
-
+
+
+
ALIGN 16
;*******************************************************************************
; lt|t0|t1|t2|t3|
@@ -880,12 +880,12 @@
; h = (2 + t1 + (t2<<1) + t3)>>2
; i = (2 + lt + (l0<<1) + l1)>>2
-; j = (2 + l0 + (l1<<1) + l2)>>2
-;
+; j = (2 + l0 + (l1<<1) + l2)>>2
+;
; void_t WelsI4x4LumaPredVR_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsI4x4LumaPredVR_mmx
-WelsI4x4LumaPredVR_mmx:
+WelsI4x4LumaPredVR_mmx:
mov edx, [esp+4] ; pPred
mov eax, edx
mov ecx, [esp+8] ; kiStride
@@ -892,51 +892,51 @@
sub eax, ecx
movq mm0, [eax-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
-
- movd mm1, [eax+2*ecx-4]
- punpcklbw mm1, [eax+ecx-4] ; mm1[7] = l0, mm1[6] = l1
+
+ movd mm1, [eax+2*ecx-4]
+ punpcklbw mm1, [eax+ecx-4] ; mm1[7] = l0, mm1[6] = l1
lea eax, [eax+2*ecx]
movq mm2, [eax+ecx-8] ; mm2[7] = l2
punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx]
psrlq mm2, 28h
pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
-
+
movq mm1, mm0
psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx]
-
+
movq mm2, mm0
psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
movq mm3, mm2
pavgb mm2, mm0
-
+
pxor mm3, mm0 ; find odd value in the lowest bit of each byte
pand mm3, [mmx_01bytes] ; set the odd bit
psubusb mm2, mm3 ; decrease 1 from odd bytes
-
+
movq mm3, mm0
psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx]
movq mm2, mm3
-
+
psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a]
movd [edx], mm1
-
+
psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e]
movd [edx+ecx], mm2
-
+
movq mm4, mm3
psllq mm4, 20h
psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i]
-
+
movq mm5, mm3
psllq mm5, 28h
psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j]
-
+
psllq mm1, 8h
pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i]
movd [edx+2*ecx], mm4
-
+
psllq mm2, 8h
pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
lea edx, [edx+2*ecx]
@@ -943,7 +943,7 @@
movd [edx+ecx], mm5
WELSEMMS
ret
-
+
ALIGN 16
;*******************************************************************************
; lt|t0|t1|t2|t3|t4|t5|t6|t7
@@ -966,13 +966,13 @@
; e = (2 + t4 + t6 + (t5<<1))>>2
; f = (2 + t5 + t7 + (t6<<1))>>2
; g = (2 + t6 + t7 + (t7<<1))>>2
-
+
; [g f e d c b a] --> mov to memory
-;
+;
; void_t WelsI4x4LumaPredDDL_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsI4x4LumaPredDDL_mmx
-WelsI4x4LumaPredDDL_mmx:
+WelsI4x4LumaPredDDL_mmx:
mov edx, [esp+4] ; pPred
mov eax, edx
mov ecx, [esp+8] ; kiStride
@@ -980,11 +980,11 @@
movq mm0, [eax] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
movq mm1, mm0
movq mm2, mm0
-
+
movq mm3, mm0
psrlq mm3, 38h
psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx]
-
+
psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
psrlq mm2, 8h
pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
@@ -994,9 +994,9 @@
pxor mm3, mm2 ; find odd value in the lowest bit of each byte
pand mm3, [mmx_01bytes] ; set the odd bit
psubusb mm1, mm3 ; decrease 1 from odd bytes
-
+
pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
-
+
psrlq mm0, 8h
movd [edx], mm0
psrlq mm0, 8h
@@ -1008,8 +1008,8 @@
movd [edx+ecx], mm0
WELSEMMS
ret
-
-
+
+
ALIGN 16
;*******************************************************************************
; lt|t0|t1|t2|t3|t4|t5|t6|t7
@@ -1035,40 +1035,40 @@
; g = (2 + t2 + (t3<<1) + t4)>>2
; h = (2 + t3 + (t4<<1) + t5)>>2
; j = (2 + t4 + (t5<<1) + t6)>>2
-
+
; [i d c b a] + [j h g f e] --> mov to memory
-;
+;
; void_t WelsI4x4LumaPredVL_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsI4x4LumaPredVL_mmx
-WelsI4x4LumaPredVL_mmx:
+WelsI4x4LumaPredVL_mmx:
mov edx, [esp+4] ; pPred
mov eax, edx
mov ecx, [esp+8] ; kiStride
-
+
sub eax, ecx
movq mm0, [eax] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
movq mm1, mm0
movq mm2, mm0
-
+
psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
movq mm3, mm1
pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a]
-
+
movq mm4, mm2
- pavgb mm2, mm0
+ pavgb mm2, mm0
pxor mm4, mm0 ; find odd value in the lowest bit of each byte
pand mm4, [mmx_01bytes] ; set the odd bit
psubusb mm2, mm4 ; decrease 1 from odd bytes
-
+
pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
-
+
movd [edx], mm3
psrlq mm3, 8h
movd [edx+2*ecx], mm3
-
+
movd [edx+ecx], mm2
psrlq mm2, 8h
lea edx, [edx+2*ecx]
@@ -1075,7 +1075,7 @@
movd [edx+ecx], mm2
WELSEMMS
ret
-
+
ALIGN 16
;*******************************************************************************
;
@@ -1082,11 +1082,11 @@
; void_t WelsIChromaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsIChromaPredDc_sse2
-WelsIChromaPredDc_sse2:
+WelsIChromaPredDc_sse2:
push ebx
mov eax, [esp+8] ; pPred
mov ecx, [esp+12] ; kiStride
-
+
sub eax, ecx
movq mm0, [eax]
@@ -1100,7 +1100,7 @@
movzx edx, byte [eax-0x01] ; l4
add ebx, edx
movd mm1, ebx ; mm1 = l1+l2+l3+l4
-
+
movzx ebx, byte [eax+ecx-0x01] ; l5
lea eax, [eax+2*ecx]
movzx edx, byte [eax-0x01] ; l6
@@ -1111,7 +1111,7 @@
movzx edx, byte [eax-0x01] ; l8
add ebx, edx
movd mm2, ebx ; mm2 = l5+l6+l7+l8
-
+
movq mm3, mm0
psrlq mm0, 0x20
psllq mm3, 0x20
@@ -1118,46 +1118,46 @@
psrlq mm3, 0x20
pxor mm4, mm4
psadbw mm0, mm4
- psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
-
+ psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
+
paddq mm3, mm1
movq mm1, mm2
paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
-
+
movq mm4, [mmx_0x02]
-
+
paddq mm0, mm4
psrlq mm0, 0x02
-
+
paddq mm2, mm4
psrlq mm2, 0x02
-
+
paddq mm3, mm4
paddq mm3, mm4
psrlq mm3, 0x03
-
+
paddq mm1, mm4
paddq mm1, mm4
psrlq mm1, 0x03
-
+
pmuludq mm0, [mmx_01bytes]
pmuludq mm3, [mmx_01bytes]
psllq mm0, 0x20
pxor mm0, mm3 ; mm0 = m_up
-
+
pmuludq mm2, [mmx_01bytes]
pmuludq mm1, [mmx_01bytes]
psllq mm1, 0x20
pxor mm1, mm2 ; mm2 = m_down
-
+
mov edx, [esp+8] ; pPred
-
+
movq [edx], mm0
movq [edx+ecx], mm0
movq [edx+2*ecx], mm0
lea edx, [edx+2*ecx]
movq [edx+ecx], mm0
-
+
movq [edx+2*ecx], mm1
lea edx, [edx+2*ecx]
movq [edx+ecx], mm1
@@ -1164,13 +1164,13 @@
movq [edx+2*ecx], mm1
lea edx, [edx+2*ecx]
movq [edx+ecx], mm1
-
+
pop ebx
WELSEMMS
ret
-
-
-
+
+
+
ALIGN 16
;*******************************************************************************
;
@@ -1177,11 +1177,11 @@
; void_t WelsI16x16LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsI16x16LumaPredDc_sse2
-WelsI16x16LumaPredDc_sse2:
+WelsI16x16LumaPredDc_sse2:
push ebx
mov eax, [esp+8] ; pPred
mov ecx, [esp+12] ; kiStride
-
+
sub eax, ecx
movdqa xmm0, [eax] ; read one row
pxor xmm1, xmm1
@@ -1191,7 +1191,7 @@
pslldq xmm0, 0x08
psrldq xmm0, 0x08
paddw xmm0, xmm1
-
+
movzx ebx, byte [eax+ecx-0x01]
movzx edx, byte [eax+2*ecx-0x01]
add ebx, edx
@@ -1209,44 +1209,44 @@
psrld xmm0, 0x05
pmuludq xmm0, [mmx_01bytes]
pshufd xmm0, xmm0, 0
-
+
mov edx, [esp+8] ; pPred
-
+
movdqa [edx], xmm0
movdqa [edx+ecx], xmm0
movdqa [edx+2*ecx], xmm0
lea edx, [edx+2*ecx]
-
+
movdqa [edx+ecx], xmm0
movdqa [edx+2*ecx], xmm0
lea edx, [edx+2*ecx]
-
+
movdqa [edx+ecx], xmm0
movdqa [edx+2*ecx], xmm0
lea edx, [edx+2*ecx]
-
+
movdqa [edx+ecx], xmm0
movdqa [edx+2*ecx], xmm0
lea edx, [edx+2*ecx]
-
+
movdqa [edx+ecx], xmm0
movdqa [edx+2*ecx], xmm0
lea edx, [edx+2*ecx]
-
+
movdqa [edx+ecx], xmm0
movdqa [edx+2*ecx], xmm0
lea edx, [edx+2*ecx]
-
+
movdqa [edx+ecx], xmm0
movdqa [edx+2*ecx], xmm0
lea edx, [edx+2*ecx]
-
+
movdqa [edx+ecx], xmm0
pop ebx
ret
-
+
;*******************************************************************************
; for intra prediction as follows, 11/19/2010
;*******************************************************************************
@@ -1258,12 +1258,12 @@
WELS_EXTERN WelsI16x16LumaPredDcTop_sse2
WelsI16x16LumaPredDcTop_sse2:
push ebx
-
+
%define PUSH_SIZE 4
-
+
mov eax, [esp+PUSH_SIZE+4] ; pPred
mov ebx, [esp+PUSH_SIZE+8] ; kiStride
-
+
mov ecx, ebx
neg ecx
movdqa xmm0, [eax+ecx] ; pPred-kiStride, top line
@@ -1278,10 +1278,10 @@
pshufd xmm1, xmm0, 0b1h ; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6
paddw xmm0, xmm1 ; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6)
pshuflw xmm1, xmm0, 0b1h ; 10110001
- paddw xmm0, xmm1 ; sum in word unit (x8)
+ paddw xmm0, xmm1 ; sum in word unit (x8)
movd edx, xmm0
and edx, 0ffffh
-
+
add edx, 08h
sar edx, 04h
mov dh, dl
@@ -1288,35 +1288,35 @@
mov ecx, edx
shl ecx, 010h
or edx, ecx
- movd xmm1, edx
+ movd xmm1, edx
pshufd xmm0, xmm1, 00h
movdqa xmm1, xmm0
-
+
lea ecx, [2*ebx+ebx] ; 3*kiStride
-
+
movdqa [eax], xmm0
movdqa [eax+ebx], xmm1
movdqa [eax+2*ebx], xmm0
movdqa [eax+ecx], xmm1
-
+
lea eax, [eax+4*ebx]
movdqa [eax], xmm0
movdqa [eax+ebx], xmm1
movdqa [eax+2*ebx], xmm0
movdqa [eax+ecx], xmm1
-
+
lea eax, [eax+4*ebx]
movdqa [eax], xmm0
movdqa [eax+ebx], xmm1
movdqa [eax+2*ebx], xmm0
movdqa [eax+ecx], xmm1
-
+
lea eax, [eax+4*ebx]
movdqa [eax], xmm0
movdqa [eax+ebx], xmm1
movdqa [eax+2*ebx], xmm0
movdqa [eax+ecx], xmm1
-
+
%undef PUSH_SIZE
pop ebx
ret
@@ -1328,41 +1328,41 @@
WELS_EXTERN WelsI16x16LumaPredDcNA_sse2
WelsI16x16LumaPredDcNA_sse2:
push ebx
-
+
%define PUSH_SIZE 4
-
+
mov eax, [esp+PUSH_SIZE+4] ; pPred
- mov ebx, [esp+PUSH_SIZE+8] ; kiStride
-
+ mov ebx, [esp+PUSH_SIZE+8] ; kiStride
+
lea ecx, [2*ebx+ebx] ; 3*kiStride
-
+
movdqa xmm0, [sse2_dc_0x80]
- movdqa xmm1, xmm0
+ movdqa xmm1, xmm0
movdqa [eax], xmm0
movdqa [eax+ebx], xmm1
movdqa [eax+2*ebx], xmm0
- movdqa [eax+ecx], xmm1
+ movdqa [eax+ecx], xmm1
lea eax, [eax+4*ebx]
movdqa [eax], xmm0
movdqa [eax+ebx], xmm1
movdqa [eax+2*ebx], xmm0
- movdqa [eax+ecx], xmm1
+ movdqa [eax+ecx], xmm1
lea eax, [eax+4*ebx]
movdqa [eax], xmm0
movdqa [eax+ebx], xmm1
movdqa [eax+2*ebx], xmm0
- movdqa [eax+ecx], xmm1
+ movdqa [eax+ecx], xmm1
lea eax, [eax+4*ebx]
movdqa [eax], xmm0
movdqa [eax+ebx], xmm1
movdqa [eax+2*ebx], xmm0
movdqa [eax+ecx], xmm1
-
+
%undef PUSH_SIZE
-
+
pop ebx
ret
-
+
ALIGN 16
;*******************************************************************************
; void_t WelsIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride)
@@ -1370,12 +1370,12 @@
WELS_EXTERN WelsIChromaPredDcLeft_mmx
WelsIChromaPredDcLeft_mmx:
push ebx
- push esi
+ push esi
%define PUSH_SIZE 8
mov esi, [esp+PUSH_SIZE+4] ; pPred
mov ecx, [esp+PUSH_SIZE+8] ; kiStride
mov eax, esi
- ; for left
+ ; for left
dec eax
xor ebx, ebx
xor edx, edx
@@ -1384,7 +1384,7 @@
add ebx, edx
lea eax, [eax+2*ecx]
mov dl, [eax]
- add ebx, edx
+ add ebx, edx
mov dl, [eax+ecx]
add ebx, edx
add ebx, 02h
@@ -1451,7 +1451,7 @@
movdqa xmm6, [sse2_wd_0x02]
paddw xmm0, xmm6
psraw xmm0, 02h
- packuswb xmm0, xmm7
+ packuswb xmm0, xmm7
lea ebx, [2*ecx+ecx]
movq [eax], xmm0
movq [eax+ecx], xmm0
@@ -1463,10 +1463,10 @@
movq [eax+2*ecx], xmm0
movq [eax+ebx], xmm0
%undef PUSH_SIZE
- pop ebx
+ pop ebx
ret
-
+
ALIGN 16
;*******************************************************************************
; void_t WelsIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
@@ -1495,4 +1495,4 @@
ret
-
+
--- a/codec/decoder/core/asm/mb_copy.asm
+++ b/codec/decoder/core/asm/mb_copy.asm
@@ -37,7 +37,7 @@
;* History
;* 15/09/2009 Created
;* 12/28/2009 Modified with larger throughput
-;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
+;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
;*
;*
@@ -84,7 +84,7 @@
; int iHeight );
;*******************************************************************************
PixelAvgWidthEq4_mmx:
-
+
push esi
push edi
push ebp
@@ -102,7 +102,7 @@
movd mm0, [ebp]
pavgb mm0, [esi]
movd [edi], mm0
-
+
dec ebx
lea edi, [edi+eax]
lea esi, [esi+ecx]
@@ -115,7 +115,7 @@
pop edi
pop esi
ret
-
+
ALIGN 16
;*******************************************************************************
; void_t PixelAvgWidthEq8_mmx( uint8_t *pDst, int iDstStride,
@@ -124,7 +124,7 @@
; int iHeight );
;*******************************************************************************
PixelAvgWidthEq8_mmx:
-
+
push esi
push edi
push ebp
@@ -145,14 +145,14 @@
movq mm0, [esi+ecx]
pavgb mm0, [ebp+edx]
movq [edi+eax], mm0
-
+
lea esi, [esi+2*ecx]
lea ebp, [ebp+2*edx]
lea edi, [edi+2*eax]
-
+
sub ebx, 2
jnz .height_loop
-
+
WELSEMMS
pop ebx
pop ebp
@@ -174,8 +174,8 @@
push edi
push ebp
push ebx
-
+
mov edi, [esp+20] ; pDst
mov eax, [esp+24] ; iDstStride
mov esi, [esp+28] ; pSrcA
@@ -188,28 +188,28 @@
movdqu xmm0, [esi]
pavgb xmm0, [ebp]
movdqu [edi], xmm0
-
+
movdqu xmm0, [esi+ecx]
pavgb xmm0, [ebp+edx]
movdqu [edi+eax], xmm0
-
+
movdqu xmm0, [esi+2*ecx]
pavgb xmm0, [ebp+2*edx]
movdqu [edi+2*eax], xmm0
-
+
lea esi, [esi+2*ecx]
lea ebp, [ebp+2*edx]
lea edi, [edi+2*eax]
-
+
movdqu xmm0, [esi+ecx]
pavgb xmm0, [ebp+edx]
movdqu [edi+eax], xmm0
-
+
lea esi, [esi+2*ecx]
lea ebp, [ebp+2*edx]
lea edi, [edi+2*eax]
-
-
+
+
sub ebx, 4
jne .height_loop
@@ -232,7 +232,7 @@
push edi
push ebx
-
+
mov esi, [esp+16]
mov eax, [esp+20]
mov edi, [esp+24]
@@ -242,12 +242,12 @@
.height_loop:
mov ebx, [esi]
mov [edi], ebx
-
+
add esi, eax
add edi, ecx
dec edx
jnz .height_loop
- WELSEMMS
+ WELSEMMS
pop ebx
pop edi
pop esi
@@ -275,12 +275,11 @@
add edi, ecx
dec edx
jnz .height_loop
-
- WELSEMMS
+
+ WELSEMMS
pop edi
pop esi
ret
-
@@ -288,6 +287,7 @@
+
ALIGN 16
;*******************************************************************************
; void_t McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
@@ -308,7 +308,7 @@
push edi
mov esi, [esp+12] ; pSrc
- mov eax, [esp+16] ; iSrcStride
+ mov eax, [esp+16] ; iSrcStride
mov edi, [esp+20] ; pDst
mov edx, [esp+24] ; iDstStride
mov ecx, [esp+28] ; iHeight
@@ -324,7 +324,7 @@
lea esi, [esi+eax*2]
lea edi, [edi+edx*2]
jnz .height_loop
-
+
pop edi
pop esi
ret
--- a/codec/decoder/core/asm/mc_chroma.asm
+++ b/codec/decoder/core/asm/mc_chroma.asm
@@ -1,317 +1,317 @@
-;*!
-;* \copy
-;* Copyright (c) 2004-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* mc_chroma.asm
-;*
-;* Abstract
-;* mmx motion compensation for chroma
-;*
-;* History
-;* 10/13/2004 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-h264_d0x20_sse2:
- dw 32,32,32,32,32,32,32,32
-ALIGN 16
-h264_d0x20_mmx:
- dw 32,32,32,32
-
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq4_mmx( uint8_t *src,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; uint8_t *pABCD,
-; int32_t iHeigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq4_mmx
-McChromaWidthEq4_mmx:
- push esi
- push edi
- push ebx
-
- mov eax, [esp +12 + 20]
- movd mm3, [eax]
- WELS_Zero mm7
- punpcklbw mm3, mm3
- movq mm4, mm3
- punpcklwd mm3, mm3
- punpckhwd mm4, mm4
-
- movq mm5, mm3
- punpcklbw mm3, mm7
- punpckhbw mm5, mm7
-
- movq mm6, mm4
- punpcklbw mm4, mm7
- punpckhbw mm6, mm7
-
- mov esi, [esp +12+ 4]
- mov eax, [esp + 12 + 8]
- mov edi, [esp + 12 + 12]
- mov edx, [esp + 12 + 16]
- mov ecx, [esp + 12 + 24]
-
- lea ebx, [esi + eax]
- movd mm0, [esi]
- movd mm1, [esi+1]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
-.xloop:
-
- pmullw mm0, mm3
- pmullw mm1, mm5
- paddw mm0, mm1
-
- movd mm1, [ebx]
- punpcklbw mm1, mm7
- movq mm2, mm1
- pmullw mm1, mm4
- paddw mm0, mm1
-
- movd mm1, [ebx+1]
- punpcklbw mm1, mm7
- movq mm7, mm1
- pmullw mm1,mm6
- paddw mm0, mm1
- movq mm1,mm7
-
- paddw mm0, [h264_d0x20_mmx]
- psrlw mm0, 6
-
- WELS_Zero mm7
- packuswb mm0, mm7
- movd [edi], mm0
-
- movq mm0, mm2
-
- lea edi, [edi +edx ]
- lea ebx, [ebx + eax]
-
- dec ecx
- jnz near .xloop
- WELSEMMS
- pop ebx
- pop edi
- pop esi
- ret
-
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq8_sse2( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; uint8_t *pABCD,
-; int32_t iheigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq8_sse2
-McChromaWidthEq8_sse2:
- push esi
- push edi
- push ebx
-
- mov eax, [esp +12 + 20]
- movd xmm3, [eax]
- WELS_Zero xmm7
- punpcklbw xmm3, xmm3
- punpcklwd xmm3, xmm3
-
- movdqa xmm4, xmm3
- punpckldq xmm3, xmm3
- punpckhdq xmm4, xmm4
- movdqa xmm5, xmm3
- movdqa xmm6, xmm4
-
- punpcklbw xmm3, xmm7
- punpckhbw xmm5, xmm7
- punpcklbw xmm4, xmm7
- punpckhbw xmm6, xmm7
-
- mov esi, [esp +12+ 4]
- mov eax, [esp + 12 + 8]
- mov edi, [esp + 12 + 12]
- mov edx, [esp + 12 + 16]
- mov ecx, [esp + 12 + 24]
-
- lea ebx, [esi + eax]
- movq xmm0, [esi]
- movq xmm1, [esi+1]
- punpcklbw xmm0, xmm7
- punpcklbw xmm1, xmm7
-.xloop:
-
- pmullw xmm0, xmm3
- pmullw xmm1, xmm5
- paddw xmm0, xmm1
-
- movq xmm1, [ebx]
- punpcklbw xmm1, xmm7
- movdqa xmm2, xmm1
- pmullw xmm1, xmm4
- paddw xmm0, xmm1
-
- movq xmm1, [ebx+1]
- punpcklbw xmm1, xmm7
- movdqa xmm7, xmm1
- pmullw xmm1, xmm6
- paddw xmm0, xmm1
- movdqa xmm1,xmm7
-
- paddw xmm0, [h264_d0x20_sse2]
- psrlw xmm0, 6
-
- WELS_Zero xmm7
- packuswb xmm0, xmm7
- movq [edi], xmm0
-
- movdqa xmm0, xmm2
-
- lea edi, [edi +edx ]
- lea ebx, [ebx + eax]
-
- dec ecx
- jnz near .xloop
-
- pop ebx
- pop edi
- pop esi
- ret
-
-
-
-
-ALIGN 16
-;***********************************************************************
-; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; uint8_t *pABCD,
-; int32_t iHeigh);
-;***********************************************************************
-WELS_EXTERN McChromaWidthEq8_ssse3
-McChromaWidthEq8_ssse3:
- push ebx
- push esi
- push edi
-
- mov eax, [esp + 12 + 20]
-
- pxor xmm7, xmm7
- movd xmm5, [eax]
- punpcklwd xmm5, xmm5
- punpckldq xmm5, xmm5
- movdqa xmm6, xmm5
- punpcklqdq xmm5, xmm5
- punpckhqdq xmm6, xmm6
-
- mov eax, [esp + 12 + 4]
- mov edx, [esp + 12 + 8]
- mov esi, [esp + 12 + 12]
- mov edi, [esp + 12 + 16]
- mov ecx, [esp + 12 + 24]
-
- sub esi, edi
- sub esi, edi
- movdqa xmm7, [h264_d0x20_sse2]
-
- movdqu xmm0, [eax]
- movdqa xmm1, xmm0
- psrldq xmm1, 1
- punpcklbw xmm0, xmm1
-
-.hloop_chroma:
- lea esi, [esi+2*edi]
-
- movdqu xmm2, [eax+edx]
- movdqa xmm3, xmm2
- psrldq xmm3, 1
- punpcklbw xmm2, xmm3
- movdqa xmm4, xmm2
-
- pmaddubsw xmm0, xmm5
- pmaddubsw xmm2, xmm6
- paddw xmm0, xmm2
- paddw xmm0, xmm7
- psrlw xmm0, 6
- packuswb xmm0, xmm0
- movq [esi],xmm0
-
- lea eax, [eax+2*edx]
- movdqu xmm2, [eax]
- movdqa xmm3, xmm2
- psrldq xmm3, 1
- punpcklbw xmm2, xmm3
- movdqa xmm0, xmm2
-
- pmaddubsw xmm4, xmm5
- pmaddubsw xmm2, xmm6
- paddw xmm4, xmm2
- paddw xmm4, xmm7
- psrlw xmm4, 6
- packuswb xmm4, xmm4
- movq [esi+edi],xmm4
-
- sub ecx, 2
- jnz .hloop_chroma
- pop edi
- pop esi
- pop ebx
-
- ret
-
-
+;*!
+;* \copy
+;* Copyright (c) 2004-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* mc_chroma.asm
+;*
+;* Abstract
+;* mmx motion compensation for chroma
+;*
+;* History
+;* 10/13/2004 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+BITS 32
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+h264_d0x20_sse2:
+ dw 32,32,32,32,32,32,32,32
+ALIGN 16
+h264_d0x20_mmx:
+ dw 32,32,32,32
+
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq4_mmx( uint8_t *src,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; uint8_t *pABCD,
+; int32_t iHeigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq4_mmx
+McChromaWidthEq4_mmx:
+ push esi
+ push edi
+ push ebx
+
+ mov eax, [esp +12 + 20]
+ movd mm3, [eax]
+ WELS_Zero mm7
+ punpcklbw mm3, mm3
+ movq mm4, mm3
+ punpcklwd mm3, mm3
+ punpckhwd mm4, mm4
+
+ movq mm5, mm3
+ punpcklbw mm3, mm7
+ punpckhbw mm5, mm7
+
+ movq mm6, mm4
+ punpcklbw mm4, mm7
+ punpckhbw mm6, mm7
+
+ mov esi, [esp +12+ 4]
+ mov eax, [esp + 12 + 8]
+ mov edi, [esp + 12 + 12]
+ mov edx, [esp + 12 + 16]
+ mov ecx, [esp + 12 + 24]
+
+ lea ebx, [esi + eax]
+ movd mm0, [esi]
+ movd mm1, [esi+1]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+.xloop:
+
+ pmullw mm0, mm3
+ pmullw mm1, mm5
+ paddw mm0, mm1
+
+ movd mm1, [ebx]
+ punpcklbw mm1, mm7
+ movq mm2, mm1
+ pmullw mm1, mm4
+ paddw mm0, mm1
+
+ movd mm1, [ebx+1]
+ punpcklbw mm1, mm7
+ movq mm7, mm1
+ pmullw mm1,mm6
+ paddw mm0, mm1
+ movq mm1,mm7
+
+ paddw mm0, [h264_d0x20_mmx]
+ psrlw mm0, 6
+
+ WELS_Zero mm7
+ packuswb mm0, mm7
+ movd [edi], mm0
+
+ movq mm0, mm2
+
+ lea edi, [edi +edx ]
+ lea ebx, [ebx + eax]
+
+ dec ecx
+ jnz near .xloop
+ WELSEMMS
+ pop ebx
+ pop edi
+ pop esi
+ ret
+
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq8_sse2( uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; uint8_t *pABCD,
+; int32_t iheigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq8_sse2
+McChromaWidthEq8_sse2:
+ push esi
+ push edi
+ push ebx
+
+ mov eax, [esp +12 + 20]
+ movd xmm3, [eax]
+ WELS_Zero xmm7
+ punpcklbw xmm3, xmm3
+ punpcklwd xmm3, xmm3
+
+ movdqa xmm4, xmm3
+ punpckldq xmm3, xmm3
+ punpckhdq xmm4, xmm4
+ movdqa xmm5, xmm3
+ movdqa xmm6, xmm4
+
+ punpcklbw xmm3, xmm7
+ punpckhbw xmm5, xmm7
+ punpcklbw xmm4, xmm7
+ punpckhbw xmm6, xmm7
+
+ mov esi, [esp +12+ 4]
+ mov eax, [esp + 12 + 8]
+ mov edi, [esp + 12 + 12]
+ mov edx, [esp + 12 + 16]
+ mov ecx, [esp + 12 + 24]
+
+ lea ebx, [esi + eax]
+ movq xmm0, [esi]
+ movq xmm1, [esi+1]
+ punpcklbw xmm0, xmm7
+ punpcklbw xmm1, xmm7
+.xloop:
+
+ pmullw xmm0, xmm3
+ pmullw xmm1, xmm5
+ paddw xmm0, xmm1
+
+ movq xmm1, [ebx]
+ punpcklbw xmm1, xmm7
+ movdqa xmm2, xmm1
+ pmullw xmm1, xmm4
+ paddw xmm0, xmm1
+
+ movq xmm1, [ebx+1]
+ punpcklbw xmm1, xmm7
+ movdqa xmm7, xmm1
+ pmullw xmm1, xmm6
+ paddw xmm0, xmm1
+ movdqa xmm1,xmm7
+
+ paddw xmm0, [h264_d0x20_sse2]
+ psrlw xmm0, 6
+
+ WELS_Zero xmm7
+ packuswb xmm0, xmm7
+ movq [edi], xmm0
+
+ movdqa xmm0, xmm2
+
+ lea edi, [edi +edx ]
+ lea ebx, [ebx + eax]
+
+ dec ecx
+ jnz near .xloop
+
+ pop ebx
+ pop edi
+ pop esi
+ ret
+
+
+
+
+ALIGN 16
+;***********************************************************************
+; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; uint8_t *pABCD,
+; int32_t iHeigh);
+;***********************************************************************
+WELS_EXTERN McChromaWidthEq8_ssse3
+McChromaWidthEq8_ssse3:
+ push ebx
+ push esi
+ push edi
+
+ mov eax, [esp + 12 + 20]
+
+ pxor xmm7, xmm7
+ movd xmm5, [eax]
+ punpcklwd xmm5, xmm5
+ punpckldq xmm5, xmm5
+ movdqa xmm6, xmm5
+ punpcklqdq xmm5, xmm5
+ punpckhqdq xmm6, xmm6
+
+ mov eax, [esp + 12 + 4]
+ mov edx, [esp + 12 + 8]
+ mov esi, [esp + 12 + 12]
+ mov edi, [esp + 12 + 16]
+ mov ecx, [esp + 12 + 24]
+
+ sub esi, edi
+ sub esi, edi
+ movdqa xmm7, [h264_d0x20_sse2]
+
+ movdqu xmm0, [eax]
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+ punpcklbw xmm0, xmm1
+
+.hloop_chroma:
+ lea esi, [esi+2*edi]
+
+ movdqu xmm2, [eax+edx]
+ movdqa xmm3, xmm2
+ psrldq xmm3, 1
+ punpcklbw xmm2, xmm3
+ movdqa xmm4, xmm2
+
+ pmaddubsw xmm0, xmm5
+ pmaddubsw xmm2, xmm6
+ paddw xmm0, xmm2
+ paddw xmm0, xmm7
+ psrlw xmm0, 6
+ packuswb xmm0, xmm0
+ movq [esi],xmm0
+
+ lea eax, [eax+2*edx]
+ movdqu xmm2, [eax]
+ movdqa xmm3, xmm2
+ psrldq xmm3, 1
+ punpcklbw xmm2, xmm3
+ movdqa xmm0, xmm2
+
+ pmaddubsw xmm4, xmm5
+ pmaddubsw xmm2, xmm6
+ paddw xmm4, xmm2
+ paddw xmm4, xmm7
+ psrlw xmm4, 6
+ packuswb xmm4, xmm4
+ movq [esi+edi],xmm4
+
+ sub ecx, 2
+ jnz .hloop_chroma
+ pop edi
+ pop esi
+ pop ebx
+
+ ret
+
+
--- a/codec/decoder/core/asm/mc_luma.asm
+++ b/codec/decoder/core/asm/mc_luma.asm
@@ -69,16 +69,16 @@
ALIGN 16
;*******************************************************************************
-; void_t McHorVer20WidthEq4_mmx( uint8_t *pSrc,
-; int iSrcStride,
-; uint8_t *pDst,
-; int iDstStride,
+; void_t McHorVer20WidthEq4_mmx( uint8_t *pSrc,
+; int iSrcStride,
+; uint8_t *pDst,
+; int iDstStride,
; int iHeight)
;*******************************************************************************
McHorVer20WidthEq4_mmx:
push esi
push edi
-
+
mov esi, [esp+12]
mov eax, [esp+16]
mov edi, [esp+20]
@@ -100,7 +100,7 @@
punpcklbw mm4, mm7
movd mm5, [esi+3]
punpcklbw mm5, mm7
-
+
paddw mm2, mm3
paddw mm4, mm5
psllw mm4, 2
@@ -113,12 +113,12 @@
psraw mm0, 5
packuswb mm0, mm7
movd [edi], mm0
-
+
add esi, eax
add edi, ecx
dec edx
jnz .height_loop
-
+
WELSEMMS
pop edi
pop esi
@@ -181,8 +181,8 @@
ALIGN 16
;***********************************************************************
-; void_t McHorVer22Width8HorFirst_sse2(int16_t *pSrc,
-; int16_t iSrcStride,
+; void_t McHorVer22Width8HorFirst_sse2(int16_t *pSrc,
+; int16_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride
; int32_t iHeight
@@ -197,11 +197,11 @@
mov edi, [esp+24] ;pDst
mov edx, [esp+28] ;iDstStride
mov ebx, [esp+32] ;iHeight
- pxor xmm7, xmm7
-
+ pxor xmm7, xmm7
+
sub esi, eax ;;;;;;;;need more 5 lines.
sub esi, eax
-
+
.yloop_width_8:
movq xmm0, [esi]
punpcklbw xmm0, xmm7
@@ -215,7 +215,7 @@
punpcklbw xmm4, xmm7
movq xmm5, [esi+3]
punpcklbw xmm5, xmm7
-
+
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
@@ -225,7 +225,7 @@
psllw xmm4, 2
paddw xmm0, xmm4
movdqa [edi], xmm0
-
+
add esi, eax
add edi, edx
dec ebx
@@ -238,8 +238,8 @@
ALIGN 16
;***********************************************************************
;void_t McHorVer22VerLast_sse2(
-; uint8_t *pSrc,
-; int32_t pSrcStride,
+; uint8_t *pSrc,
+; int32_t pSrcStride,
; uint8_t * pDst,
; int32_t iDstStride,
; int32_t iWidth,
@@ -250,17 +250,17 @@
paddw %1, %6
movdqa %7, %2
movdqa %8, %3
-
-
+
+
paddw %7, %5
paddw %8, %4
-
- psubw %1, %7
- psraw %1, 2
- paddw %1, %8
- psubw %1, %7
- psraw %1, 2
- paddw %8, %1
+
+ psubw %1, %7
+ psraw %1, 2
+ paddw %1, %8
+ psubw %1, %7
+ psraw %1, 2
+ paddw %8, %1
paddw %8, [h264_mc_hc_32]
psraw %8, 6
packuswb %8, %8
@@ -272,15 +272,15 @@
push edi
push ebx
push ebp
-
+
mov esi, [esp+20]
mov eax, [esp+24]
mov edi, [esp+28]
mov edx, [esp+32]
mov ebx, [esp+36]
- mov ecx, [esp+40]
- shr ebx, 3
-
+ mov ecx, [esp+40]
+ shr ebx, 3
+
.width_loop:
movdqa xmm0, [esi]
movdqa xmm1, [esi+eax]
@@ -290,12 +290,12 @@
lea esi, [esi+2*eax]
movdqa xmm4, [esi]
movdqa xmm5, [esi+eax]
-
+
FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
dec ecx
lea esi, [esi+2*eax]
movdqa xmm6, [esi]
-
+
movdqa xmm0, xmm1
movdqa xmm1, xmm2
movdqa xmm2, xmm3
@@ -302,61 +302,61 @@
movdqa xmm3, xmm4
movdqa xmm4, xmm5
movdqa xmm5, xmm6
-
+
add edi, edx
- sub esi, eax
-
+ sub esi, eax
+
.start:
FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
dec ecx
jz near .x_loop_dec
-
+
lea esi, [esi+2*eax]
movdqa xmm6, [esi]
FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
dec ecx
jz near .x_loop_dec
-
+
lea edi, [edi+2*edx]
movdqa xmm7, [esi+eax]
FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
dec ecx
jz near .x_loop_dec
-
+
lea esi, [esi+2*eax]
movdqa xmm0, [esi]
FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
dec ecx
jz near .x_loop_dec
-
+
lea edi, [edi+2*edx]
movdqa xmm1, [esi+eax]
FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
dec ecx
jz near .x_loop_dec
-
+
lea esi, [esi+2*eax]
movdqa xmm2, [esi]
FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
dec ecx
jz near .x_loop_dec
-
+
lea edi, [edi+2*edx]
movdqa xmm3, [esi+eax]
FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
dec ecx
jz near .x_loop_dec
-
+
lea esi, [esi+2*eax]
movdqa xmm4, [esi]
FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
dec ecx
jz near .x_loop_dec
-
+
lea edi, [edi+2*edx]
movdqa xmm5, [esi+eax]
jmp near .start
-
+
.x_loop_dec:
dec ebx
jz near .exit
@@ -366,9 +366,9 @@
add esi, 16
add edi, 8
jmp .width_loop
-
-
-
+
+
+
.exit:
pop ebp
pop ebx
@@ -379,10 +379,10 @@
ALIGN 16
;*******************************************************************************
-; void_t McHorVer20WidthEq8_sse2( uint8_t *pSrc,
-; int iSrcStride,
-; uint8_t *pDst,
-; int iDstStride,
+; void_t McHorVer20WidthEq8_sse2( uint8_t *pSrc,
+; int iSrcStride,
+; uint8_t *pDst,
+; int iDstStride,
; int iHeight,
; );
;*******************************************************************************
@@ -389,18 +389,18 @@
McHorVer20WidthEq8_sse2:
push esi
push edi
-
+
mov esi, [esp + 12] ;pSrc
mov eax, [esp + 16] ;iSrcStride
mov edi, [esp + 20] ;pDst
mov ecx, [esp + 28] ;iHeight
mov edx, [esp + 24] ;iDstStride
-
+
lea esi, [esi-2] ;pSrc -= 2;
-
+
pxor xmm7, xmm7
movdqa xmm6, [h264_w0x10_1]
-.y_loop:
+.y_loop:
movq xmm0, [esi]
punpcklbw xmm0, xmm7
movq xmm1, [esi+5]
@@ -413,7 +413,7 @@
punpcklbw xmm4, xmm7
movq xmm5, [esi+3]
punpcklbw xmm5, xmm7
-
+
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
@@ -424,7 +424,7 @@
paddw xmm0, xmm4
paddw xmm0, xmm6
psraw xmm0, 5
-
+
packuswb xmm0, xmm7
movq [edi], xmm0
@@ -432,17 +432,17 @@
lea esi, [esi+eax]
dec ecx
jnz near .y_loop
-
+
pop edi
pop esi
ret
-
+
ALIGN 16
;*******************************************************************************
-; void_t McHorVer20WidthEq16_sse2( uint8_t *pSrc,
-; int iSrcStride,
-; uint8_t *pDst,
-; int iDstStride,
+; void_t McHorVer20WidthEq16_sse2( uint8_t *pSrc,
+; int iSrcStride,
+; uint8_t *pDst,
+; int iDstStride,
; int iHeight,
; );
;*******************************************************************************
@@ -449,20 +449,20 @@
McHorVer20WidthEq16_sse2:
push esi
push edi
-
+
mov esi, [esp + 12] ;pSrc
mov eax, [esp + 16] ;iSrcStride
mov edi, [esp + 20] ;pDst
mov ecx, [esp + 28] ;iHeight
mov edx, [esp + 24] ;iDstStride
-
+
lea esi, [esi-2] ;pSrc -= 2;
-
+
pxor xmm7, xmm7
movdqa xmm6, [h264_w0x10_1]
.y_loop:
-
+
movq xmm0, [esi]
punpcklbw xmm0, xmm7
movq xmm1, [esi+5]
@@ -475,7 +475,7 @@
punpcklbw xmm4, xmm7
movq xmm5, [esi+3]
punpcklbw xmm5, xmm7
-
+
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
@@ -501,7 +501,7 @@
punpcklbw xmm4, xmm7
movq xmm5, [esi+3+8]
punpcklbw xmm5, xmm7
-
+
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
@@ -514,9 +514,9 @@
psraw xmm0, 5
packuswb xmm0, xmm7
movq [edi+8], xmm0
-
- lea edi, [edi+edx]
- lea esi, [esi+eax]
+
+ lea edi, [edi+edx]
+ lea esi, [esi+eax]
dec ecx
jnz near .y_loop
pop edi
@@ -525,10 +525,10 @@
;*******************************************************************************
-; void_t McHorVer02WidthEq8_sse2( uint8_t *pSrc,
-; int iSrcStride,
-; uint8_t *pDst,
-; int iDstStride,
+; void_t McHorVer02WidthEq8_sse2( uint8_t *pSrc,
+; int iSrcStride,
+; uint8_t *pDst,
+; int iDstStride,
; int iHeight )
;*******************************************************************************
ALIGN 16
@@ -535,7 +535,7 @@
McHorVer02WidthEq8_sse2:
push esi
push edi
-
+
mov esi, [esp + 12] ;pSrc
mov edx, [esp + 16] ;iSrcStride
mov edi, [esp + 20] ;pDst
@@ -546,7 +546,7 @@
sub esi, edx
WELS_Zero xmm7
-
+
SSE_LOAD_8P xmm0, xmm7, [esi]
SSE_LOAD_8P xmm1, xmm7, [esi+edx]
lea esi, [esi+2*edx]
@@ -555,8 +555,8 @@
lea esi, [esi+2*edx]
SSE_LOAD_8P xmm4, xmm7, [esi]
SSE_LOAD_8P xmm5, xmm7, [esi+edx]
-
-.start:
+
+.start:
FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
dec ecx
jz near .xx_exit
@@ -566,7 +566,7 @@
FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
dec ecx
jz near .xx_exit
-
+
lea edi, [edi+2*eax]
SSE_LOAD_8P xmm7, xmm0, [esi+edx]
FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
--- a/codec/decoder/core/asm/memzero.asm
+++ b/codec/decoder/core/asm/memzero.asm
@@ -32,8 +32,8 @@
;* memzero.asm
;*
;* Abstract
-;*
;*
+;*
;* History
;* 9/16/2009 Created
;*
@@ -47,8 +47,8 @@
; Code
;***********************************************************************
-SECTION .text
-
+SECTION .text
+
ALIGN 16
;***********************************************************************
;_inline void __cdecl WelsPrefetchZero_mmx(int8_t const*_A);
@@ -57,7 +57,7 @@
WelsPrefetchZero_mmx:
mov eax,[esp+4]
prefetchnta [eax]
- ret
+ ret
ALIGN 16
@@ -69,7 +69,7 @@
mov eax, [esp + 4] ; dst
mov ecx, [esp + 8]
neg ecx
-
+
pxor xmm0, xmm0
.memzeroa64_sse2_loops:
movdqa [eax], xmm0
@@ -77,12 +77,12 @@
movdqa [eax+32], xmm0
movdqa [eax+48], xmm0
add eax, 0x40
-
+
add ecx, 0x40
jnz near .memzeroa64_sse2_loops
-
- ret
+ ret
+
ALIGN 16
;***********************************************************************
; void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
@@ -92,7 +92,7 @@
mov eax, [esp + 4] ; dst
mov ecx, [esp + 8]
neg ecx
-
+
pxor mm0, mm0
.memzero64_mmx_loops:
movq [eax], mm0
@@ -102,16 +102,16 @@
movq [eax+32], mm0
movq [eax+40], mm0
movq [eax+48], mm0
- movq [eax+56], mm0
+ movq [eax+56], mm0
add eax, 0x40
-
+
add ecx, 0x40
jnz near .memzero64_mmx_loops
-
- WELSEMMS
- ret
-
-ALIGN 16
+
+ WELSEMMS
+ ret
+
+ALIGN 16
;***********************************************************************
; void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
;***********************************************************************
@@ -119,17 +119,17 @@
WelsSetMemZeroSize8_mmx:
mov eax, [esp + 4] ; dst
mov ecx, [esp + 8] ; size
- neg ecx
+ neg ecx
pxor mm0, mm0
-
+
.memzero8_mmx_loops:
movq [eax], mm0
add eax, 0x08
-
+
add ecx, 0x08
jnz near .memzero8_mmx_loops
-
- WELSEMMS
- ret
-
+ WELSEMMS
+ ret
+
+
--- a/codec/decoder/plus/res/welsdec.rc
+++ b/codec/decoder/plus/res/welsdec.rc
@@ -27,18 +27,18 @@
// TEXTINCLUDE
//
-1 TEXTINCLUDE
+1 TEXTINCLUDE
BEGIN
"resource.h\0"
END
-2 TEXTINCLUDE
+2 TEXTINCLUDE
BEGIN
"#include ""windows.h""\r\n"
"\0"
END
-3 TEXTINCLUDE
+3 TEXTINCLUDE
BEGIN
"\r\n"
"\0"
--- a/codec/encoder/core/asm/asm_inc.asm
+++ b/codec/encoder/core/asm/asm_inc.asm
@@ -43,7 +43,7 @@
; Options, for DEBUG
;***********************************************************************
-%if 1
+%if 1
%define MOVDQ movdqa
%else
%define MOVDQ movdqu
@@ -58,7 +58,7 @@
BITS 32
;***********************************************************************
-; Macros
+; Macros
;***********************************************************************
%macro WELS_EXTERN 1
@@ -74,7 +74,7 @@
pxor %2, %2
psubw %2, %1
pmaxsw %1, %2
-%endmacro
+%endmacro
%macro MMX_XSwap 4
movq %4, %2
@@ -105,7 +105,7 @@
SSE2_XSawp qdq, %5, %2, %3
%endmacro
-;in: xmm0, xmm1, xmm2, xmm3 pOut: xmm0, xmm1, xmm3, xmm4
+;in: xmm0, xmm1, xmm2, xmm3 pOut: xmm0, xmm1, xmm3, xmm4
%macro SSE2_TransTwo4x4W 5
SSE2_XSawp wd, %1, %2, %5
SSE2_XSawp wd, %3, %4, %2
@@ -125,26 +125,26 @@
movdqa %6, %9
movdqa %9, %4
SSE2_XSawp bw, %7, %6, %4
-
- SSE2_XSawp wd, %1, %3, %6
+
+ SSE2_XSawp wd, %1, %3, %6
SSE2_XSawp wd, %8, %2, %3
SSE2_XSawp wd, %5, %7, %2
movdqa %7, %9
- movdqa %9, %3
+ movdqa %9, %3
SSE2_XSawp wd, %7, %4, %3
-
- SSE2_XSawp dq, %1, %5, %4
+
+ SSE2_XSawp dq, %1, %5, %4
SSE2_XSawp dq, %6, %2, %5
SSE2_XSawp dq, %8, %7, %2
movdqa %7, %9
- movdqa %9, %5
+ movdqa %9, %5
SSE2_XSawp dq, %7, %3, %5
-
+
SSE2_XSawp qdq, %1, %8, %3
SSE2_XSawp qdq, %4, %2, %8
SSE2_XSawp qdq, %6, %7, %2
movdqa %7, %9
- movdqa %9, %1
+ movdqa %9, %1
SSE2_XSawp qdq, %7, %5, %1
movdqa %5, %9
%endmacro
@@ -170,9 +170,9 @@
%macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
mov %3h, %3l
movd %1, e%3x ; i.e, 1% = eax (=b0)
- pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
- pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
-%endmacro
+ pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
+ pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro
;copy a dw into a xmm for 8 times
%macro SSE2_Copy8Times 2
--- a/codec/encoder/core/asm/coeff.asm
+++ b/codec/encoder/core/asm/coeff.asm
@@ -318,9 +318,9 @@
SECTION .text
-
+
;***********************************************************************
-;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
+;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
;***********************************************************************
WELS_EXTERN CavlcParamCal_sse2
CavlcParamCal_sse2:
@@ -327,16 +327,16 @@
push ebx
push edi
push esi
-
+
mov eax, [esp+16] ;coffLevel
mov edi, [esp+24] ;Level
mov ebx, [esp+32] ;endIdx
cmp ebx, 3
- jne .Level16
+ jne .Level16
pxor xmm1, xmm1
movq xmm0, [eax] ; removed QWORD
- jmp .Cal_begin
-.Level16:
+ jmp .Cal_begin
+.Level16:
movdqa xmm0, [eax]
movdqa xmm1, [eax+16]
.Cal_begin:
@@ -354,7 +354,7 @@
pcmpeqw xmm7, xmm7 ;generate -1
mov ebx, 0xff
;pinsrw xmm6, ebx, 3
-
+
mov bl, dh
lea ebx, [byte_1pos_table+8*ebx]
@@ -362,7 +362,7 @@
pextrw ecx, xmm0, 3
shr ecx, 8
mov dh, cl
-
+
.loopHighFind0:
cmp ecx, 0
je .loopHighFind0End
@@ -372,7 +372,7 @@
add esi, 8
mov esi, [eax+2*esi]
mov [edi], si
- add edi, 2
+ add edi, 2
;add ebx, 1
inc ebx
dec ecx
@@ -403,8 +403,8 @@
;and edx, 0xff
movzx edx, byte [ebx]
mov edx, [eax+2*edx]
- mov [edi], dx
- add edi, 2
+ mov [edi], dx
+ add edi, 2
;add ebx, 1
inc ebx
dec esi
@@ -436,8 +436,8 @@
psllq xmm0, xmm3
psrlq xmm0, xmm3
movdqa xmm4, xmm1
- psllq xmm1, xmm2
- psrlq xmm4, xmm3
+ psllq xmm1, xmm2
+ psrlq xmm4, xmm3
punpcklqdq xmm1, xmm4
por xmm0, xmm1
--- a/codec/encoder/core/asm/cpuid.asm
+++ b/codec/encoder/core/asm/cpuid.asm
@@ -84,12 +84,12 @@
; void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
;****************************************************************************************************
WelsCPUId:
- push ebx
+ push ebx
push edi
-
+
mov eax, [esp+12] ; operating index
cpuid ; cpuid
-
+
; processing various information return
mov edi, [esp+16]
mov [edi], eax
@@ -100,10 +100,10 @@
mov edi, [esp+28]
mov [edi], edx
- pop edi
+ pop edi
pop ebx
ret
-
+
WELS_EXTERN WelsCPUSupportAVX
; need call after cpuid=1 and eax, ecx flag got then
ALIGN 16
@@ -139,7 +139,7 @@
WelsCPUSupportFMA:
mov eax, [esp+4]
mov ecx, [esp+8]
-
+
; refer to detection of FMA addressed in INTEL AVX manual document
and ecx, 018001000H
cmp ecx, 018001000H ; check OSXSAVE, AVX, FMA feature flags
@@ -153,7 +153,7 @@
mov eax, 1
ret
fma_not_supported:
- mov eax, 0
+ mov eax, 0
ret
WELS_EXTERN WelsEmms
--- a/codec/encoder/core/asm/dct.asm
+++ b/codec/encoder/core/asm/dct.asm
@@ -48,26 +48,26 @@
;***********************************************************************
; Constant
-;***********************************************************************
-
+;***********************************************************************
+
align 16
-SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
+SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
dw 10, 13, 10, 13, 13, 16, 13, 16,
- dw 11, 14, 11, 14, 14, 18, 14, 18,
+ dw 11, 14, 11, 14, 14, 18, 14, 18,
dw 11, 14, 11, 14, 14, 18, 14, 18,
- dw 13, 16, 13, 16, 16, 20, 16, 20,
dw 13, 16, 13, 16, 16, 20, 16, 20,
- dw 14, 18, 14, 18, 18, 23, 18, 23,
+ dw 13, 16, 13, 16, 16, 20, 16, 20,
+ dw 14, 18, 14, 18, 18, 23, 18, 23,
dw 14, 18, 14, 18, 18, 23, 18, 23,
- dw 16, 20, 16, 20, 20, 25, 20, 25,
dw 16, 20, 16, 20, 20, 25, 20, 25,
- dw 18, 23, 18, 23, 23, 29, 23, 29,
+ dw 16, 20, 16, 20, 20, 25, 20, 25,
+ dw 18, 23, 18, 23, 23, 29, 23, 29,
dw 18, 23, 18, 23, 23, 29, 23, 29
-
+
;***********************************************************************
; MMX functions
-;***********************************************************************
+;***********************************************************************
%macro MMX_LoadDiff4P 5
movd %1, [%3]
@@ -112,7 +112,7 @@
MMX_SumSub %4, %1, %6
MMX_SumSub %3, %2, %6
MMX_SumSub %3, %4, %6
- MMX_SumSubMul2 %1, %2, %5
+ MMX_SumSubMul2 %1, %2, %5
%endmacro
%macro MMX_IDCT 6
@@ -145,13 +145,13 @@
mov edx, [esp+24] ; i_pix2
WELS_Zero mm7
-
+
MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, eax, ebx, ecx, edx, mm0, mm7
- MMX_DCT mm1, mm2, mm3 ,mm4, mm5, mm6
+ MMX_DCT mm1, mm2, mm3 ,mm4, mm5, mm6
MMX_Trans4x4W mm3, mm1, mm4, mm5, mm2
-
- MMX_DCT mm3, mm5, mm2 ,mm4, mm1, mm6
+
+ MMX_DCT mm3, mm5, mm2 ,mm4, mm1, mm6
MMX_Trans4x4W mm2, mm3, mm4, mm1, mm5
mov eax, [esp+ 8] ; pDct
@@ -178,15 +178,15 @@
%define i_pred esp+pushsize+16
%define pDct esp+pushsize+20
- mov eax, [pDct ]
+ mov eax, [pDct ]
movq mm0, [eax+ 0]
movq mm1, [eax+ 8]
movq mm2, [eax+16]
movq mm3, [eax+24]
- mov edx, [p_dst ]
- mov ecx, [i_dst ]
+ mov edx, [p_dst ]
+ mov ecx, [i_dst ]
mov eax, [p_pred]
- mov ebx, [i_pred]
+ mov ebx, [i_pred]
MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
@@ -195,7 +195,7 @@
WELS_Zero mm7
WELS_DW32 mm6
-
+
MMX_StoreDiff4P mm3, mm0, mm6, mm7, [edx], [eax]
MMX_StoreDiff4P mm4, mm0, mm6, mm7, [edx+ecx], [eax+ebx]
lea edx, [edx+2*ecx]
@@ -202,7 +202,7 @@
lea eax, [eax+2*ebx]
MMX_StoreDiff4P mm1, mm0, mm6, mm7, [edx], [eax]
MMX_StoreDiff4P mm2, mm0, mm6, mm7, [edx+ecx], [eax+ebx]
-
+
WELSEMMS
%undef pushsize
%undef p_dst
@@ -220,17 +220,17 @@
%macro SSE2_Store4x8p 6
SSE2_XSawp qdq, %2, %3, %6
SSE2_XSawp qdq, %4, %5, %3
- MOVDQ [%1+0x00], %2
- MOVDQ [%1+0x10], %4
- MOVDQ [%1+0x20], %6
- MOVDQ [%1+0x30], %3
+ MOVDQ [%1+0x00], %2
+ MOVDQ [%1+0x10], %4
+ MOVDQ [%1+0x20], %6
+ MOVDQ [%1+0x30], %3
%endmacro
%macro SSE2_Load4x8p 6
MOVDQ %2, [%1+0x00]
- MOVDQ %4, [%1+0x10]
- MOVDQ %6, [%1+0x20]
- MOVDQ %3, [%1+0x30]
+ MOVDQ %4, [%1+0x10]
+ MOVDQ %6, [%1+0x20]
+ MOVDQ %3, [%1+0x30]
SSE2_XSawp qdq, %4, %3, %5
SSE2_XSawp qdq, %2, %6, %3
%endmacro
@@ -271,40 +271,40 @@
%endmacro
%macro SSE2_Load8DC 6
- movdqa %1, %6 ; %1 = dc0 dc1
+ movdqa %1, %6 ; %1 = dc0 dc1
paddw %1, %5
- psraw %1, $6 ; (dc + 32) >> 6
-
+ psraw %1, $6 ; (dc + 32) >> 6
+
movdqa %2, %1
psrldq %2, 4
punpcklwd %2, %2
- punpckldq %2, %2 ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
+ punpckldq %2, %2 ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
movdqa %3, %1
psrldq %3, 8
punpcklwd %3, %3
punpckldq %3, %3 ; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
-
+
movdqa %4, %1
psrldq %4, 12
punpcklwd %4, %4
punpckldq %4, %4 ; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
-
+
punpcklwd %1, %1
- punpckldq %1, %1 ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
+ punpckldq %1, %1 ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
%endmacro
%macro SSE2_DCT 6
- SSE2_SumSub %6, %3, %5
- SSE2_SumSub %1, %2, %5
- SSE2_SumSub %3, %2, %5
- SSE2_SumSubMul2 %6, %1, %4
+ SSE2_SumSub %6, %3, %5
+ SSE2_SumSub %1, %2, %5
+ SSE2_SumSub %3, %2, %5
+ SSE2_SumSubMul2 %6, %1, %4
%endmacro
%macro SSE2_IDCT 7
- SSE2_SumSub %7, %2, %6
- SSE2_SumSubDiv2 %1, %3, %5, %4
- SSE2_SumSub %2, %1, %5
+ SSE2_SumSub %7, %2, %6
+ SSE2_SumSubDiv2 %1, %3, %5, %4
+ SSE2_SumSub %2, %1, %5
SSE2_SumSub %7, %4, %5
%endmacro
@@ -316,12 +316,12 @@
WelsDctFourT4_sse2:
push ebx
push esi
- mov esi, [esp+12]
+ mov esi, [esp+12]
mov eax, [esp+16] ; pix1
mov ebx, [esp+20] ; i_pix1
mov ecx, [esp+24] ; pix2
- mov edx, [esp+28] ; i_pix2
-
+ mov edx, [esp+28] ; i_pix2
+
pxor xmm7, xmm7
;Load 4x8
@@ -331,33 +331,33 @@
lea ecx, [ecx + 2 * edx]
SSE2_LoadDiff8P xmm2, xmm6, xmm7, [eax], [ecx]
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [eax+ebx], [ecx+edx]
-
+
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
- SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
+ SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
-
- SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5
-
+
+ SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5
+
lea eax, [eax + 2 * ebx]
lea ecx, [ecx + 2 * edx]
-
+
;Load 4x8
SSE2_LoadDiff8P xmm0, xmm6, xmm7, [eax ], [ecx ]
SSE2_LoadDiff8P xmm1, xmm6, xmm7, [eax+ebx ], [ecx+edx]
lea eax, [eax + 2 * ebx]
- lea ecx, [ecx + 2 * edx]
+ lea ecx, [ecx + 2 * edx]
SSE2_LoadDiff8P xmm2, xmm6, xmm7, [eax], [ecx]
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [eax+ebx], [ecx+edx]
-
+
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
- SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
- SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
+ SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
+ SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
-
+
lea esi, [esi+64]
- SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5
-
+ SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5
+
pop esi
pop ebx
ret
@@ -377,21 +377,21 @@
%define pushsize 8
push ebx
push esi
-
- mov eax, [rec]
- mov ebx, [stride]
- mov ecx, [pred]
- mov edx, [pred_stride]
- mov esi, [rs]
+ mov eax, [rec]
+ mov ebx, [stride]
+ mov ecx, [pred]
+ mov edx, [pred_stride]
+ mov esi, [rs]
+
;Load 4x8
- SSE2_Load4x8p esi, xmm0, xmm1, xmm4, xmm2, xmm5
-
+ SSE2_Load4x8p esi, xmm0, xmm1, xmm4, xmm2, xmm5
+
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
-
+
WELS_Zero xmm7
WELS_DW32 xmm6
@@ -398,41 +398,41 @@
SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [eax ], [ecx]
SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [eax + ebx ], [ecx + edx]
lea eax, [eax + 2 * ebx]
- lea ecx, [ecx + 2 * edx]
+ lea ecx, [ecx + 2 * edx]
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [eax], [ecx]
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [eax + ebx ], [ecx + edx]
-
+
add esi, 64
lea eax, [eax + 2 * ebx]
lea ecx, [ecx + 2 * edx]
- SSE2_Load4x8p esi, xmm0, xmm1, xmm4, xmm2, xmm5
-
+ SSE2_Load4x8p esi, xmm0, xmm1, xmm4, xmm2, xmm5
+
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
- SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
+ SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
WELS_Zero xmm7
WELS_DW32 xmm6
-
+
SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [eax ], [ecx]
SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [eax + ebx ], [ecx + edx]
lea eax, [eax + 2 * ebx]
- lea ecx, [ecx + 2 * edx]
+ lea ecx, [ecx + 2 * edx]
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [eax], [ecx]
- SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [eax + ebx], [ecx + edx]
+ SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [eax + ebx], [ecx + edx]
pop esi
pop ebx
ret
-
+
%macro SSE2_StoreDiff4x8p 8
SSE2_StoreDiff8p %1, %3, %4, [%5], [%6]
- SSE2_StoreDiff8p %1, %3, %4, [%5 + %7], [%6 + %8]
+ SSE2_StoreDiff8p %1, %3, %4, [%5 + %7], [%6 + %8]
SSE2_StoreDiff8p %2, %3, %4, [%5 + 8], [%6 + 8]
- SSE2_StoreDiff8p %2, %3, %4, [%5 + %7 + 8], [%6 + %8 + 8]
+ SSE2_StoreDiff8p %2, %3, %4, [%5 + %7 + 8], [%6 + %8 + 8]
%endmacro
-
+
;***********************************************************************
; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
;***********************************************************************
@@ -443,47 +443,47 @@
WelsIDctRecI16x16Dc_sse2:
push esi
push edi
-
+
mov ecx, [luma_dc]
- mov eax, [rec]
- mov edx, [stride]
- mov esi, [pred]
- mov edi, [pred_stride]
+ mov eax, [rec]
+ mov edx, [stride]
+ mov esi, [pred]
+ mov edi, [pred_stride]
pxor xmm7, xmm7
WELS_DW32 xmm6
-
+
SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [ecx]
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
-
+
lea eax, [eax + 2 * edx]
- lea esi, [esi + 2 * edi]
- SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
-
+ lea esi, [esi + 2 * edi]
+ SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
+
lea eax, [eax + 2 * edx]
- lea esi, [esi + 2 * edi]
+ lea esi, [esi + 2 * edi]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
-
+
lea eax, [eax + 2 * edx]
- lea esi, [esi + 2 * edi]
+ lea esi, [esi + 2 * edi]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
-
- SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [ecx + 16]
+
+ SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [ecx + 16]
lea eax, [eax + 2 * edx]
- lea esi, [esi + 2 * edi]
+ lea esi, [esi + 2 * edi]
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
-
+
lea eax, [eax + 2 * edx]
- lea esi, [esi + 2 * edi]
- SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
-
+ lea esi, [esi + 2 * edi]
+ SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
+
lea eax, [eax + 2 * edx]
- lea esi, [esi + 2 * edi]
+ lea esi, [esi + 2 * edi]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
-
+
lea eax, [eax + 2 * edx]
- lea esi, [esi + 2 * edi]
+ lea esi, [esi + 2 * edi]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
-
+
pop edi
pop esi
ret
@@ -517,7 +517,7 @@
punpckldq %3, %4
punpcklqdq %1, %3
%endmacro
-
+
;***********************************************************************
;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
;***********************************************************************
@@ -525,23 +525,23 @@
WelsHadamardT4Dc_sse2:
mov eax, [esp + 4] ; luma_dc
mov ecx, [esp + 8] ; pDct
-
+
SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, ecx
SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, ecx + 0x40
SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, ecx + 0x100
SSE2_Load4Col xmm4, xmm5, xmm6, xmm0, ecx + 0x140
-
+
SSE2_SumSubD xmm1, xmm2, xmm7
SSE2_SumSubD xmm3, xmm4, xmm7
SSE2_SumSubD xmm2, xmm4, xmm7
- SSE2_SumSubD xmm1, xmm3, xmm7
+ SSE2_SumSubD xmm1, xmm3, xmm7
SSE2_Trans4x4D xmm4, xmm2, xmm1, xmm3, xmm5 ; pOut: xmm4,xmm3,xmm5,xmm1
-
+
SSE2_SumSubD xmm4, xmm3, xmm7
SSE2_SumSubD xmm5, xmm1, xmm7
- WELS_DD1 xmm6
+ WELS_DD1 xmm6
SSE2_SumSubDiv2D xmm3, xmm1, xmm6, xmm0 ; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
SSE2_SumSubDiv2D xmm4, xmm5, xmm6, xmm1 ; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
SSE2_Trans4x4D xmm3, xmm0, xmm1, xmm4, xmm2 ; pOut: xmm3,xmm4,xmm2,xmm1
@@ -550,7 +550,7 @@
packssdw xmm2, xmm1
movdqa [eax+ 0], xmm3
movdqa [eax+16], xmm2
-
- ret
+
+ ret
--- a/codec/encoder/core/asm/deblock.asm
+++ b/codec/encoder/core/asm/deblock.asm
@@ -1,2113 +1,2113 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* deblock.asm
-;*
-;* Abstract
-;* edge loop
-;*
-;* History
-;* 08/07/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata pData
-%else
-SECTION .rodata align=16
-%endif
-
-SECTION .text
-
-;********************************************************************************
-; void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta)
-;********************************************************************************
-WELS_EXTERN DeblockChromaEq4V_sse2
-
-ALIGN 16
-DeblockChromaEq4V_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,68h
- mov edx,[ebp+10h] ; iStride
- mov eax,[ebp+8] ; pPixCb
- mov ecx,[ebp+0Ch] ; pPixCr
- movq xmm4,[ecx]
- movq xmm5,[edx+ecx]
- push esi
- push edi
- lea esi,[edx+edx]
- mov edi,eax
- sub edi,esi
- movq xmm1,[edi]
- mov edi,ecx
- sub edi,esi
- movq xmm2,[edi]
- punpcklqdq xmm1,xmm2
- mov esi,eax
- sub esi,edx
- movq xmm2,[esi]
- mov edi,ecx
- sub edi,edx
- movq xmm3,[edi]
- punpcklqdq xmm2,xmm3
- movq xmm3,[eax]
- punpcklqdq xmm3,xmm4
- movq xmm4,[edx+eax]
- mov edx, [ebp + 14h]
- punpcklqdq xmm4,xmm5
- movd xmm5,edx
- mov edx, [ebp + 18h]
- pxor xmm0,xmm0
- movdqa xmm6,xmm5
- punpcklwd xmm6,xmm5
- pshufd xmm5,xmm6,0
- movd xmm6,edx
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa xmm7,xmm1
- punpckhbw xmm1,xmm0
- punpcklbw xmm7,xmm0
- movdqa [esp+40h],xmm1
- movdqa [esp+60h],xmm7
- movdqa xmm7,xmm2
- punpcklbw xmm7,xmm0
- movdqa [esp+10h],xmm7
- movdqa xmm7,xmm3
- punpcklbw xmm7,xmm0
- punpckhbw xmm3,xmm0
- movdqa [esp+50h],xmm7
- movdqa xmm7,xmm4
- punpckhbw xmm4,xmm0
- punpckhbw xmm2,xmm0
- punpcklbw xmm7,xmm0
- movdqa [esp+30h],xmm3
- movdqa xmm3,[esp+10h]
- movdqa xmm1,xmm3
- psubw xmm1,[esp+50h]
- pabsw xmm1,xmm1
- movdqa [esp+20h],xmm4
- movdqa xmm0,xmm5
- pcmpgtw xmm0,xmm1
- movdqa xmm1,[esp+60h]
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- pand xmm0,xmm4
- movdqa xmm1,xmm7
- psubw xmm1,[esp+50h]
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- movdqa xmm1,xmm2
- psubw xmm1,[esp+30h]
- pabsw xmm1,xmm1
- pcmpgtw xmm5,xmm1
- movdqa xmm1,[esp+40h]
- pand xmm0,xmm4
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- movdqa xmm1,[esp+20h]
- psubw xmm1,[esp+30h]
- pand xmm5,xmm4
- pabsw xmm1,xmm1
- pcmpgtw xmm6,xmm1
- pand xmm5,xmm6
- mov edx,2
- movsx edx,dx
- movd xmm1,edx
- movdqa xmm4,xmm1
- punpcklwd xmm4,xmm1
- pshufd xmm1,xmm4,0
- movdqa xmm4,[esp+60h]
- movdqa xmm6,xmm4
- paddw xmm6,xmm4
- paddw xmm6,xmm3
- paddw xmm6,xmm7
- movdqa [esp+10h],xmm1
- paddw xmm6,[esp+10h]
- psraw xmm6,2
- movdqa xmm4,xmm0
- pandn xmm4,xmm3
- movdqa xmm3,[esp+40h]
- movdqa xmm1,xmm0
- pand xmm1,xmm6
- por xmm1,xmm4
- movdqa xmm6,xmm3
- paddw xmm6,xmm3
- movdqa xmm3,[esp+10h]
- paddw xmm6,xmm2
- paddw xmm6,[esp+20h]
- paddw xmm6,xmm3
- psraw xmm6,2
- movdqa xmm4,xmm5
- pand xmm4,xmm6
- movdqa xmm6,xmm5
- pandn xmm6,xmm2
- por xmm4,xmm6
- packuswb xmm1,xmm4
- movdqa xmm4,[esp+50h]
- movdqa xmm6,xmm7
- paddw xmm6,xmm7
- paddw xmm6,xmm4
- paddw xmm6,[esp+60h]
- paddw xmm6,xmm3
- psraw xmm6,2
- movdqa xmm2,xmm0
- pand xmm2,xmm6
- pandn xmm0,xmm4
- por xmm2,xmm0
- movdqa xmm0,[esp+20h]
- movdqa xmm6,xmm0
- paddw xmm6,xmm0
- movdqa xmm0,[esp+30h]
- paddw xmm6,xmm0
- paddw xmm6,[esp+40h]
- movdqa xmm4,xmm5
- paddw xmm6,xmm3
- movq [esi],xmm1
- psraw xmm6,2
- pand xmm4,xmm6
- pandn xmm5,xmm0
- por xmm4,xmm5
- packuswb xmm2,xmm4
- movq [eax],xmm2
- psrldq xmm1,8
- movq [edi],xmm1
- pop edi
- psrldq xmm2,8
- movq [ecx],xmm2
- pop esi
- mov esp,ebp
- pop ebp
- ret
-
-;******************************************************************************
-; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN DeblockChromaLt4V_sse2
-
-DeblockChromaLt4V_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,0E4h
- push ebx
- push esi
- mov esi, [ebp+1Ch] ; pTC
- movsx ebx, byte [esi+2]
- push edi
- movsx di,byte [esi+3]
- mov word [esp+0Ch],bx
- movsx bx,byte [esi+1]
- movsx esi,byte [esi]
- mov word [esp+0Eh],si
- movzx esi,di
- movd xmm1,esi
- movzx esi,di
- movd xmm2,esi
- mov si,word [esp+0Ch]
- mov edx, [ebp + 10h]
- mov eax, [ebp + 08h]
- movzx edi,si
- movzx esi,si
- mov ecx, [ebp + 0Ch]
- movd xmm4,esi
- movzx esi,bx
- movd xmm5,esi
- movd xmm3,edi
- movzx esi,bx
- movd xmm6,esi
- mov si,word [esp+0Eh]
- movzx edi,si
- movzx esi,si
- punpcklwd xmm6,xmm2
- pxor xmm0,xmm0
- movdqa [esp+40h],xmm0
- movd xmm7,edi
- movd xmm0,esi
- lea esi,[edx+edx]
- mov edi,eax
- sub edi,esi
- punpcklwd xmm5,xmm1
- movdqa xmm1,[esp+40h]
- punpcklwd xmm0,xmm4
- movq xmm4,[edx+ecx]
- punpcklwd xmm7,xmm3
- movq xmm3,[eax]
- punpcklwd xmm0,xmm6
- movq xmm6,[edi]
- punpcklwd xmm7,xmm5
- punpcklwd xmm0,xmm7
- mov edi,ecx
- sub edi,esi
- movdqa xmm2,xmm1
- psubw xmm2,xmm0
- movdqa [esp+60h],xmm2
- movq xmm2, [edi]
- punpcklqdq xmm6,xmm2
- mov esi,eax
- sub esi,edx
- movq xmm7,[esi]
- mov edi,ecx
- sub edi,edx
- movq xmm2,[edi]
- punpcklqdq xmm7,xmm2
- movq xmm2,[ecx]
- punpcklqdq xmm3,xmm2
- movq xmm2,[edx+eax]
- movsx edx,word [ebp + 14h]
- punpcklqdq xmm2,xmm4
- movdqa [esp+0E0h],xmm2
- movd xmm2,edx
- movsx edx,word [ebp + 18h]
- movdqa xmm4,xmm2
- punpcklwd xmm4,xmm2
- movd xmm2,edx
- movdqa xmm5,xmm2
- punpcklwd xmm5,xmm2
- pshufd xmm2,xmm5,0
- movdqa [esp+50h],xmm2
- movdqa xmm2,xmm6
- punpcklbw xmm2,xmm1
- movdqa [esp+0D0h],xmm3
- pshufd xmm4,xmm4,0
- movdqa [esp+30h],xmm2
- punpckhbw xmm6,xmm1
- movdqa [esp+80h],xmm6
- movdqa xmm6,[esp+0D0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+70h],xmm6
- movdqa xmm6, [esp+0E0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+90h],xmm6
- movdqa xmm5, [esp+0E0h]
- movdqa xmm2,xmm7
- punpckhbw xmm7,xmm1
- punpcklbw xmm5,xmm1
- movdqa [esp+0A0h],xmm7
- punpcklbw xmm3,xmm1
- mov edx,4
- punpcklbw xmm2,xmm1
- movsx edx,dx
- movd xmm6,edx
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa xmm7,[esp+30h]
- movdqa [esp+20h],xmm6
- psubw xmm7,xmm5
- movdqa xmm6,xmm0
- pcmpgtw xmm6,xmm1
- movdqa xmm1,[esp+60h]
- movdqa [esp+40h],xmm6
- movdqa xmm6,xmm3
- psubw xmm6,xmm2
- psllw xmm6,2
- paddw xmm6,xmm7
- paddw xmm6, [esp+20h]
- movdqa xmm7, [esp+50h]
- psraw xmm6,3
- pmaxsw xmm1,xmm6
- movdqa [esp+10h],xmm0
- movdqa xmm6, [esp+10h]
- pminsw xmm6,xmm1
- movdqa [esp+10h],xmm6
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- movdqa xmm6,xmm4
- pcmpgtw xmm6,xmm1
- movdqa xmm1, [esp+30h]
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- pcmpgtw xmm7,xmm1
- movdqa xmm1,[esp+50h]
- pand xmm6,xmm7
- movdqa xmm7,[esp+50h]
- psubw xmm5,xmm3
- pabsw xmm5,xmm5
- pcmpgtw xmm1,xmm5
- movdqa xmm5,[esp+80h]
- psubw xmm5,[esp+90h]
- pand xmm6,xmm1
- pand xmm6,[esp+40h]
- movdqa xmm1,[esp+10h]
- pand xmm1,xmm6
- movdqa xmm6,[esp+70h]
- movdqa [esp+30h],xmm1
- movdqa xmm1,[esp+0A0h]
- psubw xmm6,xmm1
- psllw xmm6,2
- paddw xmm6,xmm5
- paddw xmm6,[esp+20h]
- movdqa xmm5,[esp+60h]
- psraw xmm6,3
- pmaxsw xmm5,xmm6
- pminsw xmm0,xmm5
- movdqa xmm5,[esp+70h]
- movdqa xmm6,xmm1
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm4,xmm6
- movdqa xmm6,[esp+80h]
- psubw xmm6,xmm1
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+90h]
- pand xmm4,xmm7
- movdqa xmm7,[esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- pand xmm4,xmm7
- pand xmm4,[esp+40h]
- pand xmm0,xmm4
- movdqa xmm4,[esp+30h]
- paddw xmm2,xmm4
- paddw xmm1,xmm0
- packuswb xmm2,xmm1
- movq [esi],xmm2
- psubw xmm3,xmm4
- psubw xmm5,xmm0
- packuswb xmm3,xmm5
- movq [eax],xmm3
- psrldq xmm2,8
- movq [edi],xmm2
- pop edi
- pop esi
- psrldq xmm3,8
- movq [ecx],xmm3
- pop ebx
- mov esp,ebp
- pop ebp
- ret
-
-;***************************************************************************
-; void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta)
-;***************************************************************************
-
-WELS_EXTERN DeblockChromaEq4H_sse2
-
-ALIGN 16
-
-DeblockChromaEq4H_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,0C8h
- mov ecx,dword [ebp+8]
- mov edx,dword [ebp+0Ch]
- mov eax,dword [ebp+10h]
- sub ecx,2
- sub edx,2
- push esi
- lea esi,[eax+eax*2]
- mov dword [esp+18h],ecx
- mov dword [esp+4],edx
- lea ecx,[ecx+eax*4]
- lea edx,[edx+eax*4]
- lea eax,[esp+7Ch]
- push edi
- mov dword [esp+14h],esi
- mov dword [esp+18h],ecx
- mov dword [esp+0Ch],edx
- mov dword [esp+10h],eax
- mov esi,dword [esp+1Ch]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+14h]
- movd xmm0,dword [esi]
- movd xmm1,dword [esi+ecx]
- movd xmm2,dword [esi+ecx*2]
- movd xmm3,dword [esi+edx]
- mov esi,dword [esp+8]
- movd xmm4,dword [esi]
- movd xmm5,dword [esi+ecx]
- movd xmm6,dword [esi+ecx*2]
- movd xmm7,dword [esi+edx]
- punpckldq xmm0,xmm4
- punpckldq xmm1,xmm5
- punpckldq xmm2,xmm6
- punpckldq xmm3,xmm7
- mov esi,dword [esp+18h]
- mov edi,dword [esp+0Ch]
- movd xmm4,dword [esi]
- movd xmm5,dword [edi]
- punpckldq xmm4,xmm5
- punpcklqdq xmm0,xmm4
- movd xmm4,dword [esi+ecx]
- movd xmm5,dword [edi+ecx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm1,xmm4
- movd xmm4,dword [esi+ecx*2]
- movd xmm5,dword [edi+ecx*2]
- punpckldq xmm4,xmm5
- punpcklqdq xmm2,xmm4
- movd xmm4,dword [esi+edx]
- movd xmm5,dword [edi+edx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm3,xmm4
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov edi,dword [esp+10h]
- movdqa [edi],xmm0
- movdqa [edi+10h],xmm5
- movdqa [edi+20h],xmm1
- movdqa [edi+30h],xmm6
- movsx ecx,word [ebp+14h]
- movsx edx,word [ebp+18h]
- movdqa xmm6,[esp+80h]
- movdqa xmm4,[esp+90h]
- movdqa xmm5,[esp+0A0h]
- movdqa xmm7,[esp+0B0h]
- pxor xmm0,xmm0
- movd xmm1,ecx
- movdqa xmm2,xmm1
- punpcklwd xmm2,xmm1
- pshufd xmm1,xmm2,0
- movd xmm2,edx
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm2,xmm3,0
- movdqa xmm3,xmm6
- punpckhbw xmm6,xmm0
- movdqa [esp+60h],xmm6
- movdqa xmm6,[esp+90h]
- punpckhbw xmm6,xmm0
- movdqa [esp+30h],xmm6
- movdqa xmm6,[esp+0A0h]
- punpckhbw xmm6,xmm0
- movdqa [esp+40h],xmm6
- movdqa xmm6,[esp+0B0h]
- punpckhbw xmm6,xmm0
- movdqa [esp+70h],xmm6
- punpcklbw xmm7,xmm0
- punpcklbw xmm4,xmm0
- punpcklbw xmm5,xmm0
- punpcklbw xmm3,xmm0
- movdqa [esp+50h],xmm7
- movdqa xmm6,xmm4
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- movdqa xmm0,xmm1
- pcmpgtw xmm0,xmm6
- movdqa xmm6,xmm3
- psubw xmm6,xmm4
- pabsw xmm6,xmm6
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pand xmm0,xmm7
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+30h]
- psubw xmm6,[esp+40h]
- pabsw xmm6,xmm6
- pcmpgtw xmm1,xmm6
- movdqa xmm6,[esp+60h]
- psubw xmm6,[esp+30h]
- pabsw xmm6,xmm6
- pand xmm0,xmm7
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+70h]
- psubw xmm6,[esp+40h]
- pabsw xmm6,xmm6
- pand xmm1,xmm7
- pcmpgtw xmm2,xmm6
- pand xmm1,xmm2
- mov eax,2
- movsx ecx,ax
- movd xmm2,ecx
- movdqa xmm6,xmm2
- punpcklwd xmm6,xmm2
- pshufd xmm2,xmm6,0
- movdqa [esp+20h],xmm2
- movdqa xmm2,xmm3
- paddw xmm2,xmm3
- paddw xmm2,xmm4
- paddw xmm2,[esp+50h]
- paddw xmm2,[esp+20h]
- psraw xmm2,2
- movdqa xmm6,xmm0
- pand xmm6,xmm2
- movdqa xmm2,xmm0
- pandn xmm2,xmm4
- por xmm6,xmm2
- movdqa xmm2,[esp+60h]
- movdqa xmm7,xmm2
- paddw xmm7,xmm2
- paddw xmm7,[esp+30h]
- paddw xmm7,[esp+70h]
- paddw xmm7,[esp+20h]
- movdqa xmm4,xmm1
- movdqa xmm2,xmm1
- pandn xmm2,[esp+30h]
- psraw xmm7,2
- pand xmm4,xmm7
- por xmm4,xmm2
- movdqa xmm2,[esp+50h]
- packuswb xmm6,xmm4
- movdqa [esp+90h],xmm6
- movdqa xmm6,xmm2
- paddw xmm6,xmm2
- movdqa xmm2,[esp+20h]
- paddw xmm6,xmm5
- paddw xmm6,xmm3
- movdqa xmm4,xmm0
- pandn xmm0,xmm5
- paddw xmm6,xmm2
- psraw xmm6,2
- pand xmm4,xmm6
- por xmm4,xmm0
- movdqa xmm0,[esp+70h]
- movdqa xmm5,xmm0
- paddw xmm5,xmm0
- movdqa xmm0,[esp+40h]
- paddw xmm5,xmm0
- paddw xmm5,[esp+60h]
- movdqa xmm3,xmm1
- paddw xmm5,xmm2
- psraw xmm5,2
- pand xmm3,xmm5
- pandn xmm1,xmm0
- por xmm3,xmm1
- packuswb xmm4,xmm3
- movdqa [esp+0A0h],xmm4
- mov esi,dword [esp+10h]
- movdqa xmm0,[esi]
- movdqa xmm1,[esi+10h]
- movdqa xmm2,[esi+20h]
- movdqa xmm3,[esi+30h]
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov esi,dword [esp+1Ch]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+14h]
- mov edi,dword [esp+8]
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov esi,dword [esp+18h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov edi,dword [esp+0Ch]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- pop edi
- pop esi
- mov esp,ebp
- pop ebp
- ret
-
-;*******************************************************************************
-; void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN DeblockChromaLt4H_sse2
-
-ALIGN 16
-
-DeblockChromaLt4H_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,108h
- mov ecx,dword [ebp+8]
- mov edx,dword [ebp+0Ch]
- mov eax,dword [ebp+10h]
- sub ecx,2
- sub edx,2
- push esi
- lea esi,[eax+eax*2]
- mov dword [esp+10h],ecx
- mov dword [esp+4],edx
- lea ecx,[ecx+eax*4]
- lea edx,[edx+eax*4]
- lea eax,[esp+6Ch]
- push edi
- mov dword [esp+0Ch],esi
- mov dword [esp+18h],ecx
- mov dword [esp+10h],edx
- mov dword [esp+1Ch],eax
- mov esi,dword [esp+14h]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+0Ch]
- movd xmm0,dword [esi]
- movd xmm1,dword [esi+ecx]
- movd xmm2,dword [esi+ecx*2]
- movd xmm3,dword [esi+edx]
- mov esi,dword [esp+8]
- movd xmm4,dword [esi]
- movd xmm5,dword [esi+ecx]
- movd xmm6,dword [esi+ecx*2]
- movd xmm7,dword [esi+edx]
- punpckldq xmm0,xmm4
- punpckldq xmm1,xmm5
- punpckldq xmm2,xmm6
- punpckldq xmm3,xmm7
- mov esi,dword [esp+18h]
- mov edi,dword [esp+10h]
- movd xmm4,dword [esi]
- movd xmm5,dword [edi]
- punpckldq xmm4,xmm5
- punpcklqdq xmm0,xmm4
- movd xmm4,dword [esi+ecx]
- movd xmm5,dword [edi+ecx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm1,xmm4
- movd xmm4,dword [esi+ecx*2]
- movd xmm5,dword [edi+ecx*2]
- punpckldq xmm4,xmm5
- punpcklqdq xmm2,xmm4
- movd xmm4,dword [esi+edx]
- movd xmm5,dword [edi+edx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm3,xmm4
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov edi,dword [esp+1Ch]
- movdqa [edi],xmm0
- movdqa [edi+10h],xmm5
- movdqa [edi+20h],xmm1
- movdqa [edi+30h],xmm6
- mov eax,dword [ebp+1Ch]
- movsx cx,byte [eax+3]
- movsx dx,byte [eax+2]
- movsx si,byte [eax+1]
- movsx ax,byte [eax]
- movzx edi,cx
- movzx ecx,cx
- movd xmm2,ecx
- movzx ecx,dx
- movzx edx,dx
- movd xmm3,ecx
- movd xmm4,edx
- movzx ecx,si
- movzx edx,si
- movd xmm5,ecx
- pxor xmm0,xmm0
- movd xmm6,edx
- movzx ecx,ax
- movdqa [esp+60h],xmm0
- movzx edx,ax
- movsx eax,word [ebp+14h]
- punpcklwd xmm6,xmm2
- movd xmm1,edi
- movd xmm7,ecx
- movsx ecx,word [ebp+18h]
- movd xmm0,edx
- punpcklwd xmm7,xmm3
- punpcklwd xmm5,xmm1
- movdqa xmm1,[esp+60h]
- punpcklwd xmm7,xmm5
- movdqa xmm5,[esp+0A0h]
- punpcklwd xmm0,xmm4
- punpcklwd xmm0,xmm6
- movdqa xmm6, [esp+70h]
- punpcklwd xmm0,xmm7
- movdqa xmm7,[esp+80h]
- movdqa xmm2,xmm1
- psubw xmm2,xmm0
- movdqa [esp+0D0h],xmm2
- movd xmm2,eax
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm4,xmm3,0
- movd xmm2,ecx
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm2,xmm3,0
- movdqa xmm3, [esp+90h]
- movdqa [esp+50h],xmm2
- movdqa xmm2,xmm6
- punpcklbw xmm2,xmm1
- punpckhbw xmm6,xmm1
- movdqa [esp+40h],xmm2
- movdqa [esp+0B0h],xmm6
- movdqa xmm6,[esp+90h]
- movdqa xmm2,xmm7
- punpckhbw xmm7,xmm1
- punpckhbw xmm6,xmm1
- punpcklbw xmm2,xmm1
- punpcklbw xmm3,xmm1
- punpcklbw xmm5,xmm1
- movdqa [esp+0F0h],xmm7
- movdqa [esp+0C0h],xmm6
- movdqa xmm6, [esp+0A0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+0E0h],xmm6
- mov edx,4
- movsx eax,dx
- movd xmm6,eax
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa [esp+30h],xmm6
- movdqa xmm7, [esp+40h]
- psubw xmm7,xmm5
- movdqa xmm6,xmm0
- pcmpgtw xmm6,xmm1
- movdqa [esp+60h],xmm6
- movdqa xmm1, [esp+0D0h]
- movdqa xmm6,xmm3
- psubw xmm6,xmm2
- psllw xmm6,2
- paddw xmm6,xmm7
- paddw xmm6,[esp+30h]
- psraw xmm6,3
- pmaxsw xmm1,xmm6
- movdqa xmm7,[esp+50h]
- movdqa [esp+20h],xmm0
- movdqa xmm6, [esp+20h]
- pminsw xmm6,xmm1
- movdqa [esp+20h],xmm6
- movdqa xmm6,xmm4
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- pcmpgtw xmm6,xmm1
- movdqa xmm1, [esp+40h]
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- pcmpgtw xmm7,xmm1
- movdqa xmm1, [esp+50h]
- pand xmm6,xmm7
- movdqa xmm7, [esp+50h]
- psubw xmm5,xmm3
- pabsw xmm5,xmm5
- pcmpgtw xmm1,xmm5
- movdqa xmm5, [esp+0B0h]
- psubw xmm5,[esp+0E0h]
- pand xmm6,xmm1
- pand xmm6, [esp+60h]
- movdqa xmm1, [esp+20h]
- pand xmm1,xmm6
- movdqa xmm6, [esp+0C0h]
- movdqa [esp+40h],xmm1
- movdqa xmm1, [esp+0F0h]
- psubw xmm6,xmm1
- psllw xmm6,2
- paddw xmm6,xmm5
- paddw xmm6, [esp+30h]
- movdqa xmm5, [esp+0D0h]
- psraw xmm6,3
- pmaxsw xmm5,xmm6
- pminsw xmm0,xmm5
- movdqa xmm5,[esp+0C0h]
- movdqa xmm6,xmm1
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm4,xmm6
- movdqa xmm6,[esp+0B0h]
- psubw xmm6,xmm1
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- movdqa xmm6, [esp+0E0h]
- pand xmm4,xmm7
- movdqa xmm7, [esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- pand xmm4,xmm7
- pand xmm4,[esp+60h]
- pand xmm0,xmm4
- movdqa xmm4, [esp+40h]
- paddw xmm2,xmm4
- paddw xmm1,xmm0
- psubw xmm3,xmm4
- psubw xmm5,xmm0
- packuswb xmm2,xmm1
- packuswb xmm3,xmm5
- movdqa [esp+80h],xmm2
- movdqa [esp+90h],xmm3
- mov esi,dword [esp+1Ch]
- movdqa xmm0, [esi]
- movdqa xmm1, [esi+10h]
- movdqa xmm2, [esi+20h]
- movdqa xmm3, [esi+30h]
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov esi,dword [esp+14h]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+0Ch]
- mov edi,dword [esp+8]
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov esi,dword [esp+18h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov edi,dword [esp+10h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- pop edi
- pop esi
- mov esp,ebp
- pop ebp
- ret
-
-
-
-;*******************************************************************************
-; void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
-; int32_t iBeta, int8_t * pTC)
-;*******************************************************************************
-
-
-WELS_EXTERN DeblockLumaLt4V_sse2
-
-ALIGN 16
-
-DeblockLumaLt4V_sse2:
- push ebp
- mov ebp, esp
- and esp, -16 ; fffffff0H
- sub esp, 420 ; 000001a4H
- mov eax, dword [ebp+8]
- mov ecx, dword [ebp+12]
-
- pxor xmm0, xmm0
- push ebx
- mov edx, dword [ebp+24]
- movdqa [esp+424-384], xmm0
- push esi
-
- lea esi, [ecx+ecx*2]
- push edi
- mov edi, eax
- sub edi, esi
- movdqa xmm0, [edi]
-
- lea esi, [ecx+ecx]
- movdqa [esp+432-208], xmm0
- mov edi, eax
- sub edi, esi
- movdqa xmm0, [edi]
- movdqa [esp+448-208], xmm0
-
- mov ebx, eax
- sub ebx, ecx
- movdqa xmm0, [ebx]
- movdqa [esp+464-208], xmm0
-
- movdqa xmm0, [eax]
-
- add ecx, eax
- movdqa [esp+480-208], xmm0
- movdqa xmm0, [ecx]
- mov dword [esp+432-404], ecx
-
- movsx ecx, word [ebp+16]
- movdqa [esp+496-208], xmm0
- movdqa xmm0, [esi+eax]
-
- movsx si, byte [edx]
- movdqa [esp+512-208], xmm0
- movd xmm0, ecx
- movsx ecx, word [ebp+20]
- movdqa xmm1, xmm0
- punpcklwd xmm1, xmm0
- pshufd xmm0, xmm1, 0
- movdqa [esp+432-112], xmm0
- movd xmm0, ecx
- movsx cx, byte [edx+1]
- movdqa xmm1, xmm0
- punpcklwd xmm1, xmm0
- mov dword [esp+432-408], ebx
- movzx ebx, cx
- pshufd xmm0, xmm1, 0
- movd xmm1, ebx
- movzx ebx, cx
- movd xmm2, ebx
- movzx ebx, cx
- movzx ecx, cx
- movd xmm4, ecx
- movzx ecx, si
- movd xmm5, ecx
- movzx ecx, si
- movd xmm6, ecx
- movzx ecx, si
- movd xmm7, ecx
- movzx ecx, si
- movdqa [esp+432-336], xmm0
- movd xmm0, ecx
-
- movsx cx, byte [edx+3]
- movsx dx, byte [edx+2]
- movd xmm3, ebx
- punpcklwd xmm0, xmm4
- movzx esi, cx
- punpcklwd xmm6, xmm2
- punpcklwd xmm5, xmm1
- punpcklwd xmm0, xmm6
- punpcklwd xmm7, xmm3
- punpcklwd xmm7, xmm5
- punpcklwd xmm0, xmm7
- movdqa [esp+432-400], xmm0
- movd xmm0, esi
- movzx esi, cx
- movd xmm2, esi
- movzx esi, cx
- movzx ecx, cx
- movd xmm4, ecx
- movzx ecx, dx
- movd xmm3, esi
- movd xmm5, ecx
- punpcklwd xmm5, xmm0
-
- movdqa xmm0, [esp+432-384]
- movzx ecx, dx
- movd xmm6, ecx
- movzx ecx, dx
- movzx edx, dx
- punpcklwd xmm6, xmm2
- movd xmm7, ecx
- movd xmm1, edx
-
- movdqa xmm2, [esp+448-208]
- punpcklbw xmm2, xmm0
-
- mov ecx, 4
- movsx edx, cx
- punpcklwd xmm7, xmm3
- punpcklwd xmm7, xmm5
- movdqa xmm5, [esp+496-208]
- movdqa xmm3, [esp+464-208]
- punpcklbw xmm5, xmm0
- movdqa [esp+432-240], xmm5
- movdqa xmm5, [esp+512-208]
- punpcklbw xmm5, xmm0
- movdqa [esp+432-352], xmm5
- punpcklwd xmm1, xmm4
- movdqa xmm4, [esp+432-208]
- punpcklwd xmm1, xmm6
- movdqa xmm6, [esp+480-208]
- punpcklwd xmm1, xmm7
- punpcklbw xmm6, xmm0
- punpcklbw xmm3, xmm0
- punpcklbw xmm4, xmm0
- movdqa xmm7, xmm3
- psubw xmm7, xmm4
- pabsw xmm7, xmm7
- movdqa [esp+432-272], xmm4
- movdqa xmm4, [esp+432-336]
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-288], xmm5
- movdqa xmm7, xmm6
- psubw xmm7, [esp+432-352]
- pabsw xmm7, xmm7
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-256], xmm5
- movdqa xmm5, xmm3
- pavgw xmm5, xmm6
- movdqa [esp+432-304], xmm5
- movdqa xmm5, [esp+432-400]
- psubw xmm5, [esp+432-288]
- psubw xmm5, [esp+432-256]
- movdqa [esp+432-224], xmm5
- movdqa xmm5, xmm6
- psubw xmm5, xmm3
- movdqa [esp+432-32], xmm6
- psubw xmm6, [esp+432-240]
- movdqa xmm7, xmm5
- movdqa [esp+432-384], xmm5
- movdqa xmm5, [esp+432-112]
- pabsw xmm7, xmm7
- pcmpgtw xmm5, xmm7
- pabsw xmm6, xmm6
- movdqa xmm7, xmm4
- pcmpgtw xmm7, xmm6
-
- pand xmm5, xmm7
- movdqa xmm6, xmm3
- psubw xmm6, xmm2
- pabsw xmm6, xmm6
- movdqa xmm7, xmm4
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+432-400]
- pand xmm5, xmm7
- movdqa xmm7, xmm6
- pcmpeqw xmm6, xmm0
- pcmpgtw xmm7, xmm0
- por xmm7, xmm6
- pand xmm5, xmm7
- movdqa [esp+432-320], xmm5
- movd xmm5, edx
- movdqa xmm6, xmm5
- punpcklwd xmm6, xmm5
- pshufd xmm5, xmm6, 0
- movdqa [esp+432-336], xmm5
- movdqa xmm5, [esp+432-224]
- movdqa [esp+432-368], xmm5
- movdqa xmm6, xmm0
- psubw xmm6, xmm5
- movdqa xmm5, [esp+432-384]
- psllw xmm5, 2
- movdqa xmm7, xmm2
- psubw xmm7, [esp+432-240]
- paddw xmm7, xmm5
- paddw xmm7, [esp+432-336]
- movdqa xmm5, [esp+432-368]
- psraw xmm7, 3
- pmaxsw xmm6, xmm7
- pminsw xmm5, xmm6
-
- pand xmm5, [esp+432-320]
- movdqa xmm6, [esp+432-400]
- movdqa [esp+432-64], xmm5
- movdqa [esp+432-384], xmm6
- movdqa xmm5, xmm0
- psubw xmm5, xmm6
- movdqa [esp+432-368], xmm5
- movdqa xmm6, xmm5
- movdqa xmm5, [esp+432-272]
- paddw xmm5, [esp+432-304]
- movdqa xmm7, xmm2
- paddw xmm7, xmm2
- psubw xmm5, xmm7
- psraw xmm5, 1
- pmaxsw xmm6, xmm5
- movdqa xmm5, [esp+432-384]
- pminsw xmm5, xmm6
-
- pand xmm5, [esp+432-320]
- pand xmm5, [esp+432-288]
- movdqa xmm6, [esp+432-240]
- movdqa [esp+432-96], xmm5
- movdqa xmm5, [esp+432-352]
- paddw xmm5, [esp+432-304]
- movdqa xmm7, xmm6
- paddw xmm7, xmm6
- movdqa xmm6, [esp+432-368]
- psubw xmm5, xmm7
-
- movdqa xmm7, [esp+496-208]
- psraw xmm5, 1
- pmaxsw xmm6, xmm5
- movdqa xmm5, [esp+432-400]
- pminsw xmm5, xmm6
- pand xmm5, [esp+432-320]
- pand xmm5, [esp+432-256]
- movdqa xmm6, [esp+448-208]
- punpckhbw xmm7, xmm0
- movdqa [esp+432-352], xmm7
-
- movdqa xmm7, [esp+512-208]
- punpckhbw xmm6, xmm0
- movdqa [esp+432-48], xmm5
- movdqa xmm5, [esp+432-208]
- movdqa [esp+432-368], xmm6
- movdqa xmm6, [esp+464-208]
- punpckhbw xmm7, xmm0
- punpckhbw xmm5, xmm0
- movdqa [esp+432-384], xmm7
- punpckhbw xmm6, xmm0
- movdqa [esp+432-400], xmm6
-
- movdqa xmm7, [esp+432-400]
- movdqa xmm6, [esp+480-208]
- psubw xmm7, xmm5
- movdqa [esp+432-16], xmm5
- pabsw xmm7, xmm7
- punpckhbw xmm6, xmm0
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-288], xmm5
-
- movdqa xmm7, xmm6
- psubw xmm7, [esp+432-384]
- pabsw xmm7, xmm7
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-256], xmm5
-
- movdqa xmm5, [esp+432-400]
- movdqa [esp+432-80], xmm6
- pavgw xmm5, xmm6
- movdqa [esp+432-304], xmm5
-
- movdqa xmm5, xmm1
- psubw xmm5, [esp+432-288]
- psubw xmm5, [esp+432-256]
- movdqa [esp+432-224], xmm5
- movdqa xmm5, xmm6
- psubw xmm5, [esp+432-400]
- psubw xmm6, [esp+432-352]
- movdqa [esp+432-272], xmm5
- movdqa xmm7, xmm5
- movdqa xmm5, [esp+432-112]
- pabsw xmm7, xmm7
- pcmpgtw xmm5, xmm7
- movdqa xmm7, xmm4
- pabsw xmm6, xmm6
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+432-368]
-
- pand xmm5, xmm7
- movdqa xmm7, [esp+432-400]
- psubw xmm7, xmm6
- psubw xmm6, [esp+432-352]
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
- pand xmm5, xmm4
-
- paddw xmm2, [esp+432-96]
- movdqa xmm4, xmm1
- pcmpgtw xmm4, xmm0
- movdqa xmm7, xmm1
- pcmpeqw xmm7, xmm0
- por xmm4, xmm7
- pand xmm5, xmm4
- movdqa xmm4, [esp+432-224]
- movdqa [esp+432-320], xmm5
- movdqa xmm5, [esp+432-272]
- movdqa xmm7, xmm0
- psubw xmm7, xmm4
- psubw xmm0, xmm1
- psllw xmm5, 2
- paddw xmm6, xmm5
- paddw xmm6, [esp+432-336]
- movdqa xmm5, [esp+432-368]
- movdqa [esp+432-336], xmm0
- psraw xmm6, 3
- pmaxsw xmm7, xmm6
- pminsw xmm4, xmm7
- pand xmm4, [esp+432-320]
- movdqa xmm6, xmm0
- movdqa xmm0, [esp+432-16]
- paddw xmm0, [esp+432-304]
- movdqa [esp+432-272], xmm4
- movdqa xmm4, [esp+432-368]
- paddw xmm4, xmm4
- psubw xmm0, xmm4
-
- movdqa xmm4, [esp+432-64]
- psraw xmm0, 1
- pmaxsw xmm6, xmm0
- movdqa xmm0, [esp+432-400]
- movdqa xmm7, xmm1
- pminsw xmm7, xmm6
- movdqa xmm6, [esp+432-320]
- pand xmm7, xmm6
- pand xmm7, [esp+432-288]
- paddw xmm5, xmm7
- packuswb xmm2, xmm5
- movdqa xmm5, [esp+432-272]
- paddw xmm0, xmm5
- paddw xmm3, xmm4
- packuswb xmm3, xmm0
-
- movdqa xmm0, [esp+432-32]
- psubw xmm0, xmm4
- movdqa xmm4, [esp+432-80]
- psubw xmm4, xmm5
-
- movdqa xmm5, [esp+432-240]
- paddw xmm5, [esp+432-48]
- packuswb xmm0, xmm4
- movdqa xmm4, [esp+432-384]
- paddw xmm4, [esp+432-304]
- movdqa [esp+480-208], xmm0
- movdqa xmm0, [esp+432-352]
- movdqa xmm7, xmm0
- paddw xmm0, xmm0
-
- mov ecx, dword [esp+432-408]
-
- mov edx, dword [esp+432-404]
- psubw xmm4, xmm0
- movdqa xmm0, [esp+432-336]
- movdqa [edi], xmm2
- psraw xmm4, 1
- pmaxsw xmm0, xmm4
- pminsw xmm1, xmm0
- movdqa xmm0, [esp+480-208]
-
- pop edi
- pand xmm1, xmm6
- pand xmm1, [esp+428-256]
- movdqa [ecx], xmm3
- paddw xmm7, xmm1
- pop esi
- packuswb xmm5, xmm7
- movdqa [eax], xmm0
- movdqa [edx], xmm5
- pop ebx
- mov esp, ebp
- pop ebp
- ret
-
-
-;*******************************************************************************
-; void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
-; int32_t iBeta)
-;*******************************************************************************
-
-WELS_EXTERN DeblockLumaEq4V_sse2
-
-ALIGN 16
-
-DeblockLumaEq4V_sse2:
-
- push ebp
- mov ebp, esp
- and esp, -16 ; fffffff0H
- sub esp, 628 ; 00000274H
- mov eax, dword [ebp+8]
- mov ecx, dword [ebp+12]
- push ebx
- push esi
-
- lea edx, [ecx*4]
- pxor xmm0, xmm0
- movdqa xmm2, xmm0
-
- movdqa xmm0, [ecx+eax]
- mov esi, eax
- sub esi, edx
- movdqa xmm3, [esi]
- movdqa xmm5, [eax]
- push edi
- lea edi, [ecx+ecx]
- lea ebx, [ecx+ecx*2]
- mov dword [esp+640-600], edi
- mov esi, eax
- sub esi, edi
- movdqa xmm1, [esi]
- movdqa [esp+720-272], xmm0
- mov edi, eax
- sub edi, ecx
- movdqa xmm4, [edi]
- add ecx, eax
- mov dword [esp+640-596], ecx
-
- mov ecx, dword [esp+640-600]
- movdqa xmm0, [ecx+eax]
- movdqa [esp+736-272], xmm0
-
- movdqa xmm0, [eax+ebx]
- mov edx, eax
- sub edx, ebx
-
- movsx ebx, word [ebp+16]
- movdqa xmm6, [edx]
- add ecx, eax
- movdqa [esp+752-272], xmm0
- movd xmm0, ebx
-
- movsx ebx, word [ebp+20]
- movdqa xmm7, xmm0
- punpcklwd xmm7, xmm0
- pshufd xmm0, xmm7, 0
- movdqa [esp+640-320], xmm0
- movd xmm0, ebx
- movdqa xmm7, xmm0
- punpcklwd xmm7, xmm0
- pshufd xmm0, xmm7, 0
-
- movdqa xmm7, [esp+736-272]
- punpcklbw xmm7, xmm2
- movdqa [esp+640-416], xmm7
- movdqa [esp+640-512], xmm0
- movdqa xmm0, xmm1
- movdqa [esp+672-272], xmm1
- movdqa xmm1, xmm4
- movdqa [esp+704-272], xmm5
- punpcklbw xmm5, xmm2
- punpcklbw xmm1, xmm2
-
- movdqa xmm7, xmm5
- psubw xmm7, xmm1
- pabsw xmm7, xmm7
- movdqa [esp+640-560], xmm7
- punpcklbw xmm0, xmm2
- movdqa [esp+688-272], xmm4
- movdqa xmm4, [esp+720-272]
- movdqa [esp+640-480], xmm0
-
- movdqa xmm7, xmm1
- psubw xmm7, xmm0
-
- movdqa xmm0, [esp+640-512]
- pabsw xmm7, xmm7
- punpcklbw xmm4, xmm2
- pcmpgtw xmm0, xmm7
- movdqa [esp+640-384], xmm4
- movdqa xmm7, xmm5
- psubw xmm7, xmm4
- movdqa xmm4, [esp+640-512]
- movdqa [esp+656-272], xmm6
- punpcklbw xmm6, xmm2
- pabsw xmm7, xmm7
- movdqa [esp+640-48], xmm2
- movdqa [esp+640-368], xmm6
- movdqa [esp+640-144], xmm1
- movdqa [esp+640-400], xmm5
- pcmpgtw xmm4, xmm7
- pand xmm0, xmm4
- movdqa xmm4, [esp+640-320]
- pcmpgtw xmm4, [esp+640-560]
- pand xmm0, xmm4
-
- mov ebx, 2
- movsx ebx, bx
- movd xmm4, ebx
- movdqa xmm7, xmm4
- punpcklwd xmm7, xmm4
- movdqa xmm4, [esp+640-320]
- psraw xmm4, 2
- pshufd xmm7, xmm7, 0
- paddw xmm4, xmm7
- movdqa [esp+640-576], xmm4
- pcmpgtw xmm4, [esp+640-560]
- movdqa [esp+640-560], xmm4
-
- movdqa xmm4, [esp+640-512]
- movdqa [esp+640-624], xmm7
- movdqa xmm7, xmm1
- psubw xmm7, xmm6
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
-
- pand xmm4, [esp+640-560]
- movdqa [esp+640-544], xmm4
- movdqa xmm4, [esp+640-512]
- movdqa xmm7, xmm5
- psubw xmm7, [esp+640-416]
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
-
- pand xmm4, [esp+640-560]
- movdqa [esp+640-560], xmm4
-
- movdqa xmm4, [esp+640-544]
- pandn xmm4, xmm6
- movdqa [esp+640-16], xmm4
- mov ebx, 4
- movsx ebx, bx
- movd xmm4, ebx
- movdqa xmm7, xmm4
- punpcklwd xmm7, xmm4
- movdqa xmm4, xmm3
- punpcklbw xmm4, xmm2
- psllw xmm4, 1
- paddw xmm4, xmm6
- paddw xmm4, xmm6
- paddw xmm4, xmm6
- paddw xmm4, [esp+640-480]
-
- movdqa xmm6, [esp+640-560]
- pshufd xmm7, xmm7, 0
- paddw xmm4, xmm1
- movdqa [esp+640-592], xmm7
- paddw xmm4, xmm5
- paddw xmm4, xmm7
- movdqa xmm7, [esp+640-416]
- pandn xmm6, xmm7
- movdqa [esp+640-80], xmm6
- movdqa xmm6, [esp+752-272]
- punpcklbw xmm6, xmm2
- psllw xmm6, 1
- paddw xmm6, xmm7
- paddw xmm6, xmm7
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-384]
-
- movdqa xmm7, [esp+640-480]
- paddw xmm6, xmm5
- paddw xmm6, xmm1
- paddw xmm6, [esp+640-592]
- psraw xmm6, 3
- pand xmm6, [esp+640-560]
- movdqa [esp+640-112], xmm6
- movdqa xmm6, [esp+640-544]
- pandn xmm6, xmm7
- movdqa [esp+640-336], xmm6
- movdqa xmm6, [esp+640-544]
- movdqa [esp+640-528], xmm6
- movdqa xmm6, [esp+640-368]
- paddw xmm6, xmm7
- movdqa xmm7, xmm1
- psraw xmm4, 3
- pand xmm4, [esp+640-544]
- paddw xmm7, xmm5
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-624]
- movdqa xmm7, [esp+640-528]
-
- paddw xmm5, xmm1
- psraw xmm6, 2
- pand xmm7, xmm6
-
- movdqa xmm6, [esp+640-384]
- movdqa [esp+640-64], xmm7
- movdqa xmm7, [esp+640-560]
- pandn xmm7, xmm6
- movdqa [esp+640-304], xmm7
- movdqa xmm7, [esp+640-560]
- movdqa [esp+640-528], xmm7
- movdqa xmm7, [esp+640-416]
- paddw xmm7, xmm6
- paddw xmm7, xmm5
- paddw xmm7, [esp+640-624]
- movdqa xmm5, [esp+640-528]
- psraw xmm7, 2
- pand xmm5, xmm7
- movdqa [esp+640-32], xmm5
-
- movdqa xmm5, [esp+640-544]
- movdqa [esp+640-528], xmm5
- movdqa xmm5, [esp+640-480]
- movdqa xmm7, xmm5
- paddw xmm7, xmm5
- movdqa xmm5, xmm1
- paddw xmm5, xmm6
- paddw xmm6, [esp+640-592]
- paddw xmm7, xmm5
- paddw xmm7, [esp+640-624]
- movdqa xmm5, [esp+640-528]
- psraw xmm7, 2
- pandn xmm5, xmm7
- movdqa xmm7, [esp+640-480]
- paddw xmm7, xmm1
- paddw xmm7, [esp+640-400]
- movdqa xmm1, [esp+640-544]
- movdqa [esp+640-352], xmm5
- movdqa xmm5, [esp+640-368]
- psllw xmm7, 1
- paddw xmm7, xmm6
- paddw xmm5, xmm7
-
- movdqa xmm7, [esp+640-400]
- psraw xmm5, 3
- pand xmm1, xmm5
- movdqa xmm5, [esp+640-480]
- movdqa [esp+640-96], xmm1
- movdqa xmm1, [esp+640-560]
- movdqa [esp+640-528], xmm1
- movdqa xmm1, [esp+640-384]
- movdqa xmm6, xmm1
- paddw xmm6, xmm1
- paddw xmm1, [esp+640-400]
- paddw xmm1, [esp+640-144]
- paddw xmm7, xmm5
- paddw xmm5, [esp+640-592]
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-624]
- movdqa xmm7, [esp+640-528]
- psraw xmm6, 2
- psllw xmm1, 1
- paddw xmm1, xmm5
-
- movdqa xmm5, [esp+656-272]
- pandn xmm7, xmm6
- movdqa xmm6, [esp+640-416]
- paddw xmm6, xmm1
- movdqa xmm1, [esp+640-560]
- psraw xmm6, 3
- pand xmm1, xmm6
-
- movdqa xmm6, [esp+704-272]
- movdqa [esp+640-128], xmm1
- movdqa xmm1, [esp+672-272]
- punpckhbw xmm1, xmm2
- movdqa [esp+640-448], xmm1
- movdqa xmm1, [esp+688-272]
- punpckhbw xmm1, xmm2
- punpckhbw xmm6, xmm2
- movdqa [esp+640-288], xmm7
- punpckhbw xmm5, xmm2
- movdqa [esp+640-496], xmm1
- movdqa [esp+640-432], xmm6
-
- movdqa xmm7, [esp+720-272]
- punpckhbw xmm7, xmm2
- movdqa [esp+640-464], xmm7
-
- movdqa xmm7, [esp+736-272]
- punpckhbw xmm7, xmm2
- movdqa [esp+640-528], xmm7
-
- movdqa xmm7, xmm6
-
- psubw xmm6, [esp+640-464]
- psubw xmm7, xmm1
- pabsw xmm7, xmm7
- movdqa [esp+640-560], xmm7
- por xmm4, [esp+640-16]
- pabsw xmm6, xmm6
- movdqa xmm7, xmm1
- psubw xmm7, [esp+640-448]
-
- movdqa xmm1, [esp+640-512]
- pabsw xmm7, xmm7
- pcmpgtw xmm1, xmm7
- movdqa xmm7, [esp+640-512]
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+640-320]
- pand xmm1, xmm7
- movdqa xmm7, [esp+640-560]
- pcmpgtw xmm6, xmm7
- pand xmm1, xmm6
-
- movdqa xmm6, [esp+640-576]
- pcmpgtw xmm6, xmm7
-
- movdqa xmm7, [esp+640-496]
- punpckhbw xmm3, xmm2
- movdqa [esp+640-560], xmm6
- movdqa xmm6, [esp+640-512]
- psubw xmm7, xmm5
- pabsw xmm7, xmm7
- pcmpgtw xmm6, xmm7
-
- pand xmm6, [esp+640-560]
- movdqa xmm7, [esp+640-432]
- psubw xmm7, [esp+640-528]
-
- psllw xmm3, 1
- movdqa [esp+640-544], xmm6
- movdqa xmm6, [esp+640-512]
-
- movdqa xmm2, [esp+640-544]
- paddw xmm3, xmm5
- paddw xmm3, xmm5
- paddw xmm3, xmm5
- paddw xmm3, [esp+640-448]
- paddw xmm3, [esp+640-496]
- pabsw xmm7, xmm7
- pcmpgtw xmm6, xmm7
- pand xmm6, [esp+640-560]
- movdqa [esp+640-560], xmm6
-
- movdqa xmm6, xmm0
- pand xmm6, xmm4
- movdqa xmm4, xmm0
- pandn xmm4, [esp+640-368]
- por xmm6, xmm4
- movdqa xmm4, [esp+640-432]
- paddw xmm3, xmm4
- paddw xmm3, [esp+640-592]
- psraw xmm3, 3
- pand xmm3, xmm2
- pandn xmm2, xmm5
- por xmm3, xmm2
- movdqa xmm7, xmm1
- pand xmm7, xmm3
- movdqa xmm3, [esp+640-64]
- por xmm3, [esp+640-336]
- movdqa xmm2, xmm1
- pandn xmm2, xmm5
- por xmm7, xmm2
-
- movdqa xmm2, xmm0
- pand xmm2, xmm3
- movdqa xmm3, xmm0
- pandn xmm3, [esp+640-480]
- por xmm2, xmm3
- packuswb xmm6, xmm7
- movdqa [esp+640-336], xmm2
- movdqa [esp+656-272], xmm6
- movdqa xmm6, [esp+640-544]
- movdqa xmm2, xmm5
- paddw xmm2, [esp+640-448]
- movdqa xmm3, xmm1
- movdqa xmm7, [esp+640-496]
- paddw xmm7, xmm4
- paddw xmm2, xmm7
- paddw xmm2, [esp+640-624]
- movdqa xmm7, [esp+640-544]
- psraw xmm2, 2
- pand xmm6, xmm2
- movdqa xmm2, [esp+640-448]
- pandn xmm7, xmm2
- por xmm6, xmm7
- pand xmm3, xmm6
- movdqa xmm6, xmm1
- pandn xmm6, xmm2
- paddw xmm2, [esp+640-496]
- paddw xmm2, xmm4
- por xmm3, xmm6
- movdqa xmm6, [esp+640-336]
- packuswb xmm6, xmm3
- psllw xmm2, 1
- movdqa [esp+672-272], xmm6
- movdqa xmm6, [esp+640-96]
- por xmm6, [esp+640-352]
-
- movdqa xmm3, xmm0
- pand xmm3, xmm6
- movdqa xmm6, xmm0
- pandn xmm6, [esp+640-144]
- por xmm3, xmm6
- movdqa xmm6, [esp+640-544]
- movdqa [esp+640-352], xmm3
- movdqa xmm3, [esp+640-464]
- paddw xmm3, [esp+640-592]
- paddw xmm2, xmm3
- movdqa xmm3, [esp+640-448]
- paddw xmm5, xmm2
- movdqa xmm2, [esp+640-496]
- psraw xmm5, 3
- pand xmm6, xmm5
- movdqa xmm5, [esp+640-464]
- paddw xmm2, xmm5
- paddw xmm5, [esp+640-432]
- movdqa xmm4, xmm3
- paddw xmm4, xmm3
- paddw xmm4, xmm2
- paddw xmm4, [esp+640-624]
- movdqa xmm2, [esp+640-544]
- paddw xmm3, [esp+640-592]
- psraw xmm4, 2
- pandn xmm2, xmm4
- por xmm6, xmm2
- movdqa xmm7, xmm1
- pand xmm7, xmm6
- movdqa xmm6, [esp+640-496]
- movdqa xmm2, xmm1
- pandn xmm2, xmm6
- por xmm7, xmm2
- movdqa xmm2, [esp+640-352]
- packuswb xmm2, xmm7
- movdqa [esp+688-272], xmm2
- movdqa xmm2, [esp+640-128]
- por xmm2, [esp+640-288]
-
- movdqa xmm4, xmm0
- pand xmm4, xmm2
- paddw xmm5, xmm6
- movdqa xmm2, xmm0
- pandn xmm2, [esp+640-400]
- por xmm4, xmm2
- movdqa xmm2, [esp+640-528]
- psllw xmm5, 1
- paddw xmm5, xmm3
- movdqa xmm3, [esp+640-560]
- paddw xmm2, xmm5
- psraw xmm2, 3
- movdqa [esp+640-288], xmm4
- movdqa xmm4, [esp+640-560]
- pand xmm4, xmm2
- movdqa xmm2, [esp+640-464]
- movdqa xmm5, xmm2
- paddw xmm5, xmm2
- movdqa xmm2, [esp+640-432]
- paddw xmm2, [esp+640-448]
- movdqa xmm7, xmm1
- paddw xmm5, xmm2
- paddw xmm5, [esp+640-624]
- movdqa xmm6, [esp+640-560]
- psraw xmm5, 2
- pandn xmm3, xmm5
- por xmm4, xmm3
- movdqa xmm3, [esp+640-32]
- por xmm3, [esp+640-304]
- pand xmm7, xmm4
- movdqa xmm4, [esp+640-432]
- movdqa xmm5, [esp+640-464]
- movdqa xmm2, xmm1
- pandn xmm2, xmm4
- paddw xmm4, [esp+640-496]
- por xmm7, xmm2
- movdqa xmm2, [esp+640-288]
- packuswb xmm2, xmm7
- movdqa [esp+704-272], xmm2
-
- movdqa xmm2, xmm0
- pand xmm2, xmm3
- movdqa xmm3, xmm0
- pandn xmm3, [esp+640-384]
- por xmm2, xmm3
- movdqa [esp+640-304], xmm2
- movdqa xmm2, [esp+640-528]
- movdqa xmm3, xmm2
- paddw xmm3, [esp+640-464]
- paddw xmm3, xmm4
- paddw xmm3, [esp+640-624]
- psraw xmm3, 2
- pand xmm6, xmm3
- movdqa xmm3, [esp+640-560]
- movdqa xmm4, xmm3
- pandn xmm4, xmm5
- por xmm6, xmm4
- movdqa xmm7, xmm1
- pand xmm7, xmm6
- movdqa xmm6, [esp+640-304]
- movdqa xmm4, xmm1
- pandn xmm4, xmm5
- por xmm7, xmm4
-
- movdqa xmm4, xmm0
- pandn xmm0, [esp+640-416]
- packuswb xmm6, xmm7
- movdqa xmm7, [esp+640-112]
- por xmm7, [esp+640-80]
- pand xmm4, xmm7
- por xmm4, xmm0
- movdqa xmm0, [esp+752-272]
- punpckhbw xmm0, [esp+640-48]
- psllw xmm0, 1
- paddw xmm0, xmm2
- paddw xmm0, xmm2
- paddw xmm0, xmm2
- paddw xmm0, xmm5
- paddw xmm0, [esp+640-432]
- paddw xmm0, [esp+640-496]
- paddw xmm0, [esp+640-592]
- psraw xmm0, 3
- pand xmm0, xmm3
- movdqa xmm7, xmm1
- pandn xmm3, xmm2
- por xmm0, xmm3
- pand xmm7, xmm0
-
- movdqa xmm0, [esp+656-272]
- movdqa [edx], xmm0
-
- movdqa xmm0, [esp+672-272]
-
- mov edx, dword [esp+640-596]
- movdqa [esi], xmm0
- movdqa xmm0, [esp+688-272]
- movdqa [edi], xmm0
- movdqa xmm0, [esp+704-272]
-
- pop edi
- pandn xmm1, xmm2
- movdqa [eax], xmm0
- por xmm7, xmm1
- pop esi
- packuswb xmm4, xmm7
- movdqa [edx], xmm6
- movdqa [ecx], xmm4
- pop ebx
- mov esp, ebp
- pop ebp
- ret
-
-
-;********************************************************************************
-;
-; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
-;
-;********************************************************************************
-
-WELS_EXTERN DeblockLumaTransposeH2V_sse2
-
-ALIGN 16
-
-DeblockLumaTransposeH2V_sse2:
- push ebp
- push ebx
- mov ebp, esp
- and esp,0FFFFFFF0h
- sub esp, 10h
-
- mov eax, [ebp + 0Ch]
- mov ecx, [ebp + 10h]
- lea edx, [eax + ecx * 8]
- lea ebx, [ecx*3]
-
- movq xmm0, [eax]
- movq xmm7, [edx]
- punpcklqdq xmm0, xmm7
- movq xmm1, [eax + ecx]
- movq xmm7, [edx + ecx]
- punpcklqdq xmm1, xmm7
- movq xmm2, [eax + ecx*2]
- movq xmm7, [edx + ecx*2]
- punpcklqdq xmm2, xmm7
- movq xmm3, [eax + ebx]
- movq xmm7, [edx + ebx]
- punpcklqdq xmm3, xmm7
-
- lea eax, [eax + ecx * 4]
- lea edx, [edx + ecx * 4]
- movq xmm4, [eax]
- movq xmm7, [edx]
- punpcklqdq xmm4, xmm7
- movq xmm5, [eax + ecx]
- movq xmm7, [edx + ecx]
- punpcklqdq xmm5, xmm7
- movq xmm6, [eax + ecx*2]
- movq xmm7, [edx + ecx*2]
- punpcklqdq xmm6, xmm7
-
- movdqa [esp], xmm0
- movq xmm7, [eax + ebx]
- movq xmm0, [edx + ebx]
- punpcklqdq xmm7, xmm0
- movdqa xmm0, [esp]
-
- SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
- ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-
- mov eax, [ebp + 14h]
- movdqa [eax], xmm4
- movdqa [eax + 10h], xmm2
- movdqa [eax + 20h], xmm3
- movdqa [eax + 30h], xmm7
- movdqa [eax + 40h], xmm5
- movdqa [eax + 50h], xmm1
- movdqa [eax + 60h], xmm6
- movdqa [eax + 70h], xmm0
-
- mov esp, ebp
- pop ebx
- pop ebp
- ret
-
-
-
-;*******************************************************************************************
-;
-; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
-;
-;*******************************************************************************************
-
-WELS_EXTERN DeblockLumaTransposeV2H_sse2
-
-ALIGN 16
-
-DeblockLumaTransposeV2H_sse2:
- push ebp
- mov ebp, esp
-
- and esp, 0FFFFFFF0h
- sub esp, 10h
-
- mov eax, [ebp + 10h]
- mov ecx, [ebp + 0Ch]
- mov edx, [ebp + 08h]
-
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 10h]
- movdqa xmm2, [eax + 20h]
- movdqa xmm3, [eax + 30h]
- movdqa xmm4, [eax + 40h]
- movdqa xmm5, [eax + 50h]
- movdqa xmm6, [eax + 60h]
- movdqa xmm7, [eax + 70h]
-
- SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
- ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-
- lea eax, [ecx * 3]
-
- movq [edx], xmm4
- movq [edx + ecx], xmm2
- movq [edx + ecx*2], xmm3
- movq [edx + eax], xmm7
-
- lea edx, [edx + ecx*4]
- movq [edx], xmm5
- movq [edx + ecx], xmm1
- movq [edx + ecx*2], xmm6
- movq [edx + eax], xmm0
-
- psrldq xmm4, 8
- psrldq xmm2, 8
- psrldq xmm3, 8
- psrldq xmm7, 8
- psrldq xmm5, 8
- psrldq xmm1, 8
- psrldq xmm6, 8
- psrldq xmm0, 8
-
- lea edx, [edx + ecx*4]
- movq [edx], xmm4
- movq [edx + ecx], xmm2
- movq [edx + ecx*2], xmm3
- movq [edx + eax], xmm7
-
- lea edx, [edx + ecx*4]
- movq [edx], xmm5
- movq [edx + ecx], xmm1
- movq [edx + ecx*2], xmm6
- movq [edx + eax], xmm0
-
-
- mov esp, ebp
- pop ebp
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* deblock.asm
+;*
+;* Abstract
+;* edge loop
+;*
+;* History
+;* 08/07/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata pData
+%else
+SECTION .rodata align=16
+%endif
+
+SECTION .text
+
+;********************************************************************************
+; void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; int32_t iAlpha, int32_t iBeta)
+;********************************************************************************
+WELS_EXTERN DeblockChromaEq4V_sse2
+
+ALIGN 16
+DeblockChromaEq4V_sse2:
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,68h
+ mov edx,[ebp+10h] ; iStride
+ mov eax,[ebp+8] ; pPixCb
+ mov ecx,[ebp+0Ch] ; pPixCr
+ movq xmm4,[ecx]
+ movq xmm5,[edx+ecx]
+ push esi
+ push edi
+ lea esi,[edx+edx]
+ mov edi,eax
+ sub edi,esi
+ movq xmm1,[edi]
+ mov edi,ecx
+ sub edi,esi
+ movq xmm2,[edi]
+ punpcklqdq xmm1,xmm2
+ mov esi,eax
+ sub esi,edx
+ movq xmm2,[esi]
+ mov edi,ecx
+ sub edi,edx
+ movq xmm3,[edi]
+ punpcklqdq xmm2,xmm3
+ movq xmm3,[eax]
+ punpcklqdq xmm3,xmm4
+ movq xmm4,[edx+eax]
+ mov edx, [ebp + 14h]
+ punpcklqdq xmm4,xmm5
+ movd xmm5,edx
+ mov edx, [ebp + 18h]
+ pxor xmm0,xmm0
+ movdqa xmm6,xmm5
+ punpcklwd xmm6,xmm5
+ pshufd xmm5,xmm6,0
+ movd xmm6,edx
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa xmm7,xmm1
+ punpckhbw xmm1,xmm0
+ punpcklbw xmm7,xmm0
+ movdqa [esp+40h],xmm1
+ movdqa [esp+60h],xmm7
+ movdqa xmm7,xmm2
+ punpcklbw xmm7,xmm0
+ movdqa [esp+10h],xmm7
+ movdqa xmm7,xmm3
+ punpcklbw xmm7,xmm0
+ punpckhbw xmm3,xmm0
+ movdqa [esp+50h],xmm7
+ movdqa xmm7,xmm4
+ punpckhbw xmm4,xmm0
+ punpckhbw xmm2,xmm0
+ punpcklbw xmm7,xmm0
+ movdqa [esp+30h],xmm3
+ movdqa xmm3,[esp+10h]
+ movdqa xmm1,xmm3
+ psubw xmm1,[esp+50h]
+ pabsw xmm1,xmm1
+ movdqa [esp+20h],xmm4
+ movdqa xmm0,xmm5
+ pcmpgtw xmm0,xmm1
+ movdqa xmm1,[esp+60h]
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ pand xmm0,xmm4
+ movdqa xmm1,xmm7
+ psubw xmm1,[esp+50h]
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ movdqa xmm1,xmm2
+ psubw xmm1,[esp+30h]
+ pabsw xmm1,xmm1
+ pcmpgtw xmm5,xmm1
+ movdqa xmm1,[esp+40h]
+ pand xmm0,xmm4
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ movdqa xmm1,[esp+20h]
+ psubw xmm1,[esp+30h]
+ pand xmm5,xmm4
+ pabsw xmm1,xmm1
+ pcmpgtw xmm6,xmm1
+ pand xmm5,xmm6
+ mov edx,2
+ movsx edx,dx
+ movd xmm1,edx
+ movdqa xmm4,xmm1
+ punpcklwd xmm4,xmm1
+ pshufd xmm1,xmm4,0
+ movdqa xmm4,[esp+60h]
+ movdqa xmm6,xmm4
+ paddw xmm6,xmm4
+ paddw xmm6,xmm3
+ paddw xmm6,xmm7
+ movdqa [esp+10h],xmm1
+ paddw xmm6,[esp+10h]
+ psraw xmm6,2
+ movdqa xmm4,xmm0
+ pandn xmm4,xmm3
+ movdqa xmm3,[esp+40h]
+ movdqa xmm1,xmm0
+ pand xmm1,xmm6
+ por xmm1,xmm4
+ movdqa xmm6,xmm3
+ paddw xmm6,xmm3
+ movdqa xmm3,[esp+10h]
+ paddw xmm6,xmm2
+ paddw xmm6,[esp+20h]
+ paddw xmm6,xmm3
+ psraw xmm6,2
+ movdqa xmm4,xmm5
+ pand xmm4,xmm6
+ movdqa xmm6,xmm5
+ pandn xmm6,xmm2
+ por xmm4,xmm6
+ packuswb xmm1,xmm4
+ movdqa xmm4,[esp+50h]
+ movdqa xmm6,xmm7
+ paddw xmm6,xmm7
+ paddw xmm6,xmm4
+ paddw xmm6,[esp+60h]
+ paddw xmm6,xmm3
+ psraw xmm6,2
+ movdqa xmm2,xmm0
+ pand xmm2,xmm6
+ pandn xmm0,xmm4
+ por xmm2,xmm0
+ movdqa xmm0,[esp+20h]
+ movdqa xmm6,xmm0
+ paddw xmm6,xmm0
+ movdqa xmm0,[esp+30h]
+ paddw xmm6,xmm0
+ paddw xmm6,[esp+40h]
+ movdqa xmm4,xmm5
+ paddw xmm6,xmm3
+ movq [esi],xmm1
+ psraw xmm6,2
+ pand xmm4,xmm6
+ pandn xmm5,xmm0
+ por xmm4,xmm5
+ packuswb xmm2,xmm4
+ movq [eax],xmm2
+ psrldq xmm1,8
+ movq [edi],xmm1
+ pop edi
+ psrldq xmm2,8
+ movq [ecx],xmm2
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
+
+;******************************************************************************
+; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN DeblockChromaLt4V_sse2
+
+DeblockChromaLt4V_sse2:
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,0E4h
+ push ebx
+ push esi
+ mov esi, [ebp+1Ch] ; pTC
+ movsx ebx, byte [esi+2]
+ push edi
+ movsx di,byte [esi+3]
+ mov word [esp+0Ch],bx
+ movsx bx,byte [esi+1]
+ movsx esi,byte [esi]
+ mov word [esp+0Eh],si
+ movzx esi,di
+ movd xmm1,esi
+ movzx esi,di
+ movd xmm2,esi
+ mov si,word [esp+0Ch]
+ mov edx, [ebp + 10h]
+ mov eax, [ebp + 08h]
+ movzx edi,si
+ movzx esi,si
+ mov ecx, [ebp + 0Ch]
+ movd xmm4,esi
+ movzx esi,bx
+ movd xmm5,esi
+ movd xmm3,edi
+ movzx esi,bx
+ movd xmm6,esi
+ mov si,word [esp+0Eh]
+ movzx edi,si
+ movzx esi,si
+ punpcklwd xmm6,xmm2
+ pxor xmm0,xmm0
+ movdqa [esp+40h],xmm0
+ movd xmm7,edi
+ movd xmm0,esi
+ lea esi,[edx+edx]
+ mov edi,eax
+ sub edi,esi
+ punpcklwd xmm5,xmm1
+ movdqa xmm1,[esp+40h]
+ punpcklwd xmm0,xmm4
+ movq xmm4,[edx+ecx]
+ punpcklwd xmm7,xmm3
+ movq xmm3,[eax]
+ punpcklwd xmm0,xmm6
+ movq xmm6,[edi]
+ punpcklwd xmm7,xmm5
+ punpcklwd xmm0,xmm7
+ mov edi,ecx
+ sub edi,esi
+ movdqa xmm2,xmm1
+ psubw xmm2,xmm0
+ movdqa [esp+60h],xmm2
+ movq xmm2, [edi]
+ punpcklqdq xmm6,xmm2
+ mov esi,eax
+ sub esi,edx
+ movq xmm7,[esi]
+ mov edi,ecx
+ sub edi,edx
+ movq xmm2,[edi]
+ punpcklqdq xmm7,xmm2
+ movq xmm2,[ecx]
+ punpcklqdq xmm3,xmm2
+ movq xmm2,[edx+eax]
+ movsx edx,word [ebp + 14h]
+ punpcklqdq xmm2,xmm4
+ movdqa [esp+0E0h],xmm2
+ movd xmm2,edx
+ movsx edx,word [ebp + 18h]
+ movdqa xmm4,xmm2
+ punpcklwd xmm4,xmm2
+ movd xmm2,edx
+ movdqa xmm5,xmm2
+ punpcklwd xmm5,xmm2
+ pshufd xmm2,xmm5,0
+ movdqa [esp+50h],xmm2
+ movdqa xmm2,xmm6
+ punpcklbw xmm2,xmm1
+ movdqa [esp+0D0h],xmm3
+ pshufd xmm4,xmm4,0
+ movdqa [esp+30h],xmm2
+ punpckhbw xmm6,xmm1
+ movdqa [esp+80h],xmm6
+ movdqa xmm6,[esp+0D0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+70h],xmm6
+ movdqa xmm6, [esp+0E0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+90h],xmm6
+ movdqa xmm5, [esp+0E0h]
+ movdqa xmm2,xmm7
+ punpckhbw xmm7,xmm1
+ punpcklbw xmm5,xmm1
+ movdqa [esp+0A0h],xmm7
+ punpcklbw xmm3,xmm1
+ mov edx,4
+ punpcklbw xmm2,xmm1
+ movsx edx,dx
+ movd xmm6,edx
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa xmm7,[esp+30h]
+ movdqa [esp+20h],xmm6
+ psubw xmm7,xmm5
+ movdqa xmm6,xmm0
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1,[esp+60h]
+ movdqa [esp+40h],xmm6
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm2
+ psllw xmm6,2
+ paddw xmm6,xmm7
+ paddw xmm6, [esp+20h]
+ movdqa xmm7, [esp+50h]
+ psraw xmm6,3
+ pmaxsw xmm1,xmm6
+ movdqa [esp+10h],xmm0
+ movdqa xmm6, [esp+10h]
+ pminsw xmm6,xmm1
+ movdqa [esp+10h],xmm6
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ movdqa xmm6,xmm4
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1, [esp+30h]
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ pcmpgtw xmm7,xmm1
+ movdqa xmm1,[esp+50h]
+ pand xmm6,xmm7
+ movdqa xmm7,[esp+50h]
+ psubw xmm5,xmm3
+ pabsw xmm5,xmm5
+ pcmpgtw xmm1,xmm5
+ movdqa xmm5,[esp+80h]
+ psubw xmm5,[esp+90h]
+ pand xmm6,xmm1
+ pand xmm6,[esp+40h]
+ movdqa xmm1,[esp+10h]
+ pand xmm1,xmm6
+ movdqa xmm6,[esp+70h]
+ movdqa [esp+30h],xmm1
+ movdqa xmm1,[esp+0A0h]
+ psubw xmm6,xmm1
+ psllw xmm6,2
+ paddw xmm6,xmm5
+ paddw xmm6,[esp+20h]
+ movdqa xmm5,[esp+60h]
+ psraw xmm6,3
+ pmaxsw xmm5,xmm6
+ pminsw xmm0,xmm5
+ movdqa xmm5,[esp+70h]
+ movdqa xmm6,xmm1
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm4,xmm6
+ movdqa xmm6,[esp+80h]
+ psubw xmm6,xmm1
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+90h]
+ pand xmm4,xmm7
+ movdqa xmm7,[esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ pand xmm4,xmm7
+ pand xmm4,[esp+40h]
+ pand xmm0,xmm4
+ movdqa xmm4,[esp+30h]
+ paddw xmm2,xmm4
+ paddw xmm1,xmm0
+ packuswb xmm2,xmm1
+ movq [esi],xmm2
+ psubw xmm3,xmm4
+ psubw xmm5,xmm0
+ packuswb xmm3,xmm5
+ movq [eax],xmm3
+ psrldq xmm2,8
+ movq [edi],xmm2
+ pop edi
+ pop esi
+ psrldq xmm3,8
+ movq [ecx],xmm3
+ pop ebx
+ mov esp,ebp
+ pop ebp
+ ret
+
+;***************************************************************************
+; void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; int32_t iAlpha, int32_t iBeta)
+;***************************************************************************
+
+WELS_EXTERN DeblockChromaEq4H_sse2
+
+ALIGN 16
+
+DeblockChromaEq4H_sse2:
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,0C8h
+ mov ecx,dword [ebp+8]
+ mov edx,dword [ebp+0Ch]
+ mov eax,dword [ebp+10h]
+ sub ecx,2
+ sub edx,2
+ push esi
+ lea esi,[eax+eax*2]
+ mov dword [esp+18h],ecx
+ mov dword [esp+4],edx
+ lea ecx,[ecx+eax*4]
+ lea edx,[edx+eax*4]
+ lea eax,[esp+7Ch]
+ push edi
+ mov dword [esp+14h],esi
+ mov dword [esp+18h],ecx
+ mov dword [esp+0Ch],edx
+ mov dword [esp+10h],eax
+ mov esi,dword [esp+1Ch]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+14h]
+ movd xmm0,dword [esi]
+ movd xmm1,dword [esi+ecx]
+ movd xmm2,dword [esi+ecx*2]
+ movd xmm3,dword [esi+edx]
+ mov esi,dword [esp+8]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [esi+ecx]
+ movd xmm6,dword [esi+ecx*2]
+ movd xmm7,dword [esi+edx]
+ punpckldq xmm0,xmm4
+ punpckldq xmm1,xmm5
+ punpckldq xmm2,xmm6
+ punpckldq xmm3,xmm7
+ mov esi,dword [esp+18h]
+ mov edi,dword [esp+0Ch]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [edi]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm0,xmm4
+ movd xmm4,dword [esi+ecx]
+ movd xmm5,dword [edi+ecx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm1,xmm4
+ movd xmm4,dword [esi+ecx*2]
+ movd xmm5,dword [edi+ecx*2]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm2,xmm4
+ movd xmm4,dword [esi+edx]
+ movd xmm5,dword [edi+edx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm3,xmm4
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov edi,dword [esp+10h]
+ movdqa [edi],xmm0
+ movdqa [edi+10h],xmm5
+ movdqa [edi+20h],xmm1
+ movdqa [edi+30h],xmm6
+ movsx ecx,word [ebp+14h]
+ movsx edx,word [ebp+18h]
+ movdqa xmm6,[esp+80h]
+ movdqa xmm4,[esp+90h]
+ movdqa xmm5,[esp+0A0h]
+ movdqa xmm7,[esp+0B0h]
+ pxor xmm0,xmm0
+ movd xmm1,ecx
+ movdqa xmm2,xmm1
+ punpcklwd xmm2,xmm1
+ pshufd xmm1,xmm2,0
+ movd xmm2,edx
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm2,xmm3,0
+ movdqa xmm3,xmm6
+ punpckhbw xmm6,xmm0
+ movdqa [esp+60h],xmm6
+ movdqa xmm6,[esp+90h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+30h],xmm6
+ movdqa xmm6,[esp+0A0h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+40h],xmm6
+ movdqa xmm6,[esp+0B0h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+70h],xmm6
+ punpcklbw xmm7,xmm0
+ punpcklbw xmm4,xmm0
+ punpcklbw xmm5,xmm0
+ punpcklbw xmm3,xmm0
+ movdqa [esp+50h],xmm7
+ movdqa xmm6,xmm4
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ movdqa xmm0,xmm1
+ pcmpgtw xmm0,xmm6
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm4
+ pabsw xmm6,xmm6
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pand xmm0,xmm7
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+30h]
+ psubw xmm6,[esp+40h]
+ pabsw xmm6,xmm6
+ pcmpgtw xmm1,xmm6
+ movdqa xmm6,[esp+60h]
+ psubw xmm6,[esp+30h]
+ pabsw xmm6,xmm6
+ pand xmm0,xmm7
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+70h]
+ psubw xmm6,[esp+40h]
+ pabsw xmm6,xmm6
+ pand xmm1,xmm7
+ pcmpgtw xmm2,xmm6
+ pand xmm1,xmm2
+ mov eax,2
+ movsx ecx,ax
+ movd xmm2,ecx
+ movdqa xmm6,xmm2
+ punpcklwd xmm6,xmm2
+ pshufd xmm2,xmm6,0
+ movdqa [esp+20h],xmm2
+ movdqa xmm2,xmm3
+ paddw xmm2,xmm3
+ paddw xmm2,xmm4
+ paddw xmm2,[esp+50h]
+ paddw xmm2,[esp+20h]
+ psraw xmm2,2
+ movdqa xmm6,xmm0
+ pand xmm6,xmm2
+ movdqa xmm2,xmm0
+ pandn xmm2,xmm4
+ por xmm6,xmm2
+ movdqa xmm2,[esp+60h]
+ movdqa xmm7,xmm2
+ paddw xmm7,xmm2
+ paddw xmm7,[esp+30h]
+ paddw xmm7,[esp+70h]
+ paddw xmm7,[esp+20h]
+ movdqa xmm4,xmm1
+ movdqa xmm2,xmm1
+ pandn xmm2,[esp+30h]
+ psraw xmm7,2
+ pand xmm4,xmm7
+ por xmm4,xmm2
+ movdqa xmm2,[esp+50h]
+ packuswb xmm6,xmm4
+ movdqa [esp+90h],xmm6
+ movdqa xmm6,xmm2
+ paddw xmm6,xmm2
+ movdqa xmm2,[esp+20h]
+ paddw xmm6,xmm5
+ paddw xmm6,xmm3
+ movdqa xmm4,xmm0
+ pandn xmm0,xmm5
+ paddw xmm6,xmm2
+ psraw xmm6,2
+ pand xmm4,xmm6
+ por xmm4,xmm0
+ movdqa xmm0,[esp+70h]
+ movdqa xmm5,xmm0
+ paddw xmm5,xmm0
+ movdqa xmm0,[esp+40h]
+ paddw xmm5,xmm0
+ paddw xmm5,[esp+60h]
+ movdqa xmm3,xmm1
+ paddw xmm5,xmm2
+ psraw xmm5,2
+ pand xmm3,xmm5
+ pandn xmm1,xmm0
+ por xmm3,xmm1
+ packuswb xmm4,xmm3
+ movdqa [esp+0A0h],xmm4
+ mov esi,dword [esp+10h]
+ movdqa xmm0,[esi]
+ movdqa xmm1,[esi+10h]
+ movdqa xmm2,[esi+20h]
+ movdqa xmm3,[esi+30h]
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov esi,dword [esp+1Ch]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+14h]
+ mov edi,dword [esp+8]
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov esi,dword [esp+18h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov edi,dword [esp+0Ch]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ pop edi
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
+
+;*******************************************************************************
+; void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN DeblockChromaLt4H_sse2
+
+ALIGN 16
+
+DeblockChromaLt4H_sse2:
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,108h
+ mov ecx,dword [ebp+8]
+ mov edx,dword [ebp+0Ch]
+ mov eax,dword [ebp+10h]
+ sub ecx,2
+ sub edx,2
+ push esi
+ lea esi,[eax+eax*2]
+ mov dword [esp+10h],ecx
+ mov dword [esp+4],edx
+ lea ecx,[ecx+eax*4]
+ lea edx,[edx+eax*4]
+ lea eax,[esp+6Ch]
+ push edi
+ mov dword [esp+0Ch],esi
+ mov dword [esp+18h],ecx
+ mov dword [esp+10h],edx
+ mov dword [esp+1Ch],eax
+ mov esi,dword [esp+14h]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+0Ch]
+ movd xmm0,dword [esi]
+ movd xmm1,dword [esi+ecx]
+ movd xmm2,dword [esi+ecx*2]
+ movd xmm3,dword [esi+edx]
+ mov esi,dword [esp+8]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [esi+ecx]
+ movd xmm6,dword [esi+ecx*2]
+ movd xmm7,dword [esi+edx]
+ punpckldq xmm0,xmm4
+ punpckldq xmm1,xmm5
+ punpckldq xmm2,xmm6
+ punpckldq xmm3,xmm7
+ mov esi,dword [esp+18h]
+ mov edi,dword [esp+10h]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [edi]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm0,xmm4
+ movd xmm4,dword [esi+ecx]
+ movd xmm5,dword [edi+ecx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm1,xmm4
+ movd xmm4,dword [esi+ecx*2]
+ movd xmm5,dword [edi+ecx*2]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm2,xmm4
+ movd xmm4,dword [esi+edx]
+ movd xmm5,dword [edi+edx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm3,xmm4
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov edi,dword [esp+1Ch]
+ movdqa [edi],xmm0
+ movdqa [edi+10h],xmm5
+ movdqa [edi+20h],xmm1
+ movdqa [edi+30h],xmm6
+ mov eax,dword [ebp+1Ch]
+ movsx cx,byte [eax+3]
+ movsx dx,byte [eax+2]
+ movsx si,byte [eax+1]
+ movsx ax,byte [eax]
+ movzx edi,cx
+ movzx ecx,cx
+ movd xmm2,ecx
+ movzx ecx,dx
+ movzx edx,dx
+ movd xmm3,ecx
+ movd xmm4,edx
+ movzx ecx,si
+ movzx edx,si
+ movd xmm5,ecx
+ pxor xmm0,xmm0
+ movd xmm6,edx
+ movzx ecx,ax
+ movdqa [esp+60h],xmm0
+ movzx edx,ax
+ movsx eax,word [ebp+14h]
+ punpcklwd xmm6,xmm2
+ movd xmm1,edi
+ movd xmm7,ecx
+ movsx ecx,word [ebp+18h]
+ movd xmm0,edx
+ punpcklwd xmm7,xmm3
+ punpcklwd xmm5,xmm1
+ movdqa xmm1,[esp+60h]
+ punpcklwd xmm7,xmm5
+ movdqa xmm5,[esp+0A0h]
+ punpcklwd xmm0,xmm4
+ punpcklwd xmm0,xmm6
+ movdqa xmm6, [esp+70h]
+ punpcklwd xmm0,xmm7
+ movdqa xmm7,[esp+80h]
+ movdqa xmm2,xmm1
+ psubw xmm2,xmm0
+ movdqa [esp+0D0h],xmm2
+ movd xmm2,eax
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm4,xmm3,0
+ movd xmm2,ecx
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm2,xmm3,0
+ movdqa xmm3, [esp+90h]
+ movdqa [esp+50h],xmm2
+ movdqa xmm2,xmm6
+ punpcklbw xmm2,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa [esp+40h],xmm2
+ movdqa [esp+0B0h],xmm6
+ movdqa xmm6,[esp+90h]
+ movdqa xmm2,xmm7
+ punpckhbw xmm7,xmm1
+ punpckhbw xmm6,xmm1
+ punpcklbw xmm2,xmm1
+ punpcklbw xmm3,xmm1
+ punpcklbw xmm5,xmm1
+ movdqa [esp+0F0h],xmm7
+ movdqa [esp+0C0h],xmm6
+ movdqa xmm6, [esp+0A0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+0E0h],xmm6
+ mov edx,4
+ movsx eax,dx
+ movd xmm6,eax
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa [esp+30h],xmm6
+ movdqa xmm7, [esp+40h]
+ psubw xmm7,xmm5
+ movdqa xmm6,xmm0
+ pcmpgtw xmm6,xmm1
+ movdqa [esp+60h],xmm6
+ movdqa xmm1, [esp+0D0h]
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm2
+ psllw xmm6,2
+ paddw xmm6,xmm7
+ paddw xmm6,[esp+30h]
+ psraw xmm6,3
+ pmaxsw xmm1,xmm6
+ movdqa xmm7,[esp+50h]
+ movdqa [esp+20h],xmm0
+ movdqa xmm6, [esp+20h]
+ pminsw xmm6,xmm1
+ movdqa [esp+20h],xmm6
+ movdqa xmm6,xmm4
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1, [esp+40h]
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ pcmpgtw xmm7,xmm1
+ movdqa xmm1, [esp+50h]
+ pand xmm6,xmm7
+ movdqa xmm7, [esp+50h]
+ psubw xmm5,xmm3
+ pabsw xmm5,xmm5
+ pcmpgtw xmm1,xmm5
+ movdqa xmm5, [esp+0B0h]
+ psubw xmm5,[esp+0E0h]
+ pand xmm6,xmm1
+ pand xmm6, [esp+60h]
+ movdqa xmm1, [esp+20h]
+ pand xmm1,xmm6
+ movdqa xmm6, [esp+0C0h]
+ movdqa [esp+40h],xmm1
+ movdqa xmm1, [esp+0F0h]
+ psubw xmm6,xmm1
+ psllw xmm6,2
+ paddw xmm6,xmm5
+ paddw xmm6, [esp+30h]
+ movdqa xmm5, [esp+0D0h]
+ psraw xmm6,3
+ pmaxsw xmm5,xmm6
+ pminsw xmm0,xmm5
+ movdqa xmm5,[esp+0C0h]
+ movdqa xmm6,xmm1
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm4,xmm6
+ movdqa xmm6,[esp+0B0h]
+ psubw xmm6,xmm1
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6, [esp+0E0h]
+ pand xmm4,xmm7
+ movdqa xmm7, [esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ pand xmm4,xmm7
+ pand xmm4,[esp+60h]
+ pand xmm0,xmm4
+ movdqa xmm4, [esp+40h]
+ paddw xmm2,xmm4
+ paddw xmm1,xmm0
+ psubw xmm3,xmm4
+ psubw xmm5,xmm0
+ packuswb xmm2,xmm1
+ packuswb xmm3,xmm5
+ movdqa [esp+80h],xmm2
+ movdqa [esp+90h],xmm3
+ mov esi,dword [esp+1Ch]
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi+10h]
+ movdqa xmm2, [esi+20h]
+ movdqa xmm3, [esi+30h]
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov esi,dword [esp+14h]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+0Ch]
+ mov edi,dword [esp+8]
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov esi,dword [esp+18h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov edi,dword [esp+10h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ pop edi
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
+
+
+
+;*******************************************************************************
+; void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+; int32_t iBeta, int8_t * pTC)
+;*******************************************************************************
+
+
+WELS_EXTERN DeblockLumaLt4V_sse2
+
+ALIGN 16
+
+DeblockLumaLt4V_sse2:
+ push ebp
+ mov ebp, esp
+ and esp, -16 ; fffffff0H
+ sub esp, 420 ; 000001a4H
+ mov eax, dword [ebp+8]
+ mov ecx, dword [ebp+12]
+
+ pxor xmm0, xmm0
+ push ebx
+ mov edx, dword [ebp+24]
+ movdqa [esp+424-384], xmm0
+ push esi
+
+ lea esi, [ecx+ecx*2]
+ push edi
+ mov edi, eax
+ sub edi, esi
+ movdqa xmm0, [edi]
+
+ lea esi, [ecx+ecx]
+ movdqa [esp+432-208], xmm0
+ mov edi, eax
+ sub edi, esi
+ movdqa xmm0, [edi]
+ movdqa [esp+448-208], xmm0
+
+ mov ebx, eax
+ sub ebx, ecx
+ movdqa xmm0, [ebx]
+ movdqa [esp+464-208], xmm0
+
+ movdqa xmm0, [eax]
+
+ add ecx, eax
+ movdqa [esp+480-208], xmm0
+ movdqa xmm0, [ecx]
+ mov dword [esp+432-404], ecx
+
+ movsx ecx, word [ebp+16]
+ movdqa [esp+496-208], xmm0
+ movdqa xmm0, [esi+eax]
+
+ movsx si, byte [edx]
+ movdqa [esp+512-208], xmm0
+ movd xmm0, ecx
+ movsx ecx, word [ebp+20]
+ movdqa xmm1, xmm0
+ punpcklwd xmm1, xmm0
+ pshufd xmm0, xmm1, 0
+ movdqa [esp+432-112], xmm0
+ movd xmm0, ecx
+ movsx cx, byte [edx+1]
+ movdqa xmm1, xmm0
+ punpcklwd xmm1, xmm0
+ mov dword [esp+432-408], ebx
+ movzx ebx, cx
+ pshufd xmm0, xmm1, 0
+ movd xmm1, ebx
+ movzx ebx, cx
+ movd xmm2, ebx
+ movzx ebx, cx
+ movzx ecx, cx
+ movd xmm4, ecx
+ movzx ecx, si
+ movd xmm5, ecx
+ movzx ecx, si
+ movd xmm6, ecx
+ movzx ecx, si
+ movd xmm7, ecx
+ movzx ecx, si
+ movdqa [esp+432-336], xmm0
+ movd xmm0, ecx
+
+ movsx cx, byte [edx+3]
+ movsx dx, byte [edx+2]
+ movd xmm3, ebx
+ punpcklwd xmm0, xmm4
+ movzx esi, cx
+ punpcklwd xmm6, xmm2
+ punpcklwd xmm5, xmm1
+ punpcklwd xmm0, xmm6
+ punpcklwd xmm7, xmm3
+ punpcklwd xmm7, xmm5
+ punpcklwd xmm0, xmm7
+ movdqa [esp+432-400], xmm0
+ movd xmm0, esi
+ movzx esi, cx
+ movd xmm2, esi
+ movzx esi, cx
+ movzx ecx, cx
+ movd xmm4, ecx
+ movzx ecx, dx
+ movd xmm3, esi
+ movd xmm5, ecx
+ punpcklwd xmm5, xmm0
+
+ movdqa xmm0, [esp+432-384]
+ movzx ecx, dx
+ movd xmm6, ecx
+ movzx ecx, dx
+ movzx edx, dx
+ punpcklwd xmm6, xmm2
+ movd xmm7, ecx
+ movd xmm1, edx
+
+ movdqa xmm2, [esp+448-208]
+ punpcklbw xmm2, xmm0
+
+ mov ecx, 4
+ movsx edx, cx
+ punpcklwd xmm7, xmm3
+ punpcklwd xmm7, xmm5
+ movdqa xmm5, [esp+496-208]
+ movdqa xmm3, [esp+464-208]
+ punpcklbw xmm5, xmm0
+ movdqa [esp+432-240], xmm5
+ movdqa xmm5, [esp+512-208]
+ punpcklbw xmm5, xmm0
+ movdqa [esp+432-352], xmm5
+ punpcklwd xmm1, xmm4
+ movdqa xmm4, [esp+432-208]
+ punpcklwd xmm1, xmm6
+ movdqa xmm6, [esp+480-208]
+ punpcklwd xmm1, xmm7
+ punpcklbw xmm6, xmm0
+ punpcklbw xmm3, xmm0
+ punpcklbw xmm4, xmm0
+ movdqa xmm7, xmm3
+ psubw xmm7, xmm4
+ pabsw xmm7, xmm7
+ movdqa [esp+432-272], xmm4
+ movdqa xmm4, [esp+432-336]
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-288], xmm5
+ movdqa xmm7, xmm6
+ psubw xmm7, [esp+432-352]
+ pabsw xmm7, xmm7
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-256], xmm5
+ movdqa xmm5, xmm3
+ pavgw xmm5, xmm6
+ movdqa [esp+432-304], xmm5
+ movdqa xmm5, [esp+432-400]
+ psubw xmm5, [esp+432-288]
+ psubw xmm5, [esp+432-256]
+ movdqa [esp+432-224], xmm5
+ movdqa xmm5, xmm6
+ psubw xmm5, xmm3
+ movdqa [esp+432-32], xmm6
+ psubw xmm6, [esp+432-240]
+ movdqa xmm7, xmm5
+ movdqa [esp+432-384], xmm5
+ movdqa xmm5, [esp+432-112]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm5, xmm7
+ pabsw xmm6, xmm6
+ movdqa xmm7, xmm4
+ pcmpgtw xmm7, xmm6
+
+ pand xmm5, xmm7
+ movdqa xmm6, xmm3
+ psubw xmm6, xmm2
+ pabsw xmm6, xmm6
+ movdqa xmm7, xmm4
+ pcmpgtw xmm7, xmm6
+ movdqa xmm6, [esp+432-400]
+ pand xmm5, xmm7
+ movdqa xmm7, xmm6
+ pcmpeqw xmm6, xmm0
+ pcmpgtw xmm7, xmm0
+ por xmm7, xmm6
+ pand xmm5, xmm7
+ movdqa [esp+432-320], xmm5
+ movd xmm5, edx
+ movdqa xmm6, xmm5
+ punpcklwd xmm6, xmm5
+ pshufd xmm5, xmm6, 0
+ movdqa [esp+432-336], xmm5
+ movdqa xmm5, [esp+432-224]
+ movdqa [esp+432-368], xmm5
+ movdqa xmm6, xmm0
+ psubw xmm6, xmm5
+ movdqa xmm5, [esp+432-384]
+ psllw xmm5, 2
+ movdqa xmm7, xmm2
+ psubw xmm7, [esp+432-240]
+ paddw xmm7, xmm5
+ paddw xmm7, [esp+432-336]
+ movdqa xmm5, [esp+432-368]
+ psraw xmm7, 3
+ pmaxsw xmm6, xmm7
+ pminsw xmm5, xmm6
+
+ pand xmm5, [esp+432-320]
+ movdqa xmm6, [esp+432-400]
+ movdqa [esp+432-64], xmm5
+ movdqa [esp+432-384], xmm6
+ movdqa xmm5, xmm0
+ psubw xmm5, xmm6
+ movdqa [esp+432-368], xmm5
+ movdqa xmm6, xmm5
+ movdqa xmm5, [esp+432-272]
+ paddw xmm5, [esp+432-304]
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm2
+ psubw xmm5, xmm7
+ psraw xmm5, 1
+ pmaxsw xmm6, xmm5
+ movdqa xmm5, [esp+432-384]
+ pminsw xmm5, xmm6
+
+ pand xmm5, [esp+432-320]
+ pand xmm5, [esp+432-288]
+ movdqa xmm6, [esp+432-240]
+ movdqa [esp+432-96], xmm5
+ movdqa xmm5, [esp+432-352]
+ paddw xmm5, [esp+432-304]
+ movdqa xmm7, xmm6
+ paddw xmm7, xmm6
+ movdqa xmm6, [esp+432-368]
+ psubw xmm5, xmm7
+
+ movdqa xmm7, [esp+496-208]
+ psraw xmm5, 1
+ pmaxsw xmm6, xmm5
+ movdqa xmm5, [esp+432-400]
+ pminsw xmm5, xmm6
+ pand xmm5, [esp+432-320]
+ pand xmm5, [esp+432-256]
+ movdqa xmm6, [esp+448-208]
+ punpckhbw xmm7, xmm0
+ movdqa [esp+432-352], xmm7
+
+ movdqa xmm7, [esp+512-208]
+ punpckhbw xmm6, xmm0
+ movdqa [esp+432-48], xmm5
+ movdqa xmm5, [esp+432-208]
+ movdqa [esp+432-368], xmm6
+ movdqa xmm6, [esp+464-208]
+ punpckhbw xmm7, xmm0
+ punpckhbw xmm5, xmm0
+ movdqa [esp+432-384], xmm7
+ punpckhbw xmm6, xmm0
+ movdqa [esp+432-400], xmm6
+
+ movdqa xmm7, [esp+432-400]
+ movdqa xmm6, [esp+480-208]
+ psubw xmm7, xmm5
+ movdqa [esp+432-16], xmm5
+ pabsw xmm7, xmm7
+ punpckhbw xmm6, xmm0
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-288], xmm5
+
+ movdqa xmm7, xmm6
+ psubw xmm7, [esp+432-384]
+ pabsw xmm7, xmm7
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-256], xmm5
+
+ movdqa xmm5, [esp+432-400]
+ movdqa [esp+432-80], xmm6
+ pavgw xmm5, xmm6
+ movdqa [esp+432-304], xmm5
+
+ movdqa xmm5, xmm1
+ psubw xmm5, [esp+432-288]
+ psubw xmm5, [esp+432-256]
+ movdqa [esp+432-224], xmm5
+ movdqa xmm5, xmm6
+ psubw xmm5, [esp+432-400]
+ psubw xmm6, [esp+432-352]
+ movdqa [esp+432-272], xmm5
+ movdqa xmm7, xmm5
+ movdqa xmm5, [esp+432-112]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm5, xmm7
+ movdqa xmm7, xmm4
+ pabsw xmm6, xmm6
+ pcmpgtw xmm7, xmm6
+ movdqa xmm6, [esp+432-368]
+
+ pand xmm5, xmm7
+ movdqa xmm7, [esp+432-400]
+ psubw xmm7, xmm6
+ psubw xmm6, [esp+432-352]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm4, xmm7
+ pand xmm5, xmm4
+
+ paddw xmm2, [esp+432-96]
+ movdqa xmm4, xmm1
+ pcmpgtw xmm4, xmm0
+ movdqa xmm7, xmm1
+ pcmpeqw xmm7, xmm0
+ por xmm4, xmm7
+ pand xmm5, xmm4
+ movdqa xmm4, [esp+432-224]
+ movdqa [esp+432-320], xmm5
+ movdqa xmm5, [esp+432-272]
+ movdqa xmm7, xmm0
+ psubw xmm7, xmm4
+ psubw xmm0, xmm1
+ psllw xmm5, 2
+ paddw xmm6, xmm5
+ paddw xmm6, [esp+432-336]
+ movdqa xmm5, [esp+432-368]
+ movdqa [esp+432-336], xmm0
+ psraw xmm6, 3
+ pmaxsw xmm7, xmm6
+ pminsw xmm4, xmm7
+ pand xmm4, [esp+432-320]
+ movdqa xmm6, xmm0
+ movdqa xmm0, [esp+432-16]
+ paddw xmm0, [esp+432-304]
+ movdqa [esp+432-272], xmm4
+ movdqa xmm4, [esp+432-368]
+ paddw xmm4, xmm4
+ psubw xmm0, xmm4
+
+ movdqa xmm4, [esp+432-64]
+ psraw xmm0, 1
+ pmaxsw xmm6, xmm0
+ movdqa xmm0, [esp+432-400]
+ movdqa xmm7, xmm1
+ pminsw xmm7, xmm6
+ movdqa xmm6, [esp+432-320]
+ pand xmm7, xmm6
+ pand xmm7, [esp+432-288]
+ paddw xmm5, xmm7
+ packuswb xmm2, xmm5
+ movdqa xmm5, [esp+432-272]
+ paddw xmm0, xmm5
+ paddw xmm3, xmm4
+ packuswb xmm3, xmm0
+
+ movdqa xmm0, [esp+432-32]
+ psubw xmm0, xmm4
+ movdqa xmm4, [esp+432-80]
+ psubw xmm4, xmm5
+
+ movdqa xmm5, [esp+432-240]
+ paddw xmm5, [esp+432-48]
+ packuswb xmm0, xmm4
+ movdqa xmm4, [esp+432-384]
+ paddw xmm4, [esp+432-304]
+ movdqa [esp+480-208], xmm0
+ movdqa xmm0, [esp+432-352]
+ movdqa xmm7, xmm0
+ paddw xmm0, xmm0
+
+ mov ecx, dword [esp+432-408]
+
+ mov edx, dword [esp+432-404]
+ psubw xmm4, xmm0
+ movdqa xmm0, [esp+432-336]
+ movdqa [edi], xmm2
+ psraw xmm4, 1
+ pmaxsw xmm0, xmm4
+ pminsw xmm1, xmm0
+ movdqa xmm0, [esp+480-208]
+
+ pop edi
+ pand xmm1, xmm6
+ pand xmm1, [esp+428-256]
+ movdqa [ecx], xmm3
+ paddw xmm7, xmm1
+ pop esi
+ packuswb xmm5, xmm7
+ movdqa [eax], xmm0
+ movdqa [edx], xmm5
+ pop ebx
+ mov esp, ebp
+ pop ebp
+ ret
+
+
+;*******************************************************************************
+; void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+; int32_t iBeta)
+;*******************************************************************************
+
+WELS_EXTERN DeblockLumaEq4V_sse2
+
+ALIGN 16
+
+DeblockLumaEq4V_sse2:
+
+ push ebp
+ mov ebp, esp
+ and esp, -16 ; fffffff0H
+ sub esp, 628 ; 00000274H
+ mov eax, dword [ebp+8]
+ mov ecx, dword [ebp+12]
+ push ebx
+ push esi
+
+ lea edx, [ecx*4]
+ pxor xmm0, xmm0
+ movdqa xmm2, xmm0
+
+ movdqa xmm0, [ecx+eax]
+ mov esi, eax
+ sub esi, edx
+ movdqa xmm3, [esi]
+ movdqa xmm5, [eax]
+ push edi
+ lea edi, [ecx+ecx]
+ lea ebx, [ecx+ecx*2]
+ mov dword [esp+640-600], edi
+ mov esi, eax
+ sub esi, edi
+ movdqa xmm1, [esi]
+ movdqa [esp+720-272], xmm0
+ mov edi, eax
+ sub edi, ecx
+ movdqa xmm4, [edi]
+ add ecx, eax
+ mov dword [esp+640-596], ecx
+
+ mov ecx, dword [esp+640-600]
+ movdqa xmm0, [ecx+eax]
+ movdqa [esp+736-272], xmm0
+
+ movdqa xmm0, [eax+ebx]
+ mov edx, eax
+ sub edx, ebx
+
+ movsx ebx, word [ebp+16]
+ movdqa xmm6, [edx]
+ add ecx, eax
+ movdqa [esp+752-272], xmm0
+ movd xmm0, ebx
+
+ movsx ebx, word [ebp+20]
+ movdqa xmm7, xmm0
+ punpcklwd xmm7, xmm0
+ pshufd xmm0, xmm7, 0
+ movdqa [esp+640-320], xmm0
+ movd xmm0, ebx
+ movdqa xmm7, xmm0
+ punpcklwd xmm7, xmm0
+ pshufd xmm0, xmm7, 0
+
+ movdqa xmm7, [esp+736-272]
+ punpcklbw xmm7, xmm2
+ movdqa [esp+640-416], xmm7
+ movdqa [esp+640-512], xmm0
+ movdqa xmm0, xmm1
+ movdqa [esp+672-272], xmm1
+ movdqa xmm1, xmm4
+ movdqa [esp+704-272], xmm5
+ punpcklbw xmm5, xmm2
+ punpcklbw xmm1, xmm2
+
+ movdqa xmm7, xmm5
+ psubw xmm7, xmm1
+ pabsw xmm7, xmm7
+ movdqa [esp+640-560], xmm7
+ punpcklbw xmm0, xmm2
+ movdqa [esp+688-272], xmm4
+ movdqa xmm4, [esp+720-272]
+ movdqa [esp+640-480], xmm0
+
+ movdqa xmm7, xmm1
+ psubw xmm7, xmm0
+
+ movdqa xmm0, [esp+640-512]
+ pabsw xmm7, xmm7
+ punpcklbw xmm4, xmm2
+ pcmpgtw xmm0, xmm7
+ movdqa [esp+640-384], xmm4
+ movdqa xmm7, xmm5
+ psubw xmm7, xmm4
+ movdqa xmm4, [esp+640-512]
+ movdqa [esp+656-272], xmm6
+ punpcklbw xmm6, xmm2
+ pabsw xmm7, xmm7
+ movdqa [esp+640-48], xmm2
+ movdqa [esp+640-368], xmm6
+ movdqa [esp+640-144], xmm1
+ movdqa [esp+640-400], xmm5
+ pcmpgtw xmm4, xmm7
+ pand xmm0, xmm4
+ movdqa xmm4, [esp+640-320]
+ pcmpgtw xmm4, [esp+640-560]
+ pand xmm0, xmm4
+
+ mov ebx, 2
+ movsx ebx, bx
+ movd xmm4, ebx
+ movdqa xmm7, xmm4
+ punpcklwd xmm7, xmm4
+ movdqa xmm4, [esp+640-320]
+ psraw xmm4, 2
+ pshufd xmm7, xmm7, 0
+ paddw xmm4, xmm7
+ movdqa [esp+640-576], xmm4
+ pcmpgtw xmm4, [esp+640-560]
+ movdqa [esp+640-560], xmm4
+
+ movdqa xmm4, [esp+640-512]
+ movdqa [esp+640-624], xmm7
+ movdqa xmm7, xmm1
+ psubw xmm7, xmm6
+ pabsw xmm7, xmm7
+ pcmpgtw xmm4, xmm7
+
+ pand xmm4, [esp+640-560]
+ movdqa [esp+640-544], xmm4
+ movdqa xmm4, [esp+640-512]
+ movdqa xmm7, xmm5
+ psubw xmm7, [esp+640-416]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm4, xmm7
+
+ pand xmm4, [esp+640-560]
+ movdqa [esp+640-560], xmm4
+
+ movdqa xmm4, [esp+640-544]
+ pandn xmm4, xmm6
+ movdqa [esp+640-16], xmm4
+ mov ebx, 4
+ movsx ebx, bx
+ movd xmm4, ebx
+ movdqa xmm7, xmm4
+ punpcklwd xmm7, xmm4
+ movdqa xmm4, xmm3
+ punpcklbw xmm4, xmm2
+ psllw xmm4, 1
+ paddw xmm4, xmm6
+ paddw xmm4, xmm6
+ paddw xmm4, xmm6
+ paddw xmm4, [esp+640-480]
+
+ movdqa xmm6, [esp+640-560]
+ pshufd xmm7, xmm7, 0
+ paddw xmm4, xmm1
+ movdqa [esp+640-592], xmm7
+ paddw xmm4, xmm5
+ paddw xmm4, xmm7
+ movdqa xmm7, [esp+640-416]
+ pandn xmm6, xmm7
+ movdqa [esp+640-80], xmm6
+ movdqa xmm6, [esp+752-272]
+ punpcklbw xmm6, xmm2
+ psllw xmm6, 1
+ paddw xmm6, xmm7
+ paddw xmm6, xmm7
+ paddw xmm6, xmm7
+ paddw xmm6, [esp+640-384]
+
+ movdqa xmm7, [esp+640-480]
+ paddw xmm6, xmm5
+ paddw xmm6, xmm1
+ paddw xmm6, [esp+640-592]
+ psraw xmm6, 3
+ pand xmm6, [esp+640-560]
+ movdqa [esp+640-112], xmm6
+ movdqa xmm6, [esp+640-544]
+ pandn xmm6, xmm7
+ movdqa [esp+640-336], xmm6
+ movdqa xmm6, [esp+640-544]
+ movdqa [esp+640-528], xmm6
+ movdqa xmm6, [esp+640-368]
+ paddw xmm6, xmm7
+ movdqa xmm7, xmm1
+ psraw xmm4, 3
+ pand xmm4, [esp+640-544]
+ paddw xmm7, xmm5
+ paddw xmm6, xmm7
+ paddw xmm6, [esp+640-624]
+ movdqa xmm7, [esp+640-528]
+
+ paddw xmm5, xmm1
+ psraw xmm6, 2
+ pand xmm7, xmm6
+
+ movdqa xmm6, [esp+640-384]
+ movdqa [esp+640-64], xmm7
+ movdqa xmm7, [esp+640-560]
+ pandn xmm7, xmm6
+ movdqa [esp+640-304], xmm7
+ movdqa xmm7, [esp+640-560]
+ movdqa [esp+640-528], xmm7
+ movdqa xmm7, [esp+640-416]
+ paddw xmm7, xmm6
+ paddw xmm7, xmm5
+ paddw xmm7, [esp+640-624]
+ movdqa xmm5, [esp+640-528]
+ psraw xmm7, 2
+ pand xmm5, xmm7
+ movdqa [esp+640-32], xmm5
+
+ movdqa xmm5, [esp+640-544]
+ movdqa [esp+640-528], xmm5
+ movdqa xmm5, [esp+640-480]
+ movdqa xmm7, xmm5
+ paddw xmm7, xmm5
+ movdqa xmm5, xmm1
+ paddw xmm5, xmm6
+ paddw xmm6, [esp+640-592]
+ paddw xmm7, xmm5
+ paddw xmm7, [esp+640-624]
+ movdqa xmm5, [esp+640-528]
+ psraw xmm7, 2
+ pandn xmm5, xmm7
+ movdqa xmm7, [esp+640-480]
+ paddw xmm7, xmm1
+ paddw xmm7, [esp+640-400]
+ movdqa xmm1, [esp+640-544]
+ movdqa [esp+640-352], xmm5
+ movdqa xmm5, [esp+640-368]
+ psllw xmm7, 1
+ paddw xmm7, xmm6
+ paddw xmm5, xmm7
+
+ movdqa xmm7, [esp+640-400]
+ psraw xmm5, 3
+ pand xmm1, xmm5
+ movdqa xmm5, [esp+640-480]
+ movdqa [esp+640-96], xmm1
+ movdqa xmm1, [esp+640-560]
+ movdqa [esp+640-528], xmm1
+ movdqa xmm1, [esp+640-384]
+ movdqa xmm6, xmm1
+ paddw xmm6, xmm1
+ paddw xmm1, [esp+640-400]
+ paddw xmm1, [esp+640-144]
+ paddw xmm7, xmm5
+ paddw xmm5, [esp+640-592]
+ paddw xmm6, xmm7
+ paddw xmm6, [esp+640-624]
+ movdqa xmm7, [esp+640-528]
+ psraw xmm6, 2
+ psllw xmm1, 1
+ paddw xmm1, xmm5
+
+ movdqa xmm5, [esp+656-272]
+ pandn xmm7, xmm6
+ movdqa xmm6, [esp+640-416]
+ paddw xmm6, xmm1
+ movdqa xmm1, [esp+640-560]
+ psraw xmm6, 3
+ pand xmm1, xmm6
+
+ movdqa xmm6, [esp+704-272]
+ movdqa [esp+640-128], xmm1
+ movdqa xmm1, [esp+672-272]
+ punpckhbw xmm1, xmm2
+ movdqa [esp+640-448], xmm1
+ movdqa xmm1, [esp+688-272]
+ punpckhbw xmm1, xmm2
+ punpckhbw xmm6, xmm2
+ movdqa [esp+640-288], xmm7
+ punpckhbw xmm5, xmm2
+ movdqa [esp+640-496], xmm1
+ movdqa [esp+640-432], xmm6
+
+ movdqa xmm7, [esp+720-272]
+ punpckhbw xmm7, xmm2
+ movdqa [esp+640-464], xmm7
+
+ movdqa xmm7, [esp+736-272]
+ punpckhbw xmm7, xmm2
+ movdqa [esp+640-528], xmm7
+
+ movdqa xmm7, xmm6
+
+ psubw xmm6, [esp+640-464]
+ psubw xmm7, xmm1
+ pabsw xmm7, xmm7
+ movdqa [esp+640-560], xmm7
+ por xmm4, [esp+640-16]
+ pabsw xmm6, xmm6
+ movdqa xmm7, xmm1
+ psubw xmm7, [esp+640-448]
+
+ movdqa xmm1, [esp+640-512]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm1, xmm7
+ movdqa xmm7, [esp+640-512]
+ pcmpgtw xmm7, xmm6
+ movdqa xmm6, [esp+640-320]
+ pand xmm1, xmm7
+ movdqa xmm7, [esp+640-560]
+ pcmpgtw xmm6, xmm7
+ pand xmm1, xmm6
+
+ movdqa xmm6, [esp+640-576]
+ pcmpgtw xmm6, xmm7
+
+ movdqa xmm7, [esp+640-496]
+ punpckhbw xmm3, xmm2
+ movdqa [esp+640-560], xmm6
+ movdqa xmm6, [esp+640-512]
+ psubw xmm7, xmm5
+ pabsw xmm7, xmm7
+ pcmpgtw xmm6, xmm7
+
+ pand xmm6, [esp+640-560]
+ movdqa xmm7, [esp+640-432]
+ psubw xmm7, [esp+640-528]
+
+ psllw xmm3, 1
+ movdqa [esp+640-544], xmm6
+ movdqa xmm6, [esp+640-512]
+
+ movdqa xmm2, [esp+640-544]
+ paddw xmm3, xmm5
+ paddw xmm3, xmm5
+ paddw xmm3, xmm5
+ paddw xmm3, [esp+640-448]
+ paddw xmm3, [esp+640-496]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm6, xmm7
+ pand xmm6, [esp+640-560]
+ movdqa [esp+640-560], xmm6
+
+ movdqa xmm6, xmm0
+ pand xmm6, xmm4
+ movdqa xmm4, xmm0
+ pandn xmm4, [esp+640-368]
+ por xmm6, xmm4
+ movdqa xmm4, [esp+640-432]
+ paddw xmm3, xmm4
+ paddw xmm3, [esp+640-592]
+ psraw xmm3, 3
+ pand xmm3, xmm2
+ pandn xmm2, xmm5
+ por xmm3, xmm2
+ movdqa xmm7, xmm1
+ pand xmm7, xmm3
+ movdqa xmm3, [esp+640-64]
+ por xmm3, [esp+640-336]
+ movdqa xmm2, xmm1
+ pandn xmm2, xmm5
+ por xmm7, xmm2
+
+ movdqa xmm2, xmm0
+ pand xmm2, xmm3
+ movdqa xmm3, xmm0
+ pandn xmm3, [esp+640-480]
+ por xmm2, xmm3
+ packuswb xmm6, xmm7
+ movdqa [esp+640-336], xmm2
+ movdqa [esp+656-272], xmm6
+ movdqa xmm6, [esp+640-544]
+ movdqa xmm2, xmm5
+ paddw xmm2, [esp+640-448]
+ movdqa xmm3, xmm1
+ movdqa xmm7, [esp+640-496]
+ paddw xmm7, xmm4
+ paddw xmm2, xmm7
+ paddw xmm2, [esp+640-624]
+ movdqa xmm7, [esp+640-544]
+ psraw xmm2, 2
+ pand xmm6, xmm2
+ movdqa xmm2, [esp+640-448]
+ pandn xmm7, xmm2
+ por xmm6, xmm7
+ pand xmm3, xmm6
+ movdqa xmm6, xmm1
+ pandn xmm6, xmm2
+ paddw xmm2, [esp+640-496]
+ paddw xmm2, xmm4
+ por xmm3, xmm6
+ movdqa xmm6, [esp+640-336]
+ packuswb xmm6, xmm3
+ psllw xmm2, 1
+ movdqa [esp+672-272], xmm6
+ movdqa xmm6, [esp+640-96]
+ por xmm6, [esp+640-352]
+
+ movdqa xmm3, xmm0
+ pand xmm3, xmm6
+ movdqa xmm6, xmm0
+ pandn xmm6, [esp+640-144]
+ por xmm3, xmm6
+ movdqa xmm6, [esp+640-544]
+ movdqa [esp+640-352], xmm3
+ movdqa xmm3, [esp+640-464]
+ paddw xmm3, [esp+640-592]
+ paddw xmm2, xmm3
+ movdqa xmm3, [esp+640-448]
+ paddw xmm5, xmm2
+ movdqa xmm2, [esp+640-496]
+ psraw xmm5, 3
+ pand xmm6, xmm5
+ movdqa xmm5, [esp+640-464]
+ paddw xmm2, xmm5
+ paddw xmm5, [esp+640-432]
+ movdqa xmm4, xmm3
+ paddw xmm4, xmm3
+ paddw xmm4, xmm2
+ paddw xmm4, [esp+640-624]
+ movdqa xmm2, [esp+640-544]
+ paddw xmm3, [esp+640-592]
+ psraw xmm4, 2
+ pandn xmm2, xmm4
+ por xmm6, xmm2
+ movdqa xmm7, xmm1
+ pand xmm7, xmm6
+ movdqa xmm6, [esp+640-496]
+ movdqa xmm2, xmm1
+ pandn xmm2, xmm6
+ por xmm7, xmm2
+ movdqa xmm2, [esp+640-352]
+ packuswb xmm2, xmm7
+ movdqa [esp+688-272], xmm2
+ movdqa xmm2, [esp+640-128]
+ por xmm2, [esp+640-288]
+
+ movdqa xmm4, xmm0
+ pand xmm4, xmm2
+ paddw xmm5, xmm6
+ movdqa xmm2, xmm0
+ pandn xmm2, [esp+640-400]
+ por xmm4, xmm2
+ movdqa xmm2, [esp+640-528]
+ psllw xmm5, 1
+ paddw xmm5, xmm3
+ movdqa xmm3, [esp+640-560]
+ paddw xmm2, xmm5
+ psraw xmm2, 3
+ movdqa [esp+640-288], xmm4
+ movdqa xmm4, [esp+640-560]
+ pand xmm4, xmm2
+ movdqa xmm2, [esp+640-464]
+ movdqa xmm5, xmm2
+ paddw xmm5, xmm2
+ movdqa xmm2, [esp+640-432]
+ paddw xmm2, [esp+640-448]
+ movdqa xmm7, xmm1
+ paddw xmm5, xmm2
+ paddw xmm5, [esp+640-624]
+ movdqa xmm6, [esp+640-560]
+ psraw xmm5, 2
+ pandn xmm3, xmm5
+ por xmm4, xmm3
+ movdqa xmm3, [esp+640-32]
+ por xmm3, [esp+640-304]
+ pand xmm7, xmm4
+ movdqa xmm4, [esp+640-432]
+ movdqa xmm5, [esp+640-464]
+ movdqa xmm2, xmm1
+ pandn xmm2, xmm4
+ paddw xmm4, [esp+640-496]
+ por xmm7, xmm2
+ movdqa xmm2, [esp+640-288]
+ packuswb xmm2, xmm7
+ movdqa [esp+704-272], xmm2
+
+ movdqa xmm2, xmm0
+ pand xmm2, xmm3
+ movdqa xmm3, xmm0
+ pandn xmm3, [esp+640-384]
+ por xmm2, xmm3
+ movdqa [esp+640-304], xmm2
+ movdqa xmm2, [esp+640-528]
+ movdqa xmm3, xmm2
+ paddw xmm3, [esp+640-464]
+ paddw xmm3, xmm4
+ paddw xmm3, [esp+640-624]
+ psraw xmm3, 2
+ pand xmm6, xmm3
+ movdqa xmm3, [esp+640-560]
+ movdqa xmm4, xmm3
+ pandn xmm4, xmm5
+ por xmm6, xmm4
+ movdqa xmm7, xmm1
+ pand xmm7, xmm6
+ movdqa xmm6, [esp+640-304]
+ movdqa xmm4, xmm1
+ pandn xmm4, xmm5
+ por xmm7, xmm4
+
+ movdqa xmm4, xmm0
+ pandn xmm0, [esp+640-416]
+ packuswb xmm6, xmm7
+ movdqa xmm7, [esp+640-112]
+ por xmm7, [esp+640-80]
+ pand xmm4, xmm7
+ por xmm4, xmm0
+ movdqa xmm0, [esp+752-272]
+ punpckhbw xmm0, [esp+640-48]
+ psllw xmm0, 1
+ paddw xmm0, xmm2
+ paddw xmm0, xmm2
+ paddw xmm0, xmm2
+ paddw xmm0, xmm5
+ paddw xmm0, [esp+640-432]
+ paddw xmm0, [esp+640-496]
+ paddw xmm0, [esp+640-592]
+ psraw xmm0, 3
+ pand xmm0, xmm3
+ movdqa xmm7, xmm1
+ pandn xmm3, xmm2
+ por xmm0, xmm3
+ pand xmm7, xmm0
+
+ movdqa xmm0, [esp+656-272]
+ movdqa [edx], xmm0
+
+ movdqa xmm0, [esp+672-272]
+
+ mov edx, dword [esp+640-596]
+ movdqa [esi], xmm0
+ movdqa xmm0, [esp+688-272]
+ movdqa [edi], xmm0
+ movdqa xmm0, [esp+704-272]
+
+ pop edi
+ pandn xmm1, xmm2
+ movdqa [eax], xmm0
+ por xmm7, xmm1
+ pop esi
+ packuswb xmm4, xmm7
+ movdqa [edx], xmm6
+ movdqa [ecx], xmm4
+ pop ebx
+ mov esp, ebp
+ pop ebp
+ ret
+
+
+;********************************************************************************
+;
+; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
+;
+;********************************************************************************
+
+WELS_EXTERN DeblockLumaTransposeH2V_sse2
+
+ALIGN 16
+
+DeblockLumaTransposeH2V_sse2:
+ push ebp
+ push ebx
+ mov ebp, esp
+ and esp,0FFFFFFF0h
+ sub esp, 10h
+
+ mov eax, [ebp + 0Ch]
+ mov ecx, [ebp + 10h]
+ lea edx, [eax + ecx * 8]
+ lea ebx, [ecx*3]
+
+ movq xmm0, [eax]
+ movq xmm7, [edx]
+ punpcklqdq xmm0, xmm7
+ movq xmm1, [eax + ecx]
+ movq xmm7, [edx + ecx]
+ punpcklqdq xmm1, xmm7
+ movq xmm2, [eax + ecx*2]
+ movq xmm7, [edx + ecx*2]
+ punpcklqdq xmm2, xmm7
+ movq xmm3, [eax + ebx]
+ movq xmm7, [edx + ebx]
+ punpcklqdq xmm3, xmm7
+
+ lea eax, [eax + ecx * 4]
+ lea edx, [edx + ecx * 4]
+ movq xmm4, [eax]
+ movq xmm7, [edx]
+ punpcklqdq xmm4, xmm7
+ movq xmm5, [eax + ecx]
+ movq xmm7, [edx + ecx]
+ punpcklqdq xmm5, xmm7
+ movq xmm6, [eax + ecx*2]
+ movq xmm7, [edx + ecx*2]
+ punpcklqdq xmm6, xmm7
+
+ movdqa [esp], xmm0
+ movq xmm7, [eax + ebx]
+ movq xmm0, [edx + ebx]
+ punpcklqdq xmm7, xmm0
+ movdqa xmm0, [esp]
+
+ SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
+ ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+
+ mov eax, [ebp + 14h]
+ movdqa [eax], xmm4
+ movdqa [eax + 10h], xmm2
+ movdqa [eax + 20h], xmm3
+ movdqa [eax + 30h], xmm7
+ movdqa [eax + 40h], xmm5
+ movdqa [eax + 50h], xmm1
+ movdqa [eax + 60h], xmm6
+ movdqa [eax + 70h], xmm0
+
+ mov esp, ebp
+ pop ebx
+ pop ebp
+ ret
+
+
+
+;*******************************************************************************************
+;
+; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
+;
+;*******************************************************************************************
+
+WELS_EXTERN DeblockLumaTransposeV2H_sse2
+
+ALIGN 16
+
+DeblockLumaTransposeV2H_sse2:
+ push ebp
+ mov ebp, esp
+
+ and esp, 0FFFFFFF0h
+ sub esp, 10h
+
+ mov eax, [ebp + 10h]
+ mov ecx, [ebp + 0Ch]
+ mov edx, [ebp + 08h]
+
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 10h]
+ movdqa xmm2, [eax + 20h]
+ movdqa xmm3, [eax + 30h]
+ movdqa xmm4, [eax + 40h]
+ movdqa xmm5, [eax + 50h]
+ movdqa xmm6, [eax + 60h]
+ movdqa xmm7, [eax + 70h]
+
+ SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
+ ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+
+ lea eax, [ecx * 3]
+
+ movq [edx], xmm4
+ movq [edx + ecx], xmm2
+ movq [edx + ecx*2], xmm3
+ movq [edx + eax], xmm7
+
+ lea edx, [edx + ecx*4]
+ movq [edx], xmm5
+ movq [edx + ecx], xmm1
+ movq [edx + ecx*2], xmm6
+ movq [edx + eax], xmm0
+
+ psrldq xmm4, 8
+ psrldq xmm2, 8
+ psrldq xmm3, 8
+ psrldq xmm7, 8
+ psrldq xmm5, 8
+ psrldq xmm1, 8
+ psrldq xmm6, 8
+ psrldq xmm0, 8
+
+ lea edx, [edx + ecx*4]
+ movq [edx], xmm4
+ movq [edx + ecx], xmm2
+ movq [edx + ecx*2], xmm3
+ movq [edx + eax], xmm7
+
+ lea edx, [edx + ecx*4]
+ movq [edx], xmm5
+ movq [edx + ecx], xmm1
+ movq [edx + ecx*2], xmm6
+ movq [edx + eax], xmm0
+
+
+ mov esp, ebp
+ pop ebp
ret
\ No newline at end of file
--- a/codec/encoder/core/asm/expand_picture.asm
+++ b/codec/encoder/core/asm/expand_picture.asm
@@ -153,11 +153,11 @@
lea %1, [%1+%2]
%endmacro
-%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)]
+%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)]
; ebx [width/16(8)]
; esi [pSrc+0], edi [pSrc-1], ecx [-stride], 32(16) ; top
; eax [pSrc+(h-1)*stride], ebp [pSrc+(h+31)*stride], 32(16) ; bottom
-
+
%if %1 == 32 ; for luma
sar ebx, 04h ; width / 16(8) pixels
.top_bottom_loops:
@@ -171,7 +171,7 @@
mov_line_16x4_sse2 edi, ecx, xmm0, a
mov_line_16x4_sse2 edi, ecx, xmm0, a
mov_line_end16x4_sse2 edi, ecx, xmm0, a
-
+
; bottom
movdqa xmm1, [eax] ; last line of picture pData
mov_line_16x4_sse2 ebp, ecx, xmm1, a ; dst, stride, xmm?
@@ -182,15 +182,15 @@
mov_line_16x4_sse2 ebp, ecx, xmm1, a
mov_line_16x4_sse2 ebp, ecx, xmm1, a
mov_line_end16x4_sse2 ebp, ecx, xmm1, a
-
+
lea esi, [esi+16] ; top pSrc
lea edi, [edi+16] ; top dst
lea eax, [eax+16] ; bottom pSrc
lea ebp, [ebp+16] ; bottom dst
- neg ecx ; positive/negative stride need for next loop?
-
+ neg ecx ; positive/negative stride need for next loop?
+
dec ebx
- jnz near .top_bottom_loops
+ jnz near .top_bottom_loops
%elif %1 == 16 ; for chroma ??
mov edx, ebx
sar ebx, 04h ; (width / 16) pixels
@@ -200,21 +200,21 @@
mov_line_16x4_sse2 edi, ecx, xmm0, a ; dst, stride, xmm?
mov_line_16x4_sse2 edi, ecx, xmm0, a
mov_line_16x4_sse2 edi, ecx, xmm0, a
- mov_line_end16x4_sse2 edi, ecx, xmm0, a
-
+ mov_line_end16x4_sse2 edi, ecx, xmm0, a
+
; bottom
movdqa xmm1, [eax] ; last line of picture pData
mov_line_16x4_sse2 ebp, ecx, xmm1, a ; dst, stride, xmm?
mov_line_16x4_sse2 ebp, ecx, xmm1, a
mov_line_16x4_sse2 ebp, ecx, xmm1, a
- mov_line_end16x4_sse2 ebp, ecx, xmm1, a
-
+ mov_line_end16x4_sse2 ebp, ecx, xmm1, a
+
lea esi, [esi+16] ; top pSrc
lea edi, [edi+16] ; top dst
lea eax, [eax+16] ; bottom pSrc
lea ebp, [ebp+16] ; bottom dst
- neg ecx ; positive/negative stride need for next loop?
-
+ neg ecx ; positive/negative stride need for next loop?
+
dec ebx
jnz near .top_bottom_loops
@@ -241,13 +241,13 @@
%endif
%endmacro
-%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
+%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
; ecx [height]
; esi [pSrc+0], edi [pSrc-32], edx [stride], 32(16) ; left
; ebx [pSrc+(w-1)], ebp [pSrc+w], 32(16) ; right
; xor eax, eax ; for pixel pData (uint8_t) ; make sure eax=0 at least high 24 bits of eax = 0
-
-%if %1 == 32 ; for luma
+
+%if %1 == 32 ; for luma
.left_right_loops:
; left
mov al, byte [esi] ; pixel pData for left border
@@ -254,37 +254,37 @@
butterfly_1to16_sse xmm0, xmm1, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdqa [edi], xmm0
movdqa [edi+16], xmm0
-
+
; right
mov al, byte [ebx]
butterfly_1to16_sse xmm1, xmm2, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdqa [ebp], xmm1
movdqa [ebp+16], xmm1
-
+
lea esi, [esi+edx] ; left pSrc
lea edi, [edi+edx] ; left dst
lea ebx, [ebx+edx] ; right pSrc
- lea ebp, [ebp+edx] ; right dst
-
+ lea ebp, [ebp+edx] ; right dst
+
dec ecx
- jnz near .left_right_loops
-%elif %1 == 16 ; for chroma ??
+ jnz near .left_right_loops
+%elif %1 == 16 ; for chroma ??
.left_right_loops:
; left
mov al, byte [esi] ; pixel pData for left border
butterfly_1to16_sse xmm0, xmm1, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
- movdqa [edi], xmm0
-
+ movdqa [edi], xmm0
+
; right
mov al, byte [ebx]
butterfly_1to16_sse xmm1, xmm2, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdq%2 [ebp], xmm1 ; might not be aligned 16 bytes in case chroma planes
-
+
lea esi, [esi+edx] ; left pSrc
lea edi, [edi+edx] ; left dst
lea ebx, [ebx+edx] ; right pSrc
- lea ebp, [ebp+edx] ; right dst
-
+ lea ebp, [ebp+edx] ; right dst
+
dec ecx
jnz near .left_right_loops
%endif
@@ -337,25 +337,25 @@
; TL
mov_line_16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
mov_line_16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
- mov_line_16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
mov_line_end16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
; TR
mov_line_16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
+ mov_line_16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
mov_line_end16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
; BL
mov_line_16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
mov_line_16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
- mov_line_16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
mov_line_end16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
; BR
mov_line_16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
+ mov_line_16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
mov_line_end16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
%endif
%endmacro
@@ -373,7 +373,7 @@
push esi
push edi
push ebp
-
+
; for both top and bottom border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst
@@ -385,10 +385,10 @@
mov cl, byte [esi]
butterfly_1to16_sse xmm3, xmm4, c ; dst, tmp, pSrc [generic register name: a/b/c/d]
; load top border
- mov ecx, edx ; stride
+ mov ecx, edx ; stride
neg ecx ; -stride
lea edi, [esi+ecx] ; last line of top border
- ; load bottom border
+ ; load bottom border
dec eax ; h-1
imul eax, edx ; (h-1)*stride
lea eax, [esi+eax] ; last line of picture pData
@@ -396,16 +396,16 @@
lea ebp, [eax+edx] ; last line of bottom border, (h-1)*stride + 32 * stride
; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
dec ebx ; width-1
- lea ebx, [eax+ebx] ; dst[w-1][h-1]
+ lea ebx, [eax+ebx] ; dst[w-1][h-1]
; xor edx, edx
mov dl, byte [eax] ; bottom-left
butterfly_1to16_sse xmm5, xmm6, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
mov dl, byte [ebx] ; bottom-right
butterfly_1to16_sse xmm6, xmm4, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- ; for top & bottom expanding
+ ; for top & bottom expanding
mov ebx, [esp+32] ; width
- exp_top_bottom_sse2 32
-
+ exp_top_bottom_sse2 32
+
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst: left border pSrc
@@ -417,7 +417,7 @@
lea edi, [esi+eax] ; left border dst
dec ebx
lea ebx, [esi+ebx] ; right border pSrc, (p_dst + width - 1)
- lea ebp, [ebx+1] ; right border dst
+ lea ebp, [ebx+1] ; right border dst
; prepare for cross border pData: top-right with xmm4
; xor eax, eax
mov al, byte [ebx] ; top-right
@@ -424,7 +424,7 @@
butterfly_1to16_sse xmm4, xmm0, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
; for left & right border expanding
exp_left_right_sse2 32, a
-
+
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst
@@ -434,7 +434,7 @@
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
mov eax, -32 ; luma=-32, chroma=-16
neg ecx ; -stride
- lea edi, [esi+eax]
+ lea edi, [esi+eax]
lea edi, [edi+ecx] ; last line of top-left border
lea ebp, [esi+ebx]
lea ebp, [ebp+ecx] ; last line of top-right border
@@ -442,19 +442,19 @@
mov ecx, [esp+28] ; stride
imul edx, ecx ; (height+32(16)) * stride
lea eax, [edi+edx] ; last line of bottom-left border
- lea ebx, [ebp+edx] ; last line of bottom-right border
+ lea ebx, [ebp+edx] ; last line of bottom-right border
neg ecx ; -stride
; for left & right border expanding
- exp_cross_sse2 32, a
-
+ exp_cross_sse2 32, a
+
; sfence ; commit cache write back memory
-
+
pop ebp
pop edi
pop esi
pop edx
pop ebx
-
+
ret
ALIGN 16
@@ -470,7 +470,7 @@
push esi
push edi
push ebp
-
+
; for both top and bottom border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst
@@ -482,10 +482,10 @@
mov cl, byte [esi]
butterfly_1to16_sse xmm3, xmm4, c ; dst, tmp, pSrc [generic register name: a/b/c/d]
; load top border
- mov ecx, edx ; stride
+ mov ecx, edx ; stride
neg ecx ; -stride
lea edi, [esi+ecx] ; last line of top border
- ; load bottom border
+ ; load bottom border
dec eax ; h-1
imul eax, edx ; (h-1)*stride
lea eax, [esi+eax] ; last line of picture pData
@@ -493,16 +493,16 @@
lea ebp, [eax+edx] ; last line of bottom border, (h-1)*stride + 16 * stride
; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
dec ebx ; width-1
- lea ebx, [eax+ebx] ; dst[w-1][h-1]
+ lea ebx, [eax+ebx] ; dst[w-1][h-1]
; xor edx, edx
mov dl, byte [eax] ; bottom-left
butterfly_1to16_sse xmm5, xmm6, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
mov dl, byte [ebx] ; bottom-right
butterfly_1to16_sse xmm6, xmm4, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- ; for top & bottom expanding
+ ; for top & bottom expanding
mov ebx, [esp+32] ; width
- exp_top_bottom_sse2 16
-
+ exp_top_bottom_sse2 16
+
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst: left border pSrc
@@ -514,7 +514,7 @@
lea edi, [esi+eax] ; left border dst
dec ebx
lea ebx, [esi+ebx] ; right border pSrc, (p_dst + width - 1)
- lea ebp, [ebx+1] ; right border dst
+ lea ebp, [ebx+1] ; right border dst
; prepare for cross border pData: top-right with xmm4
; xor eax, eax
mov al, byte [ebx] ; top-right
@@ -521,7 +521,7 @@
butterfly_1to16_sse xmm4, xmm0, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
; for left & right border expanding
exp_left_right_sse2 16, a
-
+
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst
@@ -531,9 +531,9 @@
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
mov eax, -16 ; chroma=-16
neg ecx ; -stride
- lea edi, [esi+eax]
+ lea edi, [esi+eax]
lea edi, [edi+ecx] ; last line of top-left border
- lea ebp, [esi+ebx]
+ lea ebp, [esi+ebx]
lea ebp, [ebp+ecx] ; last line of top-right border
mov ecx, [esp+28] ; stride
add edx, 16 ; height+16, luma=32, chroma=16
@@ -543,15 +543,15 @@
neg ecx ; -stride
; for left & right border expanding
exp_cross_sse2 16, a
-
+
; sfence ; commit cache write back memory
-
+
pop ebp
pop edi
pop esi
pop edx
pop ebx
-
+
ret
ALIGN 16
@@ -567,7 +567,7 @@
push esi
push edi
push ebp
-
+
; for both top and bottom border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst
@@ -579,10 +579,10 @@
mov cl, byte [esi]
butterfly_1to16_sse xmm3, xmm4, c ; dst, tmp, pSrc [generic register name: a/b/c/d]
; load top border
- mov ecx, edx ; stride
+ mov ecx, edx ; stride
neg ecx ; -stride
lea edi, [esi+ecx] ; last line of top border
- ; load bottom border
+ ; load bottom border
dec eax ; h-1
imul eax, edx ; (h-1)*stride
lea eax, [esi+eax] ; last line of picture pData
@@ -590,16 +590,16 @@
lea ebp, [eax+edx] ; last line of bottom border, (h-1)*stride + 16 * stride
; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
dec ebx ; width-1
- lea ebx, [eax+ebx] ; dst[w-1][h-1]
+ lea ebx, [eax+ebx] ; dst[w-1][h-1]
; xor edx, edx
mov dl, byte [eax] ; bottom-left
butterfly_1to16_sse xmm5, xmm6, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
mov dl, byte [ebx] ; bottom-right
butterfly_1to16_sse xmm6, xmm4, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- ; for top & bottom expanding
+ ; for top & bottom expanding
mov ebx, [esp+32] ; width
- exp_top_bottom_sse2 16
-
+ exp_top_bottom_sse2 16
+
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst: left border pSrc
@@ -611,7 +611,7 @@
lea edi, [esi+eax] ; left border dst
dec ebx
lea ebx, [esi+ebx] ; right border pSrc, (p_dst + width - 1)
- lea ebp, [ebx+1] ; right border dst
+ lea ebp, [ebx+1] ; right border dst
; prepare for cross border pData: top-right with xmm4
; xor eax, eax
mov al, byte [ebx] ; top-right
@@ -618,7 +618,7 @@
butterfly_1to16_sse xmm4, xmm0, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
; for left & right border expanding
exp_left_right_sse2 16, u
-
+
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst
@@ -628,9 +628,9 @@
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
neg ecx ; -stride
mov eax, -16 ; chroma=-16
- lea edi, [esi+eax]
+ lea edi, [esi+eax]
lea edi, [edi+ecx] ; last line of top-left border
- lea ebp, [esi+ebx]
+ lea ebp, [esi+ebx]
lea ebp, [ebp+ecx] ; last line of top-right border
mov ecx, [esp+28] ; stride
add edx, 16 ; height+16, luma=32, chroma=16
@@ -640,14 +640,14 @@
neg ecx ; -stride
; for left & right border expanding
exp_cross_sse2 16, u
-
+
; sfence ; commit cache write back memory
-
+
pop ebp
pop edi
pop esi
pop edx
pop ebx
-
+
ret
--- a/codec/encoder/core/asm/intra_pred.asm
+++ b/codec/encoder/core/asm/intra_pred.asm
@@ -95,13 +95,13 @@
punpcklbw %1, %3
movdqa %3, %1
punpcklbw %1, %3
-
+
;add %4, %5
movd %2, [%4+%5-1]
movdqa %3, %2
punpcklbw %2, %3
movdqa %3, %2
- punpcklbw %2, %3
+ punpcklbw %2, %3
punpckldq %1, %2
%endmacro
@@ -126,24 +126,24 @@
movd %2, [%5+%6]
punpcklbw %3, %2
punpcklwd %1, %3
- lea %5, [%5+2*%6]
+ lea %5, [%5+2*%6]
movd %4, [%5]
movd %2, [%5+%6]
punpcklbw %4, %2
- lea %5, [%5+2*%6]
+ lea %5, [%5+2*%6]
movd %3, [%5]
movd %2, [%5+%6]
lea %5, [%5+2*%6]
punpcklbw %3, %2
punpcklwd %4, %3
- punpckhdq %1, %4
-%endmacro
+ punpckhdq %1, %4
+%endmacro
%macro SUMW_HORIZON 3
movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
- punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
- movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
+ punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
+ movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
@@ -173,7 +173,7 @@
movd %2, [%5+%6]
punpcklbw %3, %2
punpckhwd %1, %3
- lea %5, [%5+2*%6]
+ lea %5, [%5+2*%6]
%endmacro
%macro LOAD_2_LEFT_AND_ADD 0
@@ -197,7 +197,7 @@
ALIGN 16
;***********************************************************************
; void __cdecl WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-;
+;
; pred must align to 16
;***********************************************************************
WelsI4x4LumaPredH_sse2:
@@ -207,11 +207,11 @@
movzx edx, byte [eax-1]
movd xmm0, edx
pmuludq xmm0, [mmx_01bytes]
-
+
movzx edx, byte [eax+ecx-1]
movd xmm1, edx
pmuludq xmm1, [mmx_01bytes]
-
+
unpcklps xmm0, xmm1
lea eax, [eax+ecx*2]
@@ -218,19 +218,19 @@
movzx edx, byte [eax-1]
movd xmm2, edx
pmuludq xmm2, [mmx_01bytes]
-
+
movzx edx, byte [eax+ecx-1]
- movd xmm3, edx
+ movd xmm3, edx
pmuludq xmm3, [mmx_01bytes]
-
+
unpcklps xmm2, xmm3
unpcklpd xmm0, xmm2
-
+
mov edx, [esp+4] ;pred
movdqa [edx], xmm0
-
+
ret
-
+
;***********************************************************************
; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
;***********************************************************************
@@ -241,9 +241,9 @@
mov ecx, [esp + pushsize + 12]
sub esi, 1
sub esi, ecx
-
+
;for H
- pxor xmm7, xmm7
+ pxor xmm7, xmm7
movq xmm0, [esi]
movdqa xmm5, [sse2_plane_dec]
punpcklbw xmm0, xmm7
@@ -253,7 +253,7 @@
punpcklbw xmm1, xmm7
pmullw xmm1, xmm6
psubw xmm1, xmm0
-
+
SUMW_HORIZON xmm1,xmm0,xmm2
movd eax, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]);
movsx eax, ax
@@ -261,26 +261,26 @@
add eax, 32
sar eax, 6 ; b = (5 * H + 32) >> 6;
SSE2_Copy8Times xmm1, eax ; xmm1 = b,b,b,b,b,b,b,b
-
- movzx edx, BYTE [esi+16]
+
+ movzx edx, BYTE [esi+16]
sub esi, 3
LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, esi, ecx
-
+
add esi, 3
movzx eax, BYTE [esi+8*ecx]
add edx, eax
shl edx, 4 ; a = (left[15*stride] + top[15]) << 4;
-
+
sub esi, 3
add esi, ecx
LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, esi, ecx
- pxor xmm4, xmm4
+ pxor xmm4, xmm4
punpckhbw xmm0, xmm4
pmullw xmm0, xmm5
punpckhbw xmm7, xmm4
pmullw xmm7, xmm6
psubw xmm7, xmm0
-
+
SUMW_HORIZON xmm7,xmm0,xmm2
movd eax, xmm7 ; V
movsx eax, ax
@@ -288,17 +288,17 @@
imul eax, 5
add eax, 32
sar eax, 6 ; c = (5 * V + 32) >> 6;
- SSE2_Copy8Times xmm4, eax ; xmm4 = c,c,c,c,c,c,c,c
-
+ SSE2_Copy8Times xmm4, eax ; xmm4 = c,c,c,c,c,c,c,c
+
mov esi, [esp + pushsize + 4]
add edx, 16
imul eax, -7
- add edx, eax ; s = a + 16 + (-7)*c
- SSE2_Copy8Times xmm0, edx ; xmm0 = s,s,s,s,s,s,s,s
-
+ add edx, eax ; s = a + 16 + (-7)*c
+ SSE2_Copy8Times xmm0, edx ; xmm0 = s,s,s,s,s,s,s,s
+
xor eax, eax
movdqa xmm5, [sse2_plane_inc_minus]
-
+
get_i16x16_luma_pred_plane_sse2_1:
movdqa xmm2, xmm1
pmullw xmm2, xmm5
@@ -307,7 +307,7 @@
movdqa xmm3, xmm1
pmullw xmm3, xmm6
paddw xmm3, xmm0
- psraw xmm3, 5
+ psraw xmm3, 5
packuswb xmm2, xmm3
movdqa [esi], xmm2
paddw xmm0, xmm4
@@ -314,13 +314,13 @@
add esi, 16
inc eax
cmp eax, 16
- jnz get_i16x16_luma_pred_plane_sse2_1
-
+ jnz get_i16x16_luma_pred_plane_sse2_1
+
pop esi
ret
-
-
-
+
+
+
;***********************************************************************
; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
;***********************************************************************
@@ -327,7 +327,7 @@
%macro SSE2_PRED_H_16X16_TWO_LINE 1
lea eax, [eax+ecx*2]
-
+
COPY_16_TIMES eax, xmm0
movdqa [edx+%1], xmm0
COPY_16_TIMESS eax, xmm0, ecx
@@ -340,13 +340,13 @@
mov edx, [esp+4] ; pred
mov eax, [esp+8] ; pRef
mov ecx, [esp+12] ; stride
-
+
COPY_16_TIMES eax, xmm0
movdqa [edx], xmm0
COPY_16_TIMESS eax, xmm0, ecx
movdqa [edx+0x10], xmm0
-
- SSE2_PRED_H_16X16_TWO_LINE 0x20
+
+ SSE2_PRED_H_16X16_TWO_LINE 0x20
SSE2_PRED_H_16X16_TWO_LINE 0x40
SSE2_PRED_H_16X16_TWO_LINE 0x60
SSE2_PRED_H_16X16_TWO_LINE 0x80
@@ -353,9 +353,9 @@
SSE2_PRED_H_16X16_TWO_LINE 0xa0
SSE2_PRED_H_16X16_TWO_LINE 0xc0
SSE2_PRED_H_16X16_TWO_LINE 0xe0
-
+
ret
-
+
;***********************************************************************
; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
;***********************************************************************
@@ -364,10 +364,10 @@
mov edx, [esp+4] ; pred
mov eax, [esp+8] ; pRef
mov ecx, [esp+12] ; stride
-
+
sub eax, ecx
movdqa xmm0, [eax]
-
+
movdqa [edx], xmm0
movdqa [edx+10h], xmm0
movdqa [edx+20h], xmm0
@@ -378,15 +378,15 @@
movdqa [edx+70h], xmm0
movdqa [edx+80h], xmm0
movdqa [edx+90h], xmm0
- movdqa [edx+160], xmm0
+ movdqa [edx+160], xmm0
movdqa [edx+176], xmm0
movdqa [edx+192], xmm0
movdqa [edx+208], xmm0
movdqa [edx+224], xmm0
movdqa [edx+240], xmm0
-
+
ret
-
+
;***********************************************************************
; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
;***********************************************************************
@@ -398,8 +398,8 @@
mov ecx, [esp + pushsize + 12] ;stride
sub esi, 1
sub esi, ecx
-
- pxor mm7, mm7
+
+ pxor mm7, mm7
movq mm0, [esi]
movq mm5, [sse2_plane_dec_c]
punpcklbw mm0, mm7
@@ -409,7 +409,7 @@
punpcklbw mm1, mm7
pmullw mm1, mm6
psubw mm1, mm0
-
+
movq2dq xmm1, mm1
pxor xmm2, xmm2
SUMW_HORIZON xmm1,xmm0,xmm2
@@ -419,7 +419,7 @@
add eax, 16
sar eax, 5 ; b = (17 * H + 16) >> 5;
SSE2_Copy8Times xmm1, eax ; mm1 = b,b,b,b,b,b,b,b
-
+
movzx edx, BYTE [esi+8]
sub esi, 3
LOAD_COLUMN_C mm0, mm2, mm3, mm4, esi, ecx
@@ -428,17 +428,17 @@
movzx eax, BYTE [esi+4*ecx]
add edx, eax
shl edx, 4 ; a = (left[7*stride] + top[7]) << 4;
-
+
sub esi, 3
add esi, ecx
LOAD_COLUMN_C mm7, mm2, mm3, mm4, esi, ecx
- pxor mm4, mm4
+ pxor mm4, mm4
punpckhbw mm0, mm4
pmullw mm0, mm5
punpckhbw mm7, mm4
pmullw mm7, mm6
psubw mm7, mm0
-
+
movq2dq xmm7, mm7
pxor xmm2, xmm2
SUMW_HORIZON xmm7,xmm0,xmm2
@@ -448,17 +448,17 @@
imul eax, 17
add eax, 16
sar eax, 5 ; c = (17 * V + 16) >> 5;
- SSE2_Copy8Times xmm4, eax ; mm4 = c,c,c,c,c,c,c,c
-
+ SSE2_Copy8Times xmm4, eax ; mm4 = c,c,c,c,c,c,c,c
+
mov esi, [esp + pushsize + 4]
add edx, 16
imul eax, -3
- add edx, eax ; s = a + 16 + (-3)*c
- SSE2_Copy8Times xmm0, edx ; xmm0 = s,s,s,s,s,s,s,s
-
+ add edx, eax ; s = a + 16 + (-3)*c
+ SSE2_Copy8Times xmm0, edx ; xmm0 = s,s,s,s,s,s,s,s
+
xor eax, eax
movdqa xmm5, [sse2_plane_mul_b_c]
-
+
get_i_chroma_pred_plane_sse2_1:
movdqa xmm2, xmm1
pmullw xmm2, xmm5
@@ -470,12 +470,12 @@
add esi, 8
inc eax
cmp eax, 8
- jnz get_i_chroma_pred_plane_sse2_1
-
+ jnz get_i_chroma_pred_plane_sse2_1
+
pop esi
WELSEMMS
- ret
-
+ ret
+
ALIGN 16
;***********************************************************************
; 0 |1 |2 |3 |4 |
@@ -487,13 +487,13 @@
; pred[7] = ([6]+[0]*2+[1]+2)/4
;
; void __cdecl WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;
+;
;***********************************************************************
-WelsI4x4LumaPredDDR_mmx:
+WelsI4x4LumaPredDDR_mmx:
mov edx,[esp+4] ;pred
mov eax,[esp+8] ;pRef
mov ecx,[esp+12] ;stride
-
+
movq mm1,[eax+ecx-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
movq mm2,[eax-8] ;get value of 6 mm2[8] = 6
sub eax, ecx ;mov eax to above line of current block(postion of 1)
@@ -520,17 +520,17 @@
pand mm1,[mmx_01bytes] ;set the odd bit
psubusb mm3,mm1 ;decrease 1 from odd bytes
pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
-
- movd [edx+12],mm2
- psrlq mm2,8
- movd [edx+8],mm2
- psrlq mm2,8
- movd [edx+4],mm2
- psrlq mm2,8
+
+ movd [edx+12],mm2
+ psrlq mm2,8
+ movd [edx+8],mm2
+ psrlq mm2,8
+ movd [edx+4],mm2
+ psrlq mm2,8
movd [edx],mm2
WELSEMMS
ret
-
+
ALIGN 16
;***********************************************************************
; 0 |1 |2 |3 |4 |
@@ -542,44 +542,44 @@
; pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
;
; void __cdecl WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;
+;
;***********************************************************************
-WelsI4x4LumaPredDc_sse2:
+WelsI4x4LumaPredDc_sse2:
mov eax,[esp+8] ;pRef
mov ecx,[esp+12] ;stride
push ebx
-
+
movzx edx, byte [eax-1h]
-
+
sub eax, ecx
movd xmm0, [eax]
pxor xmm1, xmm1
psadbw xmm0, xmm1
-
+
movd ebx, xmm0
add ebx, edx
-
+
movzx edx, byte [eax+ecx*2-1h]
add ebx, edx
-
+
lea eax, [eax+ecx*2-1]
movzx edx, byte [eax+ecx]
add ebx, edx
-
+
movzx edx, byte [eax+ecx*2]
add ebx, edx
add ebx, 4
sar ebx, 3
imul ebx, 0x01010101
-
+
mov edx, [esp+8] ;pred
movd xmm0, ebx
pshufd xmm0, xmm0, 0
movdqa [edx], xmm0
-
+
pop ebx
- ret
-
+ ret
+
ALIGN 16
;***********************************************************************
; void __cdecl WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
@@ -588,7 +588,7 @@
%macro MMX_PRED_H_8X8_ONE_LINE 4
movq %1, [%3-8]
psrlq %1, 38h
-
+
;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
pmullw %1, [mmx_01bytes]
pshufw %1, %1, 0
@@ -598,7 +598,7 @@
%macro MMX_PRED_H_8X8_ONE_LINEE 4
movq %1, [%3+ecx-8]
psrlq %1, 38h
-
+
;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
pmullw %1, [mmx_01bytes]
pshufw %1, %1, 0
@@ -610,34 +610,34 @@
mov edx, [esp+4] ;pred
mov eax, [esp+8] ;pRef
mov ecx, [esp+12] ;stride
-
+
movq mm0, [eax-8]
psrlq mm0, 38h
-
+
;pmuludq mm0, [mmx_01bytes] ;extend to 4 bytes
pmullw mm0, [mmx_01bytes]
pshufw mm0, mm0, 0
movq [edx], mm0
-
+
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax,edx+8
-
+
lea eax,[eax+ecx*2]
MMX_PRED_H_8X8_ONE_LINE mm0, mm1, eax,edx+16
-
+
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax,edx+24
-
+
lea eax,[eax+ecx*2]
MMX_PRED_H_8X8_ONE_LINE mm0, mm1, eax,edx+32
-
+
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax,edx+40
-
+
lea eax,[eax+ecx*2]
MMX_PRED_H_8X8_ONE_LINE mm0, mm1, eax,edx+48
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax,edx+56
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax,edx+56
WELSEMMS
- ret
-
+ ret
+
ALIGN 16
;***********************************************************************
; void __cdecl WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
@@ -648,12 +648,12 @@
mov edx, [esp+4] ;pred
mov eax, [esp+8] ;pRef
mov ecx, [esp+12] ;stride
-
+
sub eax, ecx
movd xmm0, [eax]
pshufd xmm0, xmm0, 0
movdqa [edx], xmm0
- ret
+ ret
ALIGN 16
;***********************************************************************
@@ -665,7 +665,7 @@
mov edx, [esp+4] ;pred
mov eax, [esp+8] ;pRef
mov ecx, [esp+12] ;stride
-
+
sub eax, ecx
movq xmm0, [eax]
movdqa xmm1, xmm0
@@ -676,8 +676,8 @@
movdqa [edx+32], xmm0
movdqa [edx+48], xmm0
ret
-
-
+
+
ALIGN 16
;***********************************************************************
; lt|t0|t1|t2|t3|
@@ -703,13 +703,13 @@
; f = (2 + l1 + (l0<<1) + lt)>>2
; h = (2 + l2 + (l1<<1) + l0)>>2
-; j = (2 + l3 + (l2<<1) + l1)>>2
+; j = (2 + l3 + (l2<<1) + l1)>>2
; [b a f e h g j i] + [d c b a] --> mov to memory
-;
+;
; void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredHD_mmx
-WelsI4x4LumaPredHD_mmx:
+WelsI4x4LumaPredHD_mmx:
mov edx, [esp+4] ; pred
mov eax, [esp+8] ; pRef
mov ecx, [esp+12] ; stride
@@ -716,16 +716,16 @@
sub eax, ecx
movd mm0, [eax-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt]
psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx]
-
- movd mm1, [eax+2*ecx-4]
- punpcklbw mm1, [eax+ecx-4] ; mm1[7] = l0, mm1[6] = l1
+
+ movd mm1, [eax+2*ecx-4]
+ punpcklbw mm1, [eax+ecx-4] ; mm1[7] = l0, mm1[6] = l1
lea eax, [eax+2*ecx]
- movd mm2, [eax+2*ecx-4]
+ movd mm2, [eax+2*ecx-4]
punpcklbw mm2, [eax+ecx-4] ; mm2[7] = l2, mm2[6] = l3
punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
psrlq mm2, 20h
pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
-
+
movq mm1, mm0
psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
movq mm2, mm0
@@ -733,17 +733,17 @@
movq mm3, mm2
movq mm4, mm1
pavgb mm1, mm0
-
+
pxor mm4, mm0 ; find odd value in the lowest bit of each byte
pand mm4, [mmx_01bytes] ; set the odd bit
psubusb mm1, mm4 ; decrease 1 from odd bytes
-
+
pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
-
+
movq mm4, mm0
pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i]
punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i]
-
+
psrlq mm2, 20h
psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0]
movq mm4, mm3
@@ -750,7 +750,7 @@
psrlq mm4, 10h ; mm4 = [0 0 b a f e h j]
pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx]
psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a]
-
+
movd [edx], mm2
movd [edx+12], mm3
psrlq mm3, 10h
@@ -759,9 +759,9 @@
movd [edx+4], mm3
WELSEMMS
ret
-
-
-
+
+
+
ALIGN 16
;***********************************************************************
; lt|t0|t1|t2|t3|
@@ -784,17 +784,17 @@
; b = (2 + l0 + (l1<<1) + l2)>>2
; d = (2 + l1 + (l2<<1) + l3)>>2
; f = (2 + l2 + (l3<<1) + l3)>>2
-
+
; [g g f e d c b a] + [g g g g] --> mov to memory
-;
+;
; void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredHU_mmx
-WelsI4x4LumaPredHU_mmx:
+WelsI4x4LumaPredHU_mmx:
mov edx, [esp+4] ; pred
mov eax, [esp+8] ; pRef
mov ecx, [esp+12] ; stride
-
+
movd mm0, [eax-4] ; mm0[3] = l0
punpcklbw mm0, [eax+ecx-4] ; mm0[7] = l1, mm0[6] = l0
lea eax, [eax+2*ecx]
@@ -802,38 +802,38 @@
movd mm4, [eax+ecx-4] ; mm4[3] = l3
punpcklbw mm2, mm4
punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
-
+
psrlq mm4, 18h
psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx]
psrlq mm0, 8h
pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
-
+
movq mm1, mm0
psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx]
-
+
movq mm2, mm0
psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx]
movq mm5, mm2
pavgb mm2, mm0
-
+
pxor mm5, mm0 ; find odd value in the lowest bit of each byte
pand mm5, [mmx_01bytes] ; set the odd bit
psubusb mm2, mm5 ; decrease 1 from odd bytes
-
+
pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
-
+
psrlq mm2, 8h
pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx]
-
+
punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a]
punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx]
punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx]
-
+
psrlq mm4, 20h
movd [edx+12], mm4
-
+
movd [edx], mm1
psrlq mm1, 10h
movd [edx+4], mm1
@@ -841,9 +841,9 @@
movd [edx+8], mm1
WELSEMMS
ret
-
-
-
+
+
+
ALIGN 16
;***********************************************************************
; lt|t0|t1|t2|t3|
@@ -869,12 +869,12 @@
; h = (2 + t1 + (t2<<1) + t3)>>2
; i = (2 + lt + (l0<<1) + l1)>>2
-; j = (2 + l0 + (l1<<1) + l2)>>2
-;
+; j = (2 + l0 + (l1<<1) + l2)>>2
+;
; void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredVR_mmx
-WelsI4x4LumaPredVR_mmx:
+WelsI4x4LumaPredVR_mmx:
mov edx, [esp+4] ; pred
mov eax, [esp+8] ; pRef
mov ecx, [esp+12] ; stride
@@ -881,57 +881,57 @@
sub eax, ecx
movq mm0, [eax-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
-
- movd mm1, [eax+2*ecx-4]
- punpcklbw mm1, [eax+ecx-4] ; mm1[7] = l0, mm1[6] = l1
+
+ movd mm1, [eax+2*ecx-4]
+ punpcklbw mm1, [eax+ecx-4] ; mm1[7] = l0, mm1[6] = l1
lea eax, [eax+2*ecx]
movq mm2, [eax+ecx-8] ; mm2[7] = l2
punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx]
psrlq mm2, 28h
pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
-
+
movq mm1, mm0
psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx]
-
+
movq mm2, mm0
psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
movq mm3, mm2
pavgb mm2, mm0
-
+
pxor mm3, mm0 ; find odd value in the lowest bit of each byte
pand mm3, [mmx_01bytes] ; set the odd bit
psubusb mm2, mm3 ; decrease 1 from odd bytes
-
+
movq mm3, mm0
psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx]
movq mm2, mm3
-
+
psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a]
movd [edx], mm1
-
+
psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e]
movd [edx+4], mm2
-
+
movq mm4, mm3
psllq mm4, 20h
psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i]
-
+
movq mm5, mm3
psllq mm5, 28h
psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j]
-
+
psllq mm1, 8h
pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i]
movd [edx+8], mm4
-
+
psllq mm2, 8h
pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
movd [edx+12], mm5
WELSEMMS
ret
-
+
ALIGN 16
;***********************************************************************
; lt|t0|t1|t2|t3|t4|t5|t6|t7
@@ -954,13 +954,13 @@
; e = (2 + t4 + t6 + (t5<<1))>>2
; f = (2 + t5 + t7 + (t6<<1))>>2
; g = (2 + t6 + t7 + (t7<<1))>>2
-
+
; [g f e d c b a] --> mov to memory
-;
+;
; void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredDDL_mmx
-WelsI4x4LumaPredDDL_mmx:
+WelsI4x4LumaPredDDL_mmx:
mov edx, [esp+4] ; pred
mov eax, [esp+8] ; pRef
mov ecx, [esp+12] ; stride
@@ -968,11 +968,11 @@
movq mm0, [eax] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
movq mm1, mm0
movq mm2, mm0
-
+
movq mm3, mm0
psrlq mm3, 38h
psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx]
-
+
psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
psrlq mm2, 8h
pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
@@ -982,9 +982,9 @@
pxor mm3, mm2 ; find odd value in the lowest bit of each byte
pand mm3, [mmx_01bytes] ; set the odd bit
psubusb mm1, mm3 ; decrease 1 from odd bytes
-
+
pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
-
+
psrlq mm0, 8h
movd [edx], mm0
psrlq mm0, 8h
@@ -995,8 +995,8 @@
movd [edx+12], mm0
WELSEMMS
ret
-
-
+
+
ALIGN 16
;***********************************************************************
; lt|t0|t1|t2|t3|t4|t5|t6|t7
@@ -1022,46 +1022,46 @@
; g = (2 + t2 + (t3<<1) + t4)>>2
; h = (2 + t3 + (t4<<1) + t5)>>2
; j = (2 + t4 + (t5<<1) + t6)>>2
-
+
; [i d c b a] + [j h g f e] --> mov to memory
-;
+;
; void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredVL_mmx
-WelsI4x4LumaPredVL_mmx:
+WelsI4x4LumaPredVL_mmx:
mov edx, [esp+4] ; pred
mov eax, [esp+8] ; pRef
mov ecx, [esp+12] ; stride
-
+
sub eax, ecx
movq mm0, [eax] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
movq mm1, mm0
movq mm2, mm0
-
+
psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
movq mm3, mm1
pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a]
-
+
movq mm4, mm2
- pavgb mm2, mm0
+ pavgb mm2, mm0
pxor mm4, mm0 ; find odd value in the lowest bit of each byte
pand mm4, [mmx_01bytes] ; set the odd bit
psubusb mm2, mm4 ; decrease 1 from odd bytes
-
+
pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
-
+
movd [edx], mm3
psrlq mm3, 8h
movd [edx+8], mm3
-
+
movd [edx+4], mm2
psrlq mm2, 8h
movd [edx+12], mm2
WELSEMMS
ret
-
+
ALIGN 16
;***********************************************************************
;
@@ -1068,14 +1068,14 @@
; void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
;***********************************************************************
WELS_EXTERN WelsIChromaPredDc_sse2
-WelsIChromaPredDc_sse2:
+WelsIChromaPredDc_sse2:
push ebx
mov eax, [esp+12] ; pRef
mov ecx, [esp+16] ; stride
-
+
sub eax, ecx
movq mm0, [eax]
-
+
;xor ebx, ebx
;movzx edx, byte [eax+ecx-0x01] ; l1
movzx ebx, byte [eax+ecx-0x01] ; l1
@@ -1089,7 +1089,7 @@
movzx edx, byte [eax-0x01] ; l4
add ebx, edx
movd mm1, ebx ; mm1 = l1+l2+l3+l4
-
+
;xor ebx, ebx
;movzx edx, byte [eax+ecx-0x01] ; l5
movzx ebx, byte [eax+ecx-0x01] ; l5
@@ -1103,7 +1103,7 @@
movzx edx, byte [eax-0x01] ; l8
add ebx, edx
movd mm2, ebx ; mm2 = l5+l6+l7+l8
-
+
movq mm3, mm0
psrlq mm0, 0x20
psllq mm3, 0x20
@@ -1110,56 +1110,56 @@
psrlq mm3, 0x20
pxor mm4, mm4
psadbw mm0, mm4
- psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
-
+ psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
+
paddq mm3, mm1
movq mm1, mm2
paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
-
+
movq mm4, [mmx_0x02]
-
+
paddq mm0, mm4
psrlq mm0, 0x02
-
+
paddq mm2, mm4
psrlq mm2, 0x02
-
+
paddq mm3, mm4
paddq mm3, mm4
psrlq mm3, 0x03
-
+
paddq mm1, mm4
paddq mm1, mm4
psrlq mm1, 0x03
-
+
pmuludq mm0, [mmx_01bytes]
pmuludq mm3, [mmx_01bytes]
psllq mm0, 0x20
pxor mm0, mm3 ; mm0 = m_up
-
+
pmuludq mm2, [mmx_01bytes]
pmuludq mm1, [mmx_01bytes]
psllq mm1, 0x20
pxor mm1, mm2 ; mm2 = m_down
-
+
mov edx, [esp+8] ; pRef
-
+
movq [edx], mm0
movq [edx+0x08], mm0
movq [edx+0x10], mm0
movq [edx+0x18], mm0
-
+
movq [edx+0x20], mm1
movq [edx+0x28], mm1
movq [edx+0x30], mm1
movq [edx+0x38], mm1
-
+
pop ebx
WELSEMMS
ret
-
-
-
+
+
+
ALIGN 16
;***********************************************************************
;
@@ -1166,11 +1166,11 @@
; void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI16x16LumaPredDc_sse2
-WelsI16x16LumaPredDc_sse2:
+WelsI16x16LumaPredDc_sse2:
push ebx
mov eax, [esp+12] ; pRef
mov ecx, [esp+16] ; stride
-
+
sub eax, ecx
movdqa xmm0, [eax] ; read one row
pxor xmm1, xmm1
@@ -1180,7 +1180,7 @@
pslldq xmm0, 0x08
psrldq xmm0, 0x08
paddw xmm0, xmm1
-
+
;xor ebx, ebx
;movzx edx, byte [eax+ecx-0x01]
movzx ebx, byte [eax+ecx-0x01]
@@ -1201,7 +1201,7 @@
psrld xmm0, 0x05
pmuludq xmm0, [mmx_01bytes]
pshufd xmm0, xmm0, 0
-
+
mov edx, [esp+8] ; pred
movdqa [edx], xmm0
movdqa [edx+0x10], xmm0
@@ -1219,7 +1219,7 @@
movdqa [edx+0xd0], xmm0
movdqa [edx+0xe0], xmm0
movdqa [edx+0xf0], xmm0
-
+
pop ebx
ret
@@ -1226,7 +1226,7 @@
;***********************************************************************
;
-;int32_t WelsSmpleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
+;int32_t WelsSmpleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
; uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
;
;***********************************************************************
@@ -1238,7 +1238,7 @@
push edi
mov eax, [esp+24];p_enc
mov ebx, [esp+28];linesize_enc
-
+
; load source 4x4 samples and Hadamard transform
movd xmm0, [eax]
movd xmm1, [eax+ebx]
@@ -1247,16 +1247,16 @@
movd xmm3, [eax+ebx]
punpckldq xmm0, xmm2
punpckldq xmm1, xmm3
-
+
pxor xmm6, xmm6
punpcklbw xmm0, xmm6
punpcklbw xmm1, xmm6
-
+
movdqa xmm2, xmm0
paddw xmm0, xmm1
psubw xmm2, xmm1
SSE2_XSawp qdq, xmm0, xmm2, xmm3
-
+
movdqa xmm4, xmm0
paddw xmm0, xmm3
psubw xmm4, xmm3
@@ -1264,7 +1264,7 @@
movdqa xmm2, xmm0
punpcklwd xmm0, xmm4
punpckhwd xmm4, xmm2
-
+
SSE2_XSawp dq, xmm0, xmm4, xmm3
SSE2_XSawp qdq, xmm0, xmm3, xmm5
@@ -1271,14 +1271,14 @@
movdqa xmm7, xmm0
paddw xmm0, xmm5
psubw xmm7, xmm5
-
+
SSE2_XSawp qdq, xmm0, xmm7, xmm1
-
+
; Hadamard transform results are saved in xmm0 and xmm2
movdqa xmm2, xmm0
paddw xmm0, xmm1
psubw xmm2, xmm1
-
+
; load top boundary samples: [a b c d]
mov eax, [esp+16];p_dec
sub eax, [esp+20];linesize_dec
@@ -1286,7 +1286,7 @@
movzx edx, byte [eax+1]
movzx esi, byte [eax+2]
movzx edi, byte [eax+3]
-
+
; get the transform results of top boundary samples: [a b c d]
add edx, ecx ; edx = a + b
add edi, esi ; edi = c + d
@@ -1300,7 +1300,7 @@
add esi, ecx ; esi = (a - b) + (c - d)
add ecx, ecx
sub ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]
-
+
movdqa xmm6, xmm0
movdqa xmm7, xmm2
movd xmm5, edi ; store the edi for DC mode
@@ -1312,16 +1312,16 @@
pinsrw xmm4, edx, 0
pinsrw xmm4, ecx, 4
psllw xmm4, 2
-
+
; get the satd of H
psubw xmm0, xmm3
psubw xmm2, xmm4
-
+
WELS_AbsW xmm0, xmm1
WELS_AbsW xmm2, xmm1
paddusw xmm0, xmm2
SUMW_HORIZON1 xmm0, xmm1 ; satd of V is stored in xmm0
-
+
; load left boundary samples: [a b c d]'
mov eax, [esp+16]
mov ebx, [esp+20]
@@ -1330,7 +1330,7 @@
lea eax , [eax+2*ebx]
movzx esi, byte [eax-1]
movzx edi, byte [eax+ebx-1]
-
+
; get the transform results of left boundary samples: [a b c d]'
add edx, ecx ; edx = a + b
add edi, esi ; edi = c + d
@@ -1344,14 +1344,14 @@
add esi, ecx ; esi = (a - b) + (c - d)
add ecx, ecx
sub ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]'
-
- ; store the transform results in xmm3
+
+ ; store the transform results in xmm3
movd xmm3, edi
pinsrw xmm3, edx, 1
pinsrw xmm3, ecx, 2
pinsrw xmm3, esi, 3
psllw xmm3, 2
-
+
; get the satd of V
movdqa xmm2, xmm6
movdqa xmm4, xmm7
@@ -1368,7 +1368,7 @@
psrlw xmm1, 3
movdqa xmm5, xmm1
psllw xmm1, 4
-
+
; get the satd of DC
psubw xmm6, xmm1
WELS_AbsW xmm6, xmm1
@@ -1375,7 +1375,7 @@
WELS_AbsW xmm7, xmm1
paddusw xmm6, xmm7
SUMW_HORIZON1 xmm6, xmm1 ; satd of DC is stored in xmm6
-
+
; comparing order: DC H V
mov edx, [esp+32]
movd eax, xmm6
@@ -1394,9 +1394,9 @@
jg near not_dc
cmp ax, si
jg near not_dc_h
-
+
; for DC mode
- movd ebx, xmm5
+ movd ebx, xmm5
imul ebx, 0x01010101
movd xmm5, ebx
pshufd xmm5, xmm5, 0
@@ -1407,11 +1407,11 @@
pop esi
pop ebx
ret
-
+
not_dc:
cmp di, si
jg near not_dc_h
-
+
; for H mode
SSE_DB_1_2REG xmm6, xmm7
mov eax, [esp+16]
@@ -1422,20 +1422,20 @@
movzx ecx, byte [eax+ebx-1]
movd xmm1, ecx
- pmuludq xmm1, xmm6
+ pmuludq xmm1, xmm6
%if 1
punpckldq xmm0, xmm1
-%else
+%else
unpcklps xmm0, xmm1
%endif
lea eax, [eax+ebx*2]
movzx ecx, byte [eax-1]
movd xmm2, ecx
- pmuludq xmm2, xmm6
+ pmuludq xmm2, xmm6
movzx ecx, byte [eax+ebx-1]
- movd xmm3, ecx
- pmuludq xmm3, xmm6
+ movd xmm3, ecx
+ pmuludq xmm3, xmm6
%if 1
punpckldq xmm2, xmm3
punpcklqdq xmm0, xmm2
@@ -1442,13 +1442,13 @@
%else
unpcklps xmm2, xmm3
unpcklpd xmm0, xmm2
-%endif
+%endif
movdqa [edx],xmm0
-
+
mov eax, edi
mov ebx, [esp+36]
mov dword [ebx], 0x01
-
+
pop edi
pop esi
pop ebx
@@ -1460,14 +1460,14 @@
movd xmm0, [eax]
pshufd xmm0, xmm0, 0
movdqa [edx],xmm0
-
+
mov eax, esi
mov ebx, [esp+36]
mov dword [ebx], 0x00
-
+
pop edi
pop esi
pop ebx
ret
-
+
--- a/codec/encoder/core/asm/intra_pred_util.asm
+++ b/codec/encoder/core/asm/intra_pred_util.asm
@@ -32,7 +32,7 @@
;* intra_pred_util.asm
;*
;* Abstract
-;* mmxext/sse for WelsFillingPred8to16, WelsFillingPred8x2to16 and
+;* mmxext/sse for WelsFillingPred8to16, WelsFillingPred8x2to16 and
;* WelsFillingPred1to16 etc.
;*
;* History
@@ -84,7 +84,7 @@
movq mm0, [ecx]
movq [eax ], mm0
movq [eax+8], mm0
-
+
WELSEMMS
ret
@@ -100,16 +100,16 @@
movq mm1, [ecx+8]
movq [eax ], mm0
movq [eax+8], mm1
-
+
WELSEMMS
ret
%macro butterfly_1to8_mmx 3 ; mm? for dst, mm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
- mov %3h, %3l
- movd %2, e%3x ; i.e, 1% = eax (=b0)
- pshufw %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0
-%endmacro
+ mov %3h, %3l
+ movd %2, e%3x ; i.e, 1% = eax (=b0)
+ pshufw %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro
ALIGN 16
;***********************************************************************----------------
@@ -120,10 +120,10 @@
mov cl, byte [esp+8] ; v
butterfly_1to8_mmx mm0, mm1, c ; mm? for dst, mm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
-
+
movq [eax ], mm0
movq [eax+8], mm0
-
+
WELSEMMS
ret
@@ -136,9 +136,9 @@
mov eax, [esp+4] ; pred
mov ecx, [esp+8] ; v
- movdqa xmm0, [ecx]
- movdqa [eax], xmm0
-
+ movdqa xmm0, [ecx]
+ movdqa [eax], xmm0
+
ret
ALIGN 16
@@ -150,7 +150,7 @@
mov cl, byte [esp+8] ; v
butterfly_1to16_sse xmm0, xmm1, c ; dst, tmp, pSrc [generic register name: a/b/c/d]
-
+
movdqa [eax], xmm0
-
+
ret
--- a/codec/encoder/core/asm/mb_copy.asm
+++ b/codec/encoder/core/asm/mb_copy.asm
@@ -32,7 +32,7 @@
;* mb_copy.asm
;*
;* Abstract
-;* mb_copy
+;* mb_copy
;*
;*
;*********************************************************************************************/
@@ -52,9 +52,9 @@
WELS_EXTERN WelsCopy16x16_sse2
WELS_EXTERN WelsCopy16x16NotAligned_sse2
WELS_EXTERN WelsCopy8x8_mmx
-WELS_EXTERN WelsCopy16x8NotAligned_sse2 ;
-WELS_EXTERN WelsCopy8x16_mmx ;
-WELS_EXTERN UpdateMbMv_sse2 ;
+WELS_EXTERN WelsCopy16x8NotAligned_sse2 ;
+WELS_EXTERN WelsCopy8x16_mmx ;
+WELS_EXTERN UpdateMbMv_sse2 ;
;***********************************************************************
; void WelsCopy16x16_sse2( uint8_t* Dst,
@@ -66,7 +66,7 @@
WelsCopy16x16_sse2:
push esi
push edi
- push ebx
+ push ebx
mov edi, [esp+16] ; Dst
mov eax, [esp+20] ; iStrideD
@@ -107,7 +107,7 @@
movdqa xmm5, [esi+ecx]
movdqa xmm6, [esi+2*ecx]
movdqa xmm7, [esi+edx]
-
+
movdqa [edi], xmm0
movdqa [edi+eax], xmm1
movdqa [edi+2*eax], xmm2
@@ -116,7 +116,7 @@
movdqa [edi], xmm4
movdqa [edi+eax], xmm5
movdqa [edi+2*eax], xmm6
- movdqa [edi+ebx], xmm7
+ movdqa [edi+ebx], xmm7
pop ebx
pop edi
@@ -134,7 +134,7 @@
WelsCopy16x16NotAligned_sse2:
push esi
push edi
- push ebx
+ push ebx
mov edi, [esp+16] ; Dst
mov eax, [esp+20] ; iStrideD
@@ -175,7 +175,7 @@
movdqu xmm5, [esi+ecx]
movdqu xmm6, [esi+2*ecx]
movdqu xmm7, [esi+edx]
-
+
movdqa [edi], xmm0
movdqa [edi+eax], xmm1
movdqa [edi+2*eax], xmm2
@@ -184,8 +184,8 @@
movdqa [edi], xmm4
movdqa [edi+eax], xmm5
movdqa [edi+2*eax], xmm6
- movdqa [edi+ebx], xmm7
-
+ movdqa [edi+ebx], xmm7
+
pop ebx
pop edi
pop esi
@@ -202,7 +202,7 @@
WelsCopy16x8NotAligned_sse2:
push esi
push edi
- push ebx
+ push ebx
mov edi, [esp+16] ; Dst
mov eax, [esp+20] ; iStrideD
@@ -220,7 +220,7 @@
movdqu xmm4, [esi]
movdqu xmm5, [esi+ecx]
movdqu xmm6, [esi+2*ecx]
- movdqu xmm7, [esi+edx]
+ movdqu xmm7, [esi+edx]
movdqa [edi], xmm0
movdqa [edi+eax], xmm1
@@ -231,7 +231,7 @@
movdqa [edi+eax], xmm5
movdqa [edi+2*eax], xmm6
movdqa [edi+ebx], xmm7
-
+
pop ebx
pop edi
pop esi
@@ -245,7 +245,7 @@
; int32_t iStrideS )
;***********************************************************************
ALIGN 16
-WelsCopy8x16_mmx:
+WelsCopy8x16_mmx:
push ebx
mov eax, [esp + 8 ] ;Dst
@@ -253,60 +253,60 @@
mov ebx, [esp + 16] ;Src
mov edx, [esp + 20] ;iStrideS
- movq mm0, [ebx]
- movq mm1, [ebx+edx]
+ movq mm0, [ebx]
+ movq mm1, [ebx+edx]
lea ebx, [ebx+2*edx]
- movq mm2, [ebx]
- movq mm3, [ebx+edx]
+ movq mm2, [ebx]
+ movq mm3, [ebx+edx]
lea ebx, [ebx+2*edx]
- movq mm4, [ebx]
- movq mm5, [ebx+edx]
+ movq mm4, [ebx]
+ movq mm5, [ebx+edx]
lea ebx, [ebx+2*edx]
- movq mm6, [ebx]
- movq mm7, [ebx+edx]
+ movq mm6, [ebx]
+ movq mm7, [ebx+edx]
lea ebx, [ebx+2*edx]
-
- movq [eax], mm0
- movq [eax+ecx], mm1
+
+ movq [eax], mm0
+ movq [eax+ecx], mm1
lea eax, [eax+2*ecx]
- movq [eax], mm2
+ movq [eax], mm2
movq [eax+ecx], mm3
lea eax, [eax+2*ecx]
- movq [eax], mm4
+ movq [eax], mm4
movq [eax+ecx], mm5
lea eax, [eax+2*ecx]
- movq [eax], mm6
+ movq [eax], mm6
movq [eax+ecx], mm7
lea eax, [eax+2*ecx]
- movq mm0, [ebx]
- movq mm1, [ebx+edx]
+ movq mm0, [ebx]
+ movq mm1, [ebx+edx]
lea ebx, [ebx+2*edx]
- movq mm2, [ebx]
- movq mm3, [ebx+edx]
+ movq mm2, [ebx]
+ movq mm3, [ebx+edx]
lea ebx, [ebx+2*edx]
- movq mm4, [ebx]
- movq mm5, [ebx+edx]
+ movq mm4, [ebx]
+ movq mm5, [ebx+edx]
lea ebx, [ebx+2*edx]
- movq mm6, [ebx]
- movq mm7, [ebx+edx]
-
- movq [eax], mm0
- movq [eax+ecx], mm1
+ movq mm6, [ebx]
+ movq mm7, [ebx+edx]
+
+ movq [eax], mm0
+ movq [eax+ecx], mm1
lea eax, [eax+2*ecx]
- movq [eax], mm2
+ movq [eax], mm2
movq [eax+ecx], mm3
lea eax, [eax+2*ecx]
- movq [eax], mm4
+ movq [eax], mm4
movq [eax+ecx], mm5
lea eax, [eax+2*ecx]
- movq [eax], mm6
- movq [eax+ecx], mm7
+ movq [eax], mm6
+ movq [eax+ecx], mm7
WELSEMMS
- pop ebx
+ pop ebx
ret
-
+
;***********************************************************************
; void WelsCopy8x8_mmx( uint8_t* Dst,
; int32_t iStrideD,
@@ -314,7 +314,7 @@
; int32_t iStrideS )
;***********************************************************************
ALIGN 16
-WelsCopy8x8_mmx:
+WelsCopy8x8_mmx:
push ebx
push esi
mov eax, [esp + 12] ;Dst
@@ -343,7 +343,7 @@
lea esi, [esi+2*ebx]
movq mm6, [esi]
movq mm7, [esi+ebx]
-
+
movq [eax], mm0
movq [eax+ecx], mm1
lea eax, [eax+2*ecx]
@@ -355,12 +355,12 @@
lea eax, [eax+2*ecx]
movq [eax], mm6
movq [eax+ecx], mm7
-
+
WELSEMMS
- pop esi
+ pop esi
pop ebx
ret
-
+
; (dunhuang@cisco), 12/21/2011
;***********************************************************************
; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv )
@@ -417,8 +417,8 @@
WELS_EXTERN McCopyWidthEq4_mmx
WELS_EXTERN McCopyWidthEq8_mmx
WELS_EXTERN McCopyWidthEq16_sse2
-
+
ALIGN 16
;***********************************************************************
; void PixelAvgWidthEq8_mmx( uint8_t *dst, int32_t iDstStride,
@@ -432,19 +432,19 @@
push esi
push edi
- mov edi, [esp+20]
- mov esi, [esp+28]
- mov edx, [esp+36]
- mov ebp, [esp+24]
- mov eax, [esp+32]
- mov ebx, [esp+40]
- mov ecx, [esp+44]
+ mov edi, [esp+20]
+ mov esi, [esp+28]
+ mov edx, [esp+36]
+ mov ebp, [esp+24]
+ mov eax, [esp+32]
+ mov ebx, [esp+40]
+ mov ecx, [esp+44]
sar ecx, 2
.height_loop:
- movq mm0, [esi]
+ movq mm0, [esi]
pavgb mm0, [edx]
movq [edi], mm0
- movq mm1, [esi+eax]
+ movq mm1, [esi+eax]
pavgb mm1, [edx+ebx]
movq [edi+ebp], mm1
lea edi, [edi+2*ebp]
@@ -451,19 +451,19 @@
lea esi, [esi+2*eax]
lea edx, [edx+2*ebx]
- movq mm2, [esi]
+ movq mm2, [esi]
pavgb mm2, [edx]
movq [edi], mm2
- movq mm3, [esi+eax]
+ movq mm3, [esi+eax]
pavgb mm3, [edx+ebx]
movq [edi+ebp], mm3
lea edi, [edi+2*ebp]
lea esi, [esi+2*eax]
lea edx, [edx+2*ebx]
-
+
dec ecx
jne .height_loop
-
+
WELSEMMS
pop edi
pop esi
@@ -485,19 +485,19 @@
push esi
push edi
- mov edi, [esp+20]
- mov esi, [esp+28]
- mov edx, [esp+36]
- mov ebp, [esp+24]
- mov eax, [esp+32]
- mov ebx, [esp+40]
- mov ecx, [esp+44]
+ mov edi, [esp+20]
+ mov esi, [esp+28]
+ mov edx, [esp+36]
+ mov ebp, [esp+24]
+ mov eax, [esp+32]
+ mov ebx, [esp+40]
+ mov ecx, [esp+44]
sar ecx, 2
.height_loop:
movdqu xmm0, [esi]
movdqu xmm1, [edx]
movdqu xmm2, [esi+eax]
- movdqu xmm3, [edx+ebx]
+ movdqu xmm3, [edx+ebx]
pavgb xmm0, xmm1
pavgb xmm2, xmm3
movdqu [edi], xmm0
@@ -504,12 +504,12 @@
movdqu [edi+ebp], xmm2
lea edi, [edi+2*ebp]
lea esi, [esi+2*eax]
- lea edx, [edx+2*ebx]
+ lea edx, [edx+2*ebx]
movdqu xmm4, [esi]
movdqu xmm5, [edx]
movdqu xmm6, [esi+eax]
- movdqu xmm7, [edx+ebx]
+ movdqu xmm7, [edx+ebx]
pavgb xmm4, xmm5
pavgb xmm6, xmm7
movdqu [edi], xmm4
@@ -516,11 +516,11 @@
movdqu [edi+ebp], xmm6
lea edi, [edi+2*ebp]
lea esi, [esi+2*eax]
- lea edx, [edx+2*ebx]
-
+ lea edx, [edx+2*ebx]
+
dec ecx
jne .height_loop
-
+
pop edi
pop esi
pop ebx
@@ -540,7 +540,7 @@
dec dword [esp+4]
jg avg_w16_align_0_ssse3
ret
-
+
ALIGN 64
avg_w16_align_1_ssse3:
movdqa xmm1, [ebx+16]
@@ -555,7 +555,7 @@
jg avg_w16_align_1_ssse3
ret
-
+
ALIGN 16
;***********************************************************************
; void PixelAvgWidthEq16_ssse3(uint8_t *pDst, int32_t iDstStride,
@@ -574,7 +574,7 @@
mov ebx, [esp+28] ; src1
mov ecx, [esp+36] ; src2
mov esi, [esp+24] ; i_dst_stride
-
+
%define avg_w16_offset (avg_w16_align_1_ssse3-avg_w16_align_0_ssse3)
mov edx, ebx
and edx, 0x01
@@ -582,11 +582,11 @@
lea ebp, [avg_w16_offset]
imul ebp, edx
lea edx, [ebp+eax]
-
- mov eax, [esp+32]
- mov ebp, [esp+44]
+
+ mov eax, [esp+32]
+ mov ebp, [esp+44]
push ebp
- mov ebp, [esp+44]
+ mov ebp, [esp+44]
and ebx, 0xfffffff0
call edx
pop ebp
@@ -607,7 +607,7 @@
push edi
push ebx
-
+
mov esi, [esp+16]
mov eax, [esp+20]
mov edi, [esp+24]
@@ -617,12 +617,12 @@
.height_loop:
mov ebx, [esi]
mov [edi], ebx
-
+
add esi, eax
add edi, ecx
dec edx
jnz .height_loop
- WELSEMMS
+ WELSEMMS
pop ebx
pop edi
pop esi
@@ -650,12 +650,12 @@
add edi, ecx
dec edx
jnz .height_loop
-
- WELSEMMS
+
+ WELSEMMS
pop edi
pop esi
ret
-
+
ALIGN 16
;***********************************************************************
; void McCopyWidthEq16_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight )
@@ -664,11 +664,11 @@
push esi
push edi
- mov esi, [esp+12]
- mov eax, [esp+16]
- mov edi, [esp+20]
- mov edx, [esp+24]
- mov ecx, [esp+28]
+ mov esi, [esp+12]
+ mov eax, [esp+16]
+ mov edi, [esp+20]
+ mov edx, [esp+24]
+ mov ecx, [esp+28]
ALIGN 4
.height_loop:
@@ -681,7 +681,7 @@
lea esi, [esi+eax*2]
lea edi, [edi+edx*2]
jnz .height_loop
-
+
pop edi
pop esi
ret
--- a/codec/encoder/core/asm/mc_chroma.asm
+++ b/codec/encoder/core/asm/mc_chroma.asm
@@ -1,317 +1,317 @@
-;*!
-;* \copy
-;* Copyright (c) 2004-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* mc_chroma.asm
-;*
-;* Abstract
-;* mmx motion compensation for chroma
-;*
-;* History
-;* 10/13/2004 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-h264_d0x20_sse2:
- dw 32,32,32,32,32,32,32,32
-ALIGN 16
-h264_d0x20_mmx:
- dw 32,32,32,32
-
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq4_mmx( uint8_t *src,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; uint8_t *pABCD,
-; int32_t iHeigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq4_mmx
-McChromaWidthEq4_mmx:
- push esi
- push edi
- push ebx
-
- mov eax, [esp +12 + 20]
- movd mm3, [eax]
- WELS_Zero mm7
- punpcklbw mm3, mm3
- movq mm4, mm3
- punpcklwd mm3, mm3
- punpckhwd mm4, mm4
-
- movq mm5, mm3
- punpcklbw mm3, mm7
- punpckhbw mm5, mm7
-
- movq mm6, mm4
- punpcklbw mm4, mm7
- punpckhbw mm6, mm7
-
- mov esi, [esp +12+ 4]
- mov eax, [esp + 12 + 8]
- mov edi, [esp + 12 + 12]
- mov edx, [esp + 12 + 16]
- mov ecx, [esp + 12 + 24]
-
- lea ebx, [esi + eax]
- movd mm0, [esi]
- movd mm1, [esi+1]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
-.xloop:
-
- pmullw mm0, mm3
- pmullw mm1, mm5
- paddw mm0, mm1
-
- movd mm1, [ebx]
- punpcklbw mm1, mm7
- movq mm2, mm1
- pmullw mm1, mm4
- paddw mm0, mm1
-
- movd mm1, [ebx+1]
- punpcklbw mm1, mm7
- movq mm7, mm1
- pmullw mm1,mm6
- paddw mm0, mm1
- movq mm1,mm7
-
- paddw mm0, [h264_d0x20_mmx]
- psrlw mm0, 6
-
- WELS_Zero mm7
- packuswb mm0, mm7
- movd [edi], mm0
-
- movq mm0, mm2
-
- lea edi, [edi +edx ]
- lea ebx, [ebx + eax]
-
- dec ecx
- jnz near .xloop
- WELSEMMS
- pop ebx
- pop edi
- pop esi
- ret
-
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq8_sse2( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; uint8_t *pABCD,
-; int32_t iheigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq8_sse2
-McChromaWidthEq8_sse2:
- push esi
- push edi
- push ebx
-
- mov eax, [esp +12 + 20]
- movd xmm3, [eax]
- WELS_Zero xmm7
- punpcklbw xmm3, xmm3
- punpcklwd xmm3, xmm3
-
- movdqa xmm4, xmm3
- punpckldq xmm3, xmm3
- punpckhdq xmm4, xmm4
- movdqa xmm5, xmm3
- movdqa xmm6, xmm4
-
- punpcklbw xmm3, xmm7
- punpckhbw xmm5, xmm7
- punpcklbw xmm4, xmm7
- punpckhbw xmm6, xmm7
-
- mov esi, [esp +12+ 4]
- mov eax, [esp + 12 + 8]
- mov edi, [esp + 12 + 12]
- mov edx, [esp + 12 + 16]
- mov ecx, [esp + 12 + 24]
-
- lea ebx, [esi + eax]
- movq xmm0, [esi]
- movq xmm1, [esi+1]
- punpcklbw xmm0, xmm7
- punpcklbw xmm1, xmm7
-.xloop:
-
- pmullw xmm0, xmm3
- pmullw xmm1, xmm5
- paddw xmm0, xmm1
-
- movq xmm1, [ebx]
- punpcklbw xmm1, xmm7
- movdqa xmm2, xmm1
- pmullw xmm1, xmm4
- paddw xmm0, xmm1
-
- movq xmm1, [ebx+1]
- punpcklbw xmm1, xmm7
- movdqa xmm7, xmm1
- pmullw xmm1, xmm6
- paddw xmm0, xmm1
- movdqa xmm1,xmm7
-
- paddw xmm0, [h264_d0x20_sse2]
- psrlw xmm0, 6
-
- WELS_Zero xmm7
- packuswb xmm0, xmm7
- movq [edi], xmm0
-
- movdqa xmm0, xmm2
-
- lea edi, [edi +edx ]
- lea ebx, [ebx + eax]
-
- dec ecx
- jnz near .xloop
-
- pop ebx
- pop edi
- pop esi
- ret
-
-
-
-
-ALIGN 16
-;***********************************************************************
-; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; uint8_t *pABCD,
-; int32_t iHeigh);
-;***********************************************************************
-WELS_EXTERN McChromaWidthEq8_ssse3
-McChromaWidthEq8_ssse3:
- push ebx
- push esi
- push edi
-
- mov eax, [esp + 12 + 20]
-
- pxor xmm7, xmm7
- movd xmm5, [eax]
- punpcklwd xmm5, xmm5
- punpckldq xmm5, xmm5
- movdqa xmm6, xmm5
- punpcklqdq xmm5, xmm5
- punpckhqdq xmm6, xmm6
-
- mov eax, [esp + 12 + 4]
- mov edx, [esp + 12 + 8]
- mov esi, [esp + 12 + 12]
- mov edi, [esp + 12 + 16]
- mov ecx, [esp + 12 + 24]
-
- sub esi, edi
- sub esi, edi
- movdqa xmm7, [h264_d0x20_sse2]
-
- movdqu xmm0, [eax]
- movdqa xmm1, xmm0
- psrldq xmm1, 1
- punpcklbw xmm0, xmm1
-
-.hloop_chroma:
- lea esi, [esi+2*edi]
-
- movdqu xmm2, [eax+edx]
- movdqa xmm3, xmm2
- psrldq xmm3, 1
- punpcklbw xmm2, xmm3
- movdqa xmm4, xmm2
-
- pmaddubsw xmm0, xmm5
- pmaddubsw xmm2, xmm6
- paddw xmm0, xmm2
- paddw xmm0, xmm7
- psrlw xmm0, 6
- packuswb xmm0, xmm0
- movq [esi],xmm0
-
- lea eax, [eax+2*edx]
- movdqu xmm2, [eax]
- movdqa xmm3, xmm2
- psrldq xmm3, 1
- punpcklbw xmm2, xmm3
- movdqa xmm0, xmm2
-
- pmaddubsw xmm4, xmm5
- pmaddubsw xmm2, xmm6
- paddw xmm4, xmm2
- paddw xmm4, xmm7
- psrlw xmm4, 6
- packuswb xmm4, xmm4
- movq [esi+edi],xmm4
-
- sub ecx, 2
- jnz .hloop_chroma
- pop edi
- pop esi
- pop ebx
-
- ret
-
-
+;*!
+;* \copy
+;* Copyright (c) 2004-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* mc_chroma.asm
+;*
+;* Abstract
+;* mmx motion compensation for chroma
+;*
+;* History
+;* 10/13/2004 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+BITS 32
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+h264_d0x20_sse2:
+ dw 32,32,32,32,32,32,32,32
+ALIGN 16
+h264_d0x20_mmx:
+ dw 32,32,32,32
+
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq4_mmx( uint8_t *src,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; uint8_t *pABCD,
+; int32_t iHeigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq4_mmx
+McChromaWidthEq4_mmx:
+ push esi
+ push edi
+ push ebx
+
+ mov eax, [esp +12 + 20]
+ movd mm3, [eax]
+ WELS_Zero mm7
+ punpcklbw mm3, mm3
+ movq mm4, mm3
+ punpcklwd mm3, mm3
+ punpckhwd mm4, mm4
+
+ movq mm5, mm3
+ punpcklbw mm3, mm7
+ punpckhbw mm5, mm7
+
+ movq mm6, mm4
+ punpcklbw mm4, mm7
+ punpckhbw mm6, mm7
+
+ mov esi, [esp +12+ 4]
+ mov eax, [esp + 12 + 8]
+ mov edi, [esp + 12 + 12]
+ mov edx, [esp + 12 + 16]
+ mov ecx, [esp + 12 + 24]
+
+ lea ebx, [esi + eax]
+ movd mm0, [esi]
+ movd mm1, [esi+1]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+.xloop:
+
+ pmullw mm0, mm3
+ pmullw mm1, mm5
+ paddw mm0, mm1
+
+ movd mm1, [ebx]
+ punpcklbw mm1, mm7
+ movq mm2, mm1
+ pmullw mm1, mm4
+ paddw mm0, mm1
+
+ movd mm1, [ebx+1]
+ punpcklbw mm1, mm7
+ movq mm7, mm1
+ pmullw mm1,mm6
+ paddw mm0, mm1
+ movq mm1,mm7
+
+ paddw mm0, [h264_d0x20_mmx]
+ psrlw mm0, 6
+
+ WELS_Zero mm7
+ packuswb mm0, mm7
+ movd [edi], mm0
+
+ movq mm0, mm2
+
+ lea edi, [edi +edx ]
+ lea ebx, [ebx + eax]
+
+ dec ecx
+ jnz near .xloop
+ WELSEMMS
+ pop ebx
+ pop edi
+ pop esi
+ ret
+
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq8_sse2( uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; uint8_t *pABCD,
+; int32_t iheigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq8_sse2
+McChromaWidthEq8_sse2:
+ push esi
+ push edi
+ push ebx
+
+ mov eax, [esp +12 + 20]
+ movd xmm3, [eax]
+ WELS_Zero xmm7
+ punpcklbw xmm3, xmm3
+ punpcklwd xmm3, xmm3
+
+ movdqa xmm4, xmm3
+ punpckldq xmm3, xmm3
+ punpckhdq xmm4, xmm4
+ movdqa xmm5, xmm3
+ movdqa xmm6, xmm4
+
+ punpcklbw xmm3, xmm7
+ punpckhbw xmm5, xmm7
+ punpcklbw xmm4, xmm7
+ punpckhbw xmm6, xmm7
+
+ mov esi, [esp +12+ 4]
+ mov eax, [esp + 12 + 8]
+ mov edi, [esp + 12 + 12]
+ mov edx, [esp + 12 + 16]
+ mov ecx, [esp + 12 + 24]
+
+ lea ebx, [esi + eax]
+ movq xmm0, [esi]
+ movq xmm1, [esi+1]
+ punpcklbw xmm0, xmm7
+ punpcklbw xmm1, xmm7
+.xloop:
+
+ pmullw xmm0, xmm3
+ pmullw xmm1, xmm5
+ paddw xmm0, xmm1
+
+ movq xmm1, [ebx]
+ punpcklbw xmm1, xmm7
+ movdqa xmm2, xmm1
+ pmullw xmm1, xmm4
+ paddw xmm0, xmm1
+
+ movq xmm1, [ebx+1]
+ punpcklbw xmm1, xmm7
+ movdqa xmm7, xmm1
+ pmullw xmm1, xmm6
+ paddw xmm0, xmm1
+ movdqa xmm1,xmm7
+
+ paddw xmm0, [h264_d0x20_sse2]
+ psrlw xmm0, 6
+
+ WELS_Zero xmm7
+ packuswb xmm0, xmm7
+ movq [edi], xmm0
+
+ movdqa xmm0, xmm2
+
+ lea edi, [edi +edx ]
+ lea ebx, [ebx + eax]
+
+ dec ecx
+ jnz near .xloop
+
+ pop ebx
+ pop edi
+ pop esi
+ ret
+
+
+
+
+ALIGN 16
+;***********************************************************************
+; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; uint8_t *pABCD,
+; int32_t iHeigh);
+;***********************************************************************
+WELS_EXTERN McChromaWidthEq8_ssse3
+McChromaWidthEq8_ssse3:
+ push ebx
+ push esi
+ push edi
+
+ mov eax, [esp + 12 + 20]
+
+ pxor xmm7, xmm7
+ movd xmm5, [eax]
+ punpcklwd xmm5, xmm5
+ punpckldq xmm5, xmm5
+ movdqa xmm6, xmm5
+ punpcklqdq xmm5, xmm5
+ punpckhqdq xmm6, xmm6
+
+ mov eax, [esp + 12 + 4]
+ mov edx, [esp + 12 + 8]
+ mov esi, [esp + 12 + 12]
+ mov edi, [esp + 12 + 16]
+ mov ecx, [esp + 12 + 24]
+
+ sub esi, edi
+ sub esi, edi
+ movdqa xmm7, [h264_d0x20_sse2]
+
+ movdqu xmm0, [eax]
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+ punpcklbw xmm0, xmm1
+
+.hloop_chroma:
+ lea esi, [esi+2*edi]
+
+ movdqu xmm2, [eax+edx]
+ movdqa xmm3, xmm2
+ psrldq xmm3, 1
+ punpcklbw xmm2, xmm3
+ movdqa xmm4, xmm2
+
+ pmaddubsw xmm0, xmm5
+ pmaddubsw xmm2, xmm6
+ paddw xmm0, xmm2
+ paddw xmm0, xmm7
+ psrlw xmm0, 6
+ packuswb xmm0, xmm0
+ movq [esi],xmm0
+
+ lea eax, [eax+2*edx]
+ movdqu xmm2, [eax]
+ movdqa xmm3, xmm2
+ psrldq xmm3, 1
+ punpcklbw xmm2, xmm3
+ movdqa xmm0, xmm2
+
+ pmaddubsw xmm4, xmm5
+ pmaddubsw xmm2, xmm6
+ paddw xmm4, xmm2
+ paddw xmm4, xmm7
+ psrlw xmm4, 6
+ packuswb xmm4, xmm4
+ movq [esi+edi],xmm4
+
+ sub ecx, 2
+ jnz .hloop_chroma
+ pop edi
+ pop esi
+ pop ebx
+
+ ret
+
+
--- a/codec/encoder/core/asm/mc_luma.asm
+++ b/codec/encoder/core/asm/mc_luma.asm
@@ -91,10 +91,10 @@
ALIGN 16
;***********************************************************************
-; void McHorVer20WidthEq16_sse2( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
+; void McHorVer20WidthEq16_sse2( uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
; int32_t iHeight,
; );
;***********************************************************************
@@ -101,19 +101,19 @@
McHorVer20WidthEq16_sse2:
push esi
push edi
-
- mov esi, [esp + 12]
- mov eax, [esp + 16]
- mov edi, [esp + 20]
- mov ecx, [esp + 28]
- mov edx, [esp + 24]
- sub esi, 2
-
+
+ mov esi, [esp + 12]
+ mov eax, [esp + 16]
+ mov edi, [esp + 20]
+ mov ecx, [esp + 28]
+ mov edx, [esp + 24]
+ sub esi, 2
+
WELS_Zero xmm7
movdqa xmm6, [h264_w0x10_1]
.y_loop:
-
+
movq xmm0, [esi]
punpcklbw xmm0, xmm7
movq xmm1, [esi+5]
@@ -126,7 +126,7 @@
punpcklbw xmm4, xmm7
movq xmm5, [esi+3]
punpcklbw xmm5, xmm7
-
+
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
@@ -152,7 +152,7 @@
punpcklbw xmm4, xmm7
movq xmm5, [esi+3+8]
punpcklbw xmm5, xmm7
-
+
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
@@ -165,8 +165,8 @@
psraw xmm0, 5
packuswb xmm0, xmm7
movq [edi+8], xmm0
-
-
+
+
add esi, eax
add edi, edx
dec ecx
@@ -178,9 +178,9 @@
ALIGN 16
;***********************************************************************
-; void McHorVer22Width8HorFirst_sse2( uint8_t*pSrc,
-; int32_t iSrcStride,
-; uint8_t* pTap,
+; void McHorVer22Width8HorFirst_sse2( uint8_t*pSrc,
+; int32_t iSrcStride,
+; uint8_t* pTap,
; int32_t iTapStride,
; int32_t iHeight);
;***********************************************************************
@@ -193,11 +193,11 @@
mov edi, [esp+24] ;tap
mov edx, [esp+28] ;tap_stride
mov ebx, [esp+32] ;i_height
- pxor xmm7, xmm7
-
+ pxor xmm7, xmm7
+
sub esi, eax ;;;;;;;;need more 5 lines.
sub esi, eax
-
+
.yloop_width_8:
movq xmm0, [esi]
punpcklbw xmm0, xmm7
@@ -211,7 +211,7 @@
punpcklbw xmm4, xmm7
movq xmm5, [esi+3]
punpcklbw xmm5, xmm7
-
+
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
@@ -221,7 +221,7 @@
psllw xmm4, 2
paddw xmm0, xmm4
movdqa [edi], xmm0
-
+
add esi, eax
add edi, edx
dec ebx
@@ -230,12 +230,12 @@
pop edi
pop esi
ret
-
+
;***********************************************************************
-; void McHorVer02WidthEq8_sse2( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
+; void McHorVer02WidthEq8_sse2( uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
; int32_t iHeight )
;***********************************************************************
ALIGN 16
@@ -242,18 +242,18 @@
McHorVer02WidthEq8_sse2:
push esi
push edi
-
- mov esi, [esp + 12]
- mov edx, [esp + 16]
- mov edi, [esp + 20]
- mov eax, [esp + 24]
- mov ecx, [esp + 28]
+ mov esi, [esp + 12]
+ mov edx, [esp + 16]
+ mov edi, [esp + 20]
+ mov eax, [esp + 24]
+ mov ecx, [esp + 28]
+
sub esi, edx
sub esi, edx
WELS_Zero xmm7
-
+
SSE_LOAD_8P xmm0, xmm7, [esi]
SSE_LOAD_8P xmm1, xmm7, [esi+edx]
lea esi, [esi+2*edx]
@@ -262,8 +262,8 @@
lea esi, [esi+2*edx]
SSE_LOAD_8P xmm4, xmm7, [esi]
SSE_LOAD_8P xmm5, xmm7, [esi+edx]
-
-.start:
+
+.start:
FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
dec ecx
jz near .xx_exit
@@ -273,7 +273,7 @@
FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
dec ecx
jz near .xx_exit
-
+
lea edi, [edi+2*eax]
SSE_LOAD_8P xmm7, xmm0, [esi+edx]
FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
@@ -356,11 +356,11 @@
;***********************************************************************
-; void McHorVer02_sse2( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
+; void McHorVer02_sse2( uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
; int32_t iDstStride,
-; int32_t iWidth,
+; int32_t iWidth,
; int32_t iHeight )
;***********************************************************************
ALIGN 16
@@ -368,19 +368,19 @@
push esi
push edi
push ebx
-
- mov esi, [esp + 16]
- mov edx, [esp + 20]
- mov edi, [esp + 24]
- mov eax, [esp + 28]
- mov ecx, [esp + 36]
- mov ebx, [esp + 32]
+
+ mov esi, [esp + 16]
+ mov edx, [esp + 20]
+ mov edi, [esp + 24]
+ mov eax, [esp + 28]
+ mov ecx, [esp + 36]
+ mov ebx, [esp + 32]
shr ebx, 3
sub esi, edx
sub esi, edx
-
-.xloop:
- WELS_Zero xmm7
+
+.xloop:
+ WELS_Zero xmm7
SSE_LOAD_8P xmm0, xmm7, [esi]
SSE_LOAD_8P xmm1, xmm7, [esi+edx]
lea esi, [esi+2*edx]
@@ -389,7 +389,7 @@
lea esi, [esi+2*edx]
SSE_LOAD_8P xmm4, xmm7, [esi]
SSE_LOAD_8P xmm5, xmm7, [esi+edx]
-
+
FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
dec ecx
lea esi, [esi+2*edx]
@@ -402,8 +402,8 @@
movdqa xmm5,xmm6
add edi, eax
sub esi, edx
-
-.start:
+
+.start:
FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
dec ecx
jz near .x_loop_dec
@@ -413,7 +413,7 @@
FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
dec ecx
jz near .x_loop_dec
-
+
lea edi, [edi+2*eax]
SSE_LOAD_8P xmm7, xmm0, [esi+edx]
FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
@@ -454,16 +454,16 @@
SSE_LOAD_8P xmm5, xmm6, [esi+edx]
jmp near .start
-.x_loop_dec:
+.x_loop_dec:
dec ebx
jz near .xx_exit
- mov esi, [esp + 16]
- mov edi, [esp + 24]
+ mov esi, [esp + 16]
+ mov edi, [esp + 24]
sub esi, edx
sub esi, edx
add esi, 8
add edi, 8
- mov ecx, [esp + 36]
+ mov ecx, [esp + 36]
jmp near .xloop
.xx_exit:
@@ -473,12 +473,12 @@
ret
-ALIGN 16
+ALIGN 16
;***********************************************************************
-; void McHorVer20_sse2( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
+; void McHorVer20_sse2( uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
; int32_t iWidth,
; int32_t iHeight
; );
@@ -487,19 +487,19 @@
push esi
push edi
push ebx
- mov esi, [esp+16]
- mov eax, [esp+20]
- mov edi, [esp+24]
- mov edx, [esp+28]
- mov ecx, [esp+32]
- mov ebx, [esp+36]
+ mov esi, [esp+16]
+ mov eax, [esp+20]
+ mov edi, [esp+24]
+ mov edx, [esp+28]
+ mov ecx, [esp+32]
+ mov ebx, [esp+36]
sub esi, 2
- pxor xmm7, xmm7
-
+ pxor xmm7, xmm7
+
cmp ecx, 9
- jne near .width_17
-
-.yloop_width_9:
+ jne near .width_17
+
+.yloop_width_9:
movq xmm0, [esi]
punpcklbw xmm0, xmm7
movq xmm1, [esi+5]
@@ -512,7 +512,7 @@
punpcklbw xmm4, xmm7
movq xmm5, [esi+3]
punpcklbw xmm5, xmm7
-
+
movdqa xmm7, xmm2
paddw xmm7, xmm3
movdqa xmm6, xmm4
@@ -526,12 +526,12 @@
paddw xmm0, [h264_w0x10_1]
psraw xmm0, 5
packuswb xmm0, xmm0
- movd [edi], xmm0
-
+ movd [edi], xmm0
+
pxor xmm7, xmm7
movq xmm0, [esi+6]
punpcklbw xmm0, xmm7
-
+
paddw xmm4, xmm1
paddw xmm5, xmm3
psllw xmm5, 2
@@ -543,8 +543,8 @@
paddw xmm2, [h264_w0x10_1]
psraw xmm2, 5
packuswb xmm2, xmm2
- movq [edi+1], xmm2
-
+ movq [edi+1], xmm2
+
add esi, eax
add edi, edx
dec ebx
@@ -553,8 +553,8 @@
pop edi
pop esi
ret
-
-
+
+
.width_17:
.yloop_width_17:
movq xmm0, [esi]
@@ -569,7 +569,7 @@
punpcklbw xmm4, xmm7
movq xmm5, [esi+3]
punpcklbw xmm5, xmm7
-
+
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
@@ -582,7 +582,7 @@
psraw xmm0, 5
packuswb xmm0, xmm0
movq [edi], xmm0
-
+
movq xmm0, [esi+8]
punpcklbw xmm0, xmm7
movq xmm1, [esi+5+8]
@@ -595,7 +595,7 @@
punpcklbw xmm4, xmm7
movq xmm5, [esi+3+8]
punpcklbw xmm5, xmm7
-
+
movdqa xmm7, xmm2
paddw xmm7, xmm3
movdqa xmm6, xmm4
@@ -610,12 +610,12 @@
psraw xmm0, 5
packuswb xmm0, xmm0
movd [edi+8], xmm0
-
-
+
+
pxor xmm7, xmm7
movq xmm0, [esi+6+8]
punpcklbw xmm0, xmm7
-
+
paddw xmm4, xmm1
paddw xmm5, xmm3
psllw xmm5, 2
@@ -627,7 +627,7 @@
paddw xmm2, [h264_w0x10_1]
psraw xmm2, 5
packuswb xmm2, xmm2
- movq [edi+9], xmm2
+ movq [edi+9], xmm2
add esi, eax
add edi, edx
dec ebx
@@ -636,14 +636,14 @@
pop edi
pop esi
ret
-
-
+
+
ALIGN 16
;***********************************************************************
;void McHorVer22HorFirst_sse2
-; (uint8_t *pSrc,
-; int32_t iSrcStride,
+; (uint8_t *pSrc,
+; int32_t iSrcStride,
; uint8_t * pTap,
; int32_t iTapStride,
; int32_t iWidth,int32_t iHeight);
@@ -652,21 +652,21 @@
push esi
push edi
push ebx
- mov esi, [esp+16]
- mov eax, [esp+20]
- mov edi, [esp+24]
- mov edx, [esp+28]
- mov ecx, [esp+32]
- mov ebx, [esp+36]
- pxor xmm7, xmm7
-
+ mov esi, [esp+16]
+ mov eax, [esp+20]
+ mov edi, [esp+24]
+ mov edx, [esp+28]
+ mov ecx, [esp+32]
+ mov ebx, [esp+36]
+ pxor xmm7, xmm7
+
sub esi, eax ;;;;;;;;need more 5 lines.
sub esi, eax
-
+
cmp ecx, 9
- jne near .width_17
-
-.yloop_width_9:
+ jne near .width_17
+
+.yloop_width_9:
movq xmm0, [esi]
punpcklbw xmm0, xmm7
movq xmm1, [esi+5]
@@ -679,7 +679,7 @@
punpcklbw xmm4, xmm7
movq xmm5, [esi+3]
punpcklbw xmm5, xmm7
-
+
movdqa xmm7, xmm2
paddw xmm7, xmm3
movdqa xmm6, xmm4
@@ -690,12 +690,12 @@
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
- movd [edi], xmm0
-
+ movd [edi], xmm0
+
pxor xmm7, xmm7
movq xmm0, [esi+6]
punpcklbw xmm0, xmm7
-
+
paddw xmm4, xmm1
paddw xmm5, xmm3
psllw xmm5, 2
@@ -704,9 +704,9 @@
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
- movq [edi+2], xmm2
- movhps [edi+2+8], xmm2
-
+ movq [edi+2], xmm2
+ movhps [edi+2+8], xmm2
+
add esi, eax
add edi, edx
dec ebx
@@ -715,8 +715,8 @@
pop edi
pop esi
ret
-
-
+
+
.width_17:
.yloop_width_17:
movq xmm0, [esi]
@@ -731,7 +731,7 @@
punpcklbw xmm4, xmm7
movq xmm5, [esi+3]
punpcklbw xmm5, xmm7
-
+
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
@@ -741,7 +741,7 @@
psllw xmm4, 2
paddw xmm0, xmm4
movdqa [edi], xmm0
-
+
movq xmm0, [esi+8]
punpcklbw xmm0, xmm7
movq xmm1, [esi+5+8]
@@ -754,7 +754,7 @@
punpcklbw xmm4, xmm7
movq xmm5, [esi+3+8]
punpcklbw xmm5, xmm7
-
+
movdqa xmm7, xmm2
paddw xmm7, xmm3
movdqa xmm6, xmm4
@@ -766,12 +766,12 @@
psllw xmm6, 2
paddw xmm0, xmm6
movd [edi+16], xmm0
-
-
+
+
pxor xmm7, xmm7
movq xmm0, [esi+6+8]
punpcklbw xmm0, xmm7
-
+
paddw xmm4, xmm1
paddw xmm5, xmm3
psllw xmm5, 2
@@ -780,9 +780,9 @@
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
- movq [edi+18], xmm2
- movhps [edi+18+8], xmm2
-
+ movq [edi+18], xmm2
+ movhps [edi+18+8], xmm2
+
add esi, eax
add edi, edx
dec ebx
@@ -791,23 +791,23 @@
pop edi
pop esi
ret
-
-
+
+
%macro FILTER_VER 9
paddw %1, %6
movdqa %7, %2
movdqa %8, %3
-
-
+
+
paddw %7, %5
paddw %8, %4
-
- psubw %1, %7
- psraw %1, 2
- paddw %1, %8
- psubw %1, %7
- psraw %1, 2
- paddw %8, %1
+
+ psubw %1, %7
+ psraw %1, 2
+ paddw %1, %8
+ psubw %1, %7
+ psraw %1, 2
+ paddw %8, %1
paddw %8, [h264_mc_hc_32]
psraw %8, 6
packuswb %8, %8
@@ -815,8 +815,8 @@
%endmacro
;***********************************************************************
;void McHorVer22VerLastAlign_sse2(
-; uint8_t *pTap,
-; int32_t iTapStride,
+; uint8_t *pTap,
+; int32_t iTapStride,
; uint8_t * pDst,
; int32_t iDstStride,
; int32_t iWidth,
@@ -828,15 +828,15 @@
push edi
push ebx
push ebp
-
+
mov esi, [esp+20]
mov eax, [esp+24]
mov edi, [esp+28]
mov edx, [esp+32]
mov ebx, [esp+36]
- mov ecx, [esp+40]
- shr ebx, 3
-
+ mov ecx, [esp+40]
+ shr ebx, 3
+
.width_loop:
movdqa xmm0, [esi]
movdqa xmm1, [esi+eax]
@@ -846,12 +846,12 @@
lea esi, [esi+2*eax]
movdqa xmm4, [esi]
movdqa xmm5, [esi+eax]
-
+
FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
dec ecx
lea esi, [esi+2*eax]
movdqa xmm6, [esi]
-
+
movdqa xmm0, xmm1
movdqa xmm1, xmm2
movdqa xmm2, xmm3
@@ -858,61 +858,61 @@
movdqa xmm3, xmm4
movdqa xmm4, xmm5
movdqa xmm5, xmm6
-
+
add edi, edx
- sub esi, eax
-
+ sub esi, eax
+
.start:
FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
dec ecx
jz near .x_loop_dec
-
+
lea esi, [esi+2*eax]
movdqa xmm6, [esi]
FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
dec ecx
jz near .x_loop_dec
-
+
lea edi, [edi+2*edx]
movdqa xmm7, [esi+eax]
FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
dec ecx
jz near .x_loop_dec
-
+
lea esi, [esi+2*eax]
movdqa xmm0, [esi]
FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
dec ecx
jz near .x_loop_dec
-
+
lea edi, [edi+2*edx]
movdqa xmm1, [esi+eax]
FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
dec ecx
jz near .x_loop_dec
-
+
lea esi, [esi+2*eax]
movdqa xmm2, [esi]
FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
dec ecx
jz near .x_loop_dec
-
+
lea edi, [edi+2*edx]
movdqa xmm3, [esi+eax]
FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
dec ecx
jz near .x_loop_dec
-
+
lea esi, [esi+2*eax]
movdqa xmm4, [esi]
FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
dec ecx
jz near .x_loop_dec
-
+
lea edi, [edi+2*edx]
movdqa xmm5, [esi+eax]
jmp near .start
-
+
.x_loop_dec:
dec ebx
jz near .exit
@@ -922,9 +922,9 @@
add esi, 16
add edi, 8
jmp .width_loop
-
-
-
+
+
+
.exit:
pop ebp
pop ebx
@@ -934,8 +934,8 @@
;***********************************************************************
;void McHorVer22VerLastUnAlign_sse2(
-; uint8_t *pTap,
-; int32_t iTapStride,
+; uint8_t *pTap,
+; int32_t iTapStride,
; uint8_t * pDst,
; int32_t iDstStride,
; int32_t iWidth,
@@ -947,15 +947,15 @@
push edi
push ebx
push ebp
-
+
mov esi, [esp+20]
mov eax, [esp+24]
mov edi, [esp+28]
mov edx, [esp+32]
mov ebx, [esp+36]
- mov ecx, [esp+40]
- shr ebx, 3
-
+ mov ecx, [esp+40]
+ shr ebx, 3
+
.width_loop:
movdqu xmm0, [esi]
movdqu xmm1, [esi+eax]
@@ -965,12 +965,12 @@
lea esi, [esi+2*eax]
movdqu xmm4, [esi]
movdqu xmm5, [esi+eax]
-
+
FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
dec ecx
lea esi, [esi+2*eax]
movdqu xmm6, [esi]
-
+
movdqa xmm0, xmm1
movdqa xmm1, xmm2
movdqa xmm2, xmm3
@@ -977,61 +977,61 @@
movdqa xmm3, xmm4
movdqa xmm4, xmm5
movdqa xmm5, xmm6
-
+
add edi, edx
- sub esi, eax
-
+ sub esi, eax
+
.start:
FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
dec ecx
jz near .x_loop_dec
-
+
lea esi, [esi+2*eax]
movdqu xmm6, [esi]
FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
dec ecx
jz near .x_loop_dec
-
+
lea edi, [edi+2*edx]
movdqu xmm7, [esi+eax]
FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
dec ecx
jz near .x_loop_dec
-
+
lea esi, [esi+2*eax]
movdqu xmm0, [esi]
FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
dec ecx
jz near .x_loop_dec
-
+
lea edi, [edi+2*edx]
movdqu xmm1, [esi+eax]
FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
dec ecx
jz near .x_loop_dec
-
+
lea esi, [esi+2*eax]
movdqu xmm2, [esi]
FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
dec ecx
jz near .x_loop_dec
-
+
lea edi, [edi+2*edx]
movdqu xmm3, [esi+eax]
FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
dec ecx
jz near .x_loop_dec
-
+
lea esi, [esi+2*eax]
movdqu xmm4, [esi]
FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
dec ecx
jz near .x_loop_dec
-
+
lea edi, [edi+2*edx]
movdqu xmm5, [esi+eax]
jmp near .start
-
+
.x_loop_dec:
dec ebx
jz near .exit
@@ -1041,9 +1041,9 @@
add esi, 16
add edi, 8
jmp .width_loop
-
-
-
+
+
+
.exit:
pop ebp
pop ebx
--- a/codec/encoder/core/asm/memzero.asm
+++ b/codec/encoder/core/asm/memzero.asm
@@ -32,8 +32,8 @@
;* memzero.asm
;*
;* Abstract
-;*
;*
+;*
;* History
;* 9/16/2009 Created
;*
@@ -47,8 +47,8 @@
; Code
;***********************************************************************
-SECTION .text
-
+SECTION .text
+
ALIGN 16
;***********************************************************************
;_inline void __cdecl WelsPrefetchZero_mmx(int8_t const*_A);
@@ -57,7 +57,7 @@
WelsPrefetchZero_mmx:
mov eax,[esp+4]
prefetchnta [eax]
- ret
+ ret
ALIGN 16
@@ -69,7 +69,7 @@
mov eax, [esp + 4] ; dst
mov ecx, [esp + 8]
neg ecx
-
+
pxor xmm0, xmm0
.memzeroa64_sse2_loops:
movdqa [eax], xmm0
@@ -77,12 +77,12 @@
movdqa [eax+32], xmm0
movdqa [eax+48], xmm0
add eax, 0x40
-
+
add ecx, 0x40
jnz near .memzeroa64_sse2_loops
-
- ret
+ ret
+
ALIGN 16
;***********************************************************************
; void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
@@ -92,7 +92,7 @@
mov eax, [esp + 4] ; dst
mov ecx, [esp + 8]
neg ecx
-
+
pxor mm0, mm0
.memzero64_mmx_loops:
movq [eax], mm0
@@ -102,16 +102,16 @@
movq [eax+32], mm0
movq [eax+40], mm0
movq [eax+48], mm0
- movq [eax+56], mm0
+ movq [eax+56], mm0
add eax, 0x40
-
+
add ecx, 0x40
jnz near .memzero64_mmx_loops
-
- WELSEMMS
- ret
-
-ALIGN 16
+
+ WELSEMMS
+ ret
+
+ALIGN 16
;***********************************************************************
; void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
;***********************************************************************
@@ -119,17 +119,17 @@
WelsSetMemZeroSize8_mmx:
mov eax, [esp + 4] ; dst
mov ecx, [esp + 8] ; size
- neg ecx
+ neg ecx
pxor mm0, mm0
-
+
.memzero8_mmx_loops:
movq [eax], mm0
add eax, 0x08
-
+
add ecx, 0x08
jnz near .memzero8_mmx_loops
-
- WELSEMMS
- ret
-
+ WELSEMMS
+ ret
+
+
--- a/codec/encoder/core/asm/quant.asm
+++ b/codec/encoder/core/asm/quant.asm
@@ -44,17 +44,17 @@
BITS 32
-SECTION .text
+SECTION .text
;************************************************
-;NEW_QUANT
+;NEW_QUANT
;************************************************
%macro SSE2_Quant8 5
MOVDQ %1, %5
- pxor %2, %2
- pcmpgtw %2, %1
- pxor %1, %2
- psubw %1, %2
+ pxor %2, %2
+ pcmpgtw %2, %1
+ pxor %1, %2
+ psubw %1, %2
paddusw %1, %3
pmulhuw %1, %4
pxor %1, %2
@@ -64,10 +64,10 @@
%macro SSE2_QuantMax8 6
MOVDQ %1, %5
- pxor %2, %2
- pcmpgtw %2, %1
- pxor %1, %2
- psubw %1, %2
+ pxor %2, %2
+ pcmpgtw %2, %1
+ pxor %1, %2
+ psubw %1, %2
paddusw %1, %3
pmulhuw %1, %4
pmaxsw %6, %1
@@ -86,17 +86,17 @@
WELS_EXTERN WelsQuant4x4_sse2
align 16
WelsQuant4x4_sse2:
- mov eax, [ff]
- mov ecx, [mf]
+ mov eax, [ff]
+ mov ecx, [mf]
MOVDQ xmm2, [eax]
MOVDQ xmm3, [ecx]
-
+
mov edx, [pDct]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
ret
-
+
;***********************************************************************
;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf);
;***********************************************************************
@@ -104,17 +104,17 @@
align 16
WelsQuant4x4Dc_sse2:
mov ax, [mf]
- SSE2_Copy8Times xmm3, eax
-
+ SSE2_Copy8Times xmm3, eax
+
mov cx, [ff]
- SSE2_Copy8Times xmm2, ecx
+ SSE2_Copy8Times xmm2, ecx
mov edx, [pDct]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
-
- ret
-
+
+ ret
+
;***********************************************************************
; void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
;***********************************************************************
@@ -121,20 +121,20 @@
WELS_EXTERN WelsQuantFour4x4_sse2
align 16
WelsQuantFour4x4_sse2:
- mov eax, [ff]
- mov ecx, [mf]
+ mov eax, [ff]
+ mov ecx, [mf]
MOVDQ xmm2, [eax]
MOVDQ xmm3, [ecx]
-
- mov edx, [pDct]
+
+ mov edx, [pDct]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x20]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x30]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x40]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x50]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x60]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x70]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x70]
ret
@@ -144,17 +144,17 @@
WELS_EXTERN WelsQuantFour4x4Max_sse2
align 16
WelsQuantFour4x4Max_sse2:
- mov eax, [ff]
- mov ecx, [mf]
+ mov eax, [ff]
+ mov ecx, [mf]
MOVDQ xmm2, [eax]
MOVDQ xmm3, [ecx]
-
- mov edx, [pDct]
+
+ mov edx, [pDct]
pxor xmm4, xmm4
pxor xmm5, xmm5
pxor xmm6, xmm6
pxor xmm7, xmm7
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [edx ], xmm4
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [edx ], xmm4
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [edx + 0x10], xmm4
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [edx + 0x20], xmm5
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [edx + 0x30], xmm5
@@ -162,20 +162,20 @@
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [edx + 0x50], xmm6
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [edx + 0x60], xmm7
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [edx + 0x70], xmm7
-
+
SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
- pmaxsw xmm0, xmm4
+ pmaxsw xmm0, xmm4
pmaxsw xmm0, xmm5
- pmaxsw xmm0, xmm7
+ pmaxsw xmm0, xmm7
movdqa xmm1, xmm0
punpckhqdq xmm0, xmm1
pmaxsw xmm0, xmm1
- mov edx, [max]
- movq [edx], xmm0
-
- ret
+ mov edx, [max]
+ movq [edx], xmm0
+ ret
+
%macro MMX_Copy4Times 2
movd %1, %2
punpcklwd %1, %1
@@ -185,10 +185,10 @@
SECTION .text
%macro MMX_Quant4 4
- pxor %2, %2
- pcmpgtw %2, %1
- pxor %1, %2
- psubw %1, %2
+ pxor %2, %2
+ pcmpgtw %2, %1
+ pxor %1, %2
+ psubw %1, %2
paddusw %1, %3
pmulhuw %1, %4
pxor %1, %2
@@ -211,13 +211,13 @@
movd mm3, [eax + 0x40]
movd mm1, [eax + 0x60]
punpcklwd mm3, mm1
-
+
mov cx, 0
mov [eax], cx
mov [eax + 0x20], cx
mov [eax + 0x40], cx
mov [eax + 0x60], cx
-
+
;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
movq mm5, mm3
paddw mm3, mm0
@@ -229,22 +229,22 @@
paddw mm1, mm3
psubw mm3, mm5
punpcklwd mm1, mm3
-
+
;quant_2x2_dc
mov ax, [mf]
- MMX_Copy4Times mm3, eax
+ MMX_Copy4Times mm3, eax
mov cx, [ff]
MMX_Copy4Times mm2, ecx
MMX_Quant4 mm1, mm0, mm2, mm3
-
+
; store dct_2x2
- mov edx, [dct2x2]
+ mov edx, [dct2x2]
movq [edx], mm1
mov ecx, [iChromaDc]
movq [ecx], mm1
-
+
; pNonZeroCount of dct_2x2
- pcmpeqb mm2, mm2 ; mm2 = FF
+ pcmpeqb mm2, mm2 ; mm2 = FF
pxor mm3, mm3
packsswb mm1, mm3
pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
@@ -251,10 +251,10 @@
psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
psadbw mm1, mm3 ;
movd eax, mm1
-
+
WELSEMMS
ret
-
+
;***********************************************************************
;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff, int16_t mf);
;***********************************************************************
@@ -269,7 +269,7 @@
movd mm3, [eax + 0x40]
movd mm1, [eax + 0x60]
punpcklwd mm3, mm1
-
+
;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
movq mm5, mm3
paddw mm3, mm0
@@ -281,16 +281,16 @@
paddw mm1, mm3
psubw mm3, mm5
punpcklwd mm1, mm3
-
+
;quant_2x2_dc
mov ax, [mf]
- MMX_Copy4Times mm3, eax
+ MMX_Copy4Times mm3, eax
mov cx, [ff]
MMX_Copy4Times mm2, ecx
MMX_Quant4 mm1, mm0, mm2, mm3
-
+
; pNonZeroCount of dct_2x2
- pcmpeqb mm2, mm2 ; mm2 = FF
+ pcmpeqb mm2, mm2 ; mm2 = FF
pxor mm3, mm3
packsswb mm1, mm3
pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
@@ -297,16 +297,16 @@
psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
psadbw mm1, mm3 ;
movd eax, mm1
-
- WELSEMMS
- ret
-
-
-%macro SSE2_DeQuant8 3
+
+ WELSEMMS
+ ret
+
+
+%macro SSE2_DeQuant8 3
MOVDQ %2, %1
pmullw %2, %3
MOVDQ %1, %2
-%endmacro
+%endmacro
ALIGN 16
@@ -329,7 +329,7 @@
;***********************************************************************====
;void WelsDequantFour4x4_sse2(int16_t *pDct, const uint16_t* mf);
;***********************************************************************====
-
+
align 16
WELS_EXTERN WelsDequantFour4x4_sse2
@@ -356,15 +356,15 @@
WELS_EXTERN WelsDequantIHadamard4x4_sse2
align 16
WelsDequantIHadamard4x4_sse2:
- mov eax, [esp + 4]
+ mov eax, [esp + 4]
mov cx, [esp + 8]
-
+
; WelsDequantLumaDc4x4
- SSE2_Copy8Times xmm1, ecx
+ SSE2_Copy8Times xmm1, ecx
;psrlw xmm1, 2 ; for the (>>2) in ihdm
MOVDQ xmm0, [eax]
MOVDQ xmm2, [eax+0x10]
- pmullw xmm0, xmm1
+ pmullw xmm0, xmm1
pmullw xmm2, xmm1
; ihdm_4x4
@@ -371,24 +371,23 @@
movdqa xmm1, xmm0
psrldq xmm1, 8
movdqa xmm3, xmm2
- psrldq xmm3, 8
-
- SSE2_SumSub xmm0, xmm3, xmm5 ; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
- SSE2_SumSub xmm1, xmm2, xmm5 ; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
+ psrldq xmm3, 8
+
+ SSE2_SumSub xmm0, xmm3, xmm5 ; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
+ SSE2_SumSub xmm1, xmm2, xmm5 ; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
SSE2_SumSub xmm3, xmm2, xmm5 ; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
SSE2_SumSub xmm0, xmm1, xmm5 ; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1
- SSE2_TransTwo4x4W xmm2, xmm1, xmm3, xmm0, xmm4
- SSE2_SumSub xmm2, xmm4, xmm5
- SSE2_SumSub xmm1, xmm0, xmm5
- SSE2_SumSub xmm4, xmm0, xmm5
- SSE2_SumSub xmm2, xmm1, xmm5
+ SSE2_TransTwo4x4W xmm2, xmm1, xmm3, xmm0, xmm4
+ SSE2_SumSub xmm2, xmm4, xmm5
+ SSE2_SumSub xmm1, xmm0, xmm5
+ SSE2_SumSub xmm4, xmm0, xmm5
+ SSE2_SumSub xmm2, xmm1, xmm5
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
-
+
punpcklqdq xmm0, xmm1
MOVDQ [eax], xmm0
-
+
punpcklqdq xmm2, xmm3
- MOVDQ [eax+16], xmm2
+ MOVDQ [eax+16], xmm2
ret
-
\ No newline at end of file
--- a/codec/encoder/core/asm/satd_sad.asm
+++ b/codec/encoder/core/asm/satd_sad.asm
@@ -37,7 +37,7 @@
;* WelsSampleSatd16x8_sse2
;* WelsSampleSatd8x16_sse2
;* WelsSampleSatd16x16_sse2
-;*
+;*
;* WelsSampleSad16x8_sse2
;* WelsSampleSad16x16_sse2
;*
@@ -99,12 +99,12 @@
%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4 pOut: xmm4,xmm2,xmm1,xmm3
SSE2_SumSub %1, %2, %5
- SSE2_SumSub %3, %4, %5
- SSE2_SumSub %2, %4, %5
- SSE2_SumSub %1, %3, %5
-%endmacro
+ SSE2_SumSub %3, %4, %5
+ SSE2_SumSub %2, %4, %5
+ SSE2_SumSub %1, %3, %5
+%endmacro
-%macro SSE2_SumAbs4 7
+%macro SSE2_SumAbs4 7
WELS_AbsW %1, %3
WELS_AbsW %2, %3
WELS_AbsW %4, %6
@@ -113,13 +113,13 @@
paddusw %4, %5
paddusw %7, %1
paddusw %7, %4
-%endmacro
+%endmacro
%macro SSE2_SumWHorizon 3
movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
- punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
- movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
+ punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
+ movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
@@ -132,12 +132,12 @@
lea ecx, [ecx+2*edx]
SSE2_LoadDiff8P xmm2,xmm4,xmm7,[eax],[ecx]
SSE2_LoadDiff8P xmm3,xmm5,xmm7,[eax+ebx],[ecx+edx]
-
+
SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
- SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
+ SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
-
+
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
SSE2_LoadDiff8P xmm0,xmm4,xmm7,[eax],[ecx]
@@ -146,11 +146,11 @@
lea ecx, [ecx+2*edx]
SSE2_LoadDiff8P xmm2,xmm4,xmm7,[eax],[ecx]
SSE2_LoadDiff8P xmm3,xmm5,xmm7,[eax+ebx],[ecx+edx]
-
+
SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
- SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
- SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+ SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
+ SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
%endmacro
;***********************************************************************
@@ -165,8 +165,8 @@
mov eax, [esp+8]
mov ebx, [esp+12]
mov ecx, [esp+16]
- mov edx, [esp+20]
-
+ mov edx, [esp+20]
+
movd xmm0, [eax]
movd xmm1, [eax+ebx]
lea eax , [eax+2*ebx]
@@ -174,7 +174,7 @@
movd xmm3, [eax+ebx]
punpckldq xmm0, xmm2
punpckldq xmm1, xmm3
-
+
movd xmm4, [ecx]
movd xmm5, [ecx+edx]
lea ecx , [ecx+2*edx]
@@ -188,7 +188,7 @@
punpcklbw xmm1, xmm6
punpcklbw xmm4, xmm6
punpcklbw xmm5, xmm6
-
+
psubw xmm0, xmm4
psubw xmm1, xmm5
@@ -196,7 +196,7 @@
paddw xmm0, xmm1
psubw xmm2, xmm1
SSE2_XSawp qdq, xmm0, xmm2, xmm3
-
+
movdqa xmm4, xmm0
paddw xmm0, xmm3
psubw xmm4, xmm3
@@ -204,7 +204,7 @@
movdqa xmm2, xmm0
punpcklwd xmm0, xmm4
punpckhwd xmm4, xmm2
-
+
SSE2_XSawp dq, xmm0, xmm4, xmm3
SSE2_XSawp qdq, xmm0, xmm3, xmm5
@@ -211,16 +211,16 @@
movdqa xmm7, xmm0
paddw xmm0, xmm5
psubw xmm7, xmm5
-
+
SSE2_XSawp qdq, xmm0, xmm7, xmm1
movdqa xmm2, xmm0
paddw xmm0, xmm1
psubw xmm2, xmm1
-
- WELS_AbsW xmm0, xmm3
+
+ WELS_AbsW xmm0, xmm3
paddusw xmm6, xmm0
- WELS_AbsW xmm2, xmm4
+ WELS_AbsW xmm2, xmm4
paddusw xmm6, xmm2
SSE2_SumWHorizon1 xmm6, xmm4
movd eax, xmm6
@@ -228,7 +228,7 @@
shr eax, 1
pop ebx
ret
-
+
;***********************************************************************
;
;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
@@ -241,16 +241,16 @@
mov eax, [esp+8]
mov ebx, [esp+12]
mov ecx, [esp+16]
- mov edx, [esp+20]
+ mov edx, [esp+20]
pxor xmm6, xmm6
- pxor xmm7, xmm7
- SSE2_GetSatd8x8
+ pxor xmm7, xmm7
+ SSE2_GetSatd8x8
psrlw xmm6, 1
SSE2_SumWHorizon xmm6,xmm4,xmm7
movd eax, xmm6
pop ebx
ret
-
+
;***********************************************************************
;
;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
@@ -263,15 +263,15 @@
mov eax, [esp+8]
mov ebx, [esp+12]
mov ecx, [esp+16]
- mov edx, [esp+20]
+ mov edx, [esp+20]
pxor xmm6, xmm6
- pxor xmm7, xmm7
-
- SSE2_GetSatd8x8
+ pxor xmm7, xmm7
+
+ SSE2_GetSatd8x8
lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- SSE2_GetSatd8x8
-
+ lea ecx, [ecx+2*edx]
+ SSE2_GetSatd8x8
+
psrlw xmm6, 1
SSE2_SumWHorizon xmm6,xmm4,xmm7
movd eax, xmm6
@@ -290,15 +290,15 @@
mov eax, [esp+8]
mov ebx, [esp+12]
mov ecx, [esp+16]
- mov edx, [esp+20]
+ mov edx, [esp+20]
pxor xmm6, xmm6
pxor xmm7, xmm7
-
+
SSE2_GetSatd8x8
mov eax, [esp+8]
mov ecx, [esp+16]
add eax, 8
- add ecx, 8
+ add ecx, 8
SSE2_GetSatd8x8
psrlw xmm6, 1
@@ -319,25 +319,25 @@
mov eax, [esp+8]
mov ebx, [esp+12]
mov ecx, [esp+16]
- mov edx, [esp+20]
+ mov edx, [esp+20]
pxor xmm6, xmm6
pxor xmm7, xmm7
-
- SSE2_GetSatd8x8
+
+ SSE2_GetSatd8x8
lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
+ lea ecx, [ecx+2*edx]
SSE2_GetSatd8x8
-
+
mov eax, [esp+8]
mov ecx, [esp+16]
add eax, 8
add ecx, 8
-
- SSE2_GetSatd8x8
+
+ SSE2_GetSatd8x8
lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
+ lea ecx, [ecx+2*edx]
SSE2_GetSatd8x8
-
+
; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
psrlw xmm6, 1
SSE2_SumWHorizon xmm6,xmm4,xmm7
@@ -353,18 +353,18 @@
;***********************************************************************
;
-;Pixel_satd_intra_sse2 BEGIN
+;Pixel_satd_intra_sse2 BEGIN
;
;***********************************************************************
-%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
+%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
pmaddubsw %1, xmm5
movdqa %2, %1
pmaddwd %1, xmm7
pmaddwd %2, xmm6
movdqa %3, %1
- punpckldq %1, %2
- punpckhdq %2, %3
+ punpckldq %1, %2
+ punpckhdq %2, %3
movdqa %3, %1
punpcklqdq %1, %2
punpckhqdq %3, %2
@@ -373,14 +373,14 @@
packssdw %1, %3
psllw %1, 2
%endmacro
-%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
+%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
pmaddubsw %1, xmm5
movdqa %2, %1
pmaddwd %1, xmm7
pmaddwd %2, xmm6
movdqa %3, %1
- punpckldq %1, %2
- punpckhdq %2, %3
+ punpckldq %1, %2
+ punpckhdq %2, %3
movdqa %3, %1
punpcklqdq %1, %2
punpckhqdq %3, %2
@@ -387,7 +387,7 @@
; paddd xmm4, %1 ;for dc
; paddd xmm4, %3 ;for dc
movdqa %4, %1
- punpcklqdq %4, %3
+ punpcklqdq %4, %3
packssdw %1, %3
psllw %1, 2
%endmacro
@@ -415,25 +415,25 @@
pinsrw xmm0, word[esi+%2+8], 4
psubsw xmm0, xmm7
pabsw xmm0, xmm0
- paddw xmm4, xmm0
+ paddw xmm4, xmm0
pxor xmm0, xmm0
pinsrw xmm0, word[esi+%2+2], 0
pinsrw xmm0, word[esi+%2+10], 4
psubsw xmm0, xmm1
pabsw xmm0, xmm0
- paddw xmm4, xmm0
+ paddw xmm4, xmm0
pxor xmm0, xmm0
pinsrw xmm0, word[esi+%2+4], 0
pinsrw xmm0, word[esi+%2+12], 4
psubsw xmm0, xmm3
pabsw xmm0, xmm0
- paddw xmm4, xmm0
+ paddw xmm4, xmm0
pxor xmm0, xmm0
pinsrw xmm0, word[esi+%2+6], 0
pinsrw xmm0, word[esi+%2+14], 4
psubsw xmm0, xmm2
pabsw xmm0, xmm0
- paddw xmm4, xmm0
+ paddw xmm4, xmm0
%endmacro
%macro SSE41_GetX38x4SatdH 3
movq xmm0, [esi+%3+8*%1]
@@ -455,7 +455,7 @@
psubsw xmm0, xmm7
pabsw xmm0, xmm0
paddw xmm6, xmm0
- paddw xmm6, xmm2
+ paddw xmm6, xmm2
%endmacro
%macro SSE41_ChromaGetX38x4SatdDC 1
shl %1, 4
@@ -463,13 +463,13 @@
psubsw xmm0, xmm7
pabsw xmm0, xmm0
paddw xmm6, xmm0
- paddw xmm6, xmm2
+ paddw xmm6, xmm2
%endmacro
%macro SSE41_I16x16GetX38x4Satd 2
SSE41_GetX38x4SatdDec
SSE41_GetX38x4SatdV %1, %2
SSE41_GetX38x4SatdH %1, %2, 32
- SSE41_I16X16GetX38x4SatdDC
+ SSE41_I16X16GetX38x4SatdDC
%endmacro
%macro SSE41_ChromaGetX38x4Satd 2
SSE41_GetX38x4SatdDec
@@ -478,11 +478,11 @@
SSE41_ChromaGetX38x4SatdDC %1
%endmacro
%macro SSE41_HSum8W 3
- pmaddwd %1, %2
- movhlps %3, %1
- paddd %1, %3
- pshuflw %3, %1,0Eh
- paddd %1, %3
+ pmaddwd %1, %2
+ movhlps %3, %1
+ paddd %1, %3
+ pshuflw %3, %1,0Eh
+ paddd %1, %3
%endmacro
WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
@@ -493,7 +493,7 @@
mov ecx, [esp+16]
mov edx, [esp+20]
mov eax, [esp+24]
- mov ebx, [esp+28]
+ mov ebx, [esp+28]
mov esi, [esp+40] ;temp_satd
pxor xmm4, xmm4
movdqa xmm5, [HSumSubDB1]
@@ -507,29 +507,29 @@
SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
movdqa [esi], xmm0 ;V
- movdqa [esi+16], xmm1
+ movdqa [esi+16], xmm1
add ecx, edx
pinsrb xmm0, byte[ecx-1], 0
pinsrb xmm0, byte[ecx+edx-1], 1
- lea ecx, [ecx+2*edx]
+ lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 2
pinsrb xmm0, byte[ecx+edx-1], 3
- lea ecx, [ecx+2*edx]
+ lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 4
pinsrb xmm0, byte[ecx+edx-1], 5
- lea ecx, [ecx+2*edx]
+ lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 6
pinsrb xmm0, byte[ecx+edx-1], 7
- lea ecx, [ecx+2*edx]
+ lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 8
pinsrb xmm0, byte[ecx+edx-1], 9
- lea ecx, [ecx+2*edx]
+ lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 10
pinsrb xmm0, byte[ecx+edx-1], 11
- lea ecx, [ecx+2*edx]
+ lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 12
pinsrb xmm0, byte[ecx+edx-1], 13
- lea ecx, [ecx+2*edx]
+ lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 14
pinsrb xmm0, byte[ecx+edx-1], 15
movhlps xmm1, xmm0
@@ -549,7 +549,7 @@
pxor xmm6, xmm6 ;DC
mov ecx, 0
mov edi, 0
-.loop16x16_get_satd:
+.loop16x16_get_satd:
.loopStart1:
SSE41_I16x16GetX38x4Satd ecx, edi
inc ecx
@@ -562,8 +562,8 @@
mov ecx, 0
add edi, 16
jmp .loop16x16_get_satd
- .loop16x16_get_satd_end:
- MMX_DW_1_2REG xmm0, xmm1
+ .loop16x16_get_satd_end:
+ MMX_DW_1_2REG xmm0, xmm1
psrlw xmm4, 1 ;/2
psrlw xmm5, 1 ;/2
psrlw xmm6, 1 ;/2
@@ -570,7 +570,7 @@
SSE41_HSum8W xmm4, xmm0, xmm1
SSE41_HSum8W xmm5, xmm0, xmm1
SSE41_HSum8W xmm6, xmm0, xmm1
-
+
; comparing order: DC H V
movd ebx, xmm6 ;DC
movd edi, xmm5 ;H
@@ -577,33 +577,33 @@
movd ecx, xmm4 ;V
mov edx, [esp+36]
shl edx, 1
- add edi, edx
- add ebx, edx
+ add edi, edx
+ add ebx, edx
mov edx, [esp+32]
cmp ebx, edi
jge near not_dc_16x16
cmp ebx, ecx
jge near not_dc_h_16x16
-
+
; for DC mode
mov dword[edx], 2;I16_PRED_DC
- mov eax, ebx
+ mov eax, ebx
jmp near return_satd_intra_16x16_x3
not_dc_16x16:
- ; for H mode
+ ; for H mode
cmp edi, ecx
jge near not_dc_h_16x16
mov dword[edx], 1;I16_PRED_H
- mov eax, edi
+ mov eax, edi
jmp near return_satd_intra_16x16_x3
not_dc_h_16x16:
; for V mode
mov dword[edx], 0;I16_PRED_V
mov eax, ecx
-return_satd_intra_16x16_x3:
+return_satd_intra_16x16_x3:
WELSEMMS
- pop edi
- pop esi
+ pop edi
+ pop esi
pop ebx
ret
@@ -619,13 +619,13 @@
add ecx, edx
pinsrb xmm0, byte[ecx-1], 0
pinsrb xmm0, byte[ecx+edx-1], 1
- lea ecx, [ecx+2*edx]
+ lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 2
pinsrb xmm0, byte[ecx+edx-1], 3
- lea ecx, [ecx+2*edx]
+ lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 4
pinsrb xmm0, byte[ecx+edx-1], 5
- lea ecx, [ecx+2*edx]
+ lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 6
pinsrb xmm0, byte[ecx+edx-1], 7
punpcklqdq xmm0, xmm0
@@ -634,10 +634,10 @@
;(sum+2)>>2
movdqa xmm6, [PDQ2]
movdqa xmm5, xmm4
- punpckhqdq xmm5, xmm1
+ punpckhqdq xmm5, xmm1
paddd xmm5, xmm6
psrld xmm5, 2
-;(sum1+sum2+4)>>3
+;(sum1+sum2+4)>>3
paddd xmm6, xmm6
paddd xmm4, xmm1
paddd xmm4, xmm6
@@ -644,8 +644,8 @@
psrld xmm4, 3
;satd *16
pslld xmm5, 4
- pslld xmm4, 4
-;temp satd
+ pslld xmm4, 4
+;temp satd
movdqa xmm6, xmm4
punpcklqdq xmm4, xmm5
psllq xmm4, 32
@@ -655,12 +655,12 @@
psllq xmm5, 32
psrlq xmm5, 32
movdqa [esi+48], xmm5
-
+
pxor xmm4, xmm4 ;V
pxor xmm5, xmm5 ;H
pxor xmm6, xmm6 ;DC
mov ecx, 0
-loop_chroma_satdx3_cb_cr:
+loop_chroma_satdx3_cb_cr:
SSE41_ChromaGetX38x4Satd ecx, 0
inc ecx
cmp ecx, 2
@@ -668,13 +668,13 @@
%endmacro
%macro SSEReg2MMX 3
- movdq2q %2, %1
- movhlps %1, %1
- movdq2q %3, %1
+ movdq2q %2, %1
+ movhlps %1, %1
+ movdq2q %3, %1
%endmacro
%macro MMXReg2SSE 4
- movq2dq %1, %3
- movq2dq %2, %4
+ movq2dq %1, %3
+ movq2dq %2, %4
punpcklqdq %1, %2
%endmacro
;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
@@ -687,10 +687,10 @@
mov ecx, [esp+16]
mov edx, [esp+20]
mov eax, [esp+24]
- mov ebx, [esp+28]
+ mov ebx, [esp+28]
mov esi, [esp+40] ;temp_satd
xor edi, edi
-loop_chroma_satdx3:
+loop_chroma_satdx3:
SSE41_ChromaGetX38x8Satd
cmp edi, 1
je loop_chroma_satdx3end
@@ -701,16 +701,16 @@
mov ecx, [esp+44]
mov eax, [esp+48]
jmp loop_chroma_satdx3
-loop_chroma_satdx3end:
+loop_chroma_satdx3end:
MMXReg2SSE xmm0, xmm3, mm0, mm1
MMXReg2SSE xmm1, xmm3, mm2, mm3
MMXReg2SSE xmm2, xmm3, mm5, mm6
-
+
paddw xmm4, xmm0
paddw xmm5, xmm1
paddw xmm6, xmm2
-
- MMX_DW_1_2REG xmm0, xmm1
+
+ MMX_DW_1_2REG xmm0, xmm1
psrlw xmm4, 1 ;/2
psrlw xmm5, 1 ;/2
psrlw xmm6, 1 ;/2
@@ -730,57 +730,57 @@
jge near not_dc_8x8
cmp ebx, ecx
jge near not_dc_h_8x8
-
+
; for DC mode
mov dword[edx], 0;I8_PRED_DC
- mov eax, ebx
+ mov eax, ebx
jmp near return_satd_intra_8x8_x3
not_dc_8x8:
- ; for H mode
+ ; for H mode
cmp edi, ecx
jge near not_dc_h_8x8
mov dword[edx], 1;I8_PRED_H
- mov eax, edi
+ mov eax, edi
jmp near return_satd_intra_8x8_x3
not_dc_h_8x8:
; for V mode
mov dword[edx], 2;I8_PRED_V
mov eax, ecx
-return_satd_intra_8x8_x3:
+return_satd_intra_8x8_x3:
WELSEMMS
- pop edi
- pop esi
+ pop edi
+ pop esi
pop ebx
ret
-
+
;***********************************************************************
;
-;Pixel_satd_intra_sse2 END
+;Pixel_satd_intra_sse2 END
;
;***********************************************************************
%macro SSSE3_Get16BSadHVDC 2
- movd xmm6,%1
- pshufb xmm6,xmm1
+ movd xmm6,%1
+ pshufb xmm6,xmm1
movdqa %1, xmm6
- movdqa xmm0,%2
- psadbw xmm0,xmm7
- paddw xmm4,xmm0
movdqa xmm0,%2
- psadbw xmm0,xmm5
- paddw xmm2,xmm0
+ psadbw xmm0,xmm7
+ paddw xmm4,xmm0
+ movdqa xmm0,%2
+ psadbw xmm0,xmm5
+ paddw xmm2,xmm0
psadbw xmm6,%2
- paddw xmm3,xmm6
+ paddw xmm3,xmm6
%endmacro
%macro WelsAddDCValue 4
movzx %2, byte %1
- mov %3, %2
+ mov %3, %2
add %4, %2
-%endmacro
+%endmacro
;***********************************************************************
;
-;Pixel_sad_intra_ssse3 BEGIN
+;Pixel_sad_intra_ssse3 BEGIN
;
;***********************************************************************
WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
@@ -792,14 +792,14 @@
mov edx, [esp+20]
mov edi, [esp+40] ;temp_sad
sub ecx, edx
- movdqa xmm5,[ecx]
+ movdqa xmm5,[ecx]
pxor xmm0,xmm0
- psadbw xmm0,xmm5
- movhlps xmm1,xmm0
- paddw xmm0,xmm1
+ psadbw xmm0,xmm5
+ movhlps xmm1,xmm0
+ paddw xmm0,xmm1
movd eax,xmm0
-
- add ecx,edx
+
+ add ecx,edx
lea ebx, [edx+2*edx]
WelsAddDCValue [ecx-1 ], esi, [edi ], eax
WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
@@ -824,45 +824,45 @@
WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
sub edi, 192
- add eax,10h
- shr eax,5
- movd xmm7,eax
+ add eax,10h
+ shr eax,5
+ movd xmm7,eax
pxor xmm1,xmm1
pshufb xmm7,xmm1
- pxor xmm4,xmm4
- pxor xmm3,xmm3
- pxor xmm2,xmm2
-;sad begin
+ pxor xmm4,xmm4
+ pxor xmm3,xmm3
+ pxor xmm2,xmm2
+;sad begin
mov eax, [esp+24]
- mov ebx, [esp+28]
+ mov ebx, [esp+28]
lea esi, [ebx+2*ebx]
SSSE3_Get16BSadHVDC [edi], [eax]
SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
- add edi, 64
+ add edi, 64
lea eax, [eax+4*ebx]
SSSE3_Get16BSadHVDC [edi], [eax]
SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
- add edi, 64
+ add edi, 64
lea eax, [eax+4*ebx]
SSSE3_Get16BSadHVDC [edi], [eax]
SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
- add edi, 64
+ add edi, 64
lea eax, [eax+4*ebx]
SSSE3_Get16BSadHVDC [edi], [eax]
SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-
- pslldq xmm3,4
- por xmm3,xmm2
- movhlps xmm1,xmm3
- paddw xmm3,xmm1
+
+ pslldq xmm3,4
+ por xmm3,xmm2
+ movhlps xmm1,xmm3
+ paddw xmm3,xmm1
movhlps xmm0,xmm4
paddw xmm4,xmm0
; comparing order: DC H V
@@ -872,8 +872,8 @@
movd esi, xmm3 ;H
mov eax, [esp+36] ;lamda
shl eax, 1
- add esi, eax
- add ebx, eax
+ add esi, eax
+ add ebx, eax
mov edx, [esp+32]
cmp ebx, esi
jge near not_dc_16x16_sad
@@ -881,7 +881,7 @@
jge near not_dc_h_16x16_sad
; for DC mode
mov dword[edx], 2;I16_PRED_DC
- mov eax, ebx
+ mov eax, ebx
sub edi, 192
%assign x 0
%rep 16
@@ -890,11 +890,11 @@
%endrep
jmp near return_sad_intra_16x16_x3
not_dc_16x16_sad:
- ; for H mode
+ ; for H mode
cmp esi, ecx
jge near not_dc_h_16x16_sad
mov dword[edx], 1;I16_PRED_H
- mov eax, esi
+ mov eax, esi
jmp near return_sad_intra_16x16_x3
not_dc_h_16x16_sad:
; for V mode
@@ -914,12 +914,12 @@
;***********************************************************************
;
-;Pixel_sad_intra_ssse3 END
+;Pixel_sad_intra_ssse3 END
;
;***********************************************************************
;***********************************************************************
;
-;Pixel_satd_wxh_sse41 BEGIN
+;Pixel_satd_wxh_sse41 BEGIN
;
;***********************************************************************
@@ -934,9 +934,9 @@
movq xmm2, [ecx]
punpcklqdq xmm2, xmm2
pmaddubsw xmm2, xmm7
- movq xmm3, [ecx+edx]
- punpcklqdq xmm3, xmm3
- pmaddubsw xmm3, xmm7
+ movq xmm3, [ecx+edx]
+ punpcklqdq xmm3, xmm3
+ pmaddubsw xmm3, xmm7
psubsw xmm0, xmm2
psubsw xmm1, xmm3
movq xmm2, [eax+2*ebx]
@@ -948,12 +948,12 @@
movq xmm4, [ecx+2*edx]
punpcklqdq xmm4, xmm4
pmaddubsw xmm4, xmm7
- movq xmm5, [ecx+edi]
- punpcklqdq xmm5, xmm5
+ movq xmm5, [ecx+edi]
+ punpcklqdq xmm5, xmm5
pmaddubsw xmm5, xmm7
psubsw xmm2, xmm4
psubsw xmm3, xmm5
- SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4
+ SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4
pabsw xmm0, xmm0
pabsw xmm2, xmm2
pabsw xmm1, xmm1
@@ -970,18 +970,18 @@
pslld xmm2, 16
psrld xmm4, 16
por xmm2, xmm4
- pmaxuw xmm0, xmm2
+ pmaxuw xmm0, xmm2
paddw xmm6, xmm0
%endmacro
%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
- MMX_DW_1_2REG %3, %4
- pmaddwd %2, %3
- movhlps %4, %2
- paddd %2, %4
- pshuflw %4, %2,0Eh
- paddd %2, %4
- movd %1, %2
+ MMX_DW_1_2REG %3, %4
+ pmaddwd %2, %3
+ movhlps %4, %2
+ paddd %2, %4
+ pshuflw %4, %2,0Eh
+ paddd %2, %4
+ movd %1, %2
%endmacro
;***********************************************************************
;
@@ -990,53 +990,53 @@
;***********************************************************************
WELS_EXTERN WelsSampleSatd4x4_sse41
WelsSampleSatd4x4_sse41:
- push ebx
- mov eax,[esp+8]
- mov ebx,[esp+12]
- mov ecx,[esp+16]
- mov edx,[esp+20]
- movdqa xmm4,[HSwapSumSubDB1]
- movd xmm2,[ecx]
- movd xmm5,[ecx+edx]
- shufps xmm2,xmm5,0
- movd xmm3,[ecx+edx*2]
+ push ebx
+ mov eax,[esp+8]
+ mov ebx,[esp+12]
+ mov ecx,[esp+16]
+ mov edx,[esp+20]
+ movdqa xmm4,[HSwapSumSubDB1]
+ movd xmm2,[ecx]
+ movd xmm5,[ecx+edx]
+ shufps xmm2,xmm5,0
+ movd xmm3,[ecx+edx*2]
lea ecx, [edx*2+ecx]
- movd xmm5,[ecx+edx]
- shufps xmm3,xmm5,0
- movd xmm0,[eax]
- movd xmm5,[eax+ebx]
- shufps xmm0,xmm5,0
- movd xmm1,[eax+ebx*2]
+ movd xmm5,[ecx+edx]
+ shufps xmm3,xmm5,0
+ movd xmm0,[eax]
+ movd xmm5,[eax+ebx]
+ shufps xmm0,xmm5,0
+ movd xmm1,[eax+ebx*2]
lea eax, [ebx*2+eax]
- movd xmm5,[eax+ebx]
- shufps xmm1,xmm5,0
- pmaddubsw xmm0,xmm4
- pmaddubsw xmm1,xmm4
- pmaddubsw xmm2,xmm4
- pmaddubsw xmm3,xmm4
- psubw xmm0,xmm2
- psubw xmm1,xmm3
- movdqa xmm2,xmm0
- paddw xmm0,xmm1
- psubw xmm1,xmm2
- movdqa xmm2,xmm0
- punpcklqdq xmm0,xmm1
- punpckhqdq xmm2,xmm1
- movdqa xmm1,xmm0
- paddw xmm0,xmm2
- psubw xmm2,xmm1
- movdqa xmm1,xmm0
- pblendw xmm0,xmm2,0AAh
- pslld xmm2,16
- psrld xmm1,16
- por xmm2,xmm1
- pabsw xmm0,xmm0
- pabsw xmm2,xmm2
- pmaxsw xmm0,xmm2
+ movd xmm5,[eax+ebx]
+ shufps xmm1,xmm5,0
+ pmaddubsw xmm0,xmm4
+ pmaddubsw xmm1,xmm4
+ pmaddubsw xmm2,xmm4
+ pmaddubsw xmm3,xmm4
+ psubw xmm0,xmm2
+ psubw xmm1,xmm3
+ movdqa xmm2,xmm0
+ paddw xmm0,xmm1
+ psubw xmm1,xmm2
+ movdqa xmm2,xmm0
+ punpcklqdq xmm0,xmm1
+ punpckhqdq xmm2,xmm1
+ movdqa xmm1,xmm0
+ paddw xmm0,xmm2
+ psubw xmm2,xmm1
+ movdqa xmm1,xmm0
+ pblendw xmm0,xmm2,0AAh
+ pslld xmm2,16
+ psrld xmm1,16
+ por xmm2,xmm1
+ pabsw xmm0,xmm0
+ pabsw xmm2,xmm2
+ pmaxsw xmm0,xmm2
SSSE3_SumWHorizon eax, xmm0, xmm5, xmm7
- pop ebx
- ret
-
+ pop ebx
+ ret
+
;***********************************************************************
;
;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
@@ -1051,10 +1051,10 @@
mov eax, [esp+16]
mov ebx, [esp+20]
mov ecx, [esp+24]
- mov edx, [esp+28]
+ mov edx, [esp+28]
movdqa xmm7, [HSumSubDB1]
- lea esi, [ebx+ebx*2]
- lea edi, [edx+edx*2]
+ lea esi, [ebx+ebx*2]
+ lea edi, [edx+edx*2]
pxor xmm6, xmm6
SSE41_GetSatd8x4
lea eax, [eax+4*ebx]
@@ -1065,7 +1065,7 @@
pop esi
pop ebx
ret
-
+
;***********************************************************************
;
;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
@@ -1078,17 +1078,17 @@
push esi
push edi
push ebp
-%define pushsize 16
+%define pushsize 16
mov eax, [esp+pushsize+4]
mov ebx, [esp+pushsize+8]
mov ecx, [esp+pushsize+12]
- mov edx, [esp+pushsize+16]
+ mov edx, [esp+pushsize+16]
movdqa xmm7, [HSumSubDB1]
- lea esi, [ebx+ebx*2]
- lea edi, [edx+edx*2]
+ lea esi, [ebx+ebx*2]
+ lea edi, [edx+edx*2]
pxor xmm6, xmm6
mov ebp, 0
-loop_get_satd_8x16:
+loop_get_satd_8x16:
SSE41_GetSatd8x4
lea eax, [eax+4*ebx]
lea ecx, [ecx+4*edx]
@@ -1116,10 +1116,10 @@
mov eax, [esp+16]
mov ebx, [esp+20]
mov ecx, [esp+24]
- mov edx, [esp+28]
+ mov edx, [esp+28]
movdqa xmm7, [HSumSubDB1]
- lea esi, [ebx+ebx*2]
- lea edi, [edx+edx*2]
+ lea esi, [ebx+ebx*2]
+ lea edi, [edx+edx*2]
pxor xmm6, xmm6
SSE41_GetSatd8x4
lea eax, [eax+4*ebx]
@@ -1144,7 +1144,7 @@
;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
-
+
WELS_EXTERN WelsSampleSatd16x16_sse41
align 16
WelsSampleSatd16x16_sse41:
@@ -1152,17 +1152,17 @@
push esi
push edi
push ebp
- %define pushsize 16
+ %define pushsize 16
mov eax, [esp+pushsize+4]
mov ebx, [esp+pushsize+8]
mov ecx, [esp+pushsize+12]
- mov edx, [esp+pushsize+16]
+ mov edx, [esp+pushsize+16]
movdqa xmm7, [HSumSubDB1]
- lea esi, [ebx+ebx*2]
- lea edi, [edx+edx*2]
+ lea esi, [ebx+ebx*2]
+ lea edi, [edx+edx*2]
pxor xmm6, xmm6
mov ebp, 0
-loop_get_satd_16x16_left:
+loop_get_satd_16x16_left:
SSE41_GetSatd8x4
lea eax, [eax+4*ebx]
lea ecx, [ecx+4*edx]
@@ -1206,8 +1206,8 @@
lea ecx, [ecx+2*edx]
movdqu xmm1, [ecx]
MOVDQ xmm2, [eax];[eax] must aligned 16
- psadbw xmm1, xmm2
- paddw xmm0, xmm1
+ psadbw xmm1, xmm2
+ paddw xmm0, xmm1
movdqu xmm1, [ecx+edx]
MOVDQ xmm2, [eax+ebx]
psadbw xmm1, xmm2
@@ -1218,7 +1218,7 @@
%macro SSE2_GetSad4x16 0
movdqu xmm0, [ecx]
MOVDQ xmm2, [eax]
- psadbw xmm0, xmm2
+ psadbw xmm0, xmm2
paddw xmm7, xmm0
movdqu xmm1, [ecx+edx]
MOVDQ xmm2, [eax+ebx]
@@ -1226,8 +1226,8 @@
paddw xmm7, xmm1
movdqu xmm1, [ecx+2*edx]
MOVDQ xmm2, [eax+2*ebx];[eax] must aligned 16
- psadbw xmm1, xmm2
- paddw xmm7, xmm1
+ psadbw xmm1, xmm2
+ paddw xmm7, xmm1
movdqu xmm1, [ecx+edi]
MOVDQ xmm2, [eax+esi]
psadbw xmm1, xmm2
@@ -1265,17 +1265,17 @@
WelsSampleSad16x16_sse2:
push ebx
push edi
- push esi
-
+ push esi
+
%define _STACK_SIZE 12
-
+
mov eax, [esp+_STACK_SIZE+4 ]
mov ebx, [esp+_STACK_SIZE+8 ]
lea esi, [3*ebx]
mov ecx, [esp+_STACK_SIZE+12]
- mov edx, [esp+_STACK_SIZE+16]
- lea edi, [3*edx]
-
+ mov edx, [esp+_STACK_SIZE+16]
+ lea edi, [3*edx]
+
pxor xmm7, xmm7
SSE2_GetSad4x16
lea eax, [eax+4*ebx]
@@ -1290,14 +1290,14 @@
movhlps xmm0, xmm7
paddw xmm0, xmm7
movd eax, xmm0
-
- %undef _STACK_SIZE
-
+
+ %undef _STACK_SIZE
+
pop esi
pop edi
pop ebx
ret
-
+
;***********************************************************************
;
;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
@@ -1312,10 +1312,10 @@
mov eax, [esp+8]
mov ebx, [esp+12]
mov ecx, [esp+16]
- mov edx, [esp+20]
+ mov edx, [esp+20]
movdqu xmm0, [ecx]
MOVDQ xmm2, [eax]
- psadbw xmm0, xmm2
+ psadbw xmm0, xmm2
movdqu xmm1, [ecx+edx]
MOVDQ xmm2, [eax+ebx]
psadbw xmm1, xmm2
@@ -1339,19 +1339,19 @@
mov eax, [esp+8]
mov ebx, [esp+12]
mov ecx, [esp+16]
- mov edx, [esp+20]
+ mov edx, [esp+20]
pxor xmm6, xmm6
-
+
SSE2_GetSad8x4
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
- SSE2_GetSad8x4
+ SSE2_GetSad8x4
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
SSE2_GetSad8x4
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
- SSE2_GetSad8x4
+ SSE2_GetSad8x4
movhlps xmm0, xmm6
paddw xmm0, xmm6
@@ -1375,15 +1375,15 @@
push edi
mov eax, [esp+12]
mov ebx, [esp+16]
-
+
pxor xmm7, xmm7
-
+
mov edi, ecx
and edi, 0x07
- sub ecx, edi
+ sub ecx, edi
mov edx, 8
sub edx, edi
-
+
shl edi, 3
shl edx, 3
movd xmm5, edi
@@ -1391,10 +1391,10 @@
mov edi, 8
add edi, ecx
mov edx, [esp+24]
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
-
+
movq xmm1, [ecx]
movq xmm2, [edi]
movhps xmm1, [ecx+edx]
@@ -1402,17 +1402,17 @@
psrlq xmm1, xmm5
psllq xmm2, xmm6
por xmm1, xmm2
-
+
psadbw xmm0, xmm1
paddw xmm7, xmm0
-
+
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
lea edi, [edi+2*edx]
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
-
+
movq xmm1, [ecx]
movq xmm2, [edi]
movhps xmm1, [ecx+edx]
@@ -1420,7 +1420,7 @@
psrlq xmm1, xmm5
psllq xmm2, xmm6
por xmm1, xmm2
-
+
psadbw xmm0, xmm1
paddw xmm7, xmm0
@@ -1427,10 +1427,10 @@
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
lea edi, [edi+2*edx]
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
-
+
movq xmm1, [ecx]
movq xmm2, [edi]
movhps xmm1, [ecx+edx]
@@ -1438,17 +1438,17 @@
psrlq xmm1, xmm5
psllq xmm2, xmm6
por xmm1, xmm2
-
+
psadbw xmm0, xmm1
paddw xmm7, xmm0
-
+
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
lea edi, [edi+2*edx]
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
-
+
movq xmm1, [ecx]
movq xmm2, [edi]
movhps xmm1, [ecx+edx]
@@ -1456,10 +1456,10 @@
psrlq xmm1, xmm5
psllq xmm2, xmm6
por xmm1, xmm2
-
+
psadbw xmm0, xmm1
paddw xmm7, xmm0
-
+
movhlps xmm0, xmm7
paddw xmm0, xmm7
movd eax, xmm0
@@ -1469,12 +1469,12 @@
push ebx
mov eax, [esp+8]
mov ebx, [esp+12]
- mov edx, [esp+20]
+ mov edx, [esp+20]
pxor xmm6, xmm6
SSE2_GetSad8x4
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
- SSE2_GetSad8x4
+ SSE2_GetSad8x4
movhlps xmm0, xmm6
paddw xmm0, xmm6
movd eax, xmm0
@@ -1485,7 +1485,7 @@
;***********************************************************************
;
-;Pixel_sad_wxh_sse2 END
+;Pixel_sad_wxh_sse2 END
;
;***********************************************************************
@@ -1492,7 +1492,7 @@
;***********************************************************************
;
-;Pixel_sad_4_wxh_sse2 BEGIN
+;Pixel_sad_4_wxh_sse2 BEGIN
;
;***********************************************************************
@@ -1525,20 +1525,20 @@
movdqu xmm3, [ecx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
+
movdqa xmm1, [eax+ebx]
movdqu xmm3, [ecx+edx]
psadbw xmm3, xmm1
paddw xmm4, xmm3
-
+
movdqu xmm2, [ecx+edx-1]
psadbw xmm2, xmm0
paddw xmm6, xmm2
-
+
movdqu xmm3, [ecx+edx+1]
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
movdqa xmm2, [eax]
@@ -1599,30 +1599,30 @@
movdqu xmm3, [ecx]
psadbw xmm2, xmm3
paddw xmm5, xmm2
-
+
movdqu xmm2, [ecx-1]
psadbw xmm2, xmm0
paddw xmm6, xmm2
-
+
movdqu xmm3, [ecx+1]
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
movdqu xmm3, [ecx+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
mov ecx, [esp+24]
movhlps xmm0, xmm4
- paddw xmm4, xmm0
+ paddw xmm4, xmm0
movhlps xmm0, xmm5
- paddw xmm5, xmm0
+ paddw xmm5, xmm0
movhlps xmm0, xmm6
- paddw xmm6, xmm0
+ paddw xmm6, xmm0
movhlps xmm0, xmm7
paddw xmm7, xmm0
punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
+ punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [ecx],xmm4
pop ebx
@@ -1646,20 +1646,20 @@
movdqu xmm3, [edi]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
+
movdqa xmm1, [eax+ebx]
movdqu xmm3, [edi+edx]
psadbw xmm3, xmm1
paddw xmm4, xmm3
-
+
movdqu xmm2, [edi+edx-1]
psadbw xmm2, xmm0
paddw xmm6, xmm2
-
+
movdqu xmm3, [edi+edx+1]
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movdqa xmm2, [eax]
@@ -1688,36 +1688,36 @@
movdqu xmm3, [edi]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
movdqu xmm0, [edi-1]
psadbw xmm0, xmm1
paddw xmm6, xmm0
-
+
movdqu xmm3, [edi+1]
psadbw xmm3, xmm1
paddw xmm7, xmm3
-
+
movdqu xmm3, [edi+edx]
psadbw xmm1, xmm3
paddw xmm5, xmm1
-
+
mov edi, [esp+28]
movhlps xmm0, xmm4
- paddw xmm4, xmm0
+ paddw xmm4, xmm0
movhlps xmm0, xmm5
- paddw xmm5, xmm0
+ paddw xmm5, xmm0
movhlps xmm0, xmm6
- paddw xmm6, xmm0
+ paddw xmm6, xmm0
movhlps xmm0, xmm7
paddw xmm7, xmm0
punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
+ punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [edi],xmm4
pop edi
pop ebx
ret
-
+
WELS_EXTERN WelsSampleSadFour8x16_sse2
WelsSampleSadFour8x16_sse2:
push ebx
@@ -1737,10 +1737,10 @@
movhps xmm3, [edi+edx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
+
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
-
+
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
@@ -1749,191 +1749,191 @@
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
+
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
-
+
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
-
+
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
+
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
-
+
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
-
+
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
+
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
-
+
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
-
+
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
+
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
-
+
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
-
+
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
+
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
-
+
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
-
+
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
+
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
-
+
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
-
+
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
+
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
-
+
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
-
+
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
mov edi, [esp+28]
movhlps xmm0, xmm4
- paddw xmm4, xmm0
+ paddw xmm4, xmm0
movhlps xmm0, xmm5
- paddw xmm5, xmm0
+ paddw xmm5, xmm0
movhlps xmm0, xmm6
- paddw xmm6, xmm0
+ paddw xmm6, xmm0
movhlps xmm0, xmm7
paddw xmm7, xmm0
punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
+ punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [edi],xmm4
pop edi
pop ebx
ret
-
-
+
+
WELS_EXTERN WelsSampleSadFour8x8_sse2
WelsSampleSadFour8x8_sse2:
push ebx
@@ -1953,10 +1953,10 @@
movhps xmm3, [edi+edx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
+
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
-
+
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
@@ -1965,99 +1965,99 @@
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
+
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
-
+
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
-
+
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
+
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
-
+
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
-
+
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
-
+
+
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
-
+
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
-
+
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
mov edi, [esp+28]
movhlps xmm0, xmm4
- paddw xmm4, xmm0
+ paddw xmm4, xmm0
movhlps xmm0, xmm5
- paddw xmm5, xmm0
+ paddw xmm5, xmm0
movhlps xmm0, xmm6
- paddw xmm6, xmm0
+ paddw xmm6, xmm0
movhlps xmm0, xmm7
paddw xmm7, xmm0
punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
+ punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [edi],xmm4
pop edi
pop ebx
ret
-
+
WELS_EXTERN WelsSampleSadFour4x4_sse2
WelsSampleSadFour4x4_sse2:
push ebx
@@ -2080,23 +2080,23 @@
punpckldq xmm1, xmm2
movd xmm2, [edi+edx-1]
movd xmm3, [edi+edx+1]
-
+
lea edi, [edi+2*edx]
-
+
movd xmm4, [edi]
movd xmm5, [edi-1]
punpckldq xmm2, xmm5
movd xmm5, [edi+1]
punpckldq xmm3, xmm5
-
+
movd xmm5, [edi+edx]
punpckldq xmm4, xmm5
-
+
punpcklqdq xmm1, xmm4 ;-L
-
+
movd xmm5, [edi+edx-1]
movd xmm6, [edi+edx+1]
-
+
lea edi, [edi+2*edx]
movd xmm7, [edi-1]
punpckldq xmm5, xmm7
@@ -2107,12 +2107,12 @@
movd xmm6, [edi]
movd xmm7, [edi+edx]
punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6 ;+L
+ punpcklqdq xmm4, xmm6 ;+L
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psadbw xmm3, xmm0
psadbw xmm4, xmm0
-
+
movhlps xmm0, xmm1
paddw xmm1, xmm0
movhlps xmm0, xmm2
@@ -2123,13 +2123,13 @@
paddw xmm4, xmm0
mov edi, [esp+28]
punpckldq xmm1, xmm4
- punpckldq xmm2, xmm3
+ punpckldq xmm2, xmm3
punpcklqdq xmm1, xmm2
movdqa [edi],xmm1
pop edi
pop ebx
ret
-
+
;***********************************************************************
;
;Pixel_sad_4_wxh_sse2 END
@@ -2150,40 +2150,40 @@
%define pix2address esp+pushsize+12
%define pix2stride esp+pushsize+16
- mov eax, [pix1address]
- mov ebx, [pix1stride ]
- mov ecx, [pix2address]
- mov edx, [pix2stride ]
+ mov eax, [pix1address]
+ mov ebx, [pix1stride ]
+ mov ecx, [pix2address]
+ mov edx, [pix2stride ]
movd mm0, [eax]
movd mm1, [eax+ebx]
punpckldq mm0, mm1
-
+
movd mm3, [ecx]
movd mm4, [ecx+edx]
punpckldq mm3, mm4
psadbw mm0, mm3
-
+
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
-
+
movd mm1, [eax]
movd mm2, [eax+ebx]
punpckldq mm1, mm2
-
+
movd mm3, [ecx]
movd mm4, [ecx+edx]
punpckldq mm3, mm4
psadbw mm1, mm3
paddw mm0, mm1
-
+
movd eax, mm0
WELSEMMS
pop ebx
-%undef pushsize
-%undef pix1address
-%undef pix1stride
-%undef pix2address
-%undef pix2stride
+%undef pushsize
+%undef pix1address
+%undef pix1stride
+%undef pix2address
+%undef pix2stride
ret
\ No newline at end of file
--- a/codec/encoder/core/asm/score.asm
+++ b/codec/encoder/core/asm/score.asm
@@ -45,7 +45,7 @@
bits 32
;***********************************************************************
-; Macros
+; Macros
;***********************************************************************
;***********************************************************************
@@ -59,7 +59,7 @@
sse2_1: dw 1, 1, 1, 1, 1, 1, 1, 1
align 16
sse2_b1: db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
-i_ds_table: db 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+i_ds_table: db 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
align 16
sse2_plane_inc_minus: dw -7, -6, -5, -4, -3, -2, -1, 0
align 16
@@ -139,7 +139,7 @@
db 4, 8, 5, 8, 8,12, 1, 4, 4, 8
db 4, 7, 7,11, 4, 8, 7,11, 8,11
db 11,15, 1, 4, 3, 7, 4, 7, 7,11
- db 3, 7, 6,10, 7,10,10,14, 4, 7
+ db 3, 7, 6,10, 7,10,10,14, 4, 7
db 7,11, 7,10,10,14, 7,11,10,14
db 11,14,14,18, 0, 4, 3, 7, 3, 6
db 6,10, 3, 7, 6,10, 7,10,10,14
@@ -191,7 +191,7 @@
movdqa [eax],xmm0
movdqa [eax+16], xmm1
ret
-
+
;***********************************************************************
;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct )
;***********************************************************************
@@ -206,7 +206,7 @@
pinsrw xmm0, eax, 7 ; xmm0[7] = [8]
pinsrw xmm1, ecx, 0 ; xmm1[0] = [7]
pshufb xmm1, [pb_scanacdc_maskb]
- pshufb xmm0, [pb_scanacdc_maska]
+ pshufb xmm0, [pb_scanacdc_maska]
mov eax, [esp+4]
movdqa [eax],xmm0
@@ -224,7 +224,7 @@
movdqa xmm2, xmm0
punpcklqdq xmm0, xmm1
punpckhqdq xmm2, xmm1
-
+
movdqa xmm3, xmm0
punpckldq xmm0, xmm2
punpckhdq xmm3, xmm2
@@ -236,10 +236,10 @@
pextrw edx, xmm3, 0
pinsrw xmm3, eax, 0
pinsrw xmm0, edx, 3
-
+
pshufhw xmm1, xmm0, 0x93
pshuflw xmm2, xmm3, 0x39
-
+
movdqa xmm3, xmm2
psrldq xmm1, 2
pslldq xmm3, 14
@@ -255,13 +255,13 @@
;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
;***********************************************************************
ALIGN 16
-WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
+WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
WelsCalculateSingleCtr4x4_sse2:
push ebx
mov eax, [esp+8]
movdqa xmm0, [eax]
movdqa xmm1, [eax+16]
-
+
packsswb xmm0, xmm1
pxor xmm3, xmm3
@@ -317,7 +317,7 @@
and edx, 0xff
shr ecx, 8
; and ecx, 0xff ; we do not need this due to high 16bits equal to 0 yet
- xor eax, eax
+ xor eax, eax
add al, [nozero_count_table+ecx]
add al, [nozero_count_table+edx]
ret
--- a/codec/encoder/core/asm/vaa.asm
+++ b/codec/encoder/core/asm/vaa.asm
@@ -38,7 +38,7 @@
;* 04/14/2010 Created
;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3)
;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement
-;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
+;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
;*
;*************************************************************************/
%include "asm_inc.asm"
@@ -167,7 +167,7 @@
mov ebp, esp
and ebp, 0fh
sub esp, ebp
- sub esp, 32
+ sub esp, 32
%define PUSH_SIZE 52 ; 20 + 32
mov esi, [esp+ebp+PUSH_SIZE+4] ; data_y
@@ -179,31 +179,31 @@
add edx, ecx ; iLineSize x 3 [edx]
mov eax, ebx
sal eax, $1 ; iLineSize x 4 [eax]
-
+
pxor xmm7, xmm7
-
+
; loops
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp], xmm0
+ movq [esp], xmm0
lea esi, [esi+eax]
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp+8], xmm0
+ movq [esp+8], xmm0
lea esi, [esi+eax]
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp+16], xmm0
+ movq [esp+16], xmm0
lea esi, [esi+eax]
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [esp+24], xmm0
-
+
movdqa xmm0, [esp] ; block 0~7
movdqa xmm1, [esp+16] ; block 8~15
movdqa xmm2, xmm0
paddw xmm0, xmm1
SUM_WORD_8x2_SSE2 xmm0, xmm3
-
+
pmullw xmm1, xmm1
pmullw xmm2, xmm2
movdqa xmm3, xmm1
@@ -219,7 +219,7 @@
paddd xmm1, xmm2
pshufd xmm2, xmm1, 0B1h
paddd xmm1, xmm2
-
+
movd ebx, xmm0
and ebx, 0ffffh ; effective low word truncated
mov ecx, ebx
@@ -227,7 +227,7 @@
sar ebx, $4
movd eax, xmm1
sub eax, ebx
-
+
%undef PUSH_SIZE
add esp, 32
add esp, ebp
@@ -253,7 +253,7 @@
mov ebp, esp
and ebp, 0fh
sub esp, ebp
- sub esp, 32
+ sub esp, 32
%define PUSH_SIZE 52 ; 20 + 32
mov esi, [esp+ebp+PUSH_SIZE+4] ; data_y
@@ -265,25 +265,25 @@
add edx, ecx ; iLineSize x 3 [edx]
mov eax, ebx
sal eax, $1 ; iLineSize x 4 [eax]
-
+
pxor xmm7, xmm7
-
+
; loops
VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp], xmm0
+ movq [esp], xmm0
lea esi, [esi+eax]
VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
- movq [esp+8], xmm1
+ movq [esp+8], xmm1
lea esi, [esi+eax]
VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp+16], xmm0
+ movq [esp+16], xmm0
lea esi, [esi+eax]
VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
movq [esp+24], xmm1
-
+
movdqa xmm0, [esp] ; block 0~7
movdqa xmm1, [esp+16] ; block 8~15
movdqa xmm2, xmm0
@@ -305,7 +305,7 @@
paddd xmm1, xmm2
pshufd xmm2, xmm1, 0B1h
paddd xmm1, xmm2
-
+
movd ebx, xmm0
and ebx, 0ffffh ; effective low work truncated
mov ecx, ebx
@@ -313,7 +313,7 @@
sar ebx, $4
movd eax, xmm1
sub eax, ebx
-
+
%undef PUSH_SIZE
add esp, 32
add esp, ebp
@@ -323,7 +323,7 @@
pop edx
pop ebx
ret
-
+
WELS_EXTERN MdInterAnalysisVaaInfo_sse41
;***********************************************************************
; uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
@@ -331,11 +331,11 @@
ALIGN 16
MdInterAnalysisVaaInfo_sse41:
mov eax, [esp+4]
- movdqa xmm0, [eax] ; load 4 sad_8x8
+ movdqa xmm0, [eax] ; load 4 sad_8x8
pshufd xmm1, xmm0, 01Bh
paddd xmm1, xmm0
pshufd xmm2, xmm1, 0B1h
- paddd xmm1, xmm2
+ paddd xmm1, xmm2
psrad xmm1, 02h ; iAverageSad
movdqa xmm2, xmm1
psrad xmm2, 06h
@@ -342,7 +342,7 @@
movdqa xmm3, xmm0 ; iSadBlock
psrad xmm3, 06h
psubd xmm3, xmm2
- pmulld xmm3, xmm3 ; [comment]: pmulld from SSE4.1 instruction sets
+ pmulld xmm3, xmm3 ; [comment]: pmulld from SSE4.1 instruction sets
pshufd xmm4, xmm3, 01Bh
paddd xmm4, xmm3
pshufd xmm3, xmm4, 0B1h
@@ -354,7 +354,7 @@
pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
movmskps eax, xmm0
ret
-.threshold_exit:
+.threshold_exit:
mov eax, 15
ret
@@ -365,11 +365,11 @@
ALIGN 16
MdInterAnalysisVaaInfo_sse2:
mov eax, [esp+4]
- movdqa xmm0, [eax] ; load 4 sad_8x8
+ movdqa xmm0, [eax] ; load 4 sad_8x8
pshufd xmm1, xmm0, 01Bh
paddd xmm1, xmm0
pshufd xmm2, xmm1, 0B1h
- paddd xmm1, xmm2
+ paddd xmm1, xmm2
psrad xmm1, 02h ; iAverageSad
movdqa xmm2, xmm1
psrad xmm2, 06h
@@ -376,9 +376,9 @@
movdqa xmm3, xmm0 ; iSadBlock
psrad xmm3, 06h
psubd xmm3, xmm2
-
+
; to replace pmulld functionality as below
- movdqa xmm2, xmm3
+ movdqa xmm2, xmm3
pmuludq xmm2, xmm3
pshufd xmm4, xmm3, 0B1h
pmuludq xmm4, xmm4
@@ -385,8 +385,8 @@
movdqa xmm5, xmm2
punpckldq xmm5, xmm4
punpckhdq xmm2, xmm4
- punpcklqdq xmm5, xmm2
-
+ punpcklqdq xmm5, xmm2
+
pshufd xmm4, xmm5, 01Bh
paddd xmm4, xmm5
pshufd xmm5, xmm4, 0B1h
@@ -398,6 +398,6 @@
pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
movmskps eax, xmm0
ret
-.threshold_exit:
+.threshold_exit:
mov eax, 15
ret
--- a/codec/encoder/plus/res/welsenc.rc
+++ b/codec/encoder/plus/res/welsenc.rc
@@ -27,18 +27,18 @@
// TEXTINCLUDE
//
-1 TEXTINCLUDE
+1 TEXTINCLUDE
BEGIN
"resource.h\0"
END
-2 TEXTINCLUDE
+2 TEXTINCLUDE
BEGIN
"#include ""windows.h""\r\n"
"\0"
END
-3 TEXTINCLUDE
+3 TEXTINCLUDE
BEGIN
"\r\n"
"\0"
--- a/processing/build/linux/makefile
+++ b/processing/build/linux/makefile
@@ -1,94 +1,94 @@
-NASM = 1
-NAME = libwelsvp
-
-OUTDIR = ../../../bin/linux
-BINDIR = ../../bin
-OBJDIR = ../../obj
-SRCDIRS = ../../src/asm \
- ../../src/common \
- ../../src/adaptivequantization \
- ../../src/backgounddetection \
- ../../src/denoise \
- ../../src/downsample \
- ../../src/scenechangedetection \
- ../../src/vaacalc \
- ../../src/complexityanalysis
-SRCDIRS += ../../src/imagerotate
-
-
-TARGETLIB = $(BINDIR)/$(NAME).so
-
-CC = $(shell which gcc)
-AS = $(shell which nasm)
-GCC = gcc -m32
-
-CPPFLAGS = -Wall -g -O3
-ifeq ($(NASM), 1)
-CPPFLAGS += -DX86_ASM
-endif
-ASMFLAGS = -f elf -DNOPREFIX -I ../../src/asm/
-LDFLAGS = -lstdc++ -ldl
-
-SRCEXTS = .cpp
-ifeq ($(NASM), 1)
-SRCEXTS += .asm
-endif
-HDREXTS = .h
-SOURCES = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(SRCEXTS))))
-HEADERS = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(HDREXTS))))
-SRC_CPP = $(filter %.cpp,$(SOURCES))
-SRC_ASM = $(filter %.asm,$(SOURCES))
-OBJS = $(addsuffix .o, $(basename $(SOURCES)))
-DEPS = $(OBJS:.o=.d)
-
-DEP_OPT = $(shell if `$(CC) --version | grep "GCC" >/dev/null`; then \
- echo "-MM -MP"; else echo "-M"; fi )
-DEPEND_cpp.d = $(subst -g ,,$(CC) $(DEP_OPT) $(CPPFLAGS))
-DEPEND_asm.d = $(subst -g ,,$(AS) $(DEP_OPT) $(ASMFLAGS))
-COMPILE.cpp = $(GCC) $(CPPFLAGS) -c
-COMPILE.asm = $(AS) $(ASMFLAGS)
-LINK = $(GCC) $(LDFLAGS)
-
-.PHONY: all objs tags ctags clean distclean
-
-.SUFFIXES:
-
-all: $(TARGETLIB)
-
-%.d:%.cpp
- @echo -n $(dir $<) > $@
- @$(DEPEND_cpp.d) $< >> $@
-
-%.d:%.asm
- @echo -n $(dir $<) > $@
- @$(DEPEND_asm.d) $< >> $@
-
-objs:$(OBJS)
-
-%.o:%.cpp
- $(COMPILE.cpp) $< -o $@
-
-%.o:%.asm
- $(COMPILE.asm) $< -o $@
-
-tags: $(HEADERS) $(SOURCES)
- etags $(HEADERS) $(SOURCES)
-
-ctags: $(HEADERS) $(SOURCES)
- ctags $(HEADERS) $(SOURCES)
-
-$(TARGETLIB):$(OBJS)
- @if test ! -d $(BINDIR) ; then mkdir -p $(BINDIR) ; fi
- $(LINK) $(OBJS) -shared -Wl,-Bsymbolic -o $@
- @echo produce the lib to $(TARGETLIB).
- @if test ! -d $(OUTDIR) ; then mkdir -p $(OUTDIR) ; fi
- @cp -f $(TARGETLIB) $(OUTDIR)
- @cp -f $(TARGETLIB) ../../../testbin
- @echo copy the lib to $(OUTDIR).
-
-clean:
- rm -f $(OBJS) $(TARGETLIB)
-
-distclean: clean
- rm -f $(DEPS) TAGS
-
+NASM = 1
+NAME = libwelsvp
+
+OUTDIR = ../../../bin/linux
+BINDIR = ../../bin
+OBJDIR = ../../obj
+SRCDIRS = ../../src/asm \
+ ../../src/common \
+ ../../src/adaptivequantization \
+ ../../src/backgounddetection \
+ ../../src/denoise \
+ ../../src/downsample \
+ ../../src/scenechangedetection \
+ ../../src/vaacalc \
+ ../../src/complexityanalysis
+SRCDIRS += ../../src/imagerotate
+
+
+TARGETLIB = $(BINDIR)/$(NAME).so
+
+CC = $(shell which gcc)
+AS = $(shell which nasm)
+GCC = gcc -m32
+
+CPPFLAGS = -Wall -g -O3
+ifeq ($(NASM), 1)
+CPPFLAGS += -DX86_ASM
+endif
+ASMFLAGS = -f elf -DNOPREFIX -I ../../src/asm/
+LDFLAGS = -lstdc++ -ldl
+
+SRCEXTS = .cpp
+ifeq ($(NASM), 1)
+SRCEXTS += .asm
+endif
+HDREXTS = .h
+SOURCES = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(SRCEXTS))))
+HEADERS = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(HDREXTS))))
+SRC_CPP = $(filter %.cpp,$(SOURCES))
+SRC_ASM = $(filter %.asm,$(SOURCES))
+OBJS = $(addsuffix .o, $(basename $(SOURCES)))
+DEPS = $(OBJS:.o=.d)
+
+DEP_OPT = $(shell if `$(CC) --version | grep "GCC" >/dev/null`; then \
+ echo "-MM -MP"; else echo "-M"; fi )
+DEPEND_cpp.d = $(subst -g ,,$(CC) $(DEP_OPT) $(CPPFLAGS))
+DEPEND_asm.d = $(subst -g ,,$(AS) $(DEP_OPT) $(ASMFLAGS))
+COMPILE.cpp = $(GCC) $(CPPFLAGS) -c
+COMPILE.asm = $(AS) $(ASMFLAGS)
+LINK = $(GCC) $(LDFLAGS)
+
+.PHONY: all objs tags ctags clean distclean
+
+.SUFFIXES:
+
+all: $(TARGETLIB)
+
+%.d:%.cpp
+ @echo -n $(dir $<) > $@
+ @$(DEPEND_cpp.d) $< >> $@
+
+%.d:%.asm
+ @echo -n $(dir $<) > $@
+ @$(DEPEND_asm.d) $< >> $@
+
+objs:$(OBJS)
+
+%.o:%.cpp
+ $(COMPILE.cpp) $< -o $@
+
+%.o:%.asm
+ $(COMPILE.asm) $< -o $@
+
+tags: $(HEADERS) $(SOURCES)
+ etags $(HEADERS) $(SOURCES)
+
+ctags: $(HEADERS) $(SOURCES)
+ ctags $(HEADERS) $(SOURCES)
+
+$(TARGETLIB):$(OBJS)
+ @if test ! -d $(BINDIR) ; then mkdir -p $(BINDIR) ; fi
+ $(LINK) $(OBJS) -shared -Wl,-Bsymbolic -o $@
+ @echo produce the lib to $(TARGETLIB).
+ @if test ! -d $(OUTDIR) ; then mkdir -p $(OUTDIR) ; fi
+ @cp -f $(TARGETLIB) $(OUTDIR)
+ @cp -f $(TARGETLIB) ../../../testbin
+ @echo copy the lib to $(OUTDIR).
+
+clean:
+ rm -f $(OBJS) $(TARGETLIB)
+
+distclean: clean
+ rm -f $(DEPS) TAGS
+
--- a/processing/src/asm/asm_inc.asm
+++ b/processing/src/asm/asm_inc.asm
@@ -43,7 +43,7 @@
; Options, for DEBUG
;***********************************************************************
-%if 1
+%if 1
%define MOVDQ movdqa
%else
%define MOVDQ movdqu
@@ -58,7 +58,7 @@
BITS 32
;***********************************************************************
-; Macros
+; Macros
;***********************************************************************
%macro WELS_EXTERN 1
@@ -74,7 +74,7 @@
pxor %2, %2
psubw %2, %1
pmaxsw %1, %2
-%endmacro
+%endmacro
%macro MMX_XSwap 4
movq %4, %2
@@ -105,7 +105,7 @@
SSE2_XSawp qdq, %5, %2, %3
%endmacro
-;in: xmm0, xmm1, xmm2, xmm3 pOut: xmm0, xmm1, xmm3, xmm4
+;in: xmm0, xmm1, xmm2, xmm3 pOut: xmm0, xmm1, xmm3, xmm4
%macro SSE2_TransTwo4x4W 5
SSE2_XSawp wd, %1, %2, %5
SSE2_XSawp wd, %3, %4, %2
@@ -125,26 +125,26 @@
movdqa %6, %9
movdqa %9, %4
SSE2_XSawp bw, %7, %6, %4
-
- SSE2_XSawp wd, %1, %3, %6
+
+ SSE2_XSawp wd, %1, %3, %6
SSE2_XSawp wd, %8, %2, %3
SSE2_XSawp wd, %5, %7, %2
movdqa %7, %9
- movdqa %9, %3
+ movdqa %9, %3
SSE2_XSawp wd, %7, %4, %3
-
- SSE2_XSawp dq, %1, %5, %4
+
+ SSE2_XSawp dq, %1, %5, %4
SSE2_XSawp dq, %6, %2, %5
SSE2_XSawp dq, %8, %7, %2
movdqa %7, %9
- movdqa %9, %5
+ movdqa %9, %5
SSE2_XSawp dq, %7, %3, %5
-
+
SSE2_XSawp qdq, %1, %8, %3
SSE2_XSawp qdq, %4, %2, %8
SSE2_XSawp qdq, %6, %7, %2
movdqa %7, %9
- movdqa %9, %1
+ movdqa %9, %1
SSE2_XSawp qdq, %7, %5, %1
movdqa %5, %9
%endmacro
@@ -170,9 +170,9 @@
%macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
mov %3h, %3l
movd %1, e%3x ; i.e, 1% = eax (=b0)
- pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
- pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
-%endmacro
+ pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
+ pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro
;copy a dw into a xmm for 8 times
%macro SSE2_Copy8Times 2
--- a/processing/src/asm/cpuid.asm
+++ b/processing/src/asm/cpuid.asm
@@ -84,12 +84,12 @@
; void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
;****************************************************************************************************
WelsCPUId:
- push ebx
+ push ebx
push edi
-
+
mov eax, [esp+12] ; operating index
cpuid ; cpuid
-
+
; processing various information return
mov edi, [esp+16]
mov [edi], eax
@@ -100,10 +100,10 @@
mov edi, [esp+28]
mov [edi], edx
- pop edi
+ pop edi
pop ebx
ret
-
+
WELS_EXTERN WelsCPUSupportAVX
; need call after cpuid=1 and eax, ecx flag got then
ALIGN 16
@@ -139,7 +139,7 @@
WelsCPUSupportFMA:
mov eax, [esp+4]
mov ecx, [esp+8]
-
+
; refer to detection of FMA addressed in INTEL AVX manual document
and ecx, 018001000H
cmp ecx, 018001000H ; check OSXSAVE, AVX, FMA feature flags
@@ -153,7 +153,7 @@
mov eax, 1
ret
fma_not_supported:
- mov eax, 0
+ mov eax, 0
ret
WELS_EXTERN WelsEmms
--- a/processing/src/asm/denoisefilter.asm
+++ b/processing/src/asm/denoisefilter.asm
@@ -1,263 +1,263 @@
-;*!
-;* \copy
-;* Copyright (c) 2010-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* predenoise.asm
-;*
-;* Abstract
-;* denoise for SVC2.1
-;* History
-;* 4/13/2010 Created
-;* 7/30/2010 Modified
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Constant
-;***********************************************************************
-SECTION .rodata align=16
-
-sse2_32 times 8 dw 32
-sse2_20 times 8 dw 20
-
-
-BITS 32
-;***********************************************************************
-; Code
-;***********************************************************************
-SECTION .text
-
-%macro WEIGHT_LINE 9
- movq %2, %9
- punpcklbw %2, %7
- movdqa %8, %2
-
- movdqa %1, %6
- psubusb %1, %8
- psubusb %8, %6
- por %8, %1 ; ABS(curPixel - centerPixel);
-
- movdqa %1, %3
- psubusb %1, %8
-
- pmullw %1, %1
- psrlw %1, 5
- pmullw %2, %1
- paddusw %4, %1
- paddusw %5, %2
-%endmacro
-
-%macro WEIGHT_LINE1_UV 4
- movdqa %2, %1
- punpcklbw %2, %4
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 1
- punpcklbw %2, %4
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 2
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 3
- punpcklbw %2, %4
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 4
- punpcklbw %2, %4
- paddw %3, %2
-%endmacro
-
-%macro WEIGHT_LINE2_UV 4
- movdqa %2, %1
- punpcklbw %2, %4
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 1
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 2
- punpcklbw %2, %4
- psllw %2, 2
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 3
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 4
- punpcklbw %2, %4
- paddw %3, %2
-%endmacro
-
-%macro WEIGHT_LINE3_UV 4
- movdqa %2, %1
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 1
- punpcklbw %2, %4
- psllw %2, 2
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 2
- punpcklbw %2, %4
- pmullw %2, [sse2_20]
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 3
- punpcklbw %2, %4
- psllw %2, 2
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 4
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
-%endmacro
-
-ALIGN 16
-WELS_EXTERN BilateralLumaFilter8_sse2
-;***********************************************************************
-; BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
-;***********************************************************************
-; 1 2 3
-; 4 0 5
-; 6 7 8
-; 0: the center point
-%define pushsize 4
-%define pixel esp + pushsize + 4
-%define stride esp + pushsize + 8
-BilateralLumaFilter8_sse2:
- push ebx
-
- pxor xmm7, xmm7
- mov eax, [pixel]
- mov ebx, eax
- movq xmm6, [eax]
- punpcklbw xmm6, xmm7
- movdqa xmm3, [sse2_32]
- pxor xmm4, xmm4 ; nTotWeight
- pxor xmm5, xmm5 ; nSum
-
- dec eax
- mov ecx, [stride]
-
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax] ; pixel 4
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 2] ; pixel 5
-
- sub eax, ecx
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax] ; pixel 1
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 1] ; pixel 2
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 2] ; pixel 3
-
- lea eax, [eax + ecx * 2]
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax] ; pixel 6
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 1] ; pixel 7
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 2] ; pixel 8
-
- pcmpeqw xmm0, xmm0
- psrlw xmm0, 15
- psllw xmm0, 8
- psubusw xmm0, xmm4
- pmullw xmm0, xmm6
- paddusw xmm5, xmm0
- psrlw xmm5, 8
- packuswb xmm5, xmm5
- movq [ebx], xmm5
-
- pop ebx
- ret
-
-WELS_EXTERN WaverageChromaFilter8_sse2
-;***********************************************************************
-; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
-;***********************************************************************
-;5x5 filter:
-;1 1 2 1 1
-;1 2 4 2 1
-;2 4 20 4 2
-;1 2 4 2 1
-;1 1 2 1 1
-
-ALIGN 16
-WaverageChromaFilter8_sse2:
- mov edx, [esp + 4] ; pixels
- mov ecx, [esp + 8] ; stride
-
- mov eax, ecx
- add eax, eax
- sub edx, eax ; pixels - 2 * stride
- sub edx, 2
-
- pxor xmm0, xmm0
- pxor xmm3, xmm3
-
- movdqu xmm1, [edx]
- WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
-
- movdqu xmm1, [edx + ecx]
- WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
-
- add edx, eax
- movdqu xmm1, [edx]
- WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0
-
- movdqu xmm1, [edx + ecx]
- WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
-
- movdqu xmm1, [edx + ecx * 2]
- WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
-
- psrlw xmm3, 6
- packuswb xmm3, xmm3
- movq [edx + 2], xmm3
-
- ret
\ No newline at end of file
+;*!
+;* \copy
+;* Copyright (c) 2010-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* predenoise.asm
+;*
+;* Abstract
+;* denoise for SVC2.1
+;* History
+;* 4/13/2010 Created
+;* 7/30/2010 Modified
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Constant
+;***********************************************************************
+SECTION .rodata align=16
+
+sse2_32 times 8 dw 32
+sse2_20 times 8 dw 20
+
+
+BITS 32
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+
+%macro WEIGHT_LINE 9
+ movq %2, %9
+ punpcklbw %2, %7
+ movdqa %8, %2
+
+ movdqa %1, %6
+ psubusb %1, %8
+ psubusb %8, %6
+ por %8, %1 ; ABS(curPixel - centerPixel);
+
+ movdqa %1, %3
+ psubusb %1, %8
+
+ pmullw %1, %1
+ psrlw %1, 5
+ pmullw %2, %1
+ paddusw %4, %1
+ paddusw %5, %2
+%endmacro
+
+%macro WEIGHT_LINE1_UV 4
+ movdqa %2, %1
+ punpcklbw %2, %4
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 1
+ punpcklbw %2, %4
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 2
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 3
+ punpcklbw %2, %4
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 4
+ punpcklbw %2, %4
+ paddw %3, %2
+%endmacro
+
+%macro WEIGHT_LINE2_UV 4
+ movdqa %2, %1
+ punpcklbw %2, %4
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 1
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 2
+ punpcklbw %2, %4
+ psllw %2, 2
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 3
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 4
+ punpcklbw %2, %4
+ paddw %3, %2
+%endmacro
+
+%macro WEIGHT_LINE3_UV 4
+ movdqa %2, %1
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 1
+ punpcklbw %2, %4
+ psllw %2, 2
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 2
+ punpcklbw %2, %4
+ pmullw %2, [sse2_20]
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 3
+ punpcklbw %2, %4
+ psllw %2, 2
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 4
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
+%endmacro
+
+ALIGN 16
+WELS_EXTERN BilateralLumaFilter8_sse2
+;***********************************************************************
+; BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
+;***********************************************************************
+; 1 2 3
+; 4 0 5
+; 6 7 8
+; 0: the center point
+%define pushsize 4
+%define pixel esp + pushsize + 4
+%define stride esp + pushsize + 8
+BilateralLumaFilter8_sse2:
+ push ebx
+
+ pxor xmm7, xmm7
+ mov eax, [pixel]
+ mov ebx, eax
+ movq xmm6, [eax]
+ punpcklbw xmm6, xmm7
+ movdqa xmm3, [sse2_32]
+ pxor xmm4, xmm4 ; nTotWeight
+ pxor xmm5, xmm5 ; nSum
+
+ dec eax
+ mov ecx, [stride]
+
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax] ; pixel 4
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 2] ; pixel 5
+
+ sub eax, ecx
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax] ; pixel 1
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 1] ; pixel 2
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 2] ; pixel 3
+
+ lea eax, [eax + ecx * 2]
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax] ; pixel 6
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 1] ; pixel 7
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 2] ; pixel 8
+
+ pcmpeqw xmm0, xmm0
+ psrlw xmm0, 15
+ psllw xmm0, 8
+ psubusw xmm0, xmm4
+ pmullw xmm0, xmm6
+ paddusw xmm5, xmm0
+ psrlw xmm5, 8
+ packuswb xmm5, xmm5
+ movq [ebx], xmm5
+
+ pop ebx
+ ret
+
+WELS_EXTERN WaverageChromaFilter8_sse2
+;***********************************************************************
+; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
+;***********************************************************************
+;5x5 filter:
+;1 1 2 1 1
+;1 2 4 2 1
+;2 4 20 4 2
+;1 2 4 2 1
+;1 1 2 1 1
+
+ALIGN 16
+WaverageChromaFilter8_sse2:
+ mov edx, [esp + 4] ; pixels
+ mov ecx, [esp + 8] ; stride
+
+ mov eax, ecx
+ add eax, eax
+ sub edx, eax ; pixels - 2 * stride
+ sub edx, 2
+
+ pxor xmm0, xmm0
+ pxor xmm3, xmm3
+
+ movdqu xmm1, [edx]
+ WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
+
+ movdqu xmm1, [edx + ecx]
+ WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
+
+ add edx, eax
+ movdqu xmm1, [edx]
+ WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0
+
+ movdqu xmm1, [edx + ecx]
+ WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
+
+ movdqu xmm1, [edx + ecx * 2]
+ WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
+
+ psrlw xmm3, 6
+ packuswb xmm3, xmm3
+ movq [edx + 2], xmm3
+
+ ret
\ No newline at end of file
--- a/processing/src/asm/downsample_bilinear.asm
+++ b/processing/src/asm/downsample_bilinear.asm
@@ -1,1225 +1,1225 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* upsampling.asm
-;*
-;* Abstract
-;* SIMD for pixel domain down sampling
-;*
-;* History
-;* 10/22/2009 Created
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-
-;***********************************************************************
-; Some constants
-;***********************************************************************
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-shufb_mask_low:
- db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
-shufb_mask_high:
- db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
-
-
-ALIGN 16
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx32_sse( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_sse:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $1 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 32 bytes
-.xloops:
- ; 1st part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
- ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movq mm0, [esi] ; 1st pSrc line
- movq mm1, [esi+8] ; 1st pSrc line + 8
- movq mm2, [esi+ecx] ; 2nd pSrc line
- movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8
-
- ; to handle mm0, mm1, mm2, mm3
- pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B
- pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B
- punpcklbw mm4, mm5 ; d c D C b a B A
- pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4
-
- pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B
- pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B
- punpcklbw mm5, mm6 ; h g H G f e F E
- pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5
-
- pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B
- pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B
- punpcklbw mm6, mm7 ; l k L K j i J I
- pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6
-
- pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B
- pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B
- punpcklbw mm7, mm0 ; p o P O n m N M
- pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7
-
- ; to handle mm4, mm5, mm6, mm7
- movq mm0, mm4 ;
- punpckldq mm0, mm5 ; H G F E D C B A
- punpckhdq mm4, mm5 ; h g f e d c b a
-
- movq mm1, mm6
- punpckldq mm1, mm7 ; P O N M L K J I
- punpckhdq mm6, mm7 ; p o n m l k j i
-
- ; avg within MB horizon width (16 x 2 lines)
- pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
- pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
- pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-
- ; 2nd part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
- ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movq mm1, [esi+16] ; 1st pSrc line + 16
- movq mm2, [esi+24] ; 1st pSrc line + 24
- movq mm3, [esi+ecx+16] ; 2nd pSrc line + 16
- movq mm4, [esi+ecx+24] ; 2nd pSrc line + 24
-
- ; to handle mm1, mm2, mm3, mm4
- pshufw mm5, mm1, 0d8h ; d D b B c C a A ; 11011000 B
- pshufw mm6, mm5, 04eh ; c C a A d D b B ; 01001110 B
- punpcklbw mm5, mm6 ; d c D C b a B A
- pshufw mm5, mm5, 0d8h ; d c b a D C B A ; 11011000 B: mm5
-
- pshufw mm6, mm2, 0d8h ; h H f F g G e E ; 11011000 B
- pshufw mm7, mm6, 04eh ; g G e E h H f F ; 01001110 B
- punpcklbw mm6, mm7 ; h g H G f e F E
- pshufw mm6, mm6, 0d8h ; h g f e H G F E ; 11011000 B: mm6
-
- pshufw mm7, mm3, 0d8h ; l L j J k K i I ; 11011000 B
- pshufw mm1, mm7, 04eh ; k K i I l L j J ; 01001110 B
- punpcklbw mm7, mm1 ; l k L K j i J I
- pshufw mm7, mm7, 0d8h ; l k j i L K J I ; 11011000 B: mm7
-
- pshufw mm1, mm4, 0d8h ; p P n N o O m M ; 11011000 B
- pshufw mm2, mm1, 04eh ; o O m M p P n N ; 01001110 B
- punpcklbw mm1, mm2 ; p o P O n m N M
- pshufw mm1, mm1, 0d8h ; p o n m P O N M ; 11011000 B: mm1
-
- ; to handle mm5, mm6, mm7, mm1
- movq mm2, mm5
- punpckldq mm2, mm6 ; H G F E D C B A
- punpckhdq mm5, mm6 ; h g f e d c b a
-
- movq mm3, mm7
- punpckldq mm3, mm1 ; P O N M L K J I
- punpckhdq mm7, mm1 ; p o n m l k j i
-
- ; avg within MB horizon width (16 x 2 lines)
- pavgb mm2, mm5 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
- pavgb mm3, mm7 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
- pavgb mm2, mm3 ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
-
- movq [edi ], mm0
- movq [edi+8], mm2
-
- ; next SMB
- lea esi, [esi+32]
- lea edi, [edi+16]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- WELSEMMS
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_sse:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $1 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 16 bytes
-.xloops:
- ; 1st part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
- ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movq mm0, [esi] ; 1st pSrc line
- movq mm1, [esi+8] ; 1st pSrc line + 8
- movq mm2, [esi+ecx] ; 2nd pSrc line
- movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8
-
- ; to handle mm0, mm1, mm2, mm3
- pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B
- pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B
- punpcklbw mm4, mm5 ; d c D C b a B A
- pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4
-
- pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B
- pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B
- punpcklbw mm5, mm6 ; h g H G f e F E
- pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5
-
- pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B
- pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B
- punpcklbw mm6, mm7 ; l k L K j i J I
- pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6
-
- pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B
- pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B
- punpcklbw mm7, mm0 ; p o P O n m N M
- pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7
-
- ; to handle mm4, mm5, mm6, mm7
- movq mm0, mm4 ;
- punpckldq mm0, mm5 ; H G F E D C B A
- punpckhdq mm4, mm5 ; h g f e d c b a
-
- movq mm1, mm6
- punpckldq mm1, mm7 ; P O N M L K J I
- punpckhdq mm6, mm7 ; p o n m l k j i
-
- ; avg within MB horizon width (16 x 2 lines)
- pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
- pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
- pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-
- movq [edi ], mm0
-
- ; next SMB
- lea esi, [esi+16]
- lea edi, [edi+8]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- WELSEMMS
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx8_sse:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $1 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $2 ; (iSrcWidth >> 1) / 4 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 8 bytes
-.xloops:
- ; 1st part horizonal loop: x8 bytes
- ; mem hi<- ->lo
- ;1st Line Src: mm0: d D c C b B a A
- ;2nd Line Src: mm1: h H g G f F e E
- ;=> target:
- ;: H G F E D C B A
- ;: h g f e d c b a
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movq mm0, [esi] ; 1st pSrc line
- movq mm1, [esi+ecx] ; 2nd pSrc line
-
- ; to handle mm0, mm1, mm2, mm3
- pshufw mm2, mm0, 0d8h ; d D b B c C a A ; 11011000 B
- pshufw mm3, mm2, 04eh ; c C a A d D b B ; 01001110 B
- punpcklbw mm2, mm3 ; d c D C b a B A
- pshufw mm2, mm2, 0d8h ; d c b a D C B A ; 11011000 B: mm4
-
- pshufw mm4, mm1, 0d8h ; h H f F g G e E ; 11011000 B
- pshufw mm5, mm4, 04eh ; g G e E h H f F ; 01001110 B
- punpcklbw mm4, mm5 ; h g H G f e F E
- pshufw mm4, mm4, 0d8h ; h g f e H G F E ; 11011000 B: mm5
-
- ; to handle mm2, mm4
- movq mm0, mm2 ;
- punpckldq mm0, mm4 ; H G F E D C B A
- punpckhdq mm2, mm4 ; h g f e d c b a
-
- ; avg within MB horizon width (16 x 2 lines)
- pavgb mm0, mm2 ; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
- pshufw mm1, mm0, 04eh ; 01001110 B
- pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-
- movd [edi], mm0
-
- ; next unit
- lea esi, [esi+8]
- lea edi, [edi+4]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- WELSEMMS
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-
-
-; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx32_ssse3( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_ssse3:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
-
- movdqa xmm7, [shufb_mask_low] ; mask low
- movdqa xmm6, [shufb_mask_high] ; mask high
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $1 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 32 bytes
-.xloops:
- ; 1st part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
- ; xmm1: p P o O n N m M l L k K j J i I
- ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
- ; xmm3: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: P O N M L K J I H G F E D C B A
- ;: p o n m l k j i h g f e d c b a
- ;: P .. A
- ;: p .. a
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movdqa xmm0, [esi] ; 1st_src_line
- movdqa xmm1, [esi+16] ; 1st_src_line + 16
- movdqa xmm2, [esi+ecx] ; 2nd_src_line
- movdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
-
- ; packing & avg
- movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- ; another implementation for xmm4 high bits
-; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm4
-
- movdqa xmm5, xmm1
- pshufb xmm1, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm1
-; psrlw xmm5, 8
- pavgb xmm1, xmm5
-
- movdqa xmm4, xmm2
- pshufb xmm2, xmm7
- pshufb xmm4, xmm6
-; psubb xmm4, xmm2
-; psrlw xmm4, 8
- pavgb xmm2, xmm4
-
- movdqa xmm5, xmm3
- pshufb xmm3, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm3
-; psrlw xmm5, 8
- pavgb xmm3, xmm5
-
- packuswb xmm0, xmm1
- packuswb xmm2, xmm3
- pavgb xmm0, xmm2
-
- ; write pDst
- movdqa [edi], xmm0
-
- ; next SMB
- lea esi, [esi+32]
- lea edi, [edi+16]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_ssse3:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
- movdqa xmm7, [shufb_mask_low] ; mask low
- movdqa xmm6, [shufb_mask_high] ; mask high
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $1 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 16 bytes
-.xloops:
- ; horizonal loop: x16 bytes by source
- ; mem hi<- ->lo
- ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
- ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movdqa xmm0, [esi] ; 1st_src_line
- movdqa xmm1, [esi+ecx] ; 2nd_src_line
-
- ; packing & avg
- movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- ; another implementation for xmm2 high bits
-; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm2
-
- movdqa xmm3, xmm1
- pshufb xmm1, xmm7
- pshufb xmm3, xmm6
-; psubb xmm3, xmm1
-; psrlw xmm3, 8
- pavgb xmm1, xmm3
-
- pavgb xmm0, xmm1
- packuswb xmm0, xmm1
-
- ; write pDst
- movq [edi], xmm0
-
- ; next SMB
- lea esi, [esi+16]
- lea edi, [edi+8]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx32_sse4( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_sse4:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
-
- movdqa xmm7, [shufb_mask_low] ; mask low
- movdqa xmm6, [shufb_mask_high] ; mask high
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $1 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 32 bytes
-.xloops:
- ; 1st part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
- ; xmm1: p P o O n N m M l L k K j J i I
- ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
- ; xmm3: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: P O N M L K J I H G F E D C B A
- ;: p o n m l k j i h g f e d c b a
- ;: P .. A
- ;: p .. a
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movntdqa xmm0, [esi] ; 1st_src_line
- movntdqa xmm1, [esi+16] ; 1st_src_line + 16
- movntdqa xmm2, [esi+ecx] ; 2nd_src_line
- movntdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
-
- ; packing & avg
- movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm4
-
- movdqa xmm5, xmm1
- pshufb xmm1, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm1
-; psrlw xmm5, 8
- pavgb xmm1, xmm5
-
- movdqa xmm4, xmm2
- pshufb xmm2, xmm7
- pshufb xmm4, xmm6
-; psubb xmm4, xmm2
-; psrlw xmm4, 8
- pavgb xmm2, xmm4
-
- movdqa xmm5, xmm3
- pshufb xmm3, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm3
-; psrlw xmm5, 8
- pavgb xmm3, xmm5
-
- packuswb xmm0, xmm1
- packuswb xmm2, xmm3
- pavgb xmm0, xmm2
-
- ; write pDst
- movdqa [edi], xmm0
-
- ; next SMB
- lea esi, [esi+32]
- lea edi, [edi+16]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_sse4:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
- movdqa xmm7, [shufb_mask_low] ; mask low
- movdqa xmm6, [shufb_mask_high] ; mask high
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $1 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 16 bytes
-.xloops:
- ; horizonal loop: x16 bytes by source
- ; mem hi<- ->lo
- ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
- ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movntdqa xmm0, [esi] ; 1st_src_line
- movntdqa xmm1, [esi+ecx] ; 2nd_src_line
-
- ; packing & avg
- movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm2
-
- movdqa xmm3, xmm1
- pshufb xmm1, xmm7
- pshufb xmm3, xmm6
-; psubb xmm3, xmm1
-; psrlw xmm3, 8
- pavgb xmm1, xmm3
-
- pavgb xmm0, xmm1
- packuswb xmm0, xmm1
-
- ; write pDst
- movq [edi], xmm0
-
- ; next SMB
- lea esi, [esi+16]
- lea edi, [edi+8]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-
-
-
-
-WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
-;**************************************************************************************************************
-;int GeneralBilinearAccurateDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
-; unsigned int uiScaleX, unsigned int uiScaleY );
-;{
-;**************************************************************************************************************
-
-ALIGN 16
-GeneralBilinearAccurateDownsampler_sse2:
- push ebp
- push esi
- push edi
- push ebx
-%define pushsize 16
-%define localsize 28
-%define pDstData esp + pushsize + localsize + 4
-%define dwDstStride esp + pushsize + localsize + 8
-%define dwDstWidth esp + pushsize + localsize + 12
-%define dwDstHeight esp + pushsize + localsize + 16
-%define pSrcData esp + pushsize + localsize + 20
-%define dwSrcStride esp + pushsize + localsize + 24
-%define dwSrcWidth esp + pushsize + localsize + 28
-%define dwSrcHeight esp + pushsize + localsize + 32
-%define scale esp + 0
-%define uiScaleX esp + pushsize + localsize + 36
-%define uiScaleY esp + pushsize + localsize + 40
-%define tmpHeight esp + 12
-%define yInverse esp + 16
-%define xInverse esp + 20
-%define dstStep esp + 24
- sub esp, localsize
-
- pxor xmm0, xmm0
- mov edx, 32767
- mov eax, [uiScaleX]
- and eax, 32767
- mov ebx, eax
- neg ebx
- and ebx, 32767
- movd xmm1, eax ; uinc(uiScaleX mod 32767)
- movd xmm2, ebx ; -uinc
- psllq xmm1, 32
- por xmm1, xmm2 ; 0 0 uinc -uinc (dword)
- pshufd xmm7, xmm1, 01000100b ; xmm7: uinc -uinc uinc -uinc
-
- mov eax, [uiScaleY]
- and eax, 32767
- mov ebx, eax
- neg ebx
- and ebx, 32767
- movd xmm6, eax ; vinc(uiScaleY mod 32767)
- movd xmm2, ebx ; -vinc
- psllq xmm6, 32
- por xmm6, xmm2 ; 0 0 vinc -vinc (dword)
- pshufd xmm6, xmm6, 01010000b ; xmm6: vinc vinc -vinc -vinc
-
- mov edx, 40003fffh
- movd xmm5, edx
- punpcklwd xmm5, xmm0 ; 16384 16383
- pshufd xmm5, xmm5, 01000100b ; xmm5: 16384 16383 16384 16383
-
-
-DOWNSAMPLE:
-
- mov eax, [dwDstHeight]
- mov edi, [pDstData]
- mov edx, [dwDstStride]
- mov ecx, [dwDstWidth]
- sub edx, ecx
- mov [dstStep], edx ; stride - width
- dec eax
- mov [tmpHeight], eax
- mov eax, 16384
- mov [yInverse], eax
-
- pshufd xmm4, xmm5, 01010000b ; initial v to 16384 16384 16383 16383
-
-HEIGHT:
- mov eax, [yInverse]
- mov esi, [pSrcData]
- shr eax, 15
- mul dword [dwSrcStride]
- add esi, eax ; get current row address
- mov ebp, esi
- add ebp, [dwSrcStride]
-
- mov eax, 16384
- mov [xInverse], eax
- mov ecx, [dwDstWidth]
- dec ecx
-
- movdqa xmm3, xmm5 ; initial u to 16384 16383 16384 16383
-
-WIDTH:
- mov eax, [xInverse]
- shr eax, 15
-
- movd xmm1, [esi+eax] ; xxxxxxba
- movd xmm2, [ebp+eax] ; xxxxxxdc
- pxor xmm0, xmm0
- punpcklwd xmm1, xmm2 ; xxxxdcba
- punpcklbw xmm1, xmm0 ; 0d0c0b0a
- punpcklwd xmm1, xmm0 ; 000d000c000b000a
-
- movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv
- pmaddwd xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2
- movdqa xmm0, xmm2
- pmuludq xmm2, xmm1
- psrlq xmm0, 32
- psrlq xmm1, 32
- pmuludq xmm0, xmm1
- paddq xmm2, xmm0
- pshufd xmm1, xmm2, 00001110b
- paddq xmm2, xmm1
- psrlq xmm2, 29
-
- movd eax, xmm2
- inc eax
- shr eax, 1
- mov [edi], al
- inc edi
-
- mov eax, [uiScaleX]
- add [xInverse], eax
-
- paddw xmm3, xmm7 ; inc u
- psllw xmm3, 1
- psrlw xmm3, 1
-
- loop WIDTH
-
-WIDTH_END:
- mov eax, [xInverse]
- shr eax, 15
- mov cl, [esi+eax]
- mov [edi], cl
- inc edi
-
- mov eax, [uiScaleY]
- add [yInverse], eax
- add edi, [dstStep]
-
- paddw xmm4, xmm6 ; inc v
- psllw xmm4, 1
- psrlw xmm4, 1
-
- dec dword [tmpHeight]
- jg HEIGHT
-
-
-LAST_ROW:
- mov eax, [yInverse]
- mov esi, [pSrcData]
- shr eax, 15
- mul dword [dwSrcStride]
- add esi, eax ; get current row address
-
- mov eax, 16384
- mov [xInverse], eax
- mov ecx, [dwDstWidth]
-
-LAST_ROW_WIDTH:
- mov eax, [xInverse]
- shr eax, 15
-
- mov al, [esi+eax]
- mov [edi], al
- inc edi
-
- mov eax, [uiScaleX]
- add [xInverse], eax
-
- loop LAST_ROW_WIDTH
-
-LAST_ROW_END:
-
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef pushsize
-%undef localsize
-%undef pSrcData
-%undef dwSrcWidth
-%undef dwSrcHeight
-%undef dwSrcStride
-%undef pDstData
-%undef dwDstWidth
-%undef dwDstHeight
-%undef dwDstStride
-%undef scale
-%undef uiScaleX
-%undef uiScaleY
-%undef tmpHeight
-%undef yInverse
-%undef xInverse
-%undef dstStep
- ret
-
-
-
-
-WELS_EXTERN GeneralBilinearFastDownsampler_sse2
-;**************************************************************************************************************
-;int GeneralBilinearFastDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
-; unsigned int uiScaleX, unsigned int uiScaleY );
-;{
-;**************************************************************************************************************
-
-ALIGN 16
-GeneralBilinearFastDownsampler_sse2:
- push ebp
- push esi
- push edi
- push ebx
-%define pushsize 16
-%define localsize 28
-%define pDstData esp + pushsize + localsize + 4
-%define dwDstStride esp + pushsize + localsize + 8
-%define dwDstWidth esp + pushsize + localsize + 12
-%define dwDstHeight esp + pushsize + localsize + 16
-%define pSrcData esp + pushsize + localsize + 20
-%define dwSrcStride esp + pushsize + localsize + 24
-%define dwSrcWidth esp + pushsize + localsize + 28
-%define dwSrcHeight esp + pushsize + localsize + 32
-%define scale esp + 0
-%define uiScaleX esp + pushsize + localsize + 36
-%define uiScaleY esp + pushsize + localsize + 40
-%define tmpHeight esp + 12
-%define yInverse esp + 16
-%define xInverse esp + 20
-%define dstStep esp + 24
- sub esp, localsize
-
- pxor xmm0, xmm0
- mov edx, 65535
- mov eax, [uiScaleX]
- and eax, edx
- mov ebx, eax
- neg ebx
- and ebx, 65535
- movd xmm1, eax ; uinc(uiScaleX mod 65536)
- movd xmm2, ebx ; -uinc
- psllq xmm1, 32
- por xmm1, xmm2 ; 0 uinc 0 -uinc
- pshuflw xmm7, xmm1, 10001000b ; xmm7: uinc -uinc uinc -uinc
-
- mov eax, [uiScaleY]
- and eax, 32767
- mov ebx, eax
- neg ebx
- and ebx, 32767
- movd xmm6, eax ; vinc(uiScaleY mod 32767)
- movd xmm2, ebx ; -vinc
- psllq xmm6, 32
- por xmm6, xmm2 ; 0 vinc 0 -vinc
- pshuflw xmm6, xmm6, 10100000b ; xmm6: vinc vinc -vinc -vinc
-
- mov edx, 80007fffh ; 32768 32767
- movd xmm5, edx
- pshuflw xmm5, xmm5, 01000100b ; 32768 32767 32768 32767
- mov ebx, 16384
-
-
-FAST_DOWNSAMPLE:
-
- mov eax, [dwDstHeight]
- mov edi, [pDstData]
- mov edx, [dwDstStride]
- mov ecx, [dwDstWidth]
- sub edx, ecx
- mov [dstStep], edx ; stride - width
- dec eax
- mov [tmpHeight], eax
- mov eax, 16384
- mov [yInverse], eax
-
- pshuflw xmm4, xmm5, 01010000b
- psrlw xmm4, 1 ; initial v to 16384 16384 16383 16383
-
-FAST_HEIGHT:
- mov eax, [yInverse]
- mov esi, [pSrcData]
- shr eax, 15
- mul dword [dwSrcStride]
- add esi, eax ; get current row address
- mov ebp, esi
- add ebp, [dwSrcStride]
-
- mov eax, 32768
- mov [xInverse], eax
- mov ecx, [dwDstWidth]
- dec ecx
-
- movdqa xmm3, xmm5 ; initial u to 32768 32767 32768 32767
-
-FAST_WIDTH:
- mov eax, [xInverse]
- shr eax, 16
-
- movd xmm1, [esi+eax] ; xxxxxxba
- movd xmm2, [ebp+eax] ; xxxxxxdc
- punpcklwd xmm1, xmm2 ; xxxxdcba
- punpcklbw xmm1, xmm0 ; 0d0c0b0a
-
- movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv
- pmulhuw xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2
- pmaddwd xmm2, xmm1
- pshufd xmm1, xmm2, 00000001b
- paddd xmm2, xmm1
- movd xmm1, ebx
- paddd xmm2, xmm1
- psrld xmm2, 15
-
- packuswb xmm2, xmm0
- movd eax, xmm2
- mov [edi], al
- inc edi
-
- mov eax, [uiScaleX]
- add [xInverse], eax
-
- paddw xmm3, xmm7 ; inc u
-
- loop FAST_WIDTH
-
-FAST_WIDTH_END:
- mov eax, [xInverse]
- shr eax, 16
- mov cl, [esi+eax]
- mov [edi], cl
- inc edi
-
- mov eax, [uiScaleY]
- add [yInverse], eax
- add edi, [dstStep]
-
- paddw xmm4, xmm6 ; inc v
- psllw xmm4, 1
- psrlw xmm4, 1
-
- dec dword [tmpHeight]
- jg FAST_HEIGHT
-
-
-FAST_LAST_ROW:
- mov eax, [yInverse]
- mov esi, [pSrcData]
- shr eax, 15
- mul dword [dwSrcStride]
- add esi, eax ; get current row address
-
- mov eax, 32768
- mov [xInverse], eax
- mov ecx, [dwDstWidth]
-
-FAST_LAST_ROW_WIDTH:
- mov eax, [xInverse]
- shr eax, 16
-
- mov al, [esi+eax]
- mov [edi], al
- inc edi
-
- mov eax, [uiScaleX]
- add [xInverse], eax
-
- loop FAST_LAST_ROW_WIDTH
-
-FAST_LAST_ROW_END:
-
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef pushsize
-%undef localsize
-%undef pSrcData
-%undef dwSrcWidth
-%undef dwSrcHeight
-%undef dwSrcStride
-%undef pDstData
-%undef dwDstWidth
-%undef dwDstHeight
-%undef dwDstStride
-%undef scale
-%undef uiScaleX
-%undef uiScaleY
-%undef tmpHeight
-%undef yInverse
-%undef xInverse
-%undef dstStep
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* upsampling.asm
+;*
+;* Abstract
+;* SIMD for pixel domain down sampling
+;*
+;* History
+;* 10/22/2009 Created
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+
+;***********************************************************************
+; Some constants
+;***********************************************************************
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+shufb_mask_low:
+ db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
+shufb_mask_high:
+ db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
+
+
+ALIGN 16
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx32_sse( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx32_sse:
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $1 ; iSrcHeight >> 1
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $1 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 32 bytes
+.xloops:
+ ; 1st part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
+ ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movq mm0, [esi] ; 1st pSrc line
+ movq mm1, [esi+8] ; 1st pSrc line + 8
+ movq mm2, [esi+ecx] ; 2nd pSrc line
+ movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8
+
+ ; to handle mm0, mm1, mm2, mm3
+ pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B
+ pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B
+ punpcklbw mm4, mm5 ; d c D C b a B A
+ pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4
+
+ pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B
+ pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B
+ punpcklbw mm5, mm6 ; h g H G f e F E
+ pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5
+
+ pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B
+ pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B
+ punpcklbw mm6, mm7 ; l k L K j i J I
+ pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6
+
+ pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B
+ pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B
+ punpcklbw mm7, mm0 ; p o P O n m N M
+ pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7
+
+ ; to handle mm4, mm5, mm6, mm7
+ movq mm0, mm4 ;
+ punpckldq mm0, mm5 ; H G F E D C B A
+ punpckhdq mm4, mm5 ; h g f e d c b a
+
+ movq mm1, mm6
+ punpckldq mm1, mm7 ; P O N M L K J I
+ punpckhdq mm6, mm7 ; p o n m l k j i
+
+ ; avg within MB horizon width (16 x 2 lines)
+ pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+ pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+ pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+ ; 2nd part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
+ ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movq mm1, [esi+16] ; 1st pSrc line + 16
+ movq mm2, [esi+24] ; 1st pSrc line + 24
+ movq mm3, [esi+ecx+16] ; 2nd pSrc line + 16
+ movq mm4, [esi+ecx+24] ; 2nd pSrc line + 24
+
+ ; to handle mm1, mm2, mm3, mm4
+ pshufw mm5, mm1, 0d8h ; d D b B c C a A ; 11011000 B
+ pshufw mm6, mm5, 04eh ; c C a A d D b B ; 01001110 B
+ punpcklbw mm5, mm6 ; d c D C b a B A
+ pshufw mm5, mm5, 0d8h ; d c b a D C B A ; 11011000 B: mm5
+
+ pshufw mm6, mm2, 0d8h ; h H f F g G e E ; 11011000 B
+ pshufw mm7, mm6, 04eh ; g G e E h H f F ; 01001110 B
+ punpcklbw mm6, mm7 ; h g H G f e F E
+ pshufw mm6, mm6, 0d8h ; h g f e H G F E ; 11011000 B: mm6
+
+ pshufw mm7, mm3, 0d8h ; l L j J k K i I ; 11011000 B
+ pshufw mm1, mm7, 04eh ; k K i I l L j J ; 01001110 B
+ punpcklbw mm7, mm1 ; l k L K j i J I
+ pshufw mm7, mm7, 0d8h ; l k j i L K J I ; 11011000 B: mm7
+
+ pshufw mm1, mm4, 0d8h ; p P n N o O m M ; 11011000 B
+ pshufw mm2, mm1, 04eh ; o O m M p P n N ; 01001110 B
+ punpcklbw mm1, mm2 ; p o P O n m N M
+ pshufw mm1, mm1, 0d8h ; p o n m P O N M ; 11011000 B: mm1
+
+ ; to handle mm5, mm6, mm7, mm1
+ movq mm2, mm5
+ punpckldq mm2, mm6 ; H G F E D C B A
+ punpckhdq mm5, mm6 ; h g f e d c b a
+
+ movq mm3, mm7
+ punpckldq mm3, mm1 ; P O N M L K J I
+ punpckhdq mm7, mm1 ; p o n m l k j i
+
+ ; avg within MB horizon width (16 x 2 lines)
+ pavgb mm2, mm5 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+ pavgb mm3, mm7 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+ pavgb mm2, mm3 ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
+
+ movq [edi ], mm0
+ movq [edi+8], mm2
+
+ ; next SMB
+ lea esi, [esi+32]
+ lea edi, [edi+16]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ WELSEMMS
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx16_sse:
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $1 ; iSrcHeight >> 1
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $1 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 16 bytes
+.xloops:
+ ; 1st part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
+ ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movq mm0, [esi] ; 1st pSrc line
+ movq mm1, [esi+8] ; 1st pSrc line + 8
+ movq mm2, [esi+ecx] ; 2nd pSrc line
+ movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8
+
+ ; to handle mm0, mm1, mm2, mm3
+ pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B
+ pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B
+ punpcklbw mm4, mm5 ; d c D C b a B A
+ pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4
+
+ pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B
+ pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B
+ punpcklbw mm5, mm6 ; h g H G f e F E
+ pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5
+
+ pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B
+ pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B
+ punpcklbw mm6, mm7 ; l k L K j i J I
+ pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6
+
+ pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B
+ pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B
+ punpcklbw mm7, mm0 ; p o P O n m N M
+ pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7
+
+ ; to handle mm4, mm5, mm6, mm7
+ movq mm0, mm4 ;
+ punpckldq mm0, mm5 ; H G F E D C B A
+ punpckhdq mm4, mm5 ; h g f e d c b a
+
+ movq mm1, mm6
+ punpckldq mm1, mm7 ; P O N M L K J I
+ punpckhdq mm6, mm7 ; p o n m l k j i
+
+ ; avg within MB horizon width (16 x 2 lines)
+ pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+ pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+ pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+ movq [edi ], mm0
+
+ ; next SMB
+ lea esi, [esi+16]
+ lea edi, [edi+8]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ WELSEMMS
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx8_sse:
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $1 ; iSrcHeight >> 1
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $1 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $2 ; (iSrcWidth >> 1) / 4 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 8 bytes
+.xloops:
+ ; 1st part horizonal loop: x8 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: mm0: d D c C b B a A
+ ;2nd Line Src: mm1: h H g G f F e E
+ ;=> target:
+ ;: H G F E D C B A
+ ;: h g f e d c b a
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movq mm0, [esi] ; 1st pSrc line
+ movq mm1, [esi+ecx] ; 2nd pSrc line
+
+ ; to handle mm0, mm1, mm2, mm3
+ pshufw mm2, mm0, 0d8h ; d D b B c C a A ; 11011000 B
+ pshufw mm3, mm2, 04eh ; c C a A d D b B ; 01001110 B
+ punpcklbw mm2, mm3 ; d c D C b a B A
+ pshufw mm2, mm2, 0d8h ; d c b a D C B A ; 11011000 B: mm4
+
+ pshufw mm4, mm1, 0d8h ; h H f F g G e E ; 11011000 B
+ pshufw mm5, mm4, 04eh ; g G e E h H f F ; 01001110 B
+ punpcklbw mm4, mm5 ; h g H G f e F E
+ pshufw mm4, mm4, 0d8h ; h g f e H G F E ; 11011000 B: mm5
+
+ ; to handle mm2, mm4
+ movq mm0, mm2 ;
+ punpckldq mm0, mm4 ; H G F E D C B A
+ punpckhdq mm2, mm4 ; h g f e d c b a
+
+ ; avg within MB horizon width (16 x 2 lines)
+ pavgb mm0, mm2 ; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
+ pshufw mm1, mm0, 04eh ; 01001110 B
+ pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+ movd [edi], mm0
+
+ ; next unit
+ lea esi, [esi+8]
+ lea edi, [edi+4]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ WELSEMMS
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+
+
+; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx32_ssse3( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx32_ssse3:
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $1 ; iSrcHeight >> 1
+
+ movdqa xmm7, [shufb_mask_low] ; mask low
+ movdqa xmm6, [shufb_mask_high] ; mask high
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $1 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 32 bytes
+.xloops:
+ ; 1st part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
+ ; xmm1: p P o O n N m M l L k K j J i I
+ ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
+ ; xmm3: p P o O n N m M l L k K j J i I
+ ;=> target:
+ ;: P O N M L K J I H G F E D C B A
+ ;: p o n m l k j i h g f e d c b a
+ ;: P .. A
+ ;: p .. a
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movdqa xmm0, [esi] ; 1st_src_line
+ movdqa xmm1, [esi+16] ; 1st_src_line + 16
+ movdqa xmm2, [esi+ecx] ; 2nd_src_line
+ movdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
+
+ ; packing & avg
+ movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
+ pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+ pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ ; another implementation for xmm4 high bits
+; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ pavgb xmm0, xmm4
+
+ movdqa xmm5, xmm1
+ pshufb xmm1, xmm7
+ pshufb xmm5, xmm6
+; psubb xmm5, xmm1
+; psrlw xmm5, 8
+ pavgb xmm1, xmm5
+
+ movdqa xmm4, xmm2
+ pshufb xmm2, xmm7
+ pshufb xmm4, xmm6
+; psubb xmm4, xmm2
+; psrlw xmm4, 8
+ pavgb xmm2, xmm4
+
+ movdqa xmm5, xmm3
+ pshufb xmm3, xmm7
+ pshufb xmm5, xmm6
+; psubb xmm5, xmm3
+; psrlw xmm5, 8
+ pavgb xmm3, xmm5
+
+ packuswb xmm0, xmm1
+ packuswb xmm2, xmm3
+ pavgb xmm0, xmm2
+
+ ; write pDst
+ movdqa [edi], xmm0
+
+ ; next SMB
+ lea esi, [esi+32]
+ lea edi, [edi+16]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx16_ssse3:
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $1 ; iSrcHeight >> 1
+ movdqa xmm7, [shufb_mask_low] ; mask low
+ movdqa xmm6, [shufb_mask_high] ; mask high
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $1 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 16 bytes
+.xloops:
+ ; horizonal loop: x16 bytes by source
+ ; mem hi<- ->lo
+ ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
+ ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movdqa xmm0, [esi] ; 1st_src_line
+ movdqa xmm1, [esi+ecx] ; 2nd_src_line
+
+ ; packing & avg
+ movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
+ pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+ pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ ; another implementation for xmm2 high bits
+; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ pavgb xmm0, xmm2
+
+ movdqa xmm3, xmm1
+ pshufb xmm1, xmm7
+ pshufb xmm3, xmm6
+; psubb xmm3, xmm1
+; psrlw xmm3, 8
+ pavgb xmm1, xmm3
+
+ pavgb xmm0, xmm1
+ packuswb xmm0, xmm1
+
+ ; write pDst
+ movq [edi], xmm0
+
+ ; next SMB
+ lea esi, [esi+16]
+ lea edi, [edi+8]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx32_sse4( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx32_sse4:
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $1 ; iSrcHeight >> 1
+
+ movdqa xmm7, [shufb_mask_low] ; mask low
+ movdqa xmm6, [shufb_mask_high] ; mask high
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $1 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 32 bytes
+.xloops:
+ ; 1st part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
+ ; xmm1: p P o O n N m M l L k K j J i I
+ ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
+ ; xmm3: p P o O n N m M l L k K j J i I
+ ;=> target:
+ ;: P O N M L K J I H G F E D C B A
+ ;: p o n m l k j i h g f e d c b a
+ ;: P .. A
+ ;: p .. a
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movntdqa xmm0, [esi] ; 1st_src_line
+ movntdqa xmm1, [esi+16] ; 1st_src_line + 16
+ movntdqa xmm2, [esi+ecx] ; 2nd_src_line
+ movntdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
+
+ ; packing & avg
+ movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
+ pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+ pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ pavgb xmm0, xmm4
+
+ movdqa xmm5, xmm1
+ pshufb xmm1, xmm7
+ pshufb xmm5, xmm6
+; psubb xmm5, xmm1
+; psrlw xmm5, 8
+ pavgb xmm1, xmm5
+
+ movdqa xmm4, xmm2
+ pshufb xmm2, xmm7
+ pshufb xmm4, xmm6
+; psubb xmm4, xmm2
+; psrlw xmm4, 8
+ pavgb xmm2, xmm4
+
+ movdqa xmm5, xmm3
+ pshufb xmm3, xmm7
+ pshufb xmm5, xmm6
+; psubb xmm5, xmm3
+; psrlw xmm5, 8
+ pavgb xmm3, xmm5
+
+ packuswb xmm0, xmm1
+ packuswb xmm2, xmm3
+ pavgb xmm0, xmm2
+
+ ; write pDst
+ movdqa [edi], xmm0
+
+ ; next SMB
+ lea esi, [esi+32]
+ lea edi, [edi+16]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx16_sse4:
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $1 ; iSrcHeight >> 1
+ movdqa xmm7, [shufb_mask_low] ; mask low
+ movdqa xmm6, [shufb_mask_high] ; mask high
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $1 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 16 bytes
+.xloops:
+ ; horizonal loop: x16 bytes by source
+ ; mem hi<- ->lo
+ ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
+ ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movntdqa xmm0, [esi] ; 1st_src_line
+ movntdqa xmm1, [esi+ecx] ; 2nd_src_line
+
+ ; packing & avg
+ movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
+ pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+ pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ pavgb xmm0, xmm2
+
+ movdqa xmm3, xmm1
+ pshufb xmm1, xmm7
+ pshufb xmm3, xmm6
+; psubb xmm3, xmm1
+; psrlw xmm3, 8
+ pavgb xmm1, xmm3
+
+ pavgb xmm0, xmm1
+ packuswb xmm0, xmm1
+
+ ; write pDst
+ movq [edi], xmm0
+
+ ; next SMB
+ lea esi, [esi+16]
+ lea edi, [edi+8]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+
+
+
+
+WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
+;**************************************************************************************************************
+;int GeneralBilinearAccurateDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
+; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+; unsigned int uiScaleX, unsigned int uiScaleY );
+;{
+;**************************************************************************************************************
+
+ALIGN 16
+GeneralBilinearAccurateDownsampler_sse2:
+ push ebp
+ push esi
+ push edi
+ push ebx
+%define pushsize 16
+%define localsize 28
+%define pDstData esp + pushsize + localsize + 4
+%define dwDstStride esp + pushsize + localsize + 8
+%define dwDstWidth esp + pushsize + localsize + 12
+%define dwDstHeight esp + pushsize + localsize + 16
+%define pSrcData esp + pushsize + localsize + 20
+%define dwSrcStride esp + pushsize + localsize + 24
+%define dwSrcWidth esp + pushsize + localsize + 28
+%define dwSrcHeight esp + pushsize + localsize + 32
+%define scale esp + 0
+%define uiScaleX esp + pushsize + localsize + 36
+%define uiScaleY esp + pushsize + localsize + 40
+%define tmpHeight esp + 12
+%define yInverse esp + 16
+%define xInverse esp + 20
+%define dstStep esp + 24
+ sub esp, localsize
+
+ pxor xmm0, xmm0
+ mov edx, 32767
+ mov eax, [uiScaleX]
+ and eax, 32767
+ mov ebx, eax
+ neg ebx
+ and ebx, 32767
+ movd xmm1, eax ; uinc(uiScaleX mod 32767)
+ movd xmm2, ebx ; -uinc
+ psllq xmm1, 32
+ por xmm1, xmm2 ; 0 0 uinc -uinc (dword)
+ pshufd xmm7, xmm1, 01000100b ; xmm7: uinc -uinc uinc -uinc
+
+ mov eax, [uiScaleY]
+ and eax, 32767
+ mov ebx, eax
+ neg ebx
+ and ebx, 32767
+ movd xmm6, eax ; vinc(uiScaleY mod 32767)
+ movd xmm2, ebx ; -vinc
+ psllq xmm6, 32
+ por xmm6, xmm2 ; 0 0 vinc -vinc (dword)
+ pshufd xmm6, xmm6, 01010000b ; xmm6: vinc vinc -vinc -vinc
+
+ mov edx, 40003fffh
+ movd xmm5, edx
+ punpcklwd xmm5, xmm0 ; 16384 16383
+ pshufd xmm5, xmm5, 01000100b ; xmm5: 16384 16383 16384 16383
+
+
+DOWNSAMPLE:
+
+ mov eax, [dwDstHeight]
+ mov edi, [pDstData]
+ mov edx, [dwDstStride]
+ mov ecx, [dwDstWidth]
+ sub edx, ecx
+ mov [dstStep], edx ; stride - width
+ dec eax
+ mov [tmpHeight], eax
+ mov eax, 16384
+ mov [yInverse], eax
+
+ pshufd xmm4, xmm5, 01010000b ; initial v to 16384 16384 16383 16383
+
+HEIGHT:
+ mov eax, [yInverse]
+ mov esi, [pSrcData]
+ shr eax, 15
+ mul dword [dwSrcStride]
+ add esi, eax ; get current row address
+ mov ebp, esi
+ add ebp, [dwSrcStride]
+
+ mov eax, 16384
+ mov [xInverse], eax
+ mov ecx, [dwDstWidth]
+ dec ecx
+
+ movdqa xmm3, xmm5 ; initial u to 16384 16383 16384 16383
+
+WIDTH:
+ mov eax, [xInverse]
+ shr eax, 15
+
+ movd xmm1, [esi+eax] ; xxxxxxba
+ movd xmm2, [ebp+eax] ; xxxxxxdc
+ pxor xmm0, xmm0
+ punpcklwd xmm1, xmm2 ; xxxxdcba
+ punpcklbw xmm1, xmm0 ; 0d0c0b0a
+ punpcklwd xmm1, xmm0 ; 000d000c000b000a
+
+ movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv
+ pmaddwd xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2
+ movdqa xmm0, xmm2
+ pmuludq xmm2, xmm1
+ psrlq xmm0, 32
+ psrlq xmm1, 32
+ pmuludq xmm0, xmm1
+ paddq xmm2, xmm0
+ pshufd xmm1, xmm2, 00001110b
+ paddq xmm2, xmm1
+ psrlq xmm2, 29
+
+ movd eax, xmm2
+ inc eax
+ shr eax, 1
+ mov [edi], al
+ inc edi
+
+ mov eax, [uiScaleX]
+ add [xInverse], eax
+
+ paddw xmm3, xmm7 ; inc u
+ psllw xmm3, 1
+ psrlw xmm3, 1
+
+ loop WIDTH
+
+WIDTH_END:
+ mov eax, [xInverse]
+ shr eax, 15
+ mov cl, [esi+eax]
+ mov [edi], cl
+ inc edi
+
+ mov eax, [uiScaleY]
+ add [yInverse], eax
+ add edi, [dstStep]
+
+ paddw xmm4, xmm6 ; inc v
+ psllw xmm4, 1
+ psrlw xmm4, 1
+
+ dec dword [tmpHeight]
+ jg HEIGHT
+
+
+LAST_ROW:
+ mov eax, [yInverse]
+ mov esi, [pSrcData]
+ shr eax, 15
+ mul dword [dwSrcStride]
+ add esi, eax ; get current row address
+
+ mov eax, 16384
+ mov [xInverse], eax
+ mov ecx, [dwDstWidth]
+
+LAST_ROW_WIDTH:
+ mov eax, [xInverse]
+ shr eax, 15
+
+ mov al, [esi+eax]
+ mov [edi], al
+ inc edi
+
+ mov eax, [uiScaleX]
+ add [xInverse], eax
+
+ loop LAST_ROW_WIDTH
+
+LAST_ROW_END:
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef pushsize
+%undef localsize
+%undef pSrcData
+%undef dwSrcWidth
+%undef dwSrcHeight
+%undef dwSrcStride
+%undef pDstData
+%undef dwDstWidth
+%undef dwDstHeight
+%undef dwDstStride
+%undef scale
+%undef uiScaleX
+%undef uiScaleY
+%undef tmpHeight
+%undef yInverse
+%undef xInverse
+%undef dstStep
+ ret
+
+
+
+
+WELS_EXTERN GeneralBilinearFastDownsampler_sse2
+;**************************************************************************************************************
+;int GeneralBilinearFastDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
+; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+; unsigned int uiScaleX, unsigned int uiScaleY );
+;{
+;**************************************************************************************************************
+
+ALIGN 16
+GeneralBilinearFastDownsampler_sse2:
+ push ebp
+ push esi
+ push edi
+ push ebx
+%define pushsize 16
+%define localsize 28
+%define pDstData esp + pushsize + localsize + 4
+%define dwDstStride esp + pushsize + localsize + 8
+%define dwDstWidth esp + pushsize + localsize + 12
+%define dwDstHeight esp + pushsize + localsize + 16
+%define pSrcData esp + pushsize + localsize + 20
+%define dwSrcStride esp + pushsize + localsize + 24
+%define dwSrcWidth esp + pushsize + localsize + 28
+%define dwSrcHeight esp + pushsize + localsize + 32
+%define scale esp + 0
+%define uiScaleX esp + pushsize + localsize + 36
+%define uiScaleY esp + pushsize + localsize + 40
+%define tmpHeight esp + 12
+%define yInverse esp + 16
+%define xInverse esp + 20
+%define dstStep esp + 24
+ sub esp, localsize
+
+ pxor xmm0, xmm0
+ mov edx, 65535
+ mov eax, [uiScaleX]
+ and eax, edx
+ mov ebx, eax
+ neg ebx
+ and ebx, 65535
+ movd xmm1, eax ; uinc(uiScaleX mod 65536)
+ movd xmm2, ebx ; -uinc
+ psllq xmm1, 32
+ por xmm1, xmm2 ; 0 uinc 0 -uinc
+ pshuflw xmm7, xmm1, 10001000b ; xmm7: uinc -uinc uinc -uinc
+
+ mov eax, [uiScaleY]
+ and eax, 32767
+ mov ebx, eax
+ neg ebx
+ and ebx, 32767
+ movd xmm6, eax ; vinc(uiScaleY mod 32767)
+ movd xmm2, ebx ; -vinc
+ psllq xmm6, 32
+ por xmm6, xmm2 ; 0 vinc 0 -vinc
+ pshuflw xmm6, xmm6, 10100000b ; xmm6: vinc vinc -vinc -vinc
+
+ mov edx, 80007fffh ; 32768 32767
+ movd xmm5, edx
+ pshuflw xmm5, xmm5, 01000100b ; 32768 32767 32768 32767
+ mov ebx, 16384
+
+
+FAST_DOWNSAMPLE:
+
+ mov eax, [dwDstHeight]
+ mov edi, [pDstData]
+ mov edx, [dwDstStride]
+ mov ecx, [dwDstWidth]
+ sub edx, ecx
+ mov [dstStep], edx ; stride - width
+ dec eax
+ mov [tmpHeight], eax
+ mov eax, 16384
+ mov [yInverse], eax
+
+ pshuflw xmm4, xmm5, 01010000b
+ psrlw xmm4, 1 ; initial v to 16384 16384 16383 16383
+
+FAST_HEIGHT:
+ mov eax, [yInverse]
+ mov esi, [pSrcData]
+ shr eax, 15
+ mul dword [dwSrcStride]
+ add esi, eax ; get current row address
+ mov ebp, esi
+ add ebp, [dwSrcStride]
+
+ mov eax, 32768
+ mov [xInverse], eax
+ mov ecx, [dwDstWidth]
+ dec ecx
+
+ movdqa xmm3, xmm5 ; initial u to 32768 32767 32768 32767
+
+FAST_WIDTH:
+ mov eax, [xInverse]
+ shr eax, 16
+
+ movd xmm1, [esi+eax] ; xxxxxxba
+ movd xmm2, [ebp+eax] ; xxxxxxdc
+ punpcklwd xmm1, xmm2 ; xxxxdcba
+ punpcklbw xmm1, xmm0 ; 0d0c0b0a
+
+ movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv
+ pmulhuw xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2
+ pmaddwd xmm2, xmm1
+ pshufd xmm1, xmm2, 00000001b
+ paddd xmm2, xmm1
+ movd xmm1, ebx
+ paddd xmm2, xmm1
+ psrld xmm2, 15
+
+ packuswb xmm2, xmm0
+ movd eax, xmm2
+ mov [edi], al
+ inc edi
+
+ mov eax, [uiScaleX]
+ add [xInverse], eax
+
+ paddw xmm3, xmm7 ; inc u
+
+ loop FAST_WIDTH
+
+FAST_WIDTH_END:
+ mov eax, [xInverse]
+ shr eax, 16
+ mov cl, [esi+eax]
+ mov [edi], cl
+ inc edi
+
+ mov eax, [uiScaleY]
+ add [yInverse], eax
+ add edi, [dstStep]
+
+ paddw xmm4, xmm6 ; inc v
+ psllw xmm4, 1
+ psrlw xmm4, 1
+
+ dec dword [tmpHeight]
+ jg FAST_HEIGHT
+
+
+FAST_LAST_ROW:
+ mov eax, [yInverse]
+ mov esi, [pSrcData]
+ shr eax, 15
+ mul dword [dwSrcStride]
+ add esi, eax ; get current row address
+
+ mov eax, 32768
+ mov [xInverse], eax
+ mov ecx, [dwDstWidth]
+
+FAST_LAST_ROW_WIDTH:
+ mov eax, [xInverse]
+ shr eax, 16
+
+ mov al, [esi+eax]
+ mov [edi], al
+ inc edi
+
+ mov eax, [uiScaleX]
+ add [xInverse], eax
+
+ loop FAST_LAST_ROW_WIDTH
+
+FAST_LAST_ROW_END:
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef pushsize
+%undef localsize
+%undef pSrcData
+%undef dwSrcWidth
+%undef dwSrcHeight
+%undef dwSrcStride
+%undef pDstData
+%undef dwDstWidth
+%undef dwDstHeight
+%undef dwDstStride
+%undef scale
+%undef uiScaleX
+%undef uiScaleY
+%undef tmpHeight
+%undef yInverse
+%undef xInverse
+%undef dstStep
ret
\ No newline at end of file
--- a/processing/src/asm/intra_pred.asm
+++ b/processing/src/asm/intra_pred.asm
@@ -1,145 +1,145 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* intra_pred.asm
-;*
-;* Abstract
-;* sse2 function for intra predict operations
-;*
-;* History
-;* 18/09/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "../../src/asm/asm_inc.asm"
-
-BITS 32
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata data
-%else
-SECTION .rodata align=16
-%endif
-
-
-align 16
-mmx_01bytes: times 16 db 1
-
-;***********************************************************************
-; macros
-;***********************************************************************
-%macro COPY_16_TIMES 2
- movdqa %2, [%1-16]
- psrldq %2, 15
- pmuludq %2, [mmx_01bytes]
- pshufd %2, %2, 0
-%endmacro
-
-%macro COPY_16_TIMESS 3
- movdqa %2, [%1+%3-16]
- psrldq %2, 15
- pmuludq %2, [mmx_01bytes]
- pshufd %2, %2, 0
-%endmacro
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-;***********************************************************************
-; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-
-%macro SSE2_PRED_H_16X16_TWO_LINE 1
- lea eax, [eax+ecx*2]
-
- COPY_16_TIMES eax, xmm0
- movdqa [edx+%1], xmm0
- COPY_16_TIMESS eax, xmm0, ecx
- movdqa [edx+%1+0x10], xmm0
-%endmacro
-
-WELS_EXTERN WelsI16x16LumaPredH_sse2
-WelsI16x16LumaPredH_sse2:
- mov edx, [esp+4] ; pred
- mov eax, [esp+8] ; pRef
- mov ecx, [esp+12] ; stride
-
- COPY_16_TIMES eax, xmm0
- movdqa [edx], xmm0
- COPY_16_TIMESS eax, xmm0, ecx
- movdqa [edx+0x10], xmm0
-
- SSE2_PRED_H_16X16_TWO_LINE 0x20
- SSE2_PRED_H_16X16_TWO_LINE 0x40
- SSE2_PRED_H_16X16_TWO_LINE 0x60
- SSE2_PRED_H_16X16_TWO_LINE 0x80
- SSE2_PRED_H_16X16_TWO_LINE 0xa0
- SSE2_PRED_H_16X16_TWO_LINE 0xc0
- SSE2_PRED_H_16X16_TWO_LINE 0xe0
-
- ret
-
-;***********************************************************************
-; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-WELS_EXTERN WelsI16x16LumaPredV_sse2
-WelsI16x16LumaPredV_sse2:
- mov edx, [esp+4] ; pred
- mov eax, [esp+8] ; pRef
- mov ecx, [esp+12] ; stride
-
- sub eax, ecx
- movdqa xmm0, [eax]
-
- movdqa [edx], xmm0
- movdqa [edx+10h], xmm0
- movdqa [edx+20h], xmm0
- movdqa [edx+30h], xmm0
- movdqa [edx+40h], xmm0
- movdqa [edx+50h], xmm0
- movdqa [edx+60h], xmm0
- movdqa [edx+70h], xmm0
- movdqa [edx+80h], xmm0
- movdqa [edx+90h], xmm0
- movdqa [edx+160], xmm0
- movdqa [edx+176], xmm0
- movdqa [edx+192], xmm0
- movdqa [edx+208], xmm0
- movdqa [edx+224], xmm0
- movdqa [edx+240], xmm0
-
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* intra_pred.asm
+;*
+;* Abstract
+;* sse2 function for intra predict operations
+;*
+;* History
+;* 18/09/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "../../src/asm/asm_inc.asm"
+
+BITS 32
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata data
+%else
+SECTION .rodata align=16
+%endif
+
+
+align 16
+mmx_01bytes: times 16 db 1
+
+;***********************************************************************
+; macros
+;***********************************************************************
+%macro COPY_16_TIMES 2
+ movdqa %2, [%1-16]
+ psrldq %2, 15
+ pmuludq %2, [mmx_01bytes]
+ pshufd %2, %2, 0
+%endmacro
+
+%macro COPY_16_TIMESS 3
+ movdqa %2, [%1+%3-16]
+ psrldq %2, 15
+ pmuludq %2, [mmx_01bytes]
+ pshufd %2, %2, 0
+%endmacro
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+;***********************************************************************
+; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+
+%macro SSE2_PRED_H_16X16_TWO_LINE 1
+ lea eax, [eax+ecx*2]
+
+ COPY_16_TIMES eax, xmm0
+ movdqa [edx+%1], xmm0
+ COPY_16_TIMESS eax, xmm0, ecx
+ movdqa [edx+%1+0x10], xmm0
+%endmacro
+
+WELS_EXTERN WelsI16x16LumaPredH_sse2
+WelsI16x16LumaPredH_sse2:
+ mov edx, [esp+4] ; pred
+ mov eax, [esp+8] ; pRef
+ mov ecx, [esp+12] ; stride
+
+ COPY_16_TIMES eax, xmm0
+ movdqa [edx], xmm0
+ COPY_16_TIMESS eax, xmm0, ecx
+ movdqa [edx+0x10], xmm0
+
+ SSE2_PRED_H_16X16_TWO_LINE 0x20
+ SSE2_PRED_H_16X16_TWO_LINE 0x40
+ SSE2_PRED_H_16X16_TWO_LINE 0x60
+ SSE2_PRED_H_16X16_TWO_LINE 0x80
+ SSE2_PRED_H_16X16_TWO_LINE 0xa0
+ SSE2_PRED_H_16X16_TWO_LINE 0xc0
+ SSE2_PRED_H_16X16_TWO_LINE 0xe0
+
+ ret
+
+;***********************************************************************
+; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+WELS_EXTERN WelsI16x16LumaPredV_sse2
+WelsI16x16LumaPredV_sse2:
+ mov edx, [esp+4] ; pred
+ mov eax, [esp+8] ; pRef
+ mov ecx, [esp+12] ; stride
+
+ sub eax, ecx
+ movdqa xmm0, [eax]
+
+ movdqa [edx], xmm0
+ movdqa [edx+10h], xmm0
+ movdqa [edx+20h], xmm0
+ movdqa [edx+30h], xmm0
+ movdqa [edx+40h], xmm0
+ movdqa [edx+50h], xmm0
+ movdqa [edx+60h], xmm0
+ movdqa [edx+70h], xmm0
+ movdqa [edx+80h], xmm0
+ movdqa [edx+90h], xmm0
+ movdqa [edx+160], xmm0
+ movdqa [edx+176], xmm0
+ movdqa [edx+192], xmm0
+ movdqa [edx+208], xmm0
+ movdqa [edx+224], xmm0
+ movdqa [edx+240], xmm0
+
ret
\ No newline at end of file
--- a/processing/src/asm/sad.asm
+++ b/processing/src/asm/sad.asm
@@ -1,79 +1,79 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* pixel_sse2.asm
-;*
-;* Abstract
-;* WelsSampleSad8x8_sse21
-;*
-;* History
-;* 8/5/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-%macro SAD_8x4 0
- movq xmm0, [eax]
- movq xmm1, [eax+ebx]
- lea eax, [eax+2*ebx]
- movhps xmm0, [eax]
- movhps xmm1, [eax+ebx]
-
- movq xmm2, [ecx]
- movq xmm3, [ecx+edx]
- lea ecx, [ecx+2*edx]
- movhps xmm2, [ecx]
- movhps xmm3, [ecx+edx]
- psadbw xmm0, xmm2
- psadbw xmm1, xmm3
- paddw xmm6, xmm0
- paddw xmm6, xmm1
-%endmacro
-
-
-
-%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
-and %1, 0x1f|(%3>>1)
-cmp %1, (32-%2)|(%3>>1)
-%endmacro
-
-
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* pixel_sse2.asm
+;*
+;* Abstract
+;* WelsSampleSad8x8_sse21
+;*
+;* History
+;* 8/5/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+BITS 32
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+%macro SAD_8x4 0
+ movq xmm0, [eax]
+ movq xmm1, [eax+ebx]
+ lea eax, [eax+2*ebx]
+ movhps xmm0, [eax]
+ movhps xmm1, [eax+ebx]
+
+ movq xmm2, [ecx]
+ movq xmm3, [ecx+edx]
+ lea ecx, [ecx+2*edx]
+ movhps xmm2, [ecx]
+ movhps xmm3, [ecx+edx]
+ psadbw xmm0, xmm2
+ psadbw xmm1, xmm3
+ paddw xmm6, xmm0
+ paddw xmm6, xmm1
+%endmacro
+
+
+
+%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
+and %1, 0x1f|(%3>>1)
+cmp %1, (32-%2)|(%3>>1)
+%endmacro
+
+
%macro SSE2_GetSad8x4 0
movq xmm0, [eax]
movq xmm1, [eax+ebx]
@@ -90,12 +90,12 @@
psadbw xmm1, xmm3
paddw xmm6, xmm0
paddw xmm6, xmm1
-%endmacro
+%endmacro
-;***********************************************************************
-; Code
-;***********************************************************************
+;***********************************************************************
+; Code
+;***********************************************************************
SECTION .text
WELS_EXTERN WelsSampleSad8x8_sse21
@@ -108,15 +108,15 @@
push edi
mov eax, [esp+12]
mov ebx, [esp+16]
-
+
pxor xmm7, xmm7
-
+
mov edi, ecx
and edi, 0x07
- sub ecx, edi
+ sub ecx, edi
mov edx, 8
sub edx, edi
-
+
shl edi, 3
shl edx, 3
movd xmm5, edi
@@ -124,10 +124,10 @@
mov edi, 8
add edi, ecx
mov edx, [esp+24]
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
-
+
movq xmm1, [ecx]
movq xmm2, [edi]
movhps xmm1, [ecx+edx]
@@ -135,17 +135,17 @@
psrlq xmm1, xmm5
psllq xmm2, xmm6
por xmm1, xmm2
-
+
psadbw xmm0, xmm1
paddw xmm7, xmm0
-
+
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
lea edi, [edi+2*edx]
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
-
+
movq xmm1, [ecx]
movq xmm2, [edi]
movhps xmm1, [ecx+edx]
@@ -153,7 +153,7 @@
psrlq xmm1, xmm5
psllq xmm2, xmm6
por xmm1, xmm2
-
+
psadbw xmm0, xmm1
paddw xmm7, xmm0
@@ -160,10 +160,10 @@
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
lea edi, [edi+2*edx]
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
-
+
movq xmm1, [ecx]
movq xmm2, [edi]
movhps xmm1, [ecx+edx]
@@ -171,17 +171,17 @@
psrlq xmm1, xmm5
psllq xmm2, xmm6
por xmm1, xmm2
-
+
psadbw xmm0, xmm1
paddw xmm7, xmm0
-
+
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
lea edi, [edi+2*edx]
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
-
+
movq xmm1, [ecx]
movq xmm2, [edi]
movhps xmm1, [ecx+edx]
@@ -189,10 +189,10 @@
psrlq xmm1, xmm5
psllq xmm2, xmm6
por xmm1, xmm2
-
+
psadbw xmm0, xmm1
paddw xmm7, xmm0
-
+
movhlps xmm0, xmm7
paddw xmm0, xmm7
movd eax, xmm0
@@ -202,12 +202,12 @@
push ebx
mov eax, [esp+8]
mov ebx, [esp+12]
- mov edx, [esp+20]
+ mov edx, [esp+20]
pxor xmm6, xmm6
SSE2_GetSad8x4
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
- SSE2_GetSad8x4
+ SSE2_GetSad8x4
movhlps xmm0, xmm6
paddw xmm0, xmm6
movd eax, xmm0
--- a/processing/src/asm/vaa.asm
+++ b/processing/src/asm/vaa.asm
@@ -1,1589 +1,1589 @@
-;*!
-;* \copy
-;* Copyright (c) 2010-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* vaa.asm
-;*
-;* Abstract
-;* sse2 for pVaa routines
-;*
-;* History
-;* 04/14/2010 Created
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-;%macro SUM_SSE2 4 ; dst, pSrc, zero, pack1_8x2
-; movdqa %1, %2
-; punpcklbw %1, %3
-; punpckhbw %2, %3
-; paddw %1, %2
-; pmaddwd %1, %4
-; pshufd %2, %1, 04Eh ; 01001110 B
-; paddd %1, %2
-; pshufd %2, %1, 0B1h ; 10110001 B
-; paddd %1, %2
-;%endmacro ; END OF SUM_SSE2
-
-; by comparing it outperforms than phaddw(SSSE3) sets
-%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp
- ; @sum_8x2 begin
- pshufd %2, %1, 04Eh ; 01001110 B
- paddw %1, %2
- pshuflw %2, %1, 04Eh ; 01001110 B
- paddw %1, %2
- pshuflw %2, %1, 0B1h ; 10110001 B
- paddw %1, %2
- ; end of @sum_8x2
-%endmacro ; END of SUM_WORD_8x2_SSE2
-
-%macro SUM_SQR_SSE2 3 ; dst, pSrc, zero
- movdqa %1, %2
- punpcklbw %1, %3
- punpckhbw %2, %3
- pmaddwd %1, %1
- pmaddwd %2, %2
- paddd %1, %2
- pshufd %2, %1, 04Eh ; 01001110 B
- paddd %1, %2
- pshufd %2, %1, 0B1h ; 10110001 B
- paddd %1, %2
-%endmacro ; END OF SUM_SQR_SSE2
-
-%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
- movdqa %1, [esi ] ; line 0
- movdqa %2, [esi+ecx] ; line 1
- movdqa %3, %1
- punpcklbw %1, xmm7
- punpckhbw %3, xmm7
- movdqa %4, %2
- punpcklbw %4, xmm7
- punpckhbw %2, xmm7
- paddw %1, %4
- paddw %2, %3
- movdqa %3, [esi+ebx] ; line 2
- movdqa %4, [esi+edx] ; line 3
- movdqa %5, %3
- punpcklbw %3, xmm7
- punpckhbw %5, xmm7
- movdqa %6, %4
- punpcklbw %6, xmm7
- punpckhbw %4, xmm7
- paddw %3, %6
- paddw %4, %5
- paddw %1, %3 ; block 0, 1
- paddw %2, %4 ; block 2, 3
- pshufd %3, %1, 0B1h
- pshufd %4, %2, 0B1h
- paddw %1, %3
- paddw %2, %4
- movdqa %3, %1
- movdqa %4, %2
- pshuflw %5, %1, 0B1h
- pshufhw %6, %3, 0B1h
- paddw %1, %5
- paddw %3, %6
- pshuflw %5, %2, 0B1h
- pshufhw %6, %4, 0B1h
- paddw %2, %5
- paddw %4, %6
- punpcklwd %1, %2
- punpckhwd %3, %4
- punpcklwd %1, %3
- psraw %1, $4
-%endmacro
-
-%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
- movdqa %1, [esi ] ; line 0
- movdqa %2, [esi+ecx] ; line 1
- movdqa %3, %1
- punpcklbw %1, xmm7
- punpckhbw %3, xmm7
- movdqa %4, %2
- punpcklbw %4, xmm7
- punpckhbw %2, xmm7
- paddw %1, %4
- paddw %2, %3
- movdqa %3, [esi+ebx] ; line 2
- movdqa %4, [esi+edx] ; line 3
- movdqa %5, %3
- punpcklbw %3, xmm7
- punpckhbw %5, xmm7
- movdqa %6, %4
- punpcklbw %6, xmm7
- punpckhbw %4, xmm7
- paddw %3, %6
- paddw %4, %5
- paddw %1, %3 ; block 0, 1
- paddw %2, %4 ; block 2, 3
- phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
- phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
- psraw %1, $4
-%endmacro
-
-%macro WELS_SAD_16x2_SSE2 0
- movdqa xmm1, [esi]
- movdqa xmm2, [edi]
- movdqa xmm3, [esi+ebx]
- movdqa xmm4, [edi+ebx]
- psadbw xmm1, xmm2
- psadbw xmm3, xmm4
- paddd xmm6, xmm1
- paddd xmm6, xmm3
- lea esi, [esi+ebx*2]
- lea edi, [edi+ebx*2]
-%endmacro
-
-%macro WELS_SAD_SUM_SQSUM_16x1_SSE2 0
- movdqa xmm1, [esi]
- movdqa xmm2, [edi]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm2
- paddd xmm6, xmm3
-
- movdqa xmm3, xmm1
- psadbw xmm3, xmm0
- paddd xmm5, xmm3
-
- movdqa xmm2, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm2, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- paddd xmm4, xmm1
- paddd xmm4, xmm2
-
- add esi, ebx
- add edi, ebx
-%endmacro
-
-%macro WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 0
- movdqa xmm1, [esi]
- movdqa xmm2, [edi]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm2
- paddd xmm7, xmm3 ; sad
-
- movdqa xmm3, xmm1
- pmaxub xmm3, xmm2
- pminub xmm2, xmm1
- psubb xmm3, xmm2 ; diff
-
- movdqa xmm2, xmm1
- psadbw xmm2, xmm0
- paddd xmm6, xmm2 ; sum
-
- movdqa xmm2, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm2, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- paddd xmm5, xmm1
- paddd xmm5, xmm2 ; sqsum
-
- movdqa xmm1, xmm3
- punpcklbw xmm1, xmm0
- punpckhbw xmm3, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm3, xmm3
- paddd xmm4, xmm1
- paddd xmm4, xmm3 ; sqdiff
-
- add esi, ebx
- add edi, ebx
-%endmacro
-
-%macro WELS_SAD_SD_MAD_16x1_SSE2 4
-%define sad_reg %1
-%define sum_cur_reg %2
-%define sum_ref_reg %3
-%define mad_reg %4
- movdqa xmm1, [esi]
- movdqa xmm2, [edi]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm0
- paddd sum_cur_reg, xmm3 ; sum_cur
- movdqa xmm3, xmm2
- psadbw xmm3, xmm0
- paddd sum_ref_reg, xmm3 ; sum_ref
-
- movdqa xmm3, xmm1
- pmaxub xmm3, xmm2
- pminub xmm2, xmm1
- psubb xmm3, xmm2 ; abs diff
- pmaxub mad_reg, xmm3 ; max abs diff
-
- psadbw xmm3, xmm0
- paddd sad_reg, xmm3 ; sad
-
- add esi, ebx
- add edi, ebx
-%endmacro
-
-
-%macro WELS_MAX_REG_SSE2 1 ; xmm1, xmm2, xmm3 can be used
-%define max_reg %1
- movdqa xmm1, max_reg
- psrldq xmm1, 4
- pmaxub max_reg, xmm1
- movdqa xmm1, max_reg
- psrldq xmm1, 2
- pmaxub max_reg, xmm1
- movdqa xmm1, max_reg
- psrldq xmm1, 1
- pmaxub max_reg, xmm1
-%endmacro
-
-%macro WELS_SAD_BGD_SQDIFF_16x1_SSE2 4
-%define sad_reg %1
-%define sum_reg %2
-%define mad_reg %3
-%define sqdiff_reg %4
- movdqa xmm1, [esi]
- movdqa xmm2, xmm1
- movdqa xmm3, xmm1
- punpcklbw xmm2, xmm0
- punpckhbw xmm3, xmm0
- pmaddwd xmm2, xmm2
- pmaddwd xmm3, xmm3
- paddd xmm2, xmm3
- movdqa xmm3, xmm2
- psllq xmm2, 32
- psrlq xmm3, 32
- psllq xmm3, 32
- paddd xmm2, xmm3
- paddd sad_reg, xmm2 ; sqsum
-
- movdqa xmm2, [edi]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm0
- paddd sum_reg, xmm3 ; sum_cur
- movdqa xmm3, xmm2
- psadbw xmm3, xmm0
- pslldq xmm3, 4
- paddd sum_reg, xmm3 ; sum_ref
-
- movdqa xmm3, xmm1
- pmaxub xmm3, xmm2
- pminub xmm2, xmm1
- psubb xmm3, xmm2 ; abs diff
- pmaxub mad_reg, xmm3 ; max abs diff
-
- movdqa xmm1, xmm3
- psadbw xmm3, xmm0
- paddd sad_reg, xmm3 ; sad
-
- movdqa xmm3, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm3, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm3, xmm3
- paddd sqdiff_reg, xmm1
- paddd sqdiff_reg, xmm3 ; sqdiff
-
- add esi, ebx
- add edi, ebx
-%endmacro
-
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-;SECTION .rodata align=16
-
-;ALIGN 16
-;pack1_8x2:
-; dw 1, 1, 1, 1, 1, 1, 1, 1
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-WELS_EXTERN rc_sad_frame_sse2
-;***********************************************************************
-; uint32_t rc_sad_frame_sse2( uint8_t *ref_orig, uint8_t *cur_orig, const int mb_width, const int iPicHeight, const int iPicStride );
-;***********************************************************************
-ALIGN 16
-rc_sad_frame_sse2:
- push esi
- push edi
- push ebp
- push ebx
- push edx
-
- mov esi, [esp+24]
- mov edi, [esp+28]
- mov ebx, [esp+32]
- mov ecx, [esp+36]
- mov edx, [esp+40]
- pxor xmm0, xmm0
-.hloop:
- mov eax, ebx
- mov ebp, $0
-.wloop:
- movdqa xmm1, [esi+ebp]
- movdqa xmm2, [edi+ebp]
- psadbw xmm1, xmm2
- pshufd xmm2, xmm1, 0f6h ; 11110110 B ; movhlps for float
- paddd xmm1, xmm2
- paddd xmm0, xmm1
- add ebp, 010h
- dec eax
- jnz near .wloop
- lea esi, [esi+edx]
- lea edi, [edi+edx]
- dec ecx
- jnz near .hloop
-
- movd eax, xmm0
- pop edx
- pop ebx
- pop ebp
- pop edi
- pop esi
- ret
-
-
-WELS_EXTERN SampleVariance16x16_sse2
-;***********************************************************************
-; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
-;***********************************************************************
-ALIGN 16
-SampleVariance16x16_sse2:
- push esi
- push edi
- push ebx
-
- sub esp, 16
- %define SUM [esp]
- %define SUM_CUR [esp+4]
- %define SQR [esp+8]
- %define SQR_CUR [esp+12]
- %define PUSH_SIZE 28 ; 12 + 16
-
- mov edi, [esp+PUSH_SIZE+4] ; y_ref
- mov edx, [esp+PUSH_SIZE+8] ; y_ref_stride
- mov esi, [esp+PUSH_SIZE+12] ; y_src
- mov eax, [esp+PUSH_SIZE+16] ; y_src_stride
- mov ecx, 010h ; height = 16
-
- pxor xmm7, xmm7
- movdqu SUM, xmm7
-
-.hloops:
- movdqa xmm0, [edi] ; y_ref
- movdqa xmm1, [esi] ; y_src
- movdqa xmm2, xmm0 ; store first for future process
- movdqa xmm3, xmm1
- ; sum += diff;
- movdqa xmm4, xmm0
- psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79]
- ; to be continued for sum
- pshufd xmm5, xmm4, 0C6h ; 11000110 B
- paddw xmm4, xmm5
- movd ebx, xmm4
- add SUM, ebx
-
- ; sqr += diff * diff;
- pmaxub xmm0, xmm1
- pminub xmm1, xmm2
- psubb xmm0, xmm1 ; diff
- SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
- movd ebx, xmm1
- add SQR, ebx
-
- ; sum_cur += y_src[x];
- movdqa xmm0, xmm3 ; cur_orig
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm7
- punpckhbw xmm1, xmm7
- paddw xmm0, xmm1 ; 8x2
- SUM_WORD_8x2_SSE2 xmm0, xmm1
- movd ebx, xmm0
- and ebx, 0ffffh
- add SUM_CUR, ebx
-
- ; sqr_cur += y_src[x] * y_src[x];
- SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
- movd ebx, xmm0
- add SQR_CUR, ebx
-
- lea edi, [edi+edx]
- lea esi, [esi+eax]
- dec ecx
- jnz near .hloops
-
- mov ebx, 0
- mov bx, word SUM
- sar ebx, 8
- imul ebx, ebx
- mov ecx, SQR
- sar ecx, 8
- sub ecx, ebx
- mov edi, [esp+PUSH_SIZE+20] ; pMotionTexture
- mov [edi], cx ; to store uiMotionIndex
- mov ebx, 0
- mov bx, word SUM_CUR
- sar ebx, 8
- imul ebx, ebx
- mov ecx, SQR_CUR
- sar ecx, 8
- sub ecx, ebx
- mov [edi+2], cx ; to store uiTextureIndex
-
- %undef SUM
- %undef SUM_CUR
- %undef SQR
- %undef SQR_CUR
- %undef PUSH_SIZE
-
- add esp, 16
- pop ebx
- pop edi
- pop esi
-
- ret
-
-; , 6/7/2010
-
-%ifndef NO_DYNAMIC_VP
-WELS_EXTERN AnalysisVaaInfoIntra_sse2
-;***********************************************************************
-; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t linesize );
-;***********************************************************************
-ALIGN 16
-AnalysisVaaInfoIntra_sse2:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov ebp, esp
- and ebp, 0fh
- sub esp, ebp
- sub esp, 32
- %define PUSH_SIZE 52 ; 20 + 32
-
- mov esi, [esp+ebp+PUSH_SIZE+4] ; data_y
- mov ecx, [esp+ebp+PUSH_SIZE+8] ; linesize
-
- mov ebx, ecx
- sal ebx, $1 ; linesize x 2 [ebx]
- mov edx, ebx
- add edx, ecx ; linesize x 3 [edx]
- mov eax, ebx
- sal eax, $1 ; linesize x 4 [eax]
-
- pxor xmm7, xmm7
-
- ; loops
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp], xmm0
-
- lea esi, [esi+eax]
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp+8], xmm0
-
- lea esi, [esi+eax]
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp+16], xmm0
-
- lea esi, [esi+eax]
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp+24], xmm0
-
- movdqa xmm0, [esp] ; block 0~7
- movdqa xmm1, [esp+16] ; block 8~15
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- SUM_WORD_8x2_SSE2 xmm0, xmm3
-
- pmullw xmm1, xmm1
- pmullw xmm2, xmm2
- movdqa xmm3, xmm1
- movdqa xmm4, xmm2
- punpcklwd xmm1, xmm7
- punpckhwd xmm3, xmm7
- punpcklwd xmm2, xmm7
- punpckhwd xmm4, xmm7
- paddd xmm1, xmm2
- paddd xmm3, xmm4
- paddd xmm1, xmm3
- pshufd xmm2, xmm1, 01Bh
- paddd xmm1, xmm2
- pshufd xmm2, xmm1, 0B1h
- paddd xmm1, xmm2
-
- movd ebx, xmm0
- and ebx, 0ffffh ; effective low word truncated
- mov ecx, ebx
- imul ebx, ecx
- sar ebx, $4
- movd eax, xmm1
- sub eax, ebx
-
- %undef PUSH_SIZE
- add esp, 32
- add esp, ebp
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-WELS_EXTERN AnalysisVaaInfoIntra_ssse3
-;***********************************************************************
-; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t linesize );
-;***********************************************************************
-ALIGN 16
-AnalysisVaaInfoIntra_ssse3:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov ebp, esp
- and ebp, 0fh
- sub esp, ebp
- sub esp, 32
- %define PUSH_SIZE 52 ; 20 + 32
-
- mov esi, [esp+ebp+PUSH_SIZE+4] ; data_y
- mov ecx, [esp+ebp+PUSH_SIZE+8] ; linesize
-
- mov ebx, ecx
- sal ebx, $1 ; linesize x 2 [ebx]
- mov edx, ebx
- add edx, ecx ; linesize x 3 [edx]
- mov eax, ebx
- sal eax, $1 ; linesize x 4 [eax]
-
- pxor xmm7, xmm7
-
- ; loops
- VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp], xmm0
-
- lea esi, [esi+eax]
- VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
- movq [esp+8], xmm1
-
- lea esi, [esi+eax]
- VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp+16], xmm0
-
- lea esi, [esi+eax]
- VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
- movq [esp+24], xmm1
-
- movdqa xmm0, [esp] ; block 0~7
- movdqa xmm1, [esp+16] ; block 8~15
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets
-
- pmullw xmm1, xmm1
- pmullw xmm2, xmm2
- movdqa xmm3, xmm1
- movdqa xmm4, xmm2
- punpcklwd xmm1, xmm7
- punpckhwd xmm3, xmm7
- punpcklwd xmm2, xmm7
- punpckhwd xmm4, xmm7
- paddd xmm1, xmm2
- paddd xmm3, xmm4
- paddd xmm1, xmm3
- pshufd xmm2, xmm1, 01Bh
- paddd xmm1, xmm2
- pshufd xmm2, xmm1, 0B1h
- paddd xmm1, xmm2
-
- movd ebx, xmm0
- and ebx, 0ffffh ; effective low work truncated
- mov ecx, ebx
- imul ebx, ecx
- sar ebx, $4
- movd eax, xmm1
- sub eax, ebx
-
- %undef PUSH_SIZE
- add esp, 32
- add esp, ebp
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-%endif
-
-
-
-WELS_EXTERN abs_difference_mbrow_sse2
-;*************************************************************************************************************
-;void abs_difference_mbrow_sse2( uint8_t *ref_orig, uint8_t *cur_orig, int32_t iPicStride,
-; int32_t gom_pixel_num, int32_t *pSum)
-;*************************************************************************************************************
-ALIGN 16
-abs_difference_mbrow_sse2:
-%define ref_orig esp + pushsize + 4
-%define cur_orig esp + pushsize + 8
-%define iPicStride esp + pushsize + 12
-%define gom_pixel_num esp + pushsize + 16
-%define pSum esp + pushsize + 20
-%define pushsize 12
- push esi
- push edi
- push ebx
- mov esi, [ref_orig]
- mov edi, [cur_orig]
- mov ebx, [iPicStride]
- mov eax, [gom_pixel_num]
- mov ecx, 16 ;MB_WIDTH_LUMA
- pxor xmm0, xmm0
-mb_width_loop_p:
- mov edx, esi
- add edx, eax ; end address
-gom_row_loop_p:
- movdqa xmm1, [esi]
- movdqa xmm2, [edi]
- psadbw xmm1, xmm2
- paddd xmm0, xmm1
- add esi, 16
- add edi, 16
- cmp esi, edx
- jl gom_row_loop_p
-
- sub esi, eax
- sub edi, eax
- add esi, ebx
- add edi, ebx
- loop mb_width_loop_p
-
- movdqa xmm1, xmm0
- psrldq xmm1, 8
- paddd xmm1, xmm0
- movd eax, xmm1
- mov edx, [pSum] ; pSum
- add [edx], eax
-
-%undef ref_orig
-%undef cur_orig
-%undef iPicStride
-%undef gom_pixel_num
-%undef pSum
-%undef pushsize
- pop ebx
- pop edi
- pop esi
- ret
-
-
-
-
-WELS_EXTERN sum_sqrsum_mbrow_sse2
-;*************************************************************************************************************
-;void sum_sqrsum_mbrow_sse2( uint8_t *cur_orig, int32_t iPicStride,
-; int32_t gom_pixel_num, int32_t *pSum, int32_t *pSqrSum)
-;*************************************************************************************************************
-ALIGN 16
-sum_sqrsum_mbrow_sse2:
-%define cur_orig esp + pushsize + 4
-%define iPicStride esp + pushsize + 8
-%define gom_pixel_num esp + pushsize + 12
-%define pSum esp + pushsize + 16
-%define pSqrSum esp + pushsize + 20
-%define pushsize 8
- push esi
- push ebx
- mov esi, [cur_orig]
- mov eax, [gom_pixel_num]
- mov ebx, [iPicStride]
- mov ecx, 16 ;MB_WIDTH_LUMA
- pxor xmm0, xmm0 ; zero
- pxor xmm1, xmm1 ; sum
- pxor xmm2, xmm2 ; sqr sum
-mb_width_loop_i:
- mov edx, esi
- add edx, eax ; end address
-gom_row_loop_i:
- movdqa xmm3, [esi]
- movdqa xmm4, xmm3
- psadbw xmm4, xmm0
- paddd xmm1, xmm4
- movdqa xmm4, xmm3
- punpcklbw xmm4, xmm0
- punpckhbw xmm3, xmm0
- pmaddwd xmm4, xmm4
- pmaddwd xmm3, xmm3
- paddd xmm2, xmm3
- paddd xmm2, xmm4
- add esi, 16
- cmp esi, edx
- jl gom_row_loop_i
-
- sub esi, eax
- add esi, ebx
- loop mb_width_loop_i
-
- movdqa xmm3, xmm1
- psrldq xmm3, 8
- paddd xmm1, xmm3
- movd eax, xmm1
- mov edx, [pSum]
- add [edx], eax
-
- movdqa xmm3, xmm2
- psrldq xmm3, 8
- paddd xmm2, xmm3
- movdqa xmm3, xmm2
- psrldq xmm3, 4
- paddd xmm2, xmm3
- movd eax, xmm2
- mov edx, [pSqrSum]
- add [edx], eax
-
-
-%undef cur_orig
-%undef iPicStride
-%undef gom_pixel_num
-%undef pSum
-%undef pSqrSum
-%undef pushsize
- pop ebx
- pop esi
- ret
-
-
-
-WELS_EXTERN VAACalcSad_sse2
-;*************************************************************************************************************
-;void VAACalcSad_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
-; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSad_sse2:
-%define cur_data esp + pushsize + 4
-%define ref_data esp + pushsize + 8
-%define iPicWidth esp + pushsize + 12
-%define iPicHeight esp + pushsize + 16
-%define iPicStride esp + pushsize + 20
-%define psadframe esp + pushsize + 24
-%define psad8x8 esp + pushsize + 28
-%define pushsize 12
- push esi
- push edi
- push ebx
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov edx, [psad8x8]
- mov eax, ebx
-
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- pxor xmm0, xmm0
- pxor xmm7, xmm7 ; iFrameSad
-height_loop:
- mov ecx, dword [iPicWidth]
- push esi
- push edi
-width_loop:
- pxor xmm6, xmm6 ;
- WELS_SAD_16x2_SSE2
- WELS_SAD_16x2_SSE2
- WELS_SAD_16x2_SSE2
- WELS_SAD_16x2_SSE2
- paddd xmm7, xmm6
- movd [edx], xmm6
- psrldq xmm6, 8
- movd [edx+4], xmm6
-
- pxor xmm6, xmm6
- WELS_SAD_16x2_SSE2
- WELS_SAD_16x2_SSE2
- WELS_SAD_16x2_SSE2
- WELS_SAD_16x2_SSE2
- paddd xmm7, xmm6
- movd [edx+8], xmm6
- psrldq xmm6, 8
- movd [edx+12], xmm6
-
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
-
- dec ecx
- jnz width_loop
-
- pop edi
- pop esi
- add esi, eax
- add edi, eax
-
- dec dword [iPicHeight]
- jnz height_loop
-
- mov edx, [psadframe]
- movdqa xmm5, xmm7
- psrldq xmm7, 8
- paddd xmm7, xmm5
- movd [edx], xmm7
-
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef pushsize
- pop ebx
- pop edi
- pop esi
- ret
-
-
-WELS_EXTERN VAACalcSadVar_sse2
-;*************************************************************************************************************
-;void VAACalcSadVar_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
-; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadVar_sse2:
-%define localsize 8
-%define cur_data esp + pushsize + localsize + 4
-%define ref_data esp + pushsize + localsize + 8
-%define iPicWidth esp + pushsize + localsize + 12
-%define iPicHeight esp + pushsize + localsize + 16
-%define iPicStride esp + pushsize + localsize + 20
-%define psadframe esp + pushsize + localsize + 24
-%define psad8x8 esp + pushsize + localsize + 28
-%define psum16x16 esp + pushsize + localsize + 32
-%define psqsum16x16 esp + pushsize + localsize + 36
-%define tmp_esi esp + 0
-%define tmp_edi esp + 4
-%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov edx, [psad8x8]
- mov eax, ebx
-
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- pxor xmm0, xmm0
- pxor xmm7, xmm7 ; iFrameSad
-var_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
-var_width_loop:
- pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8
- pxor xmm5, xmm5 ; pSum16x16
- pxor xmm4, xmm4 ; sqsum_16x16
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- paddd xmm7, xmm6
- movd [edx], xmm6
- psrldq xmm6, 8
- movd [edx+4], xmm6
-
- pxor xmm6, xmm6
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- paddd xmm7, xmm6
- movd [edx+8], xmm6
- psrldq xmm6, 8
- movd [edx+12], xmm6
-
- mov ebp, [psum16x16]
- movdqa xmm1, xmm5
- psrldq xmm1, 8
- paddd xmm5, xmm1
- movd [ebp], xmm5
- add dword [psum16x16], 4
-
- movdqa xmm5, xmm4
- psrldq xmm5, 8
- paddd xmm4, xmm5
- movdqa xmm3, xmm4
- psrldq xmm3, 4
- paddd xmm4, xmm3
-
- mov ebp, [psqsum16x16]
- movd [ebp], xmm4
- add dword [psqsum16x16], 4
-
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
-
- dec ecx
- jnz var_width_loop
-
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
-
- dec dword [iPicHeight]
- jnz var_height_loop
-
- mov edx, [psadframe]
- movdqa xmm5, xmm7
- psrldq xmm7, 8
- paddd xmm7, xmm5
- movd [edx], xmm7
-
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef psum16x16
-%undef psqsum16x16
-%undef tmp_esi
-%undef tmp_edi
-%undef pushsize
-%undef localsize
- ret
-
-
-
-WELS_EXTERN VAACalcSadSsd_sse2
-;*************************************************************************************************************
-;void VAACalcSadSsd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadSsd_sse2:
-%define localsize 12
-%define cur_data esp + pushsize + localsize + 4
-%define ref_data esp + pushsize + localsize + 8
-%define iPicWidth esp + pushsize + localsize + 12
-%define iPicHeight esp + pushsize + localsize + 16
-%define iPicStride esp + pushsize + localsize + 20
-%define psadframe esp + pushsize + localsize + 24
-%define psad8x8 esp + pushsize + localsize + 28
-%define psum16x16 esp + pushsize + localsize + 32
-%define psqsum16x16 esp + pushsize + localsize + 36
-%define psqdiff16x16 esp + pushsize + localsize + 40
-%define tmp_esi esp + 0
-%define tmp_edi esp + 4
-%define tmp_sadframe esp + 8
-%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
- mov ecx, [iPicWidth]
- mov ecx, [iPicHeight]
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov edx, [psad8x8]
- mov eax, ebx
-
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- mov ecx, [iPicWidth]
- mov ecx, [iPicHeight]
- pxor xmm0, xmm0
- movd [tmp_sadframe], xmm0
-sqdiff_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
-sqdiff_width_loop:
- pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8
- pxor xmm6, xmm6 ; pSum16x16
- pxor xmm5, xmm5 ; sqsum_16x16 four dword
- pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- movdqa xmm1, xmm7
- movd [edx], xmm7
- psrldq xmm7, 8
- paddd xmm1, xmm7
- movd [edx+4], xmm7
- movd ebp, xmm1
- add [tmp_sadframe], ebp
-
- pxor xmm7, xmm7
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- movdqa xmm1, xmm7
- movd [edx+8], xmm7
- psrldq xmm7, 8
- paddd xmm1, xmm7
- movd [edx+12], xmm7
- movd ebp, xmm1
- add [tmp_sadframe], ebp
-
- mov ebp, [psum16x16]
- movdqa xmm1, xmm6
- psrldq xmm1, 8
- paddd xmm6, xmm1
- movd [ebp], xmm6
- add dword [psum16x16], 4
-
- mov ebp, [psqsum16x16]
- pshufd xmm6, xmm5, 14 ;00001110
- paddd xmm6, xmm5
- pshufd xmm5, xmm6, 1 ;00000001
- paddd xmm5, xmm6
- movd [ebp], xmm5
- add dword [psqsum16x16], 4
-
- mov ebp, [psqdiff16x16]
- pshufd xmm5, xmm4, 14 ; 00001110
- paddd xmm5, xmm4
- pshufd xmm4, xmm5, 1 ; 00000001
- paddd xmm4, xmm5
- movd [ebp], xmm4
- add dword [psqdiff16x16], 4
-
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
-
- dec ecx
- jnz sqdiff_width_loop
-
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
-
- dec dword [iPicHeight]
- jnz sqdiff_height_loop
-
- mov ebx, [tmp_sadframe]
- mov eax, [psadframe]
- mov [eax], ebx
-
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef psum16x16
-%undef psqsum16x16
-%undef psqdiff16x16
-%undef tmp_esi
-%undef tmp_edi
-%undef tmp_sadframe
-%undef pushsize
-%undef localsize
- ret
-
-
-
-
-
-WELS_EXTERN VAACalcSadBgd_sse2
-;*************************************************************************************************************
-;void VAACalcSadBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadBgd_sse2:
-%define localsize 12
-%define cur_data esp + pushsize + localsize + 4
-%define ref_data esp + pushsize + localsize + 8
-%define iPicWidth esp + pushsize + localsize + 12
-%define iPicHeight esp + pushsize + localsize + 16
-%define iPicStride esp + pushsize + localsize + 20
-%define psadframe esp + pushsize + localsize + 24
-%define psad8x8 esp + pushsize + localsize + 28
-%define p_sd8x8 esp + pushsize + localsize + 32
-%define p_mad8x8 esp + pushsize + localsize + 36
-%define tmp_esi esp + 0
-%define tmp_edi esp + 4
-%define tmp_ecx esp + 8
-%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov eax, ebx
-
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- xor ebp, ebp
- pxor xmm0, xmm0
-bgd_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
-bgd_width_loop:
- pxor xmm7, xmm7 ; pSad8x8
- pxor xmm6, xmm6 ; sum_cur_8x8
- pxor xmm5, xmm5 ; sum_ref_8x8
- pxor xmm4, xmm4 ; pMad8x8
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
-
-
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm4
-
- ;movdqa xmm1, xmm4
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm4, xmm0
- ;punpcklwd xmm4, xmm0
- ;movd [edx+4], xmm4
- ;add edx, 8
- ;mov [p_mad8x8], edx
- mov [tmp_ecx], ecx
- movhlps xmm1, xmm4
- movd ecx, xmm4
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
-
-
- pslldq xmm7, 4
- pslldq xmm6, 4
- pslldq xmm5, 4
-
-
- pxor xmm4, xmm4 ; pMad8x8
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
-
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm4
-
- ;movdqa xmm1, xmm4
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm4, xmm0
- ;punpcklwd xmm4, xmm0
- ;movd [edx+4], xmm4
- ;add edx, 8
- ;mov [p_mad8x8], edx
- movhlps xmm1, xmm4
- movd ecx, xmm4
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
-
- ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2
-
- mov edx, [psad8x8]
- pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0
- movdqa [edx], xmm1
- add edx, 16
- mov [psad8x8], edx ; sad8x8
-
- paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0
- pshufd xmm2, xmm1, 00000011b
- paddd xmm1, xmm2
- movd edx, xmm1
- add ebp, edx ; sad frame
-
- mov edx, [p_sd8x8]
- psubd xmm6, xmm5
- pshufd xmm1, xmm6, 10001101b
- movdqa [edx], xmm1
- add edx, 16
- mov [p_sd8x8], edx
-
-
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
-
- mov ecx, [tmp_ecx]
- dec ecx
- jnz bgd_width_loop
-
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
-
- dec dword [iPicHeight]
- jnz bgd_height_loop
-
- mov edx, [psadframe]
- mov [edx], ebp
-
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef p_sd8x8
-%undef p_mad8x8
-%undef tmp_esi
-%undef tmp_edi
-%undef pushsize
-%undef localsize
- ret
-
-
-
-WELS_EXTERN VAACalcSadSsdBgd_sse2
-;*************************************************************************************************************
-;void VAACalcSadSsdBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
-; int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadSsdBgd_sse2:
-%define localsize 16
-%define cur_data esp + pushsize + localsize + 4
-%define ref_data esp + pushsize + localsize + 8
-%define iPicWidth esp + pushsize + localsize + 12
-%define iPicHeight esp + pushsize + localsize + 16
-%define iPicStride esp + pushsize + localsize + 20
-%define psadframe esp + pushsize + localsize + 24
-%define psad8x8 esp + pushsize + localsize + 28
-%define psum16x16 esp + pushsize + localsize + 32
-%define psqsum16x16 esp + pushsize + localsize + 36
-%define psqdiff16x16 esp + pushsize + localsize + 40
-%define p_sd8x8 esp + pushsize + localsize + 44
-%define p_mad8x8 esp + pushsize + localsize + 48
-%define tmp_esi esp + 0
-%define tmp_edi esp + 4
-%define tmp_sadframe esp + 8
-%define tmp_ecx esp + 12
-%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov eax, ebx
-
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- pxor xmm0, xmm0
- movd [tmp_sadframe], xmm0
-sqdiff_bgd_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
-sqdiff_bgd_width_loop:
- pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0
- pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
- pxor xmm5, xmm5 ; pMad8x8
- pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
-
- mov edx, [psad8x8]
- movdqa xmm2, xmm7
- pshufd xmm1, xmm2, 00001110b
- movd [edx], xmm2
- movd [edx+4], xmm1
- add edx, 8
- mov [psad8x8], edx ; sad8x8
-
- paddd xmm1, xmm2
- movd edx, xmm1
- add [tmp_sadframe], edx ; iFrameSad
-
- mov edx, [psum16x16]
- movdqa xmm1, xmm6
- pshufd xmm2, xmm1, 00001110b
- paddd xmm1, xmm2
- movd [edx], xmm1 ; sum
-
- mov edx, [p_sd8x8]
- pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
- psubd xmm6, xmm1 ; 00 diff1 00 diff0
- pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
- movq [edx], xmm1
- add edx, 8
- mov [p_sd8x8], edx
-
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm5
- ;movdqa xmm1, xmm5
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm5, xmm0
- ;punpcklwd xmm5, xmm0
- ;movd [edx+4], xmm5
- ;add edx, 8
- ;mov [p_mad8x8], edx
- mov [tmp_ecx], ecx
- movhlps xmm1, xmm5
- movd ecx, xmm5
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
-
- psrlq xmm7, 32
- psllq xmm7, 32 ; clear sad
- pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
- pxor xmm5, xmm5 ; pMad8x8
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
-
- mov edx, [psad8x8]
- movdqa xmm2, xmm7
- pshufd xmm1, xmm2, 00001110b
- movd [edx], xmm2
- movd [edx+4], xmm1
- add edx, 8
- mov [psad8x8], edx ; sad8x8
-
- paddd xmm1, xmm2
- movd edx, xmm1
- add [tmp_sadframe], edx ; iFrameSad
-
- mov edx, [psum16x16]
- movdqa xmm1, xmm6
- pshufd xmm2, xmm1, 00001110b
- paddd xmm1, xmm2
- movd ebp, xmm1 ; sum
- add [edx], ebp
- add edx, 4
- mov [psum16x16], edx
-
- mov edx, [psqsum16x16]
- psrlq xmm7, 32
- pshufd xmm2, xmm7, 00001110b
- paddd xmm2, xmm7
- movd [edx], xmm2 ; sqsum
- add edx, 4
- mov [psqsum16x16], edx
-
- mov edx, [p_sd8x8]
- pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
- psubd xmm6, xmm1 ; 00 diff1 00 diff0
- pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
- movq [edx], xmm1
- add edx, 8
- mov [p_sd8x8], edx
-
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm5
- ;movdqa xmm1, xmm5
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm5, xmm0
- ;punpcklwd xmm5, xmm0
- ;movd [edx+4], xmm5
- ;add edx, 8
- ;mov [p_mad8x8], edx
- movhlps xmm1, xmm5
- movd ecx, xmm5
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
-
- mov edx, [psqdiff16x16]
- pshufd xmm1, xmm4, 00001110b
- paddd xmm4, xmm1
- pshufd xmm1, xmm4, 00000001b
- paddd xmm4, xmm1
- movd [edx], xmm4
- add edx, 4
- mov [psqdiff16x16], edx
-
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
-
- mov ecx, [tmp_ecx]
- dec ecx
- jnz sqdiff_bgd_width_loop
-
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
-
- dec dword [iPicHeight]
- jnz sqdiff_bgd_height_loop
-
- mov edx, [psadframe]
- mov ebp, [tmp_sadframe]
- mov [edx], ebp
-
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef psum16x16
-%undef psqsum16x16
-%undef psqdiff16x16
-%undef p_sd8x8
-%undef p_mad8x8
-%undef tmp_esi
-%undef tmp_edi
-%undef pushsize
-%undef localsize
- ret
+;*!
+;* \copy
+;* Copyright (c) 2010-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* vaa.asm
+;*
+;* Abstract
+;* sse2 for pVaa routines
+;*
+;* History
+;* 04/14/2010 Created
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+;%macro SUM_SSE2 4 ; dst, pSrc, zero, pack1_8x2
+; movdqa %1, %2
+; punpcklbw %1, %3
+; punpckhbw %2, %3
+; paddw %1, %2
+; pmaddwd %1, %4
+; pshufd %2, %1, 04Eh ; 01001110 B
+; paddd %1, %2
+; pshufd %2, %1, 0B1h ; 10110001 B
+; paddd %1, %2
+;%endmacro ; END OF SUM_SSE2
+
+; by comparing it outperforms than phaddw(SSSE3) sets
+%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp
+ ; @sum_8x2 begin
+ pshufd %2, %1, 04Eh ; 01001110 B
+ paddw %1, %2
+ pshuflw %2, %1, 04Eh ; 01001110 B
+ paddw %1, %2
+ pshuflw %2, %1, 0B1h ; 10110001 B
+ paddw %1, %2
+ ; end of @sum_8x2
+%endmacro ; END of SUM_WORD_8x2_SSE2
+
+%macro SUM_SQR_SSE2 3 ; dst, pSrc, zero
+ movdqa %1, %2
+ punpcklbw %1, %3
+ punpckhbw %2, %3
+ pmaddwd %1, %1
+ pmaddwd %2, %2
+ paddd %1, %2
+ pshufd %2, %1, 04Eh ; 01001110 B
+ paddd %1, %2
+ pshufd %2, %1, 0B1h ; 10110001 B
+ paddd %1, %2
+%endmacro ; END OF SUM_SQR_SSE2
+
+%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
+ movdqa %1, [esi ] ; line 0
+ movdqa %2, [esi+ecx] ; line 1
+ movdqa %3, %1
+ punpcklbw %1, xmm7
+ punpckhbw %3, xmm7
+ movdqa %4, %2
+ punpcklbw %4, xmm7
+ punpckhbw %2, xmm7
+ paddw %1, %4
+ paddw %2, %3
+ movdqa %3, [esi+ebx] ; line 2
+ movdqa %4, [esi+edx] ; line 3
+ movdqa %5, %3
+ punpcklbw %3, xmm7
+ punpckhbw %5, xmm7
+ movdqa %6, %4
+ punpcklbw %6, xmm7
+ punpckhbw %4, xmm7
+ paddw %3, %6
+ paddw %4, %5
+ paddw %1, %3 ; block 0, 1
+ paddw %2, %4 ; block 2, 3
+ pshufd %3, %1, 0B1h
+ pshufd %4, %2, 0B1h
+ paddw %1, %3
+ paddw %2, %4
+ movdqa %3, %1
+ movdqa %4, %2
+ pshuflw %5, %1, 0B1h
+ pshufhw %6, %3, 0B1h
+ paddw %1, %5
+ paddw %3, %6
+ pshuflw %5, %2, 0B1h
+ pshufhw %6, %4, 0B1h
+ paddw %2, %5
+ paddw %4, %6
+ punpcklwd %1, %2
+ punpckhwd %3, %4
+ punpcklwd %1, %3
+ psraw %1, $4
+%endmacro
+
+%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
+ movdqa %1, [esi ] ; line 0
+ movdqa %2, [esi+ecx] ; line 1
+ movdqa %3, %1
+ punpcklbw %1, xmm7
+ punpckhbw %3, xmm7
+ movdqa %4, %2
+ punpcklbw %4, xmm7
+ punpckhbw %2, xmm7
+ paddw %1, %4
+ paddw %2, %3
+ movdqa %3, [esi+ebx] ; line 2
+ movdqa %4, [esi+edx] ; line 3
+ movdqa %5, %3
+ punpcklbw %3, xmm7
+ punpckhbw %5, xmm7
+ movdqa %6, %4
+ punpcklbw %6, xmm7
+ punpckhbw %4, xmm7
+ paddw %3, %6
+ paddw %4, %5
+ paddw %1, %3 ; block 0, 1
+ paddw %2, %4 ; block 2, 3
+ phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
+ phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
+ psraw %1, $4
+%endmacro
+
+%macro WELS_SAD_16x2_SSE2 0
+ movdqa xmm1, [esi]
+ movdqa xmm2, [edi]
+ movdqa xmm3, [esi+ebx]
+ movdqa xmm4, [edi+ebx]
+ psadbw xmm1, xmm2
+ psadbw xmm3, xmm4
+ paddd xmm6, xmm1
+ paddd xmm6, xmm3
+ lea esi, [esi+ebx*2]
+ lea edi, [edi+ebx*2]
+%endmacro
+
+%macro WELS_SAD_SUM_SQSUM_16x1_SSE2 0
+ movdqa xmm1, [esi]
+ movdqa xmm2, [edi]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm2
+ paddd xmm6, xmm3
+
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm0
+ paddd xmm5, xmm3
+
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm2, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm4, xmm1
+ paddd xmm4, xmm2
+
+ add esi, ebx
+ add edi, ebx
+%endmacro
+
+%macro WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 0
+ movdqa xmm1, [esi]
+ movdqa xmm2, [edi]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm2
+ paddd xmm7, xmm3 ; sad
+
+ movdqa xmm3, xmm1
+ pmaxub xmm3, xmm2
+ pminub xmm2, xmm1
+ psubb xmm3, xmm2 ; diff
+
+ movdqa xmm2, xmm1
+ psadbw xmm2, xmm0
+ paddd xmm6, xmm2 ; sum
+
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm2, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm5, xmm1
+ paddd xmm5, xmm2 ; sqsum
+
+ movdqa xmm1, xmm3
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm3, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm3, xmm3
+ paddd xmm4, xmm1
+ paddd xmm4, xmm3 ; sqdiff
+
+ add esi, ebx
+ add edi, ebx
+%endmacro
+
+%macro WELS_SAD_SD_MAD_16x1_SSE2 4
+%define sad_reg %1
+%define sum_cur_reg %2
+%define sum_ref_reg %3
+%define mad_reg %4
+ movdqa xmm1, [esi]
+ movdqa xmm2, [edi]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm0
+ paddd sum_cur_reg, xmm3 ; sum_cur
+ movdqa xmm3, xmm2
+ psadbw xmm3, xmm0
+ paddd sum_ref_reg, xmm3 ; sum_ref
+
+ movdqa xmm3, xmm1
+ pmaxub xmm3, xmm2
+ pminub xmm2, xmm1
+ psubb xmm3, xmm2 ; abs diff
+ pmaxub mad_reg, xmm3 ; max abs diff
+
+ psadbw xmm3, xmm0
+ paddd sad_reg, xmm3 ; sad
+
+ add esi, ebx
+ add edi, ebx
+%endmacro
+
+
+%macro WELS_MAX_REG_SSE2 1 ; xmm1, xmm2, xmm3 can be used
+%define max_reg %1
+ movdqa xmm1, max_reg
+ psrldq xmm1, 4
+ pmaxub max_reg, xmm1
+ movdqa xmm1, max_reg
+ psrldq xmm1, 2
+ pmaxub max_reg, xmm1
+ movdqa xmm1, max_reg
+ psrldq xmm1, 1
+ pmaxub max_reg, xmm1
+%endmacro
+
+%macro WELS_SAD_BGD_SQDIFF_16x1_SSE2 4
+%define sad_reg %1
+%define sum_reg %2
+%define mad_reg %3
+%define sqdiff_reg %4
+ movdqa xmm1, [esi]
+ movdqa xmm2, xmm1
+ movdqa xmm3, xmm1
+ punpcklbw xmm2, xmm0
+ punpckhbw xmm3, xmm0
+ pmaddwd xmm2, xmm2
+ pmaddwd xmm3, xmm3
+ paddd xmm2, xmm3
+ movdqa xmm3, xmm2
+ psllq xmm2, 32
+ psrlq xmm3, 32
+ psllq xmm3, 32
+ paddd xmm2, xmm3
+ paddd sad_reg, xmm2 ; sqsum
+
+ movdqa xmm2, [edi]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm0
+ paddd sum_reg, xmm3 ; sum_cur
+ movdqa xmm3, xmm2
+ psadbw xmm3, xmm0
+ pslldq xmm3, 4
+ paddd sum_reg, xmm3 ; sum_ref
+
+ movdqa xmm3, xmm1
+ pmaxub xmm3, xmm2
+ pminub xmm2, xmm1
+ psubb xmm3, xmm2 ; abs diff
+ pmaxub mad_reg, xmm3 ; max abs diff
+
+ movdqa xmm1, xmm3
+ psadbw xmm3, xmm0
+ paddd sad_reg, xmm3 ; sad
+
+ movdqa xmm3, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm3, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm3, xmm3
+ paddd sqdiff_reg, xmm1
+ paddd sqdiff_reg, xmm3 ; sqdiff
+
+ add esi, ebx
+ add edi, ebx
+%endmacro
+
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+;SECTION .rodata align=16
+
+;ALIGN 16
+;pack1_8x2:
+; dw 1, 1, 1, 1, 1, 1, 1, 1
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN rc_sad_frame_sse2
+;***********************************************************************
+; uint32_t rc_sad_frame_sse2( uint8_t *ref_orig, uint8_t *cur_orig, const int mb_width, const int iPicHeight, const int iPicStride );
+;***********************************************************************
+ALIGN 16
+rc_sad_frame_sse2:
+ push esi
+ push edi
+ push ebp
+ push ebx
+ push edx
+
+ mov esi, [esp+24]
+ mov edi, [esp+28]
+ mov ebx, [esp+32]
+ mov ecx, [esp+36]
+ mov edx, [esp+40]
+ pxor xmm0, xmm0
+.hloop:
+ mov eax, ebx
+ mov ebp, $0
+.wloop:
+ movdqa xmm1, [esi+ebp]
+ movdqa xmm2, [edi+ebp]
+ psadbw xmm1, xmm2
+ pshufd xmm2, xmm1, 0f6h ; 11110110 B ; movhlps for float
+ paddd xmm1, xmm2
+ paddd xmm0, xmm1
+ add ebp, 010h
+ dec eax
+ jnz near .wloop
+ lea esi, [esi+edx]
+ lea edi, [edi+edx]
+ dec ecx
+ jnz near .hloop
+
+ movd eax, xmm0
+ pop edx
+ pop ebx
+ pop ebp
+ pop edi
+ pop esi
+ ret
+
+
+WELS_EXTERN SampleVariance16x16_sse2
+;***********************************************************************
+; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
+;***********************************************************************
+ALIGN 16
+SampleVariance16x16_sse2:
+ push esi
+ push edi
+ push ebx
+
+ sub esp, 16
+ %define SUM [esp]
+ %define SUM_CUR [esp+4]
+ %define SQR [esp+8]
+ %define SQR_CUR [esp+12]
+ %define PUSH_SIZE 28 ; 12 + 16
+
+ mov edi, [esp+PUSH_SIZE+4] ; y_ref
+ mov edx, [esp+PUSH_SIZE+8] ; y_ref_stride
+ mov esi, [esp+PUSH_SIZE+12] ; y_src
+ mov eax, [esp+PUSH_SIZE+16] ; y_src_stride
+ mov ecx, 010h ; height = 16
+
+ pxor xmm7, xmm7
+ movdqu SUM, xmm7
+
+.hloops:
+ movdqa xmm0, [edi] ; y_ref
+ movdqa xmm1, [esi] ; y_src
+ movdqa xmm2, xmm0 ; store first for future process
+ movdqa xmm3, xmm1
+ ; sum += diff;
+ movdqa xmm4, xmm0
+ psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79]
+ ; to be continued for sum
+ pshufd xmm5, xmm4, 0C6h ; 11000110 B
+ paddw xmm4, xmm5
+ movd ebx, xmm4
+ add SUM, ebx
+
+ ; sqr += diff * diff;
+ pmaxub xmm0, xmm1
+ pminub xmm1, xmm2
+ psubb xmm0, xmm1 ; diff
+ SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
+ movd ebx, xmm1
+ add SQR, ebx
+
+ ; sum_cur += y_src[x];
+ movdqa xmm0, xmm3 ; cur_orig
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm7
+ punpckhbw xmm1, xmm7
+ paddw xmm0, xmm1 ; 8x2
+ SUM_WORD_8x2_SSE2 xmm0, xmm1
+ movd ebx, xmm0
+ and ebx, 0ffffh
+ add SUM_CUR, ebx
+
+ ; sqr_cur += y_src[x] * y_src[x];
+ SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
+ movd ebx, xmm0
+ add SQR_CUR, ebx
+
+ lea edi, [edi+edx]
+ lea esi, [esi+eax]
+ dec ecx
+ jnz near .hloops
+
+ mov ebx, 0
+ mov bx, word SUM
+ sar ebx, 8
+ imul ebx, ebx
+ mov ecx, SQR
+ sar ecx, 8
+ sub ecx, ebx
+ mov edi, [esp+PUSH_SIZE+20] ; pMotionTexture
+ mov [edi], cx ; to store uiMotionIndex
+ mov ebx, 0
+ mov bx, word SUM_CUR
+ sar ebx, 8
+ imul ebx, ebx
+ mov ecx, SQR_CUR
+ sar ecx, 8
+ sub ecx, ebx
+ mov [edi+2], cx ; to store uiTextureIndex
+
+ %undef SUM
+ %undef SUM_CUR
+ %undef SQR
+ %undef SQR_CUR
+ %undef PUSH_SIZE
+
+ add esp, 16
+ pop ebx
+ pop edi
+ pop esi
+
+ ret
+
+; , 6/7/2010
+
+%ifndef NO_DYNAMIC_VP
+WELS_EXTERN AnalysisVaaInfoIntra_sse2
+;***********************************************************************
+; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t linesize );
+;***********************************************************************
+ALIGN 16
+AnalysisVaaInfoIntra_sse2:
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov ebp, esp
+ and ebp, 0fh
+ sub esp, ebp
+ sub esp, 32
+ %define PUSH_SIZE 52 ; 20 + 32
+
+ mov esi, [esp+ebp+PUSH_SIZE+4] ; data_y
+ mov ecx, [esp+ebp+PUSH_SIZE+8] ; linesize
+
+ mov ebx, ecx
+ sal ebx, $1 ; linesize x 2 [ebx]
+ mov edx, ebx
+ add edx, ecx ; linesize x 3 [edx]
+ mov eax, ebx
+ sal eax, $1 ; linesize x 4 [eax]
+
+ pxor xmm7, xmm7
+
+ ; loops
+ VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [esp], xmm0
+
+ lea esi, [esi+eax]
+ VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [esp+8], xmm0
+
+ lea esi, [esi+eax]
+ VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [esp+16], xmm0
+
+ lea esi, [esi+eax]
+ VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [esp+24], xmm0
+
+ movdqa xmm0, [esp] ; block 0~7
+ movdqa xmm1, [esp+16] ; block 8~15
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ SUM_WORD_8x2_SSE2 xmm0, xmm3
+
+ pmullw xmm1, xmm1
+ pmullw xmm2, xmm2
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+ punpcklwd xmm1, xmm7
+ punpckhwd xmm3, xmm7
+ punpcklwd xmm2, xmm7
+ punpckhwd xmm4, xmm7
+ paddd xmm1, xmm2
+ paddd xmm3, xmm4
+ paddd xmm1, xmm3
+ pshufd xmm2, xmm1, 01Bh
+ paddd xmm1, xmm2
+ pshufd xmm2, xmm1, 0B1h
+ paddd xmm1, xmm2
+
+ movd ebx, xmm0
+ and ebx, 0ffffh ; effective low word truncated
+ mov ecx, ebx
+ imul ebx, ecx
+ sar ebx, $4
+ movd eax, xmm1
+ sub eax, ebx
+
+ %undef PUSH_SIZE
+ add esp, 32
+ add esp, ebp
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+WELS_EXTERN AnalysisVaaInfoIntra_ssse3
+;***********************************************************************
+; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t linesize );
+;***********************************************************************
+ALIGN 16
+AnalysisVaaInfoIntra_ssse3:
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov ebp, esp
+ and ebp, 0fh
+ sub esp, ebp
+ sub esp, 32
+ %define PUSH_SIZE 52 ; 20 + 32
+
+ mov esi, [esp+ebp+PUSH_SIZE+4] ; data_y
+ mov ecx, [esp+ebp+PUSH_SIZE+8] ; linesize
+
+ mov ebx, ecx
+ sal ebx, $1 ; linesize x 2 [ebx]
+ mov edx, ebx
+ add edx, ecx ; linesize x 3 [edx]
+ mov eax, ebx
+ sal eax, $1 ; linesize x 4 [eax]
+
+ pxor xmm7, xmm7
+
+ ; loops
+ VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [esp], xmm0
+
+ lea esi, [esi+eax]
+ VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+ movq [esp+8], xmm1
+
+ lea esi, [esi+eax]
+ VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [esp+16], xmm0
+
+ lea esi, [esi+eax]
+ VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+ movq [esp+24], xmm1
+
+ movdqa xmm0, [esp] ; block 0~7
+ movdqa xmm1, [esp+16] ; block 8~15
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets
+
+ pmullw xmm1, xmm1
+ pmullw xmm2, xmm2
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+ punpcklwd xmm1, xmm7
+ punpckhwd xmm3, xmm7
+ punpcklwd xmm2, xmm7
+ punpckhwd xmm4, xmm7
+ paddd xmm1, xmm2
+ paddd xmm3, xmm4
+ paddd xmm1, xmm3
+ pshufd xmm2, xmm1, 01Bh
+ paddd xmm1, xmm2
+ pshufd xmm2, xmm1, 0B1h
+ paddd xmm1, xmm2
+
+ movd ebx, xmm0
+ and ebx, 0ffffh ; effective low work truncated
+ mov ecx, ebx
+ imul ebx, ecx
+ sar ebx, $4
+ movd eax, xmm1
+ sub eax, ebx
+
+ %undef PUSH_SIZE
+ add esp, 32
+ add esp, ebp
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+%endif
+
+
+
+WELS_EXTERN abs_difference_mbrow_sse2
+;*************************************************************************************************************
+;void abs_difference_mbrow_sse2( uint8_t *ref_orig, uint8_t *cur_orig, int32_t iPicStride,
+; int32_t gom_pixel_num, int32_t *pSum)
+;*************************************************************************************************************
+ALIGN 16
+abs_difference_mbrow_sse2:
+%define ref_orig esp + pushsize + 4
+%define cur_orig esp + pushsize + 8
+%define iPicStride esp + pushsize + 12
+%define gom_pixel_num esp + pushsize + 16
+%define pSum esp + pushsize + 20
+%define pushsize 12
+ push esi
+ push edi
+ push ebx
+ mov esi, [ref_orig]
+ mov edi, [cur_orig]
+ mov ebx, [iPicStride]
+ mov eax, [gom_pixel_num]
+ mov ecx, 16 ;MB_WIDTH_LUMA
+ pxor xmm0, xmm0
+mb_width_loop_p:
+ mov edx, esi
+ add edx, eax ; end address
+gom_row_loop_p:
+ movdqa xmm1, [esi]
+ movdqa xmm2, [edi]
+ psadbw xmm1, xmm2
+ paddd xmm0, xmm1
+ add esi, 16
+ add edi, 16
+ cmp esi, edx
+ jl gom_row_loop_p
+
+ sub esi, eax
+ sub edi, eax
+ add esi, ebx
+ add edi, ebx
+ loop mb_width_loop_p
+
+ movdqa xmm1, xmm0
+ psrldq xmm1, 8
+ paddd xmm1, xmm0
+ movd eax, xmm1
+ mov edx, [pSum] ; pSum
+ add [edx], eax
+
+%undef ref_orig
+%undef cur_orig
+%undef iPicStride
+%undef gom_pixel_num
+%undef pSum
+%undef pushsize
+ pop ebx
+ pop edi
+ pop esi
+ ret
+
+
+
+
+WELS_EXTERN sum_sqrsum_mbrow_sse2
+;*************************************************************************************************************
+;void sum_sqrsum_mbrow_sse2( uint8_t *cur_orig, int32_t iPicStride,
+; int32_t gom_pixel_num, int32_t *pSum, int32_t *pSqrSum)
+;*************************************************************************************************************
+ALIGN 16
+sum_sqrsum_mbrow_sse2:
+%define cur_orig esp + pushsize + 4
+%define iPicStride esp + pushsize + 8
+%define gom_pixel_num esp + pushsize + 12
+%define pSum esp + pushsize + 16
+%define pSqrSum esp + pushsize + 20
+%define pushsize 8
+ push esi
+ push ebx
+ mov esi, [cur_orig]
+ mov eax, [gom_pixel_num]
+ mov ebx, [iPicStride]
+ mov ecx, 16 ;MB_WIDTH_LUMA
+ pxor xmm0, xmm0 ; zero
+ pxor xmm1, xmm1 ; sum
+ pxor xmm2, xmm2 ; sqr sum
+mb_width_loop_i:
+ mov edx, esi
+ add edx, eax ; end address
+gom_row_loop_i:
+ movdqa xmm3, [esi]
+ movdqa xmm4, xmm3
+ psadbw xmm4, xmm0
+ paddd xmm1, xmm4
+ movdqa xmm4, xmm3
+ punpcklbw xmm4, xmm0
+ punpckhbw xmm3, xmm0
+ pmaddwd xmm4, xmm4
+ pmaddwd xmm3, xmm3
+ paddd xmm2, xmm3
+ paddd xmm2, xmm4
+ add esi, 16
+ cmp esi, edx
+ jl gom_row_loop_i
+
+ sub esi, eax
+ add esi, ebx
+ loop mb_width_loop_i
+
+ movdqa xmm3, xmm1
+ psrldq xmm3, 8
+ paddd xmm1, xmm3
+ movd eax, xmm1
+ mov edx, [pSum]
+ add [edx], eax
+
+ movdqa xmm3, xmm2
+ psrldq xmm3, 8
+ paddd xmm2, xmm3
+ movdqa xmm3, xmm2
+ psrldq xmm3, 4
+ paddd xmm2, xmm3
+ movd eax, xmm2
+ mov edx, [pSqrSum]
+ add [edx], eax
+
+
+%undef cur_orig
+%undef iPicStride
+%undef gom_pixel_num
+%undef pSum
+%undef pSqrSum
+%undef pushsize
+ pop ebx
+ pop esi
+ ret
+
+
+
+WELS_EXTERN VAACalcSad_sse2
+;*************************************************************************************************************
+;void VAACalcSad_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSad_sse2:
+%define cur_data esp + pushsize + 4
+%define ref_data esp + pushsize + 8
+%define iPicWidth esp + pushsize + 12
+%define iPicHeight esp + pushsize + 16
+%define iPicStride esp + pushsize + 20
+%define psadframe esp + pushsize + 24
+%define psad8x8 esp + pushsize + 28
+%define pushsize 12
+ push esi
+ push edi
+ push ebx
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov edx, [psad8x8]
+ mov eax, ebx
+
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm7, xmm7 ; iFrameSad
+height_loop:
+ mov ecx, dword [iPicWidth]
+ push esi
+ push edi
+width_loop:
+ pxor xmm6, xmm6 ;
+ WELS_SAD_16x2_SSE2
+ WELS_SAD_16x2_SSE2
+ WELS_SAD_16x2_SSE2
+ WELS_SAD_16x2_SSE2
+ paddd xmm7, xmm6
+ movd [edx], xmm6
+ psrldq xmm6, 8
+ movd [edx+4], xmm6
+
+ pxor xmm6, xmm6
+ WELS_SAD_16x2_SSE2
+ WELS_SAD_16x2_SSE2
+ WELS_SAD_16x2_SSE2
+ WELS_SAD_16x2_SSE2
+ paddd xmm7, xmm6
+ movd [edx+8], xmm6
+ psrldq xmm6, 8
+ movd [edx+12], xmm6
+
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
+
+ dec ecx
+ jnz width_loop
+
+ pop edi
+ pop esi
+ add esi, eax
+ add edi, eax
+
+ dec dword [iPicHeight]
+ jnz height_loop
+
+ mov edx, [psadframe]
+ movdqa xmm5, xmm7
+ psrldq xmm7, 8
+ paddd xmm7, xmm5
+ movd [edx], xmm7
+
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef pushsize
+ pop ebx
+ pop edi
+ pop esi
+ ret
+
+
+WELS_EXTERN VAACalcSadVar_sse2
+;*************************************************************************************************************
+;void VAACalcSadVar_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadVar_sse2:
+%define localsize 8
+%define cur_data esp + pushsize + localsize + 4
+%define ref_data esp + pushsize + localsize + 8
+%define iPicWidth esp + pushsize + localsize + 12
+%define iPicHeight esp + pushsize + localsize + 16
+%define iPicStride esp + pushsize + localsize + 20
+%define psadframe esp + pushsize + localsize + 24
+%define psad8x8 esp + pushsize + localsize + 28
+%define psum16x16 esp + pushsize + localsize + 32
+%define psqsum16x16 esp + pushsize + localsize + 36
+%define tmp_esi esp + 0
+%define tmp_edi esp + 4
+%define pushsize 16
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov edx, [psad8x8]
+ mov eax, ebx
+
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm7, xmm7 ; iFrameSad
+var_height_loop:
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
+var_width_loop:
+ pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8
+ pxor xmm5, xmm5 ; pSum16x16
+ pxor xmm4, xmm4 ; sqsum_16x16
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ paddd xmm7, xmm6
+ movd [edx], xmm6
+ psrldq xmm6, 8
+ movd [edx+4], xmm6
+
+ pxor xmm6, xmm6
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ paddd xmm7, xmm6
+ movd [edx+8], xmm6
+ psrldq xmm6, 8
+ movd [edx+12], xmm6
+
+ mov ebp, [psum16x16]
+ movdqa xmm1, xmm5
+ psrldq xmm1, 8
+ paddd xmm5, xmm1
+ movd [ebp], xmm5
+ add dword [psum16x16], 4
+
+ movdqa xmm5, xmm4
+ psrldq xmm5, 8
+ paddd xmm4, xmm5
+ movdqa xmm3, xmm4
+ psrldq xmm3, 4
+ paddd xmm4, xmm3
+
+ mov ebp, [psqsum16x16]
+ movd [ebp], xmm4
+ add dword [psqsum16x16], 4
+
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
+
+ dec ecx
+ jnz var_width_loop
+
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
+
+ dec dword [iPicHeight]
+ jnz var_height_loop
+
+ mov edx, [psadframe]
+ movdqa xmm5, xmm7
+ psrldq xmm7, 8
+ paddd xmm7, xmm5
+ movd [edx], xmm7
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef psum16x16
+%undef psqsum16x16
+%undef tmp_esi
+%undef tmp_edi
+%undef pushsize
+%undef localsize
+ ret
+
+
+
+WELS_EXTERN VAACalcSadSsd_sse2
+;*************************************************************************************************************
+;void VAACalcSadSsd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadSsd_sse2:
+%define localsize 12
+%define cur_data esp + pushsize + localsize + 4
+%define ref_data esp + pushsize + localsize + 8
+%define iPicWidth esp + pushsize + localsize + 12
+%define iPicHeight esp + pushsize + localsize + 16
+%define iPicStride esp + pushsize + localsize + 20
+%define psadframe esp + pushsize + localsize + 24
+%define psad8x8 esp + pushsize + localsize + 28
+%define psum16x16 esp + pushsize + localsize + 32
+%define psqsum16x16 esp + pushsize + localsize + 36
+%define psqdiff16x16 esp + pushsize + localsize + 40
+%define tmp_esi esp + 0
+%define tmp_edi esp + 4
+%define tmp_sadframe esp + 8
+%define pushsize 16
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
+ mov ecx, [iPicWidth]
+ mov ecx, [iPicHeight]
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov edx, [psad8x8]
+ mov eax, ebx
+
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ mov ecx, [iPicWidth]
+ mov ecx, [iPicHeight]
+ pxor xmm0, xmm0
+ movd [tmp_sadframe], xmm0
+sqdiff_height_loop:
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
+sqdiff_width_loop:
+ pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8
+ pxor xmm6, xmm6 ; pSum16x16
+ pxor xmm5, xmm5 ; sqsum_16x16 four dword
+ pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ movdqa xmm1, xmm7
+ movd [edx], xmm7
+ psrldq xmm7, 8
+ paddd xmm1, xmm7
+ movd [edx+4], xmm7
+ movd ebp, xmm1
+ add [tmp_sadframe], ebp
+
+ pxor xmm7, xmm7
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ movdqa xmm1, xmm7
+ movd [edx+8], xmm7
+ psrldq xmm7, 8
+ paddd xmm1, xmm7
+ movd [edx+12], xmm7
+ movd ebp, xmm1
+ add [tmp_sadframe], ebp
+
+ mov ebp, [psum16x16]
+ movdqa xmm1, xmm6
+ psrldq xmm1, 8
+ paddd xmm6, xmm1
+ movd [ebp], xmm6
+ add dword [psum16x16], 4
+
+ mov ebp, [psqsum16x16]
+ pshufd xmm6, xmm5, 14 ;00001110
+ paddd xmm6, xmm5
+ pshufd xmm5, xmm6, 1 ;00000001
+ paddd xmm5, xmm6
+ movd [ebp], xmm5
+ add dword [psqsum16x16], 4
+
+ mov ebp, [psqdiff16x16]
+ pshufd xmm5, xmm4, 14 ; 00001110
+ paddd xmm5, xmm4
+ pshufd xmm4, xmm5, 1 ; 00000001
+ paddd xmm4, xmm5
+ movd [ebp], xmm4
+ add dword [psqdiff16x16], 4
+
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
+
+ dec ecx
+ jnz sqdiff_width_loop
+
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
+
+ dec dword [iPicHeight]
+ jnz sqdiff_height_loop
+
+ mov ebx, [tmp_sadframe]
+ mov eax, [psadframe]
+ mov [eax], ebx
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef psum16x16
+%undef psqsum16x16
+%undef psqdiff16x16
+%undef tmp_esi
+%undef tmp_edi
+%undef tmp_sadframe
+%undef pushsize
+%undef localsize
+ ret
+
+
+
+
+
+WELS_EXTERN VAACalcSadBgd_sse2
+;*************************************************************************************************************
+;void VAACalcSadBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadBgd_sse2:
+%define localsize 12
+%define cur_data esp + pushsize + localsize + 4
+%define ref_data esp + pushsize + localsize + 8
+%define iPicWidth esp + pushsize + localsize + 12
+%define iPicHeight esp + pushsize + localsize + 16
+%define iPicStride esp + pushsize + localsize + 20
+%define psadframe esp + pushsize + localsize + 24
+%define psad8x8 esp + pushsize + localsize + 28
+%define p_sd8x8 esp + pushsize + localsize + 32
+%define p_mad8x8 esp + pushsize + localsize + 36
+%define tmp_esi esp + 0
+%define tmp_edi esp + 4
+%define tmp_ecx esp + 8
+%define pushsize 16
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov eax, ebx
+
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ xor ebp, ebp
+ pxor xmm0, xmm0
+bgd_height_loop:
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
+bgd_width_loop:
+ pxor xmm7, xmm7 ; pSad8x8
+ pxor xmm6, xmm6 ; sum_cur_8x8
+ pxor xmm5, xmm5 ; sum_ref_8x8
+ pxor xmm4, xmm4 ; pMad8x8
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+
+
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm4
+
+ ;movdqa xmm1, xmm4
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm4, xmm0
+ ;punpcklwd xmm4, xmm0
+ ;movd [edx+4], xmm4
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ mov [tmp_ecx], ecx
+ movhlps xmm1, xmm4
+ movd ecx, xmm4
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
+
+
+ pslldq xmm7, 4
+ pslldq xmm6, 4
+ pslldq xmm5, 4
+
+
+ pxor xmm4, xmm4 ; pMad8x8
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm4
+
+ ;movdqa xmm1, xmm4
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm4, xmm0
+ ;punpcklwd xmm4, xmm0
+ ;movd [edx+4], xmm4
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ movhlps xmm1, xmm4
+ movd ecx, xmm4
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
+
+ ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2
+
+ mov edx, [psad8x8]
+ pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0
+ movdqa [edx], xmm1
+ add edx, 16
+ mov [psad8x8], edx ; sad8x8
+
+ paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0
+ pshufd xmm2, xmm1, 00000011b
+ paddd xmm1, xmm2
+ movd edx, xmm1
+ add ebp, edx ; sad frame
+
+ mov edx, [p_sd8x8]
+ psubd xmm6, xmm5
+ pshufd xmm1, xmm6, 10001101b
+ movdqa [edx], xmm1
+ add edx, 16
+ mov [p_sd8x8], edx
+
+
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
+
+ mov ecx, [tmp_ecx]
+ dec ecx
+ jnz bgd_width_loop
+
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
+
+ dec dword [iPicHeight]
+ jnz bgd_height_loop
+
+ mov edx, [psadframe]
+ mov [edx], ebp
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef p_sd8x8
+%undef p_mad8x8
+%undef tmp_esi
+%undef tmp_edi
+%undef pushsize
+%undef localsize
+ ret
+
+
+
+WELS_EXTERN VAACalcSadSsdBgd_sse2
+;*************************************************************************************************************
+;void VAACalcSadSsdBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
+; int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadSsdBgd_sse2:
+%define localsize 16
+%define cur_data esp + pushsize + localsize + 4
+%define ref_data esp + pushsize + localsize + 8
+%define iPicWidth esp + pushsize + localsize + 12
+%define iPicHeight esp + pushsize + localsize + 16
+%define iPicStride esp + pushsize + localsize + 20
+%define psadframe esp + pushsize + localsize + 24
+%define psad8x8 esp + pushsize + localsize + 28
+%define psum16x16 esp + pushsize + localsize + 32
+%define psqsum16x16 esp + pushsize + localsize + 36
+%define psqdiff16x16 esp + pushsize + localsize + 40
+%define p_sd8x8 esp + pushsize + localsize + 44
+%define p_mad8x8 esp + pushsize + localsize + 48
+%define tmp_esi esp + 0
+%define tmp_edi esp + 4
+%define tmp_sadframe esp + 8
+%define tmp_ecx esp + 12
+%define pushsize 16
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov eax, ebx
+
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ movd [tmp_sadframe], xmm0
+sqdiff_bgd_height_loop:
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
+sqdiff_bgd_width_loop:
+ pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0
+ pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
+ pxor xmm5, xmm5 ; pMad8x8
+ pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+
+ mov edx, [psad8x8]
+ movdqa xmm2, xmm7
+ pshufd xmm1, xmm2, 00001110b
+ movd [edx], xmm2
+ movd [edx+4], xmm1
+ add edx, 8
+ mov [psad8x8], edx ; sad8x8
+
+ paddd xmm1, xmm2
+ movd edx, xmm1
+ add [tmp_sadframe], edx ; iFrameSad
+
+ mov edx, [psum16x16]
+ movdqa xmm1, xmm6
+ pshufd xmm2, xmm1, 00001110b
+ paddd xmm1, xmm2
+ movd [edx], xmm1 ; sum
+
+ mov edx, [p_sd8x8]
+ pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
+ psubd xmm6, xmm1 ; 00 diff1 00 diff0
+ pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
+ movq [edx], xmm1
+ add edx, 8
+ mov [p_sd8x8], edx
+
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm5
+ ;movdqa xmm1, xmm5
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm5, xmm0
+ ;punpcklwd xmm5, xmm0
+ ;movd [edx+4], xmm5
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ mov [tmp_ecx], ecx
+ movhlps xmm1, xmm5
+ movd ecx, xmm5
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
+
+ psrlq xmm7, 32
+ psllq xmm7, 32 ; clear sad
+ pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
+ pxor xmm5, xmm5 ; pMad8x8
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+
+ mov edx, [psad8x8]
+ movdqa xmm2, xmm7
+ pshufd xmm1, xmm2, 00001110b
+ movd [edx], xmm2
+ movd [edx+4], xmm1
+ add edx, 8
+ mov [psad8x8], edx ; sad8x8
+
+ paddd xmm1, xmm2
+ movd edx, xmm1
+ add [tmp_sadframe], edx ; iFrameSad
+
+ mov edx, [psum16x16]
+ movdqa xmm1, xmm6
+ pshufd xmm2, xmm1, 00001110b
+ paddd xmm1, xmm2
+ movd ebp, xmm1 ; sum
+ add [edx], ebp
+ add edx, 4
+ mov [psum16x16], edx
+
+ mov edx, [psqsum16x16]
+ psrlq xmm7, 32
+ pshufd xmm2, xmm7, 00001110b
+ paddd xmm2, xmm7
+ movd [edx], xmm2 ; sqsum
+ add edx, 4
+ mov [psqsum16x16], edx
+
+ mov edx, [p_sd8x8]
+ pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
+ psubd xmm6, xmm1 ; 00 diff1 00 diff0
+ pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
+ movq [edx], xmm1
+ add edx, 8
+ mov [p_sd8x8], edx
+
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm5
+ ;movdqa xmm1, xmm5
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm5, xmm0
+ ;punpcklwd xmm5, xmm0
+ ;movd [edx+4], xmm5
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ movhlps xmm1, xmm5
+ movd ecx, xmm5
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
+
+ mov edx, [psqdiff16x16]
+ pshufd xmm1, xmm4, 00001110b
+ paddd xmm4, xmm1
+ pshufd xmm1, xmm4, 00000001b
+ paddd xmm4, xmm1
+ movd [edx], xmm4
+ add edx, 4
+ mov [psqdiff16x16], edx
+
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
+
+ mov ecx, [tmp_ecx]
+ dec ecx
+ jnz sqdiff_bgd_width_loop
+
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
+
+ dec dword [iPicHeight]
+ jnz sqdiff_bgd_height_loop
+
+ mov edx, [psadframe]
+ mov ebp, [tmp_sadframe]
+ mov [edx], ebp
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef psum16x16
+%undef psqsum16x16
+%undef psqdiff16x16
+%undef p_sd8x8
+%undef p_mad8x8
+%undef tmp_esi
+%undef tmp_edi
+%undef pushsize
+%undef localsize
+ ret
--- a/processing/src/common/WelsVP.def
+++ b/processing/src/common/WelsVP.def
@@ -1,36 +1,36 @@
-;*!
-;* \copy
-;* Copyright (c) 2011-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-
-LIBRARY welsvp.dll
-EXPORTS
- CreateVpInterface PRIVATE
- DestroyVpInterface PRIVATE
\ No newline at end of file
+;*!
+;* \copy
+;* Copyright (c) 2011-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+
+LIBRARY welsvp.dll
+EXPORTS
+ CreateVpInterface PRIVATE
+ DestroyVpInterface PRIVATE
\ No newline at end of file
--- a/processing/src/common/WelsVP.rc
+++ b/processing/src/common/WelsVP.rc
@@ -27,18 +27,18 @@
// TEXTINCLUDE
//
-1 TEXTINCLUDE
+1 TEXTINCLUDE
BEGIN
"resource.h\0"
END
-2 TEXTINCLUDE
+2 TEXTINCLUDE
BEGIN
"#include ""windows.h""\r\n"
"\0"
END
-3 TEXTINCLUDE
+3 TEXTINCLUDE
BEGIN
"\r\n"
"\0"
--- a/testbin/AutoBuild_Windows_VS2008.bat
+++ b/testbin/AutoBuild_Windows_VS2008.bat
@@ -23,7 +23,7 @@
rem call VP build
echo "Welsvp Building....."
cd %VPProjectDir%
-rem vcclean
+rem vcclean
%VCBUILDEXE% WelsVP_2008.vcproj
@@ -33,7 +33,7 @@
cd %CurDir%
cd %EncoderProjectDir%
-rem vcclean
+rem vcclean
%VCBUILDEXE% WelsEncCore.vcproj
%VCBUILDEXE% WelsEncPlus.vcproj
%VCBUILDEXE% encConsole.vcproj
@@ -44,7 +44,7 @@
cd %CurDir%
cd %DecoderProjectDir%
-rem vcclean
+rem vcclean
%VCBUILDEXE% WelsDecCore.vcproj
%VCBUILDEXE% WelsDecPlus.vcproj
%VCBUILDEXE% decConsole.vcproj
--- a/testbin/AutoBuild_Windows_VS2010.bat
+++ b/testbin/AutoBuild_Windows_VS2010.bat
@@ -36,7 +36,7 @@
cd %CurDir%
cd %EncoderProjectDir%
echo current directory is %EncoderProjectDir%
-rem vcclean
+rem vcclean
echo %VCMSBUILDEXE_RELEASE% WelsEncoder_2010.sln
%VCMSBUILDEXE_RELEASE% WelsEncoder_2010.sln
@@ -49,7 +49,7 @@
cd %CurDir%
cd %DecoderProjectDir%
echo current directory is %DecoderProjectDir%
-rem vcclean
+rem vcclean
echo %VCMSBUILDEXE_RELEASE% WelsDecoder_2010.sln
--- a/testbin/AutoBuild_Windows_VS2012.bat
+++ b/testbin/AutoBuild_Windows_VS2012.bat
@@ -36,7 +36,7 @@
cd %CurDir%
cd %EncoderProjectDir%
echo current directory is %EncoderProjectDir%
-rem vcclean
+rem vcclean
echo %VCMSBUILDEXE_RELEASE% WelsEncoder_2012.sln
%VCMSBUILDEXE_RELEASE% WelsEncoder_2012.sln
@@ -49,7 +49,7 @@
cd %CurDir%
cd %DecoderProjectDir%
echo current directory is %DecoderProjectDir%
-rem vcclean
+rem vcclean
echo %VCMSBUILDEXE_RELEASE% WelsDecoder_2012.sln
--- a/testbin/layer2.cfg
+++ b/testbin/layer2.cfg
@@ -1,39 +1,39 @@
-# Layer Configuration File
-
-
-#============================== INPUT / OUTPUT ==============================
-SourceWidth 320 # Input frame width
-SourceHeight 192 # Input frame height
-FrameRateIn 12 # Input frame rate [Hz]
-FrameRateOut 12 # Output frame rate [Hz]
-InputFile CiscoVT2people_320x192_12fps.yuv # Input file
-ReconFile rec_layer2.yuv # Reconstructed file
-
-#============================== CODING ==============================
-ProfileIdc 66 # value of profile_idc (or 0 for auto detection)
-
-InitialQP 24 # Quantization parameters for base quality layer
-#================================ RATE CONTROL ===============================
-SpatialBitrate 600 # Unit: kbps, controled by DisableRC also
-#============================== MultiSlice Slice Argument ==============================
-# for S/M Slice(s) mode settings
-SliceMode 0 # 0: sigle slice mode; >0: multiple slices mode, see below;
-SliceSize 1500
-SliceNum 1 # multiple slices number specified
-
-SlicesAssign0 960 # count number of MBs in slice #0
-SlicesAssign1 0 # count number of MBs in slice #1
-SlicesAssign2 0 # count number of MBs in slice #2
-SlicesAssign3 0 # count number of MBs in slice #3 -- seting here is for better testing
-SlicesAssign4 0 # count number of MBs in slice #4
-SlicesAssign5 0 # count number of MBs in slice #5
-SlicesAssign6 0 # count number of MBs in slice #6
-SlicesAssign7 0 # count number of MBs in slice #7
-
-### DESIGN OF SLICE MODE ####
-# 0 SM_SINGLE_SLICE | SliceNum==1
-# 1 SM_FIXEDSLCNUM_SLICE | according to SliceNum | Enabled dynamic slicing for multi-thread
-# 2 SM_RASTER_SLICE | according to SlicesAssign | Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
-# 3 SM_ROWMB_SLICE | according to PictureMBHeight | Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
-# 4 SM_DYN_SLICE | according to SliceSize | Dynamic slicing (have no idea about slice_nums until encoding current frame)
-
+# Layer Configuration File
+
+
+#============================== INPUT / OUTPUT ==============================
+SourceWidth 320 # Input frame width
+SourceHeight 192 # Input frame height
+FrameRateIn 12 # Input frame rate [Hz]
+FrameRateOut 12 # Output frame rate [Hz]
+InputFile CiscoVT2people_320x192_12fps.yuv # Input file
+ReconFile rec_layer2.yuv # Reconstructed file
+
+#============================== CODING ==============================
+ProfileIdc 66 # value of profile_idc (or 0 for auto detection)
+
+InitialQP 24 # Quantization parameters for base quality layer
+#================================ RATE CONTROL ===============================
+SpatialBitrate 600 # Unit: kbps, controled by DisableRC also
+#============================== MultiSlice Slice Argument ==============================
+# for S/M Slice(s) mode settings
+SliceMode 0 # 0: sigle slice mode; >0: multiple slices mode, see below;
+SliceSize 1500
+SliceNum 1 # multiple slices number specified
+
+SlicesAssign0 960 # count number of MBs in slice #0
+SlicesAssign1 0 # count number of MBs in slice #1
+SlicesAssign2 0 # count number of MBs in slice #2
+SlicesAssign3 0 # count number of MBs in slice #3 -- seting here is for better testing
+SlicesAssign4 0 # count number of MBs in slice #4
+SlicesAssign5 0 # count number of MBs in slice #5
+SlicesAssign6 0 # count number of MBs in slice #6
+SlicesAssign7 0 # count number of MBs in slice #7
+
+### DESIGN OF SLICE MODE ####
+# 0 SM_SINGLE_SLICE | SliceNum==1
+# 1 SM_FIXEDSLCNUM_SLICE | according to SliceNum | Enabled dynamic slicing for multi-thread
+# 2 SM_RASTER_SLICE | according to SlicesAssign | Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
+# 3 SM_ROWMB_SLICE | according to PictureMBHeight | Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
+# 4 SM_DYN_SLICE | according to SliceSize | Dynamic slicing (have no idea about slice_nums until encoding current frame)
+
--- a/testbin/layer2_vd.cfg
+++ b/testbin/layer2_vd.cfg
@@ -1,39 +1,39 @@
-# Layer Configuration File
-
-
-#============================== INPUT / OUTPUT ==============================
-SourceWidth 320 # Input frame width
-SourceHeight 192 # Input frame height
-FrameRateIn 12 # Input frame rate [Hz]
-FrameRateOut 12 # Output frame rate [Hz]
-InputFile CiscoVT2people_320x192_12fps.yuv # Input file
-ReconFile rec_layer2.yuv # Reconstructed file
-
-#============================== CODING ==============================
-ProfileIdc 66 # value of profile_idc (or 0 for auto detection)
-
-InitialQP 24 # Quantization parameters for base quality layer
-#================================ RATE CONTROL ===============================
-SpatialBitrate 600 # Unit: kbps, controled by DisableRC also
-#============================== MultiSlice Slice Argument ==============================
-# for S/M Slice(s) mode settings
-SliceMode 0 # 0: sigle slice mode; >0: multiple slices mode, see below;
-SliceSize 1500
-SliceNum 1 # multiple slices number specified
-
-SlicesAssign0 960 # count number of MBs in slice #0
-SlicesAssign1 0 # count number of MBs in slice #1
-SlicesAssign2 0 # count number of MBs in slice #2
-SlicesAssign3 0 # count number of MBs in slice #3 -- seting here is for better testing
-SlicesAssign4 0 # count number of MBs in slice #4
-SlicesAssign5 0 # count number of MBs in slice #5
-SlicesAssign6 0 # count number of MBs in slice #6
-SlicesAssign7 0 # count number of MBs in slice #7
-
-### DESIGN OF SLICE MODE, 100804, Sijia ####
-# 0 SM_SINGLE_SLICE | SliceNum==1
-# 1 SM_FIXEDSLCNUM_SLICE | according to SliceNum | Enabled dynamic slicing for multi-thread
-# 2 SM_RASTER_SLICE | according to SlicesAssign | Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
-# 3 SM_ROWMB_SLICE | according to PictureMBHeight | Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
-# 4 SM_DYN_SLICE | according to SliceSize | Dynamic slicing (have no idea about slice_nums until encoding current frame)
-
+# Layer Configuration File
+
+
+#============================== INPUT / OUTPUT ==============================
+SourceWidth 320 # Input frame width
+SourceHeight 192 # Input frame height
+FrameRateIn 12 # Input frame rate [Hz]
+FrameRateOut 12 # Output frame rate [Hz]
+InputFile CiscoVT2people_320x192_12fps.yuv # Input file
+ReconFile rec_layer2.yuv # Reconstructed file
+
+#============================== CODING ==============================
+ProfileIdc 66 # value of profile_idc (or 0 for auto detection)
+
+InitialQP 24 # Quantization parameters for base quality layer
+#================================ RATE CONTROL ===============================
+SpatialBitrate 600 # Unit: kbps, controled by DisableRC also
+#============================== MultiSlice Slice Argument ==============================
+# for S/M Slice(s) mode settings
+SliceMode 0 # 0: sigle slice mode; >0: multiple slices mode, see below;
+SliceSize 1500
+SliceNum 1 # multiple slices number specified
+
+SlicesAssign0 960 # count number of MBs in slice #0
+SlicesAssign1 0 # count number of MBs in slice #1
+SlicesAssign2 0 # count number of MBs in slice #2
+SlicesAssign3 0 # count number of MBs in slice #3 -- seting here is for better testing
+SlicesAssign4 0 # count number of MBs in slice #4
+SlicesAssign5 0 # count number of MBs in slice #5
+SlicesAssign6 0 # count number of MBs in slice #6
+SlicesAssign7 0 # count number of MBs in slice #7
+
+### DESIGN OF SLICE MODE, 100804, Sijia ####
+# 0 SM_SINGLE_SLICE | SliceNum==1
+# 1 SM_FIXEDSLCNUM_SLICE | according to SliceNum | Enabled dynamic slicing for multi-thread
+# 2 SM_RASTER_SLICE | according to SlicesAssign | Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
+# 3 SM_ROWMB_SLICE | according to PictureMBHeight | Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
+# 4 SM_DYN_SLICE | according to SliceSize | Dynamic slicing (have no idea about slice_nums until encoding current frame)
+
--- a/testbin/layer2_vd_rc.cfg
+++ b/testbin/layer2_vd_rc.cfg
@@ -1,39 +1,39 @@
-# Layer Configuration File
-
-
-#============================== INPUT / OUTPUT ==============================
-SourceWidth 320 # Input frame width
-SourceHeight 192 # Input frame height
-FrameRateIn 12 # Input frame rate [Hz]
-FrameRateOut 12 # Output frame rate [Hz]
-InputFile CiscoVT2people_320x192_12fps.yuv # Input file
-ReconFile rec_layer2.yuv # Reconstructed file
-
-#============================== CODING ==============================
-ProfileIdc 66 # value of profile_idc (or 0 for auto detection)
-
-InitialQP 24 # Quantization parameters for base quality layer
-#================================ RATE CONTROL ===============================
-SpatialBitrate 600 # Unit: kbps, controled by DisableRC also
-#============================== MultiSlice Slice Argument ==============================
-# for S/M Slice(s) mode settings
-SliceMode 0 # 0: sigle slice mode; >0: multiple slices mode, see below;
-SliceSize 1500
-SliceNum 1 # multiple slices number specified
-
-SlicesAssign0 960 # count number of MBs in slice #0
-SlicesAssign1 0 # count number of MBs in slice #1
-SlicesAssign2 0 # count number of MBs in slice #2
-SlicesAssign3 0 # count number of MBs in slice #3 -- seting here is for better testing
-SlicesAssign4 0 # count number of MBs in slice #4
-SlicesAssign5 0 # count number of MBs in slice #5
-SlicesAssign6 0 # count number of MBs in slice #6
-SlicesAssign7 0 # count number of MBs in slice #7
-
-### DESIGN OF SLICE MODE, 100804, Sijia ####
-# 0 SM_SINGLE_SLICE | SliceNum==1
-# 1 SM_FIXEDSLCNUM_SLICE | according to SliceNum | Enabled dynamic slicing for multi-thread
-# 2 SM_RASTER_SLICE | according to SlicesAssign | Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
-# 3 SM_ROWMB_SLICE | according to PictureMBHeight | Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
-# 4 SM_DYN_SLICE | according to SliceSize | Dynamic slicing (have no idea about slice_nums until encoding current frame)
-
+# Layer Configuration File
+
+
+#============================== INPUT / OUTPUT ==============================
+SourceWidth 320 # Input frame width
+SourceHeight 192 # Input frame height
+FrameRateIn 12 # Input frame rate [Hz]
+FrameRateOut 12 # Output frame rate [Hz]
+InputFile CiscoVT2people_320x192_12fps.yuv # Input file
+ReconFile rec_layer2.yuv # Reconstructed file
+
+#============================== CODING ==============================
+ProfileIdc 66 # value of profile_idc (or 0 for auto detection)
+
+InitialQP 24 # Quantization parameters for base quality layer
+#================================ RATE CONTROL ===============================
+SpatialBitrate 600 # Unit: kbps, controled by DisableRC also
+#============================== MultiSlice Slice Argument ==============================
+# for S/M Slice(s) mode settings
+SliceMode 0 # 0: sigle slice mode; >0: multiple slices mode, see below;
+SliceSize 1500
+SliceNum 1 # multiple slices number specified
+
+SlicesAssign0 960 # count number of MBs in slice #0
+SlicesAssign1 0 # count number of MBs in slice #1
+SlicesAssign2 0 # count number of MBs in slice #2
+SlicesAssign3 0 # count number of MBs in slice #3 -- seting here is for better testing
+SlicesAssign4 0 # count number of MBs in slice #4
+SlicesAssign5 0 # count number of MBs in slice #5
+SlicesAssign6 0 # count number of MBs in slice #6
+SlicesAssign7 0 # count number of MBs in slice #7
+
+### DESIGN OF SLICE MODE, 100804, Sijia ####
+# 0 SM_SINGLE_SLICE | SliceNum==1
+# 1 SM_FIXEDSLCNUM_SLICE | according to SliceNum | Enabled dynamic slicing for multi-thread
+# 2 SM_RASTER_SLICE | according to SlicesAssign | Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
+# 3 SM_ROWMB_SLICE | according to PictureMBHeight | Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
+# 4 SM_DYN_SLICE | according to SliceSize | Dynamic slicing (have no idea about slice_nums until encoding current frame)
+
--- a/testbin/welsenc.cfg
+++ b/testbin/welsenc.cfg
@@ -1,63 +1,63 @@
-# Cisco Scalable H.264/AVC Extension Encoder Configuration File
-
-#============================== GENERAL ==============================
-OutputFile test.264 # Bitstream file
-MaxFrameRate 30 # Maximum frame rate [Hz]
-FramesToBeEncoded -1 # Number of frames (at input frame rate)
-
-GOPSize 4 # GOP Size (at maximum frame rate), 16
-IntraPeriod 0 # Intra Period ( multipler of GoP size or -1)
-EnableSpsPpsIDAddition 1
-
-EnableFrameCropping 1 # enable frame cropping flag
-
-#============================== LOOP FILTER ==============================
-LoopFilterDisableIDC 0 # Loop filter idc (0: on, 1: off,
- # 2: on except for slice boundaries,
- # 3: two stage. slice boundries on in second stage
- # 4: Luma on but Chroma off (w.r.t. idc=0)
- # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
- # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-LoopFilterAlphaC0Offset 0 # AlphaOffset(-6..+6): valid range
-LoopFilterBetaOffset 0 # BetaOffset (-6..+6): valid range
-
-InterLayerLoopFilterDisableIDC 0 # filter idc for inter-layer deblocking (0: on, 1: off,
- # 2: on except for slice boundaries,
- # 3: two stage. slice boundries on in second stage
- # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
- # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
- # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-InterLayerLoopFilterAlphaC0Offset 0 # AlphaOffset for inter-layer deblocking
-InterLayerLoopFilterBetaOffset 0 # BetaOffset for inter-layer deblocking
-
-#============================== SOFTWARE IMPLEMENTATION ==============================
-MultipleThreadIdc 1 # 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
-
-#============================== RATE CONTROL ==============================
-EnableRC 1 # ENABLE RC
-TargetBitrate 5000 # Unit: kbps, controled by EnableRC also
-
-#============================== DENOISE CONTROL ==============================
-EnableDenoise 0 # Enable Denoise (1: enable, 0: disable)
-
-#============================== SCENE CHANGE DETECTION CONTROL =======================
-EnableSceneChangeDetection 1 # Enable Scene Change Detection (1: enable, 0: disable)
-
-#============================== BACKGROUND DETECTION CONTROL ==============================
-EnableBackgroundDetection 1 # BGD control(1: enable, 0: disable)
-
-#============================== ADAPTIVE QUANTIZATION CONTROL =======================
-EnableAdaptiveQuantization 1 # Enable Adaptive Quantization (1: enable, 0: disable)
-
-#============================== LONG TERM REFERENCE CONTROL ==============================
-EnableLongTermReference 0 # Enable Long Term Reference (1: enable, 0: disable)
-LtrMarkPeriod 30 # Long Term Reference Marking Period
-
-#============================== LAYER DEFINITION ==============================
-PrefixNALAddingCtrl 0 # Control flag of adding prefix unit (0: off, 1: on)
- # It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
- # Can be disabled when no inter spatial layer prediction in case of its value as 0
-NumLayers 1 # Number of layers
-//LayerCfg layer0.cfg # Layer 0 configuration file
-//LayerCfg layer1.cfg # Layer 1 configuration file
-LayerCfg layer2.cfg # Layer 2 configuration file
+# Cisco Scalable H.264/AVC Extension Encoder Configuration File
+
+#============================== GENERAL ==============================
+OutputFile test.264 # Bitstream file
+MaxFrameRate 30 # Maximum frame rate [Hz]
+FramesToBeEncoded -1 # Number of frames (at input frame rate)
+
+GOPSize 4 # GOP Size (at maximum frame rate), 16
+IntraPeriod 0 # Intra Period ( multipler of GoP size or -1)
+EnableSpsPpsIDAddition 1
+
+EnableFrameCropping 1 # enable frame cropping flag
+
+#============================== LOOP FILTER ==============================
+LoopFilterDisableIDC 0 # Loop filter idc (0: on, 1: off,
+ # 2: on except for slice boundaries,
+ # 3: two stage. slice boundries on in second stage
+ # 4: Luma on but Chroma off (w.r.t. idc=0)
+ # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+ # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+LoopFilterAlphaC0Offset 0 # AlphaOffset(-6..+6): valid range
+LoopFilterBetaOffset 0 # BetaOffset (-6..+6): valid range
+
+InterLayerLoopFilterDisableIDC 0 # filter idc for inter-layer deblocking (0: on, 1: off,
+ # 2: on except for slice boundaries,
+ # 3: two stage. slice boundries on in second stage
+ # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
+ # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+ # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+InterLayerLoopFilterAlphaC0Offset 0 # AlphaOffset for inter-layer deblocking
+InterLayerLoopFilterBetaOffset 0 # BetaOffset for inter-layer deblocking
+
+#============================== SOFTWARE IMPLEMENTATION ==============================
+MultipleThreadIdc 1 # 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
+
+#============================== RATE CONTROL ==============================
+EnableRC 1 # ENABLE RC
+TargetBitrate 5000 # Unit: kbps, controled by EnableRC also
+
+#============================== DENOISE CONTROL ==============================
+EnableDenoise 0 # Enable Denoise (1: enable, 0: disable)
+
+#============================== SCENE CHANGE DETECTION CONTROL =======================
+EnableSceneChangeDetection 1 # Enable Scene Change Detection (1: enable, 0: disable)
+
+#============================== BACKGROUND DETECTION CONTROL ==============================
+EnableBackgroundDetection 1 # BGD control(1: enable, 0: disable)
+
+#============================== ADAPTIVE QUANTIZATION CONTROL =======================
+EnableAdaptiveQuantization 1 # Enable Adaptive Quantization (1: enable, 0: disable)
+
+#============================== LONG TERM REFERENCE CONTROL ==============================
+EnableLongTermReference 0 # Enable Long Term Reference (1: enable, 0: disable)
+LtrMarkPeriod 30 # Long Term Reference Marking Period
+
+#============================== LAYER DEFINITION ==============================
+PrefixNALAddingCtrl 0 # Control flag of adding prefix unit (0: off, 1: on)
+ # It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
+ # Can be disabled when no inter spatial layer prediction in case of its value as 0
+NumLayers 1 # Number of layers
+//LayerCfg layer0.cfg # Layer 0 configuration file
+//LayerCfg layer1.cfg # Layer 1 configuration file
+LayerCfg layer2.cfg # Layer 2 configuration file
--- a/testbin/welsenc_vd_1d.cfg
+++ b/testbin/welsenc_vd_1d.cfg
@@ -1,63 +1,63 @@
-# Cisco Scalable H.264/AVC Extension Encoder Configuration File
-
-#============================== GENERAL ==============================
-OutputFile test_vd_1d.264 # Bitstream file
-MaxFrameRate 30 # Maximum frame rate [Hz]
-FramesToBeEncoded -1 # Number of frames (at input frame rate)
-
-GOPSize 4 # GOP Size (at maximum frame rate), 16
-IntraPeriod 0 # Intra Period ( multipler of GoP size or -1)
-EnableSpsPpsIDAddition 1
-
-EnableFrameCropping 1 # enable frame cropping flag
-
-#============================== LOOP FILTER ==============================
-LoopFilterDisableIDC 0 # Loop filter idc (0: on, 1: off,
- # 2: on except for slice boundaries,
- # 3: two stage. slice boundries on in second stage
- # 4: Luma on but Chroma off (w.r.t. idc=0)
- # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
- # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-LoopFilterAlphaC0Offset 0 # AlphaOffset(-6..+6): valid range
-LoopFilterBetaOffset 0 # BetaOffset (-6..+6): valid range
-
-InterLayerLoopFilterDisableIDC 0 # filter idc for inter-layer deblocking (0: on, 1: off,
- # 2: on except for slice boundaries,
- # 3: two stage. slice boundries on in second stage
- # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
- # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
- # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-InterLayerLoopFilterAlphaC0Offset 0 # AlphaOffset for inter-layer deblocking
-InterLayerLoopFilterBetaOffset 0 # BetaOffset for inter-layer deblocking
-
-#============================== SOFTWARE IMPLEMENTATION ==============================
-MultipleThreadIdc 1 # 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
-
-#============================== RATE CONTROL ==============================
-EnableRC 0 # ENABLE RC
-TargetBitrate 5000 # Unit: kbps, controled by EnableRC also
-
-#============================== DENOISE CONTROL ==============================
-EnableDenoise 0 # Enable Denoise (1: enable, 0: disable)
-
-#============================== SCENE CHANGE DETECTION CONTROL =======================
-EnableSceneChangeDetection 1 # Enable Scene Change Detection (1: enable, 0: disable)
-
-#============================== BACKGROUND DETECTION CONTROL ==============================
-EnableBackgroundDetection 1 # BGD control(1: enable, 0: disable)
-
-#============================== ADAPTIVE QUANTIZATION CONTROL =======================
-EnableAdaptiveQuantization 0 # Enable Adaptive Quantization (1: enable, 0: disable)
-
-#============================== LONG TERM REFERENCE CONTROL ==============================
-EnableLongTermReference 1 # Enable Long Term Reference (1: enable, 0: disable)
-LtrMarkPeriod 30 # Long Term Reference Marking Period
-
-#============================== LAYER DEFINITION ==============================
-PrefixNALAddingCtrl 0 # Control flag of adding prefix unit (0: off, 1: on)
- # It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
- # Can be disabled when no inter spatial layer prediction in case of its value as 0
-NumLayers 1 # Number of layers
-//LayerCfg layer0_vd.cfg # Layer 0 configuration file
-//LayerCfg layer1_vd.cfg # Layer 1 configuration file
-LayerCfg layer2_vd.cfg # Layer 2 configuration file
+# Cisco Scalable H.264/AVC Extension Encoder Configuration File
+
+#============================== GENERAL ==============================
+OutputFile test_vd_1d.264 # Bitstream file
+MaxFrameRate 30 # Maximum frame rate [Hz]
+FramesToBeEncoded -1 # Number of frames (at input frame rate)
+
+GOPSize 4 # GOP Size (at maximum frame rate), 16
+IntraPeriod 0 # Intra Period ( multipler of GoP size or -1)
+EnableSpsPpsIDAddition 1
+
+EnableFrameCropping 1 # enable frame cropping flag
+
+#============================== LOOP FILTER ==============================
+LoopFilterDisableIDC 0 # Loop filter idc (0: on, 1: off,
+ # 2: on except for slice boundaries,
+ # 3: two stage. slice boundries on in second stage
+ # 4: Luma on but Chroma off (w.r.t. idc=0)
+ # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+ # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+LoopFilterAlphaC0Offset 0 # AlphaOffset(-6..+6): valid range
+LoopFilterBetaOffset 0 # BetaOffset (-6..+6): valid range
+
+InterLayerLoopFilterDisableIDC 0 # filter idc for inter-layer deblocking (0: on, 1: off,
+ # 2: on except for slice boundaries,
+ # 3: two stage. slice boundries on in second stage
+ # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
+ # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+ # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+InterLayerLoopFilterAlphaC0Offset 0 # AlphaOffset for inter-layer deblocking
+InterLayerLoopFilterBetaOffset 0 # BetaOffset for inter-layer deblocking
+
+#============================== SOFTWARE IMPLEMENTATION ==============================
+MultipleThreadIdc 1 # 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
+
+#============================== RATE CONTROL ==============================
+EnableRC 0 # ENABLE RC
+TargetBitrate 5000 # Unit: kbps, controled by EnableRC also
+
+#============================== DENOISE CONTROL ==============================
+EnableDenoise 0 # Enable Denoise (1: enable, 0: disable)
+
+#============================== SCENE CHANGE DETECTION CONTROL =======================
+EnableSceneChangeDetection 1 # Enable Scene Change Detection (1: enable, 0: disable)
+
+#============================== BACKGROUND DETECTION CONTROL ==============================
+EnableBackgroundDetection 1 # BGD control(1: enable, 0: disable)
+
+#============================== ADAPTIVE QUANTIZATION CONTROL =======================
+EnableAdaptiveQuantization 0 # Enable Adaptive Quantization (1: enable, 0: disable)
+
+#============================== LONG TERM REFERENCE CONTROL ==============================
+EnableLongTermReference 1 # Enable Long Term Reference (1: enable, 0: disable)
+LtrMarkPeriod 30 # Long Term Reference Marking Period
+
+#============================== LAYER DEFINITION ==============================
+PrefixNALAddingCtrl 0 # Control flag of adding prefix unit (0: off, 1: on)
+ # It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
+ # Can be disabled when no inter spatial layer prediction in case of its value as 0
+NumLayers 1 # Number of layers
+//LayerCfg layer0_vd.cfg # Layer 0 configuration file
+//LayerCfg layer1_vd.cfg # Layer 1 configuration file
+LayerCfg layer2_vd.cfg # Layer 2 configuration file
--- a/testbin/welsenc_vd_rc.cfg
+++ b/testbin/welsenc_vd_rc.cfg
@@ -1,63 +1,63 @@
-# Cisco Scalable H.264/AVC Extension Encoder Configuration File
-
-#============================== GENERAL ==============================
-OutputFile test_vd_rc.264 # Bitstream file
-MaxFrameRate 30 # Maximum frame rate [Hz]
-FramesToBeEncoded -1 # Number of frames (at input frame rate), -1
-
-GOPSize 8 # GOP Size (at maximum frame rate), 16
-IntraPeriod 0 # Intra Period ( multipler of GoP size or -1)
-EnableSpsPpsIDAddition 1
-
-EnableFrameCropping 1 # enable frame cropping flag
-
-#============================== LOOP FILTER ==============================
-LoopFilterDisableIDC 0 # Loop filter idc (0: on, 1: off,
- # 2: on except for slice boundaries,
- # 3: two stage. slice boundries on in second stage
- # 4: Luma on but Chroma off (w.r.t. idc=0)
- # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
- # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-LoopFilterAlphaC0Offset 0 # AlphaOffset(-6..+6): valid range
-LoopFilterBetaOffset 0 # BetaOffset (-6..+6): valid range
-
-InterLayerLoopFilterDisableIDC 0 # filter idc for inter-layer deblocking (0: on, 1: off,
- # 2: on except for slice boundaries,
- # 3: two stage. slice boundries on in second stage
- # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
- # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
- # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-InterLayerLoopFilterAlphaC0Offset 0 # AlphaOffset for inter-layer deblocking
-InterLayerLoopFilterBetaOffset 0 # BetaOffset for inter-layer deblocking
-
-#============================== SOFTWARE IMPLEMENTATION ==============================
-MultipleThreadIdc 1 # 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
-
-#============================== RATE CONTROL ==============================
-EnableRC 1 # ENABLE RC
-TargetBitrate 600 # Unit: kbps, controled by EnableRC also
-
-#============================== DENOISE CONTROL ==============================
-EnableDenoise 1 # Enable Denoise (1: enable, 0: disable)
-
-#============================== SCENE CHANGE DETECTION CONTROL =======================
-EnableSceneChangeDetection 1 # Enable Scene Change Detection (1: enable, 0: disable)
-
-#============================== BACKGROUND DETECTION CONTROL ==============================
-EnableBackgroundDetection 1 # BGD control(1: enable, 0: disable)
-
-#============================== ADAPTIVE QUANTIZATION CONTROL =======================
-EnableAdaptiveQuantization 1 # Enable Adaptive Quantization (1: enable, 0: disable)
-
-#============================== LONG TERM REFERENCE CONTROL ==============================
-EnableLongTermReference 1 # Enable Long Term Reference (1: enable, 0: disable)
-LtrMarkPeriod 30 # Long Term Reference Marking Period
-
-#============================== LAYER DEFINITION ==============================
-PrefixNALAddingCtrl 0 # Control flag of adding prefix unit (0: off, 1: on)
- # It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
- # Can be disabled when no inter spatial layer prediction in case of its value as 0
-NumLayers 1 # Number of layers
-//LayerCfg layer0_vd.cfg # Layer 0 configuration file
-//LayerCfg layer1_vd.cfg # Layer 1 configuration file
-LayerCfg layer2_vd_rc.cfg # Layer 2 configuration file
+# Cisco Scalable H.264/AVC Extension Encoder Configuration File
+
+#============================== GENERAL ==============================
+OutputFile test_vd_rc.264 # Bitstream file
+MaxFrameRate 30 # Maximum frame rate [Hz]
+FramesToBeEncoded -1 # Number of frames (at input frame rate), -1
+
+GOPSize 8 # GOP Size (at maximum frame rate), 16
+IntraPeriod 0 # Intra Period ( multipler of GoP size or -1)
+EnableSpsPpsIDAddition 1
+
+EnableFrameCropping 1 # enable frame cropping flag
+
+#============================== LOOP FILTER ==============================
+LoopFilterDisableIDC 0 # Loop filter idc (0: on, 1: off,
+ # 2: on except for slice boundaries,
+ # 3: two stage. slice boundries on in second stage
+ # 4: Luma on but Chroma off (w.r.t. idc=0)
+ # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+ # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+LoopFilterAlphaC0Offset 0 # AlphaOffset(-6..+6): valid range
+LoopFilterBetaOffset 0 # BetaOffset (-6..+6): valid range
+
+InterLayerLoopFilterDisableIDC 0 # filter idc for inter-layer deblocking (0: on, 1: off,
+ # 2: on except for slice boundaries,
+ # 3: two stage. slice boundries on in second stage
+ # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
+ # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+ # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+InterLayerLoopFilterAlphaC0Offset 0 # AlphaOffset for inter-layer deblocking
+InterLayerLoopFilterBetaOffset 0 # BetaOffset for inter-layer deblocking
+
+#============================== SOFTWARE IMPLEMENTATION ==============================
+MultipleThreadIdc 1 # 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
+
+#============================== RATE CONTROL ==============================
+EnableRC 1 # ENABLE RC
+TargetBitrate 600 # Unit: kbps, controled by EnableRC also
+
+#============================== DENOISE CONTROL ==============================
+EnableDenoise 1 # Enable Denoise (1: enable, 0: disable)
+
+#============================== SCENE CHANGE DETECTION CONTROL =======================
+EnableSceneChangeDetection 1 # Enable Scene Change Detection (1: enable, 0: disable)
+
+#============================== BACKGROUND DETECTION CONTROL ==============================
+EnableBackgroundDetection 1 # BGD control(1: enable, 0: disable)
+
+#============================== ADAPTIVE QUANTIZATION CONTROL =======================
+EnableAdaptiveQuantization 1 # Enable Adaptive Quantization (1: enable, 0: disable)
+
+#============================== LONG TERM REFERENCE CONTROL ==============================
+EnableLongTermReference 1 # Enable Long Term Reference (1: enable, 0: disable)
+LtrMarkPeriod 30 # Long Term Reference Marking Period
+
+#============================== LAYER DEFINITION ==============================
+PrefixNALAddingCtrl 0 # Control flag of adding prefix unit (0: off, 1: on)
+ # It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
+ # Can be disabled when no inter spatial layer prediction in case of its value as 0
+NumLayers 1 # Number of layers
+//LayerCfg layer0_vd.cfg # Layer 0 configuration file
+//LayerCfg layer1_vd.cfg # Layer 1 configuration file
+LayerCfg layer2_vd_rc.cfg # Layer 2 configuration file