ref: ed9c03408f1ccb93fc4f6a8ce3b23e7f9e0d59c6
parent: 197423f271794ddc8a0d0a9070ab5a4834186ef9
author: Martin Storsjö <martin@martin.st>
date: Tue Mar 18 11:59:42 EDT 2014
Rename the asm subdirectories to x86 This is consistent with having the arm assembly in a subdirectory called arm.
--- a/codec/build/win32/dec/WelsDecCore.vcproj
+++ b/codec/build/win32/dec/WelsDecCore.vcproj
@@ -349,7 +349,7 @@
Filter="*.asm;*.inc"
>
<File
- RelativePath="..\..\..\decoder\core\asm\block_add.asm"
+ RelativePath="..\..\..\decoder\core\x86\block_add.asm"
>
<FileConfiguration
Name="Release|Win32"
@@ -429,7 +429,7 @@
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\decoder\core\asm\dct.asm"
+ RelativePath="..\..\..\decoder\core\x86\dct.asm"
>
<FileConfiguration
Name="Release|Win32"
@@ -549,7 +549,7 @@
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\decoder\core\asm\intra_pred.asm"
+ RelativePath="..\..\..\decoder\core\x86\intra_pred.asm"
>
<FileConfiguration
Name="Release|Win32"
--- a/codec/build/win32/enc/WelsEncCore.vcproj
+++ b/codec/build/win32/enc/WelsEncCore.vcproj
@@ -1670,7 +1670,7 @@
Filter="*.asm;*.inc"
>
<File
- RelativePath="..\..\..\encoder\core\asm\coeff.asm"
+ RelativePath="..\..\..\encoder\core\x86\coeff.asm"
>
<FileConfiguration
Name="Debug|Win32"
@@ -1750,7 +1750,7 @@
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\encoder\core\asm\dct.asm"
+ RelativePath="..\..\..\encoder\core\x86\dct.asm"
>
<FileConfiguration
Name="Debug|Win32"
@@ -1870,7 +1870,7 @@
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\encoder\core\asm\intra_pred.asm"
+ RelativePath="..\..\..\encoder\core\x86\intra_pred.asm"
>
<FileConfiguration
Name="Debug|Win32"
@@ -2030,7 +2030,7 @@
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\encoder\core\asm\memzero.asm"
+ RelativePath="..\..\..\encoder\core\x86\memzero.asm"
>
<FileConfiguration
Name="Debug|Win32"
@@ -2070,7 +2070,7 @@
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\encoder\core\asm\quant.asm"
+ RelativePath="..\..\..\encoder\core\x86\quant.asm"
>
<FileConfiguration
Name="Debug|Win32"
@@ -2150,7 +2150,7 @@
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\encoder\core\asm\score.asm"
+ RelativePath="..\..\..\encoder\core\x86\score.asm"
>
<FileConfiguration
Name="Debug|Win32"
--- a/codec/decoder/core/asm/block_add.asm
+++ /dev/null
@@ -1,151 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* block_add.asm
-;*
-;* Abstract
-;* add block
-;*
-;* History
-;* 09/21/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-
-
-;*******************************************************************************
-; void WelsResBlockZero16x16_sse2(int16_t* pBlock,int32_t iStride)
-;*******************************************************************************
-WELS_EXTERN WelsResBlockZero16x16_sse2
- %assign push_num 0
- LOAD_2_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- lea r1, [r1*2]
- lea r2, [r1*3]
-
- pxor xmm7, xmm7
-
- ; four lines
- movdqa [r0], xmm7
- movdqa [r0+10h], xmm7
-
- movdqa [r0+r1], xmm7
- movdqa [r0+r1+10h], xmm7
-
- movdqa [r0+r1*2], xmm7
- movdqa [r0+r1*2+10h], xmm7
-
- movdqa [r0+r2], xmm7
- movdqa [r0+r2+10h], xmm7
-
- ; four lines
- lea r0, [r0+r1*4]
- movdqa [r0], xmm7
- movdqa [r0+10h], xmm7
-
- movdqa [r0+r1], xmm7
- movdqa [r0+r1+10h], xmm7
-
- movdqa [r0+r1*2], xmm7
- movdqa [r0+r1*2+10h], xmm7
-
- movdqa [r0+r2], xmm7
- movdqa [r0+r2+10h], xmm7
-
- ; four lines
- lea r0, [r0+r1*4]
- movdqa [r0], xmm7
- movdqa [r0+10h], xmm7
-
- movdqa [r0+r1], xmm7
- movdqa [r0+r1+10h], xmm7
-
- movdqa [r0+r1*2], xmm7
- movdqa [r0+r1*2+10h], xmm7
-
- movdqa [r0+r2], xmm7
- movdqa [r0+r2+10h], xmm7
-
- ; four lines
- lea r0, [r0+r1*4]
- movdqa [r0], xmm7
- movdqa [r0+10h], xmm7
-
- movdqa [r0+r1], xmm7
- movdqa [r0+r1+10h], xmm7
-
- movdqa [r0+r1*2], xmm7
- movdqa [r0+r1*2+10h], xmm7
-
- movdqa [r0+r2], xmm7
- movdqa [r0+r2+10h], xmm7
-
- POP_XMM
- ret
-
-
-;*******************************************************************************
-; void WelsResBlockZero8x8_sse2(int16_t * pBlock, int32_t iStride)
-;*******************************************************************************
-WELS_EXTERN WelsResBlockZero8x8_sse2
- %assign push_num 0
- LOAD_2_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- lea r1, [r1*2]
- lea r2, [r1*3]
-
- pxor xmm7, xmm7
-
- movdqa [r0], xmm7
- movdqa [r0+r1], xmm7
- movdqa [r0+r1*2], xmm7
- movdqa [r0+r2], xmm7
-
- lea r0, [r0+r1*4]
- movdqa [r0], xmm7
- movdqa [r0+r1], xmm7
- movdqa [r0+r1*2], xmm7
- movdqa [r0+r2], xmm7
-
-
- POP_XMM
- ret
-
--- a/codec/decoder/core/asm/dct.asm
+++ /dev/null
@@ -1,115 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* ?Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* ?Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* dct.asm
-;*
-;* Abstract
-;* WelsDctFourT4_sse2
-;*
-;* History
-;* 8/4/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-%macro MMX_SumSubDiv2 3
- movq %3, %2
- psraw %3, $01
- paddw %3, %1
- psraw %1, $01
- psubw %1, %2
-%endmacro
-
-%macro MMX_SumSub 3
- movq %3, %2
- psubw %2, %1
- paddw %1, %3
-%endmacro
-
-%macro MMX_IDCT 6
- MMX_SumSub %4, %5, %6
- MMX_SumSubDiv2 %3, %2, %1
- MMX_SumSub %1, %4, %6
- MMX_SumSub %3, %5, %6
-%endmacro
-
-
-%macro MMX_StoreDiff4P 5
- movd %2, %5
- punpcklbw %2, %4
- paddw %1, %3
- psraw %1, $06
- paddsw %1, %2
- packuswb %1, %2
- movd %5, %1
-%endmacro
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-
-;*******************************************************************************
-; void IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs )
-;*******************************************************************************
-
-WELS_EXTERN IdctResAddPred_mmx
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r1, r1d
- movq mm0, [r2+ 0]
- movq mm1, [r2+ 8]
- movq mm2, [r2+16]
- movq mm3, [r2+24]
-
- MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
- MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
- MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
- MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
-
- WELS_Zero mm7
- WELS_DW32 mm6
-
- MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0]
- MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1]
- lea r0, [r0+2*r1]
- MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0]
- MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1]
-
-
- emms
- ret
--- a/codec/decoder/core/asm/intra_pred.asm
+++ /dev/null
@@ -1,1414 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* intra_pred.asm
-;*
-;* Abstract
-;* sse2 and mmx function for intra predict operations(decoder)
-;*
-;* History
-;* 18/09/2009 Created
-;* 19/11/2010 Added
-;* WelsDecoderI16x16LumaPredDcTop_sse2, WelsDecoderI16x16LumaPredDcNA_sse2,
-;* WelsDecoderIChromaPredDcLeft_mmx, WelsDecoderIChromaPredDcTop_sse2
-;* and WelsDecoderIChromaPredDcNA_mmx
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-;*******************************************************************************
-; Local Data (Read Only)
-;*******************************************************************************
-
-SECTION .rodata align=16
-%if 1
- %define WELSEMMS emms
-%else
- %define WELSEMMS
-%endif
-
-align 16
-sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
-align 16
-sse2_plane_inc dw 1, 2, 3, 4, 5, 6, 7, 8
-align 16
-sse2_plane_dec dw 8, 7, 6, 5, 4, 3, 2, 1
-
-; for chroma plane mode
-sse2_plane_inc_c dw 1, 2, 3, 4
-sse2_plane_dec_c dw 4, 3, 2, 1
-align 16
-sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
-
-align 16
-mmx_01bytes: times 16 db 1
-
-align 16
-mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
-
-align 16
-sse2_dc_0x80: times 16 db 0x80
-align 16
-sse2_wd_0x02: times 8 dw 0x02
-
-;*******************************************************************************
-; macros
-;*******************************************************************************
-;xmm0, xmm1, xmm2, eax, ecx
-;lower 64 bits of xmm0 save the result
-%macro SSE2_PRED_H_4X4_TWO_LINE 5
- movd %1, [%4-1]
- movdqa %3, %1
- punpcklbw %1, %3
- movdqa %3, %1
- punpcklbw %1, %3
-
- ;add %4, %5
- movd %2, [%4+%5-1]
- movdqa %3, %2
- punpcklbw %2, %3
- movdqa %3, %2
- punpcklbw %2, %3
- punpckldq %1, %2
-%endmacro
-
-
-%macro LOAD_COLUMN 6
- movd %1, [%5]
- movd %2, [%5+%6]
- punpcklbw %1, %2
- lea %5, [%5+2*%6]
- movd %3, [%5]
- movd %2, [%5+%6]
- punpcklbw %3, %2
- punpcklwd %1, %3
- lea %5, [%5+2*%6]
- movd %4, [%5]
- movd %2, [%5+%6]
- punpcklbw %4, %2
- lea %5, [%5+2*%6]
- movd %3, [%5]
- movd %2, [%5+%6]
- lea %5, [%5+2*%6]
- punpcklbw %3, %2
- punpcklwd %4, %3
- punpckhdq %1, %4
-%endmacro
-
-%macro SUMW_HORIZON 3
- movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
- paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
- punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
- movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
- paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
- pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
- paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
-%endmacro
-
-%macro COPY_16_TIMES 2
- movdqa %2, [%1-16]
- psrldq %2, 15
- pmuludq %2, [mmx_01bytes]
- pshufd %2, %2, 0
-%endmacro
-
-%macro COPY_16_TIMESS 3
- movdqa %2, [%1+%3-16]
- psrldq %2, 15
- pmuludq %2, [mmx_01bytes]
- pshufd %2, %2, 0
-%endmacro
-
-%macro LOAD_COLUMN_C 6
- movd %1, [%5]
- movd %2, [%5+%6]
- punpcklbw %1,%2
- lea %5, [%5+2*%6]
- movd %3, [%5]
- movd %2, [%5+%6]
- punpcklbw %3, %2
- punpckhwd %1, %3
- lea %5, [%5+2*%6]
-%endmacro
-
-%macro LOAD_2_LEFT_AND_ADD 0
- lea r0, [r0+2*r1]
- movzx r3, byte [r0-0x01]
- add r2, r3
- movzx r3, byte [r0+r1-0x01]
- add r2, r3
-%endmacro
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-
-
-;*******************************************************************************
-; void WelsDecoderI4x4LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride)
-;
-; pPred must align to 16
-;*******************************************************************************
-WELS_EXTERN WelsDecoderI4x4LumaPredH_sse2
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
-
- movzx r2, byte [r0-1]
- movd xmm0, r2d
- pmuludq xmm0, [mmx_01bytes]
-
- movzx r2, byte [r0+r1-1]
- movd xmm1, r2d
- pmuludq xmm1, [mmx_01bytes]
-
- lea r0, [r0+r1]
- movzx r2, byte [r0+r1-1]
- movd xmm2, r2d
- pmuludq xmm2, [mmx_01bytes]
-
- movzx r2, byte [r0+2*r1-1]
- movd xmm3, r2d
- pmuludq xmm3, [mmx_01bytes]
-
- sub r0, r1
- movd [r0], xmm0
- movd [r0+r1], xmm1
- lea r0, [r0+2*r1]
- movd [r0], xmm2
- movd [r0+r1], xmm3
-
- ret
-
-;*******************************************************************************
-; void WelsDecoderI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
-;*******************************************************************************
-WELS_EXTERN WelsDecoderI16x16LumaPredPlane_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_2_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- mov r4, r0 ; save r0 in r4
- sub r0, 1
- sub r0, r1
-
- ;for H
- pxor xmm7, xmm7
- movq xmm0, [r0]
- movdqa xmm5, [sse2_plane_dec]
- punpcklbw xmm0, xmm7
- pmullw xmm0, xmm5
- movq xmm1, [r0 + 9]
- movdqa xmm6, [sse2_plane_inc]
- punpcklbw xmm1, xmm7
- pmullw xmm1, xmm6
- psubw xmm1, xmm0
-
- SUMW_HORIZON xmm1,xmm0,xmm2
- movd r2d, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]);
- movsx r2, r2w
- imul r2, 5
- add r2, 32
- sar r2, 6 ; b = (5 * H + 32) >> 6;
- SSE2_Copy8Times xmm1, r2d ; xmm1 = b,b,b,b,b,b,b,b
-
- movzx r3, BYTE [r0+16]
- sub r0, 3
- LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, r0, r1
-
- add r0, 3
- movzx r2, BYTE [r0+8*r1]
- add r3, r2
- shl r3, 4 ; a = (left[15*kiStride] + top[15]) << 4;
-
- sub r0, 3
- add r0, r1
- LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, r0, r1
- pxor xmm4, xmm4
- punpckhbw xmm0, xmm4
- pmullw xmm0, xmm5
- punpckhbw xmm7, xmm4
- pmullw xmm7, xmm6
- psubw xmm7, xmm0
-
- SUMW_HORIZON xmm7,xmm0,xmm2
- movd r2d, xmm7 ; V
- movsx r2, r2w
-
- imul r2, 5
- add r2, 32
- sar r2, 6 ; c = (5 * V + 32) >> 6;
- SSE2_Copy8Times xmm4, r2d ; xmm4 = c,c,c,c,c,c,c,c
-
- mov r0, r4
- add r3, 16
- imul r2, -7
- add r3, r2 ; s = a + 16 + (-7)*c
- SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
-
- xor r2, r2
- movdqa xmm5, [sse2_plane_inc_minus]
-
-get_i16x16_luma_pred_plane_sse2_1:
- movdqa xmm2, xmm1
- pmullw xmm2, xmm5
- paddw xmm2, xmm0
- psraw xmm2, 5
- movdqa xmm3, xmm1
- pmullw xmm3, xmm6
- paddw xmm3, xmm0
- psraw xmm3, 5
- packuswb xmm2, xmm3
- movdqa [r0], xmm2
- paddw xmm0, xmm4
- add r0, r1
- inc r2
- cmp r2, 16
- jnz get_i16x16_luma_pred_plane_sse2_1
-
- POP_XMM
- pop r4
- pop r3
- ret
-
-
-
-;*******************************************************************************
-; void WelsDecoderI16x16LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride);
-;*******************************************************************************
-
-%macro SSE2_PRED_H_16X16_TWO_LINE_DEC 2
- lea %1, [%1+%2*2]
-
- COPY_16_TIMES %1, xmm0
- movdqa [%1], xmm0
- COPY_16_TIMESS %1, xmm0, %2
- movdqa [%1+%2], xmm0
-%endmacro
-
-WELS_EXTERN WelsDecoderI16x16LumaPredH_sse2
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
-
- COPY_16_TIMES r0, xmm0
- movdqa [r0], xmm0
- COPY_16_TIMESS r0, xmm0, r1
- movdqa [r0+r1], xmm0
-
- SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
- SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
- SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
- SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
- SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
- SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
- SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
-
- ret
-
-;*******************************************************************************
-; void WelsDecoderI16x16LumaPredV_sse2(uint8_t *pPred, const int32_t kiStride);
-;*******************************************************************************
-WELS_EXTERN WelsDecoderI16x16LumaPredV_sse2
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
-
- sub r0, r1
- movdqa xmm0, [r0]
-
- movdqa [r0+r1], xmm0
- lea r0, [r0+2*r1]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm0
- lea r0, [r0+2*r1]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm0
- lea r0, [r0+2*r1]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm0
- lea r0, [r0+2*r1]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm0
- lea r0, [r0+2*r1]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm0
- lea r0, [r0+2*r1]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm0
- lea r0, [r0+2*r1]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm0
- lea r0, [r0+2*r1]
- movdqa [r0], xmm0
-
- ret
-
-;*******************************************************************************
-; void WelsDecoderIChromaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
-;*******************************************************************************
-WELS_EXTERN WelsDecoderIChromaPredPlane_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_2_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- mov r4, r0
- sub r0, 1
- sub r0, r1
-
- pxor mm7, mm7
- movq mm0, [r0]
- movq mm5, [sse2_plane_dec_c]
- punpcklbw mm0, mm7
- pmullw mm0, mm5
- movq mm1, [r0 + 5]
- movq mm6, [sse2_plane_inc_c]
- punpcklbw mm1, mm7
- pmullw mm1, mm6
- psubw mm1, mm0
-
- movq2dq xmm1, mm1
- pxor xmm2, xmm2
- SUMW_HORIZON xmm1,xmm0,xmm2
- movd r2d, xmm1
- movsx r2, r2w
- imul r2, 17
- add r2, 16
- sar r2, 5 ; b = (17 * H + 16) >> 5;
- SSE2_Copy8Times xmm1, r2d ; mm1 = b,b,b,b,b,b,b,b
-
- movzx r3, BYTE [r0+8]
- sub r0, 3
- LOAD_COLUMN_C mm0, mm2, mm3, mm4, r0, r1
-
- add r0, 3
- movzx r2, BYTE [r0+4*r1]
- add r3, r2
- shl r3, 4 ; a = (left[7*kiStride] + top[7]) << 4;
-
- sub r0, 3
- add r0, r1
- LOAD_COLUMN_C mm7, mm2, mm3, mm4, r0, r1
- pxor mm4, mm4
- punpckhbw mm0, mm4
- pmullw mm0, mm5
- punpckhbw mm7, mm4
- pmullw mm7, mm6
- psubw mm7, mm0
-
- movq2dq xmm7, mm7
- pxor xmm2, xmm2
- SUMW_HORIZON xmm7,xmm0,xmm2
- movd r2d, xmm7 ; V
- movsx r2, r2w
-
- imul r2, 17
- add r2, 16
- sar r2, 5 ; c = (17 * V + 16) >> 5;
- SSE2_Copy8Times xmm4, r2d ; mm4 = c,c,c,c,c,c,c,c
-
- mov r0, r4
- add r3, 16
- imul r2, -3
- add r3, r2 ; s = a + 16 + (-3)*c
- SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
-
- xor r2, r2
- movdqa xmm5, [sse2_plane_mul_b_c]
-
-get_i_chroma_pred_plane_sse2_1:
- movdqa xmm2, xmm1
- pmullw xmm2, xmm5
- paddw xmm2, xmm0
- psraw xmm2, 5
- packuswb xmm2, xmm2
- movq [r0], xmm2
- paddw xmm0, xmm4
- add r0, r1
- inc r2
- cmp r2, 8
- jnz get_i_chroma_pred_plane_sse2_1
-
- POP_XMM
- pop r4
- pop r3
- WELSEMMS
- ret
-
-;*******************************************************************************
-; 0 |1 |2 |3 |4 |
-; 6 |7 |8 |9 |10|
-; 11|12|13|14|15|
-; 16|17|18|19|20|
-; 21|22|23|24|25|
-; 7 is the start pixel of current 4x4 block
-; pPred[7] = ([6]+[0]*2+[1]+2)/4
-;
-; void WelsDecoderI4x4LumaPredDDR_mmx(uint8_t *pPred, const int32_t kiStride)
-;
-;*******************************************************************************
-WELS_EXTERN WelsDecoderI4x4LumaPredDDR_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r2, r0
-
- movq mm1,[r2+r1-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
- movq mm2,[r2-8] ;get value of 6 mm2[8] = 6
- sub r2, r1 ;mov eax to above line of current block(postion of 1)
- punpckhbw mm2,[r2-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
- movd mm3,[r2] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
- punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
- psllq mm3,18h ;mm3[5]=[1]
- psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
- por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
- movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
- lea r2,[r2+r1*2-8h] ;set eax point to 12
- movq mm4,[r2+r1] ;get value of 16, mm4[8]=[16]
- psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
- psrlq mm4,38h ;mm4[1]=[16]
- por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
- movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
- movq mm4,[r2+r1*2] ;mm4[8]=[21]
- psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
- psrlq mm4,38h ;mm4[1]=[21]
- por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
- movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
- pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2
- pxor mm1,mm4 ;find odd value in the lowest bit of each byte
- pand mm1,[mmx_01bytes] ;set the odd bit
- psubusb mm3,mm1 ;decrease 1 from odd bytes
- pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
-
- lea r0,[r0+r1]
- movd [r0+2*r1],mm2
- sub r0,r1
- psrlq mm2,8
- movd [r0+2*r1],mm2
- psrlq mm2,8
- movd [r0+r1],mm2
- psrlq mm2,8
- movd [r0],mm2
- WELSEMMS
- ret
-
-
-;*******************************************************************************
-; void WelsDecoderIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride)
-; copy 8 pixel of 8 line from left
-;*******************************************************************************
-%macro MMX_PRED_H_8X8_ONE_LINE 4
- movq %1, [%3-8]
- psrlq %1, 38h
-
- pmullw %1, [mmx_01bytes]
- pshufw %1, %1, 0
- movq [%4], %1
-%endmacro
-
-%macro MMX_PRED_H_8X8_ONE_LINEE 4
- movq %1, [%3+r1-8]
- psrlq %1, 38h
-
- pmullw %1, [mmx_01bytes]
- pshufw %1, %1, 0
- movq [%4], %1
-%endmacro
-
-WELS_EXTERN WelsDecoderIChromaPredH_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r2, r0
-
- movq mm0, [r2-8]
- psrlq mm0, 38h
-
- pmullw mm0, [mmx_01bytes]
- pshufw mm0, mm0, 0
- movq [r0], mm0
-
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
-
- lea r2, [r2+r1*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
-
- lea r0, [r0+2*r1]
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
-
- lea r2, [r2+r1*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
-
- lea r0, [r0+2*r1]
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
-
- lea r2, [r2+r1*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
-
- lea r0, [r0+2*r1]
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
-
- WELSEMMS
- ret
-
-
-;*******************************************************************************
-; void WelsDecoderIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
-; copy 8 pixels from top 8 pixels
-;*******************************************************************************
-WELS_EXTERN WelsDecoderIChromaPredV_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
-
- sub r0, r1
- movq mm0, [r0]
-
- movq [r0+r1], mm0
- movq [r0+2*r1], mm0
- lea r0, [r0+2*r1]
- movq [r0+r1], mm0
- movq [r0+2*r1], mm0
- lea r0, [r0+2*r1]
- movq [r0+r1], mm0
- movq [r0+2*r1], mm0
- lea r0, [r0+2*r1]
- movq [r0+r1], mm0
- movq [r0+2*r1], mm0
-
- WELSEMMS
- ret
-
-
-;*******************************************************************************
-; lt|t0|t1|t2|t3|
-; l0|
-; l1|
-; l2|
-; l3|
-; t3 will never been used
-; destination:
-; |a |b |c |d |
-; |e |f |a |b |
-; |g |h |e |f |
-; |i |j |g |h |
-
-; a = (1 + lt + l0)>>1
-; e = (1 + l0 + l1)>>1
-; g = (1 + l1 + l2)>>1
-; i = (1 + l2 + l3)>>1
-
-; d = (2 + t0 + (t1<<1) + t2)>>2
-; c = (2 + lt + (t0<<1) + t1)>>2
-; b = (2 + l0 + (lt<<1) + t0)>>2
-
-; f = (2 + l1 + (l0<<1) + lt)>>2
-; h = (2 + l2 + (l1<<1) + l0)>>2
-; j = (2 + l3 + (l2<<1) + l1)>>2
-; [b a f e h g j i] + [d c b a] --> mov to memory
-;
-; void WelsDecoderI4x4LumaPredHD_mmx(uint8_t *pPred, const int32_t kiStride)
-;*******************************************************************************
-WELS_EXTERN WelsDecoderI4x4LumaPredHD_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r2, r0
- sub r2, r1
- movd mm0, [r2-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt]
- psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx]
-
- movd mm1, [r2+2*r1-4]
- punpcklbw mm1, [r2+r1-4] ; mm1[7] = l0, mm1[6] = l1
- lea r2, [r2+2*r1]
- movd mm2, [r2+2*r1-4]
- punpcklbw mm2, [r2+r1-4] ; mm2[7] = l2, mm2[6] = l3
- punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
- psrlq mm2, 20h
- pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
-
- movq mm1, mm0
- psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
- movq mm2, mm0
- psrlq mm2, 8h ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
- movq mm3, mm2
- movq mm4, mm1
- pavgb mm1, mm0
-
- pxor mm4, mm0 ; find odd value in the lowest bit of each byte
- pand mm4, [mmx_01bytes] ; set the odd bit
- psubusb mm1, mm4 ; decrease 1 from odd bytes
-
- pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
-
- movq mm4, mm0
- pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i]
- punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i]
-
- psrlq mm2, 20h
- psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0]
- movq mm4, mm3
- psrlq mm4, 10h ; mm4 = [0 0 b a f e h j]
- pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx]
- psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a]
-
- movd [r0], mm2
- lea r0, [r0+r1]
- movd [r0+2*r1], mm3
- sub r0, r1
- psrlq mm3, 10h
- movd [r0+2*r1], mm3
- psrlq mm3, 10h
- movd [r0+r1], mm3
- WELSEMMS
- ret
-
-
-
-;*******************************************************************************
-; lt|t0|t1|t2|t3|
-; l0|
-; l1|
-; l2|
-; l3|
-; t3 will never been used
-; destination:
-; |a |b |c |d |
-; |c |d |e |f |
-; |e |f |g |g |
-; |g |g |g |g |
-
-; a = (1 + l0 + l1)>>1
-; c = (1 + l1 + l2)>>1
-; e = (1 + l2 + l3)>>1
-; g = l3
-
-; b = (2 + l0 + (l1<<1) + l2)>>2
-; d = (2 + l1 + (l2<<1) + l3)>>2
-; f = (2 + l2 + (l3<<1) + l3)>>2
-
-; [g g f e d c b a] + [g g g g] --> mov to memory
-;
-; void WelsDecoderI4x4LumaPredHU_mmx(uint8_t *pPred, const int32_t kiStride)
-;*******************************************************************************
-WELS_EXTERN WelsDecoderI4x4LumaPredHU_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r2, r0
-
- movd mm0, [r2-4] ; mm0[3] = l0
- punpcklbw mm0, [r2+r1-4] ; mm0[7] = l1, mm0[6] = l0
- lea r2, [r2+2*r1]
- movd mm2, [r2-4] ; mm2[3] = l2
- movd mm4, [r2+r1-4] ; mm4[3] = l3
- punpcklbw mm2, mm4
- punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
-
- psrlq mm4, 18h
- psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx]
- psrlq mm0, 8h
- pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
-
- movq mm1, mm0
- psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
- movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
- pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx]
-
- movq mm2, mm0
- psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx]
- movq mm5, mm2
- pavgb mm2, mm0
-
- pxor mm5, mm0 ; find odd value in the lowest bit of each byte
- pand mm5, [mmx_01bytes] ; set the odd bit
- psubusb mm2, mm5 ; decrease 1 from odd bytes
-
- pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
-
- psrlq mm2, 8h
- pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx]
-
- punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a]
- punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx]
- punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx]
-
- psrlq mm4, 20h
- lea r0, [r0+r1]
- movd [r0+2*r1], mm4
-
- sub r0, r1
- movd [r0], mm1
- psrlq mm1, 10h
- movd [r0+r1], mm1
- psrlq mm1, 10h
- movd [r0+2*r1], mm1
- WELSEMMS
- ret
-
-
-
-;*******************************************************************************
-; lt|t0|t1|t2|t3|
-; l0|
-; l1|
-; l2|
-; l3|
-; l3 will never been used
-; destination:
-; |a |b |c |d |
-; |e |f |g |h |
-; |i |a |b |c |
-; |j |e |f |g |
-
-; a = (1 + lt + t0)>>1
-; b = (1 + t0 + t1)>>1
-; c = (1 + t1 + t2)>>1
-; d = (1 + t2 + t3)>>1
-
-; e = (2 + l0 + (lt<<1) + t0)>>2
-; f = (2 + lt + (t0<<1) + t1)>>2
-; g = (2 + t0 + (t1<<1) + t2)>>2
-
-; h = (2 + t1 + (t2<<1) + t3)>>2
-; i = (2 + lt + (l0<<1) + l1)>>2
-; j = (2 + l0 + (l1<<1) + l2)>>2
-;
-; void WelsDecoderI4x4LumaPredVR_mmx(uint8_t *pPred, const int32_t kiStride)
-;*******************************************************************************
-WELS_EXTERN WelsDecoderI4x4LumaPredVR_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r2, r0
- sub r2, r1
- movq mm0, [r2-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
- psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
-
- movd mm1, [r2+2*r1-4]
- punpcklbw mm1, [r2+r1-4] ; mm1[7] = l0, mm1[6] = l1
- lea r2, [r2+2*r1]
- movq mm2, [r2+r1-8] ; mm2[7] = l2
- punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx]
- psrlq mm2, 28h
- pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
-
- movq mm1, mm0
- psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
- pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx]
-
- movq mm2, mm0
- psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
- movq mm3, mm2
- pavgb mm2, mm0
-
- pxor mm3, mm0 ; find odd value in the lowest bit of each byte
- pand mm3, [mmx_01bytes] ; set the odd bit
- psubusb mm2, mm3 ; decrease 1 from odd bytes
-
- movq mm3, mm0
- psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
- pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx]
- movq mm2, mm3
-
- psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a]
- movd [r0], mm1
-
- psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e]
- movd [r0+r1], mm2
-
- movq mm4, mm3
- psllq mm4, 20h
- psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i]
-
- movq mm5, mm3
- psllq mm5, 28h
- psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j]
-
- psllq mm1, 8h
- pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i]
- movd [r0+2*r1], mm4
-
- psllq mm2, 8h
- pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
- lea r0, [r0+2*r1]
- movd [r0+r1], mm5
- WELSEMMS
- ret
-
-;*******************************************************************************
-; lt|t0|t1|t2|t3|t4|t5|t6|t7
-; l0|
-; l1|
-; l2|
-; l3|
-; lt,t0,t1,t2,t3 will never been used
-; destination:
-; |a |b |c |d |
-; |b |c |d |e |
-; |c |d |e |f |
-; |d |e |f |g |
-
-; a = (2 + t0 + t2 + (t1<<1))>>2
-; b = (2 + t1 + t3 + (t2<<1))>>2
-; c = (2 + t2 + t4 + (t3<<1))>>2
-; d = (2 + t3 + t5 + (t4<<1))>>2
-
-; e = (2 + t4 + t6 + (t5<<1))>>2
-; f = (2 + t5 + t7 + (t6<<1))>>2
-; g = (2 + t6 + t7 + (t7<<1))>>2
-
-; [g f e d c b a] --> mov to memory
-;
-; void WelsDecoderI4x4LumaPredDDL_mmx(uint8_t *pPred, const int32_t kiStride)
-;*******************************************************************************
-WELS_EXTERN WelsDecoderI4x4LumaPredDDL_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r2, r0
- sub r2, r1
- movq mm0, [r2] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
- movq mm1, mm0
- movq mm2, mm0
-
- movq mm3, mm0
- psrlq mm3, 38h
- psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx]
-
- psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
- psrlq mm2, 8h
- pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
-
- movq mm3, mm1
- pavgb mm1, mm2
- pxor mm3, mm2 ; find odd value in the lowest bit of each byte
- pand mm3, [mmx_01bytes] ; set the odd bit
- psubusb mm1, mm3 ; decrease 1 from odd bytes
-
- pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
-
- psrlq mm0, 8h
- movd [r0], mm0
- psrlq mm0, 8h
- movd [r0+r1], mm0
- psrlq mm0, 8h
- movd [r0+2*r1], mm0
- psrlq mm0, 8h
- lea r0, [r0+2*r1]
- movd [r0+r1], mm0
- WELSEMMS
- ret
-
-
-;*******************************************************************************
-; lt|t0|t1|t2|t3|t4|t5|t6|t7
-; l0|
-; l1|
-; l2|
-; l3|
-; lt,t0,t1,t2,t3 will never been used
-; destination:
-; |a |b |c |d |
-; |e |f |g |h |
-; |b |c |d |i |
-; |f |g |h |j |
-
-; a = (1 + t0 + t1)>>1
-; b = (1 + t1 + t2)>>1
-; c = (1 + t2 + t3)>>1
-; d = (1 + t3 + t4)>>1
-; i = (1 + t4 + t5)>>1
-
-; e = (2 + t0 + (t1<<1) + t2)>>2
-; f = (2 + t1 + (t2<<1) + t3)>>2
-; g = (2 + t2 + (t3<<1) + t4)>>2
-; h = (2 + t3 + (t4<<1) + t5)>>2
-; j = (2 + t4 + (t5<<1) + t6)>>2
-
-; [i d c b a] + [j h g f e] --> mov to memory
-;
-; void WelsDecoderI4x4LumaPredVL_mmx(uint8_t *pPred, const int32_t kiStride)
-;*******************************************************************************
-WELS_EXTERN WelsDecoderI4x4LumaPredVL_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r2, r0
-
- sub r2, r1
- movq mm0, [r2] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
- movq mm1, mm0
- movq mm2, mm0
-
- psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
- psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
-
- movq mm3, mm1
- pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a]
-
- movq mm4, mm2
- pavgb mm2, mm0
- pxor mm4, mm0 ; find odd value in the lowest bit of each byte
- pand mm4, [mmx_01bytes] ; set the odd bit
- psubusb mm2, mm4 ; decrease 1 from odd bytes
-
- pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
-
- movd [r0], mm3
- psrlq mm3, 8h
- movd [r0+2*r1], mm3
-
- movd [r0+r1], mm2
- psrlq mm2, 8h
- lea r0, [r0+2*r1]
- movd [r0+r1], mm2
- WELSEMMS
- ret
-
-;*******************************************************************************
-;
-; void WelsDecoderIChromaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
-;*******************************************************************************
-WELS_EXTERN WelsDecoderIChromaPredDc_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r4, r0
-
- sub r0, r1
- movq mm0, [r0]
-
- movzx r2, byte [r0+r1-0x01] ; l1
- lea r0, [r0+2*r1]
- movzx r3, byte [r0-0x01] ; l2
- add r2, r3
- movzx r3, byte [r0+r1-0x01] ; l3
- add r2, r3
- lea r0, [r0+2*r1]
- movzx r3, byte [r0-0x01] ; l4
- add r2, r3
- movd mm1, r2d ; mm1 = l1+l2+l3+l4
-
- movzx r2, byte [r0+r1-0x01] ; l5
- lea r0, [r0+2*r1]
- movzx r3, byte [r0-0x01] ; l6
- add r2, r3
- movzx r3, byte [r0+r1-0x01] ; l7
- add r2, r3
- lea r0, [r0+2*r1]
- movzx r3, byte [r0-0x01] ; l8
- add r2, r3
- movd mm2, r2d ; mm2 = l5+l6+l7+l8
-
- movq mm3, mm0
- psrlq mm0, 0x20
- psllq mm3, 0x20
- psrlq mm3, 0x20
- pxor mm4, mm4
- psadbw mm0, mm4
- psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
-
- paddq mm3, mm1
- movq mm1, mm2
- paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
-
- movq mm4, [mmx_0x02]
-
- paddq mm0, mm4
- psrlq mm0, 0x02
-
- paddq mm2, mm4
- psrlq mm2, 0x02
-
- paddq mm3, mm4
- paddq mm3, mm4
- psrlq mm3, 0x03
-
- paddq mm1, mm4
- paddq mm1, mm4
- psrlq mm1, 0x03
-
- pmuludq mm0, [mmx_01bytes]
- pmuludq mm3, [mmx_01bytes]
- psllq mm0, 0x20
- pxor mm0, mm3 ; mm0 = m_up
-
- pmuludq mm2, [mmx_01bytes]
- pmuludq mm1, [mmx_01bytes]
- psllq mm1, 0x20
- pxor mm1, mm2 ; mm2 = m_down
-
- movq [r4], mm0
- movq [r4+r1], mm0
- movq [r4+2*r1], mm0
- lea r4, [r4+2*r1]
- movq [r4+r1], mm0
-
- movq [r4+2*r1], mm1
- lea r4, [r4+2*r1]
- movq [r4+r1], mm1
- movq [r4+2*r1], mm1
- lea r4, [r4+2*r1]
- movq [r4+r1], mm1
-
- pop r4
- pop r3
- WELSEMMS
- ret
-
-
-
-;*******************************************************************************
-;
-; void WelsDecoderI16x16LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
-;*******************************************************************************
-WELS_EXTERN WelsDecoderI16x16LumaPredDc_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r4, r0
- sub r0, r1
- movdqa xmm0, [r0] ; read one row
- pxor xmm1, xmm1
- psadbw xmm0, xmm1
- movdqa xmm1, xmm0
- psrldq xmm1, 0x08
- pslldq xmm0, 0x08
- psrldq xmm0, 0x08
- paddw xmm0, xmm1
-
- movzx r2, byte [r0+r1-0x01]
- movzx r3, byte [r0+2*r1-0x01]
- add r2, r3
- lea r0, [r0+r1]
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- add r2, 0x10
- movd xmm1, r2d
- paddw xmm0, xmm1
- psrld xmm0, 0x05
- pmuludq xmm0, [mmx_01bytes]
- pshufd xmm0, xmm0, 0
-
- movdqa [r4], xmm0
- movdqa [r4+r1], xmm0
- movdqa [r4+2*r1], xmm0
- lea r4, [r4+2*r1]
-
- movdqa [r4+r1], xmm0
- movdqa [r4+2*r1], xmm0
- lea r4, [r4+2*r1]
-
- movdqa [r4+r1], xmm0
- movdqa [r4+2*r1], xmm0
- lea r4, [r4+2*r1]
-
- movdqa [r4+r1], xmm0
- movdqa [r4+2*r1], xmm0
- lea r4, [r4+2*r1]
-
- movdqa [r4+r1], xmm0
- movdqa [r4+2*r1], xmm0
- lea r4, [r4+2*r1]
-
- movdqa [r4+r1], xmm0
- movdqa [r4+2*r1], xmm0
- lea r4, [r4+2*r1]
-
- movdqa [r4+r1], xmm0
- movdqa [r4+2*r1], xmm0
- lea r4, [r4+2*r1]
-
- movdqa [r4+r1], xmm0
-
- pop r4
- pop r3
-
- ret
-
-;*******************************************************************************
-; for intra prediction as follows, 11/19/2010
-;*******************************************************************************
-
-;*******************************************************************************
-; void WelsDecoderI16x16LumaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
-;*******************************************************************************
-WELS_EXTERN WelsDecoderI16x16LumaPredDcTop_sse2
- %assign push_num 0
- LOAD_2_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- mov r2, r0
- sub r2, r1
- movdqa xmm0, [r2] ; pPred-kiStride, top line
- pxor xmm7, xmm7
- psadbw xmm0, xmm7
- movdqa xmm1, xmm0
- psrldq xmm1, 8
- paddw xmm0, xmm1
- xor r2, r2
- movd r2d, xmm0
- ;movdqa xmm1, xmm0
- ;punpcklbw xmm0, xmm7
- ;punpckhbw xmm1, xmm7
-
- ;paddw xmm0, xmm1 ; (ub.max(ff) << 4) will not excceed of uw, so can perform it in unit of unsigned word scope
- ;pshufd xmm1, xmm0, 04eh ; 01001110, w3w2w1w0,w7w6w5w4
- ;paddw xmm0, xmm1 ; w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 w1+5 w0+4
- ;pshufd xmm1, xmm0, 0b1h ; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6
- ;paddw xmm0, xmm1 ; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6)
- ;pshuflw xmm1, xmm0, 0b1h ; 10110001
- ;paddw xmm0, xmm1 ; sum in word unit (x8)
- ;xor r3, r3
- ;movd r3d, xmm0
- ;and edx, 0ffffh
-
- add r2, 8
- sar r2, 4
- SSE2_Copy16Times xmm1, r2d
- ;mov dh, dl
- ;mov r2, edx
- ;shl r2, 010h
- ;or edx, r2
- ;movd xmm1, edx
- ;pshufd xmm0, xmm1, 00h
- ;movdqa xmm1, xmm0
- movdqa xmm0, xmm1
- lea r2, [2*r1+r1] ; 3*kiStride
-
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm0
- movdqa [r0+r2], xmm1
-
- lea r0, [r0+4*r1]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm0
- movdqa [r0+r2], xmm1
-
- lea r0, [r0+4*r1]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm0
- movdqa [r0+r2], xmm1
-
- lea r0, [r0+4*r1]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm0
- movdqa [r0+r2], xmm1
-
- POP_XMM
- ret
-
-;*******************************************************************************
-; void WelsDecoderI16x16LumaPredDcNA_sse2(uint8_t *pPred, const int32_t kiStride)
-;*******************************************************************************
-WELS_EXTERN WelsDecoderI16x16LumaPredDcNA_sse2
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- lea r2, [2*r1+r1] ; 3*kiStride
-
- movdqa xmm0, [sse2_dc_0x80]
- movdqa xmm1, xmm0
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm0
- movdqa [r0+r2], xmm1
- lea r0, [r0+4*r1]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm0
- movdqa [r0+r2], xmm1
- lea r0, [r0+4*r1]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm0
- movdqa [r0+r2], xmm1
- lea r0, [r0+4*r1]
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm0
- movdqa [r0+r2], xmm1
-
- ret
-
-;*******************************************************************************
-; void WelsDecoderIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride)
-;*******************************************************************************
-WELS_EXTERN WelsDecoderIChromaPredDcLeft_mmx
- push r3
- push r4
- %assign push_num 2
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- mov r4, r0
- ; for left
- dec r0
- xor r2, r2
- xor r3, r3
- movzx r2, byte [r0]
- movzx r3, byte [r0+r1]
- add r2, r3
- lea r0, [r0+2*r1]
- movzx r3, byte [r0]
- add r2, r3
- movzx r3, byte [r0+r1]
- add r2, r3
- add r2, 02h
- sar r2, 02h
- ;SSE2_Copy16Times mm0, r2d
- mov r3, r2
- sal r3, 8
- or r2, r3
- movd mm1, r2d
- pshufw mm0, mm1, 00h
- ;mov bh, bl
- ;movd mm1, ebx
- ;pshufw mm0, mm1, 00h ; up64
- movq mm1, mm0
- xor r2, r2
- lea r0, [r0+2*r1]
- movzx r2, byte [r0]
- movzx r3, byte [r0+r1]
- add r2, r3
- lea r0, [r0+2*r1]
- movzx r3, byte [r0]
- add r2, r3
- movzx r3, byte [r0+r1]
- add r2, r3
- add r2, 02h
- sar r2, 02h
- mov r3, r2
- sal r3, 8
- or r2, r3
- movd mm3, r2d
- pshufw mm2, mm3, 00h
- ;mov bh, bl
- ;movd mm3, ebx
- ;pshufw mm2, mm3, 00h ; down64
- ;SSE2_Copy16Times mm2, r2d
- movq mm3, mm2
- lea r2, [2*r1+r1]
- movq [r4], mm0
- movq [r4+r1], mm1
- movq [r4+2*r1], mm0
- movq [r4+r2], mm1
- lea r4, [r4+4*r1]
- movq [r4], mm2
- movq [r4+r1], mm3
- movq [r4+2*r1], mm2
- movq [r4+r2], mm3
- pop r4
- pop r3
- emms
- ret
-
-;*******************************************************************************
-; void WelsDecoderIChromaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
-;*******************************************************************************
-WELS_EXTERN WelsDecoderIChromaPredDcTop_sse2
- %assign push_num 0
- LOAD_2_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- mov r2, r0
- sub r2, r1
- movq xmm0, [r2] ; top: 8x1 pixels
- pxor xmm7, xmm7
- punpcklbw xmm0, xmm7 ; ext 8x2 words
- pshufd xmm1, xmm0, 0B1h ; 10110001 B, w5 w4 w7 w6 w1 w0 w3 w2
- paddw xmm0, xmm1 ; w5+7 w4+6 w5+7 w4+6 w1+3 w0+2 w1+3 w0+2
- movdqa xmm1, xmm0
- pshuflw xmm2, xmm0, 0B1h ; 10110001 B, .. w0+2 w1+3 w0+2 w1+3
- pshufhw xmm3, xmm1, 0B1h ; 10110001 B, w4+6 w5+7 w4+6 w5+7 ..
- paddw xmm0, xmm2 ; .. w0+..+3 w0+..+3 w0+..+3 w0+..+3
- paddw xmm1, xmm3 ; w4+..+7 w4+..+7 w4+..+7 w4+..+7 ..
- punpckhqdq xmm1, xmm7
- punpcklqdq xmm0, xmm1 ; sum1 sum1 sum1 sum1 sum0 sum0 sum0 sum0
- movdqa xmm6, [sse2_wd_0x02]
- paddw xmm0, xmm6
- psraw xmm0, 02h
- packuswb xmm0, xmm7
- lea r2, [2*r1+r1]
- movq [r0], xmm0
- movq [r0+r1], xmm0
- movq [r0+2*r1], xmm0
- movq [r0+r2], xmm0
- lea r0, [r0+4*r1]
- movq [r0], xmm0
- movq [r0+r1], xmm0
- movq [r0+2*r1], xmm0
- movq [r0+r2], xmm0
- POP_XMM
- ret
-
-;*******************************************************************************
-; void WelsDecoderIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
-;*******************************************************************************
-WELS_EXTERN WelsDecoderIChromaPredDcNA_mmx
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- lea r2, [2*r1+r1]
- movq mm0, [sse2_dc_0x80]
- movq mm1, mm0
- movq [r0], mm0
- movq [r0+r1], mm1
- movq [r0+2*r1], mm0
- movq [r0+r2], mm1
- lea r0, [r0+4*r1]
- movq [r0], mm0
- movq [r0+r1], mm1
- movq [r0+2*r1], mm0
- movq [r0+r2], mm1
- emms
- ret
-
--- /dev/null
+++ b/codec/decoder/core/x86/block_add.asm
@@ -1,0 +1,151 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* block_add.asm
+;*
+;* Abstract
+;* add block
+;*
+;* History
+;* 09/21/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+
+;*******************************************************************************
+; void WelsResBlockZero16x16_sse2(int16_t* pBlock,int32_t iStride)
+;*******************************************************************************
+WELS_EXTERN WelsResBlockZero16x16_sse2
+ %assign push_num 0
+ LOAD_2_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ lea r1, [r1*2]
+ lea r2, [r1*3]
+
+ pxor xmm7, xmm7
+
+ ; four lines
+ movdqa [r0], xmm7
+ movdqa [r0+10h], xmm7
+
+ movdqa [r0+r1], xmm7
+ movdqa [r0+r1+10h], xmm7
+
+ movdqa [r0+r1*2], xmm7
+ movdqa [r0+r1*2+10h], xmm7
+
+ movdqa [r0+r2], xmm7
+ movdqa [r0+r2+10h], xmm7
+
+ ; four lines
+ lea r0, [r0+r1*4]
+ movdqa [r0], xmm7
+ movdqa [r0+10h], xmm7
+
+ movdqa [r0+r1], xmm7
+ movdqa [r0+r1+10h], xmm7
+
+ movdqa [r0+r1*2], xmm7
+ movdqa [r0+r1*2+10h], xmm7
+
+ movdqa [r0+r2], xmm7
+ movdqa [r0+r2+10h], xmm7
+
+ ; four lines
+ lea r0, [r0+r1*4]
+ movdqa [r0], xmm7
+ movdqa [r0+10h], xmm7
+
+ movdqa [r0+r1], xmm7
+ movdqa [r0+r1+10h], xmm7
+
+ movdqa [r0+r1*2], xmm7
+ movdqa [r0+r1*2+10h], xmm7
+
+ movdqa [r0+r2], xmm7
+ movdqa [r0+r2+10h], xmm7
+
+ ; four lines
+ lea r0, [r0+r1*4]
+ movdqa [r0], xmm7
+ movdqa [r0+10h], xmm7
+
+ movdqa [r0+r1], xmm7
+ movdqa [r0+r1+10h], xmm7
+
+ movdqa [r0+r1*2], xmm7
+ movdqa [r0+r1*2+10h], xmm7
+
+ movdqa [r0+r2], xmm7
+ movdqa [r0+r2+10h], xmm7
+
+ POP_XMM
+ ret
+
+
+;*******************************************************************************
+; void WelsResBlockZero8x8_sse2(int16_t * pBlock, int32_t iStride)
+;*******************************************************************************
+WELS_EXTERN WelsResBlockZero8x8_sse2
+ %assign push_num 0
+ LOAD_2_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ lea r1, [r1*2]
+ lea r2, [r1*3]
+
+ pxor xmm7, xmm7
+
+ movdqa [r0], xmm7
+ movdqa [r0+r1], xmm7
+ movdqa [r0+r1*2], xmm7
+ movdqa [r0+r2], xmm7
+
+ lea r0, [r0+r1*4]
+ movdqa [r0], xmm7
+ movdqa [r0+r1], xmm7
+ movdqa [r0+r1*2], xmm7
+ movdqa [r0+r2], xmm7
+
+
+ POP_XMM
+ ret
+
--- /dev/null
+++ b/codec/decoder/core/x86/dct.asm
@@ -1,0 +1,115 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* ?Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* ?Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* dct.asm
+;*
+;* Abstract
+;* WelsDctFourT4_sse2
+;*
+;* History
+;* 8/4/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+%macro MMX_SumSubDiv2 3
+ movq %3, %2
+ psraw %3, $01
+ paddw %3, %1
+ psraw %1, $01
+ psubw %1, %2
+%endmacro
+
+%macro MMX_SumSub 3
+ movq %3, %2
+ psubw %2, %1
+ paddw %1, %3
+%endmacro
+
+%macro MMX_IDCT 6
+ MMX_SumSub %4, %5, %6
+ MMX_SumSubDiv2 %3, %2, %1
+ MMX_SumSub %1, %4, %6
+ MMX_SumSub %3, %5, %6
+%endmacro
+
+
+%macro MMX_StoreDiff4P 5
+ movd %2, %5
+ punpcklbw %2, %4
+ paddw %1, %3
+ psraw %1, $06
+ paddsw %1, %2
+ packuswb %1, %2
+ movd %5, %1
+%endmacro
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+;*******************************************************************************
+; void IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs )
+;*******************************************************************************
+
+WELS_EXTERN IdctResAddPred_mmx
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r1, r1d
+ movq mm0, [r2+ 0]
+ movq mm1, [r2+ 8]
+ movq mm2, [r2+16]
+ movq mm3, [r2+24]
+
+ MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
+ MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
+ MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
+ MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
+
+ WELS_Zero mm7
+ WELS_DW32 mm6
+
+ MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0]
+ MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1]
+ lea r0, [r0+2*r1]
+ MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0]
+ MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1]
+
+
+ emms
+ ret
--- /dev/null
+++ b/codec/decoder/core/x86/intra_pred.asm
@@ -1,0 +1,1414 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* intra_pred.asm
+;*
+;* Abstract
+;* sse2 and mmx function for intra predict operations(decoder)
+;*
+;* History
+;* 18/09/2009 Created
+;* 19/11/2010 Added
+;* WelsDecoderI16x16LumaPredDcTop_sse2, WelsDecoderI16x16LumaPredDcNA_sse2,
+;* WelsDecoderIChromaPredDcLeft_mmx, WelsDecoderIChromaPredDcTop_sse2
+;* and WelsDecoderIChromaPredDcNA_mmx
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+;*******************************************************************************
+; Local Data (Read Only)
+;*******************************************************************************
+
+SECTION .rodata align=16
+%if 1
+ %define WELSEMMS emms
+%else
+ %define WELSEMMS
+%endif
+
+align 16
+sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
+align 16
+sse2_plane_inc dw 1, 2, 3, 4, 5, 6, 7, 8
+align 16
+sse2_plane_dec dw 8, 7, 6, 5, 4, 3, 2, 1
+
+; for chroma plane mode
+sse2_plane_inc_c dw 1, 2, 3, 4
+sse2_plane_dec_c dw 4, 3, 2, 1
+align 16
+sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
+
+align 16
+mmx_01bytes: times 16 db 1
+
+align 16
+mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
+
+align 16
+sse2_dc_0x80: times 16 db 0x80
+align 16
+sse2_wd_0x02: times 8 dw 0x02
+
+;*******************************************************************************
+; macros
+;*******************************************************************************
+;xmm0, xmm1, xmm2, eax, ecx
+;lower 64 bits of xmm0 save the result
+%macro SSE2_PRED_H_4X4_TWO_LINE 5
+ movd %1, [%4-1]
+ movdqa %3, %1
+ punpcklbw %1, %3
+ movdqa %3, %1
+ punpcklbw %1, %3
+
+ ;add %4, %5
+ movd %2, [%4+%5-1]
+ movdqa %3, %2
+ punpcklbw %2, %3
+ movdqa %3, %2
+ punpcklbw %2, %3
+ punpckldq %1, %2
+%endmacro
+
+
+%macro LOAD_COLUMN 6
+ movd %1, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %1, %2
+ lea %5, [%5+2*%6]
+ movd %3, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %3, %2
+ punpcklwd %1, %3
+ lea %5, [%5+2*%6]
+ movd %4, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %4, %2
+ lea %5, [%5+2*%6]
+ movd %3, [%5]
+ movd %2, [%5+%6]
+ lea %5, [%5+2*%6]
+ punpcklbw %3, %2
+ punpcklwd %4, %3
+ punpckhdq %1, %4
+%endmacro
+
+%macro SUMW_HORIZON 3
+ movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
+ paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
+ punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
+ movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
+ paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
+ pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
+ paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
+%endmacro
+
+%macro COPY_16_TIMES 2
+ movdqa %2, [%1-16]
+ psrldq %2, 15
+ pmuludq %2, [mmx_01bytes]
+ pshufd %2, %2, 0
+%endmacro
+
+%macro COPY_16_TIMESS 3
+ movdqa %2, [%1+%3-16]
+ psrldq %2, 15
+ pmuludq %2, [mmx_01bytes]
+ pshufd %2, %2, 0
+%endmacro
+
+%macro LOAD_COLUMN_C 6
+ movd %1, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %1,%2
+ lea %5, [%5+2*%6]
+ movd %3, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %3, %2
+ punpckhwd %1, %3
+ lea %5, [%5+2*%6]
+%endmacro
+
+%macro LOAD_2_LEFT_AND_ADD 0
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0-0x01]
+ add r2, r3
+ movzx r3, byte [r0+r1-0x01]
+ add r2, r3
+%endmacro
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+
+;*******************************************************************************
+; void WelsDecoderI4x4LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride)
+;
+; pPred must align to 16
+;*******************************************************************************
+WELS_EXTERN WelsDecoderI4x4LumaPredH_sse2
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+
+ movzx r2, byte [r0-1]
+ movd xmm0, r2d
+ pmuludq xmm0, [mmx_01bytes]
+
+ movzx r2, byte [r0+r1-1]
+ movd xmm1, r2d
+ pmuludq xmm1, [mmx_01bytes]
+
+ lea r0, [r0+r1]
+ movzx r2, byte [r0+r1-1]
+ movd xmm2, r2d
+ pmuludq xmm2, [mmx_01bytes]
+
+ movzx r2, byte [r0+2*r1-1]
+ movd xmm3, r2d
+ pmuludq xmm3, [mmx_01bytes]
+
+ sub r0, r1
+ movd [r0], xmm0
+ movd [r0+r1], xmm1
+ lea r0, [r0+2*r1]
+ movd [r0], xmm2
+ movd [r0+r1], xmm3
+
+ ret
+
+;*******************************************************************************
+; void WelsDecoderI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
+;*******************************************************************************
+WELS_EXTERN WelsDecoderI16x16LumaPredPlane_sse2
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_2_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ mov r4, r0 ; save r0 in r4
+ sub r0, 1
+ sub r0, r1
+
+ ;for H
+ pxor xmm7, xmm7
+ movq xmm0, [r0]
+ movdqa xmm5, [sse2_plane_dec]
+ punpcklbw xmm0, xmm7
+ pmullw xmm0, xmm5
+ movq xmm1, [r0 + 9]
+ movdqa xmm6, [sse2_plane_inc]
+ punpcklbw xmm1, xmm7
+ pmullw xmm1, xmm6
+ psubw xmm1, xmm0
+
+ SUMW_HORIZON xmm1,xmm0,xmm2
+ movd r2d, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]);
+ movsx r2, r2w
+ imul r2, 5
+ add r2, 32
+ sar r2, 6 ; b = (5 * H + 32) >> 6;
+ SSE2_Copy8Times xmm1, r2d ; xmm1 = b,b,b,b,b,b,b,b
+
+ movzx r3, BYTE [r0+16]
+ sub r0, 3
+ LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, r0, r1
+
+ add r0, 3
+ movzx r2, BYTE [r0+8*r1]
+ add r3, r2
+ shl r3, 4 ; a = (left[15*kiStride] + top[15]) << 4;
+
+ sub r0, 3
+ add r0, r1
+ LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, r0, r1
+ pxor xmm4, xmm4
+ punpckhbw xmm0, xmm4
+ pmullw xmm0, xmm5
+ punpckhbw xmm7, xmm4
+ pmullw xmm7, xmm6
+ psubw xmm7, xmm0
+
+ SUMW_HORIZON xmm7,xmm0,xmm2
+ movd r2d, xmm7 ; V
+ movsx r2, r2w
+
+ imul r2, 5
+ add r2, 32
+ sar r2, 6 ; c = (5 * V + 32) >> 6;
+ SSE2_Copy8Times xmm4, r2d ; xmm4 = c,c,c,c,c,c,c,c
+
+ mov r0, r4
+ add r3, 16
+ imul r2, -7
+ add r3, r2 ; s = a + 16 + (-7)*c
+ SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
+
+ xor r2, r2
+ movdqa xmm5, [sse2_plane_inc_minus]
+
+get_i16x16_luma_pred_plane_sse2_1:
+ movdqa xmm2, xmm1
+ pmullw xmm2, xmm5
+ paddw xmm2, xmm0
+ psraw xmm2, 5
+ movdqa xmm3, xmm1
+ pmullw xmm3, xmm6
+ paddw xmm3, xmm0
+ psraw xmm3, 5
+ packuswb xmm2, xmm3
+ movdqa [r0], xmm2
+ paddw xmm0, xmm4
+ add r0, r1
+ inc r2
+ cmp r2, 16
+ jnz get_i16x16_luma_pred_plane_sse2_1
+
+ POP_XMM
+ pop r4
+ pop r3
+ ret
+
+
+
+;*******************************************************************************
+; void WelsDecoderI16x16LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride);
+;*******************************************************************************
+
+%macro SSE2_PRED_H_16X16_TWO_LINE_DEC 2
+ lea %1, [%1+%2*2]
+
+ COPY_16_TIMES %1, xmm0
+ movdqa [%1], xmm0
+ COPY_16_TIMESS %1, xmm0, %2
+ movdqa [%1+%2], xmm0
+%endmacro
+
+WELS_EXTERN WelsDecoderI16x16LumaPredH_sse2
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+
+ COPY_16_TIMES r0, xmm0
+ movdqa [r0], xmm0
+ COPY_16_TIMESS r0, xmm0, r1
+ movdqa [r0+r1], xmm0
+
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+
+ ret
+
+;*******************************************************************************
+; void WelsDecoderI16x16LumaPredV_sse2(uint8_t *pPred, const int32_t kiStride);
+;*******************************************************************************
+WELS_EXTERN WelsDecoderI16x16LumaPredV_sse2
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+
+ sub r0, r1
+ movdqa xmm0, [r0]
+
+ movdqa [r0+r1], xmm0
+ lea r0, [r0+2*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm0
+ lea r0, [r0+2*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm0
+ lea r0, [r0+2*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm0
+ lea r0, [r0+2*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm0
+ lea r0, [r0+2*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm0
+ lea r0, [r0+2*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm0
+ lea r0, [r0+2*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm0
+ lea r0, [r0+2*r1]
+ movdqa [r0], xmm0
+
+ ret
+
+;*******************************************************************************
+; void WelsDecoderIChromaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
+;*******************************************************************************
+WELS_EXTERN WelsDecoderIChromaPredPlane_sse2
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_2_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ mov r4, r0
+ sub r0, 1
+ sub r0, r1
+
+ pxor mm7, mm7
+ movq mm0, [r0]
+ movq mm5, [sse2_plane_dec_c]
+ punpcklbw mm0, mm7
+ pmullw mm0, mm5
+ movq mm1, [r0 + 5]
+ movq mm6, [sse2_plane_inc_c]
+ punpcklbw mm1, mm7
+ pmullw mm1, mm6
+ psubw mm1, mm0
+
+ movq2dq xmm1, mm1
+ pxor xmm2, xmm2
+ SUMW_HORIZON xmm1,xmm0,xmm2
+ movd r2d, xmm1
+ movsx r2, r2w
+ imul r2, 17
+ add r2, 16
+ sar r2, 5 ; b = (17 * H + 16) >> 5;
+ SSE2_Copy8Times xmm1, r2d ; mm1 = b,b,b,b,b,b,b,b
+
+ movzx r3, BYTE [r0+8]
+ sub r0, 3
+ LOAD_COLUMN_C mm0, mm2, mm3, mm4, r0, r1
+
+ add r0, 3
+ movzx r2, BYTE [r0+4*r1]
+ add r3, r2
+ shl r3, 4 ; a = (left[7*kiStride] + top[7]) << 4;
+
+ sub r0, 3
+ add r0, r1
+ LOAD_COLUMN_C mm7, mm2, mm3, mm4, r0, r1
+ pxor mm4, mm4
+ punpckhbw mm0, mm4
+ pmullw mm0, mm5
+ punpckhbw mm7, mm4
+ pmullw mm7, mm6
+ psubw mm7, mm0
+
+ movq2dq xmm7, mm7
+ pxor xmm2, xmm2
+ SUMW_HORIZON xmm7,xmm0,xmm2
+ movd r2d, xmm7 ; V
+ movsx r2, r2w
+
+ imul r2, 17
+ add r2, 16
+ sar r2, 5 ; c = (17 * V + 16) >> 5;
+ SSE2_Copy8Times xmm4, r2d ; mm4 = c,c,c,c,c,c,c,c
+
+ mov r0, r4
+ add r3, 16
+ imul r2, -3
+ add r3, r2 ; s = a + 16 + (-3)*c
+ SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
+
+ xor r2, r2
+ movdqa xmm5, [sse2_plane_mul_b_c]
+
+get_i_chroma_pred_plane_sse2_1:
+ movdqa xmm2, xmm1
+ pmullw xmm2, xmm5
+ paddw xmm2, xmm0
+ psraw xmm2, 5
+ packuswb xmm2, xmm2
+ movq [r0], xmm2
+ paddw xmm0, xmm4
+ add r0, r1
+ inc r2
+ cmp r2, 8
+ jnz get_i_chroma_pred_plane_sse2_1
+
+ POP_XMM
+ pop r4
+ pop r3
+ WELSEMMS
+ ret
+
+;*******************************************************************************
+; 0 |1 |2 |3 |4 |
+; 6 |7 |8 |9 |10|
+; 11|12|13|14|15|
+; 16|17|18|19|20|
+; 21|22|23|24|25|
+; 7 is the start pixel of current 4x4 block
+; pPred[7] = ([6]+[0]*2+[1]+2)/4
+;
+; void WelsDecoderI4x4LumaPredDDR_mmx(uint8_t *pPred, const int32_t kiStride)
+;
+;*******************************************************************************
+WELS_EXTERN WelsDecoderI4x4LumaPredDDR_mmx
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
+
+ movq mm1,[r2+r1-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
+ movq mm2,[r2-8] ;get value of 6 mm2[8] = 6
+ sub r2, r1 ;mov eax to above line of current block(postion of 1)
+ punpckhbw mm2,[r2-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
+ movd mm3,[r2] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
+ punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
+ psllq mm3,18h ;mm3[5]=[1]
+ psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+ por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
+ movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+ lea r2,[r2+r1*2-8h] ;set eax point to 12
+ movq mm4,[r2+r1] ;get value of 16, mm4[8]=[16]
+ psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
+ psrlq mm4,38h ;mm4[1]=[16]
+ por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
+ movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
+ movq mm4,[r2+r1*2] ;mm4[8]=[21]
+ psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
+ psrlq mm4,38h ;mm4[1]=[21]
+ por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
+ movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
+ pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2
+ pxor mm1,mm4 ;find odd value in the lowest bit of each byte
+ pand mm1,[mmx_01bytes] ;set the odd bit
+ psubusb mm3,mm1 ;decrease 1 from odd bytes
+ pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
+
+ lea r0,[r0+r1]
+ movd [r0+2*r1],mm2
+ sub r0,r1
+ psrlq mm2,8
+ movd [r0+2*r1],mm2
+ psrlq mm2,8
+ movd [r0+r1],mm2
+ psrlq mm2,8
+ movd [r0],mm2
+ WELSEMMS
+ ret
+
+
+;*******************************************************************************
+; void WelsDecoderIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride)
+; copy 8 pixel of 8 line from left
+;*******************************************************************************
+%macro MMX_PRED_H_8X8_ONE_LINE 4
+ movq %1, [%3-8]
+ psrlq %1, 38h
+
+ pmullw %1, [mmx_01bytes]
+ pshufw %1, %1, 0
+ movq [%4], %1
+%endmacro
+
+%macro MMX_PRED_H_8X8_ONE_LINEE 4
+ movq %1, [%3+r1-8]
+ psrlq %1, 38h
+
+ pmullw %1, [mmx_01bytes]
+ pshufw %1, %1, 0
+ movq [%4], %1
+%endmacro
+
+WELS_EXTERN WelsDecoderIChromaPredH_mmx
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
+
+ movq mm0, [r2-8]
+ psrlq mm0, 38h
+
+ pmullw mm0, [mmx_01bytes]
+ pshufw mm0, mm0, 0
+ movq [r0], mm0
+
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
+
+ lea r2, [r2+r1*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
+
+ lea r0, [r0+2*r1]
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
+
+ lea r2, [r2+r1*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
+
+ lea r0, [r0+2*r1]
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
+
+ lea r2, [r2+r1*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
+
+ lea r0, [r0+2*r1]
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
+
+ WELSEMMS
+ ret
+
+
+;*******************************************************************************
+; void WelsDecoderIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
+; copy 8 pixels from top 8 pixels
+;*******************************************************************************
+WELS_EXTERN WelsDecoderIChromaPredV_mmx
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+
+ sub r0, r1
+ movq mm0, [r0]
+
+ movq [r0+r1], mm0
+ movq [r0+2*r1], mm0
+ lea r0, [r0+2*r1]
+ movq [r0+r1], mm0
+ movq [r0+2*r1], mm0
+ lea r0, [r0+2*r1]
+ movq [r0+r1], mm0
+ movq [r0+2*r1], mm0
+ lea r0, [r0+2*r1]
+ movq [r0+r1], mm0
+ movq [r0+2*r1], mm0
+
+ WELSEMMS
+ ret
+
+
+;*******************************************************************************
+; lt|t0|t1|t2|t3|
+; l0|
+; l1|
+; l2|
+; l3|
+; t3 will never been used
+; destination:
+; |a |b |c |d |
+; |e |f |a |b |
+; |g |h |e |f |
+; |i |j |g |h |
+
+; a = (1 + lt + l0)>>1
+; e = (1 + l0 + l1)>>1
+; g = (1 + l1 + l2)>>1
+; i = (1 + l2 + l3)>>1
+
+; d = (2 + t0 + (t1<<1) + t2)>>2
+; c = (2 + lt + (t0<<1) + t1)>>2
+; b = (2 + l0 + (lt<<1) + t0)>>2
+
+; f = (2 + l1 + (l0<<1) + lt)>>2
+; h = (2 + l2 + (l1<<1) + l0)>>2
+; j = (2 + l3 + (l2<<1) + l1)>>2
+; [b a f e h g j i] + [d c b a] --> mov to memory
+;
+; void WelsDecoderI4x4LumaPredHD_mmx(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsDecoderI4x4LumaPredHD_mmx
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
+ sub r2, r1
+ movd mm0, [r2-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt]
+ psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx]
+
+ movd mm1, [r2+2*r1-4]
+ punpcklbw mm1, [r2+r1-4] ; mm1[7] = l0, mm1[6] = l1
+ lea r2, [r2+2*r1]
+ movd mm2, [r2+2*r1-4]
+ punpcklbw mm2, [r2+r1-4] ; mm2[7] = l2, mm2[6] = l3
+ punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
+ psrlq mm2, 20h
+ pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
+
+ movq mm1, mm0
+ psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
+ movq mm2, mm0
+ psrlq mm2, 8h ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
+ movq mm3, mm2
+ movq mm4, mm1
+ pavgb mm1, mm0
+
+ pxor mm4, mm0 ; find odd value in the lowest bit of each byte
+ pand mm4, [mmx_01bytes] ; set the odd bit
+ psubusb mm1, mm4 ; decrease 1 from odd bytes
+
+ pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
+
+ movq mm4, mm0
+ pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i]
+ punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i]
+
+ psrlq mm2, 20h
+ psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0]
+ movq mm4, mm3
+ psrlq mm4, 10h ; mm4 = [0 0 b a f e h j]
+ pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx]
+ psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a]
+
+ movd [r0], mm2
+ lea r0, [r0+r1]
+ movd [r0+2*r1], mm3
+ sub r0, r1
+ psrlq mm3, 10h
+ movd [r0+2*r1], mm3
+ psrlq mm3, 10h
+ movd [r0+r1], mm3
+ WELSEMMS
+ ret
+
+
+
+;*******************************************************************************
+; lt|t0|t1|t2|t3|
+; l0|
+; l1|
+; l2|
+; l3|
+; t3 will never been used
+; destination:
+; |a |b |c |d |
+; |c |d |e |f |
+; |e |f |g |g |
+; |g |g |g |g |
+
+; a = (1 + l0 + l1)>>1
+; c = (1 + l1 + l2)>>1
+; e = (1 + l2 + l3)>>1
+; g = l3
+
+; b = (2 + l0 + (l1<<1) + l2)>>2
+; d = (2 + l1 + (l2<<1) + l3)>>2
+; f = (2 + l2 + (l3<<1) + l3)>>2
+
+; [g g f e d c b a] + [g g g g] --> mov to memory
+;
+; void WelsDecoderI4x4LumaPredHU_mmx(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsDecoderI4x4LumaPredHU_mmx
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
+
+ movd mm0, [r2-4] ; mm0[3] = l0
+ punpcklbw mm0, [r2+r1-4] ; mm0[7] = l1, mm0[6] = l0
+ lea r2, [r2+2*r1]
+ movd mm2, [r2-4] ; mm2[3] = l2
+ movd mm4, [r2+r1-4] ; mm4[3] = l3
+ punpcklbw mm2, mm4
+ punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
+
+ psrlq mm4, 18h
+ psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx]
+ psrlq mm0, 8h
+ pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
+
+ movq mm1, mm0
+ psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
+ movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
+ pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx]
+
+ movq mm2, mm0
+ psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx]
+ movq mm5, mm2
+ pavgb mm2, mm0
+
+ pxor mm5, mm0 ; find odd value in the lowest bit of each byte
+ pand mm5, [mmx_01bytes] ; set the odd bit
+ psubusb mm2, mm5 ; decrease 1 from odd bytes
+
+ pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
+
+ psrlq mm2, 8h
+ pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx]
+
+ punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a]
+ punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx]
+ punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx]
+
+ psrlq mm4, 20h
+ lea r0, [r0+r1]
+ movd [r0+2*r1], mm4
+
+ sub r0, r1
+ movd [r0], mm1
+ psrlq mm1, 10h
+ movd [r0+r1], mm1
+ psrlq mm1, 10h
+ movd [r0+2*r1], mm1
+ WELSEMMS
+ ret
+
+
+
+;*******************************************************************************
+; lt|t0|t1|t2|t3|
+; l0|
+; l1|
+; l2|
+; l3|
+; l3 will never been used
+; destination:
+; |a |b |c |d |
+; |e |f |g |h |
+; |i |a |b |c |
+; |j |e |f |g |
+
+; a = (1 + lt + t0)>>1
+; b = (1 + t0 + t1)>>1
+; c = (1 + t1 + t2)>>1
+; d = (1 + t2 + t3)>>1
+
+; e = (2 + l0 + (lt<<1) + t0)>>2
+; f = (2 + lt + (t0<<1) + t1)>>2
+; g = (2 + t0 + (t1<<1) + t2)>>2
+
+; h = (2 + t1 + (t2<<1) + t3)>>2
+; i = (2 + lt + (l0<<1) + l1)>>2
+; j = (2 + l0 + (l1<<1) + l2)>>2
+;
+; void WelsDecoderI4x4LumaPredVR_mmx(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsDecoderI4x4LumaPredVR_mmx
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
+ sub r2, r1
+ movq mm0, [r2-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
+ psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
+
+ movd mm1, [r2+2*r1-4]
+ punpcklbw mm1, [r2+r1-4] ; mm1[7] = l0, mm1[6] = l1
+ lea r2, [r2+2*r1]
+ movq mm2, [r2+r1-8] ; mm2[7] = l2
+ punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx]
+ psrlq mm2, 28h
+ pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
+
+ movq mm1, mm0
+ psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
+ pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx]
+
+ movq mm2, mm0
+ psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
+ movq mm3, mm2
+ pavgb mm2, mm0
+
+ pxor mm3, mm0 ; find odd value in the lowest bit of each byte
+ pand mm3, [mmx_01bytes] ; set the odd bit
+ psubusb mm2, mm3 ; decrease 1 from odd bytes
+
+ movq mm3, mm0
+ psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
+ pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx]
+ movq mm2, mm3
+
+ psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a]
+ movd [r0], mm1
+
+ psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e]
+ movd [r0+r1], mm2
+
+ movq mm4, mm3
+ psllq mm4, 20h
+ psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i]
+
+ movq mm5, mm3
+ psllq mm5, 28h
+ psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j]
+
+ psllq mm1, 8h
+ pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i]
+ movd [r0+2*r1], mm4
+
+ psllq mm2, 8h
+ pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
+ lea r0, [r0+2*r1]
+ movd [r0+r1], mm5
+ WELSEMMS
+ ret
+
+;*******************************************************************************
+; lt|t0|t1|t2|t3|t4|t5|t6|t7
+; l0|
+; l1|
+; l2|
+; l3|
+; lt,t0,t1,t2,t3 will never been used
+; destination:
+; |a |b |c |d |
+; |b |c |d |e |
+; |c |d |e |f |
+; |d |e |f |g |
+
+; a = (2 + t0 + t2 + (t1<<1))>>2
+; b = (2 + t1 + t3 + (t2<<1))>>2
+; c = (2 + t2 + t4 + (t3<<1))>>2
+; d = (2 + t3 + t5 + (t4<<1))>>2
+
+; e = (2 + t4 + t6 + (t5<<1))>>2
+; f = (2 + t5 + t7 + (t6<<1))>>2
+; g = (2 + t6 + t7 + (t7<<1))>>2
+
+; [g f e d c b a] --> mov to memory
+;
+; void WelsDecoderI4x4LumaPredDDL_mmx(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsDecoderI4x4LumaPredDDL_mmx
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
+ sub r2, r1
+ movq mm0, [r2] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+ movq mm1, mm0
+ movq mm2, mm0
+
+ movq mm3, mm0
+ psrlq mm3, 38h
+ psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx]
+
+ psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
+ psrlq mm2, 8h
+ pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
+
+ movq mm3, mm1
+ pavgb mm1, mm2
+ pxor mm3, mm2 ; find odd value in the lowest bit of each byte
+ pand mm3, [mmx_01bytes] ; set the odd bit
+ psubusb mm1, mm3 ; decrease 1 from odd bytes
+
+ pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
+
+ psrlq mm0, 8h
+ movd [r0], mm0
+ psrlq mm0, 8h
+ movd [r0+r1], mm0
+ psrlq mm0, 8h
+ movd [r0+2*r1], mm0
+ psrlq mm0, 8h
+ lea r0, [r0+2*r1]
+ movd [r0+r1], mm0
+ WELSEMMS
+ ret
+
+
+;*******************************************************************************
+; lt|t0|t1|t2|t3|t4|t5|t6|t7
+; l0|
+; l1|
+; l2|
+; l3|
+; lt,t0,t1,t2,t3 will never been used
+; destination:
+; |a |b |c |d |
+; |e |f |g |h |
+; |b |c |d |i |
+; |f |g |h |j |
+
+; a = (1 + t0 + t1)>>1
+; b = (1 + t1 + t2)>>1
+; c = (1 + t2 + t3)>>1
+; d = (1 + t3 + t4)>>1
+; i = (1 + t4 + t5)>>1
+
+; e = (2 + t0 + (t1<<1) + t2)>>2
+; f = (2 + t1 + (t2<<1) + t3)>>2
+; g = (2 + t2 + (t3<<1) + t4)>>2
+; h = (2 + t3 + (t4<<1) + t5)>>2
+; j = (2 + t4 + (t5<<1) + t6)>>2
+
+; [i d c b a] + [j h g f e] --> mov to memory
+;
+; void WelsDecoderI4x4LumaPredVL_mmx(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsDecoderI4x4LumaPredVL_mmx
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
+
+ sub r2, r1
+ movq mm0, [r2] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+ movq mm1, mm0
+ movq mm2, mm0
+
+ psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
+ psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
+
+ movq mm3, mm1
+ pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a]
+
+ movq mm4, mm2
+ pavgb mm2, mm0
+ pxor mm4, mm0 ; find odd value in the lowest bit of each byte
+ pand mm4, [mmx_01bytes] ; set the odd bit
+ psubusb mm2, mm4 ; decrease 1 from odd bytes
+
+ pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
+
+ movd [r0], mm3
+ psrlq mm3, 8h
+ movd [r0+2*r1], mm3
+
+ movd [r0+r1], mm2
+ psrlq mm2, 8h
+ lea r0, [r0+2*r1]
+ movd [r0+r1], mm2
+ WELSEMMS
+ ret
+
+;*******************************************************************************
+;
+; void WelsDecoderIChromaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsDecoderIChromaPredDc_sse2
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r4, r0
+
+ sub r0, r1
+ movq mm0, [r0]
+
+ movzx r2, byte [r0+r1-0x01] ; l1
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0-0x01] ; l2
+ add r2, r3
+ movzx r3, byte [r0+r1-0x01] ; l3
+ add r2, r3
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0-0x01] ; l4
+ add r2, r3
+ movd mm1, r2d ; mm1 = l1+l2+l3+l4
+
+ movzx r2, byte [r0+r1-0x01] ; l5
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0-0x01] ; l6
+ add r2, r3
+ movzx r3, byte [r0+r1-0x01] ; l7
+ add r2, r3
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0-0x01] ; l8
+ add r2, r3
+ movd mm2, r2d ; mm2 = l5+l6+l7+l8
+
+ movq mm3, mm0
+ psrlq mm0, 0x20
+ psllq mm3, 0x20
+ psrlq mm3, 0x20
+ pxor mm4, mm4
+ psadbw mm0, mm4
+ psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
+
+ paddq mm3, mm1
+ movq mm1, mm2
+ paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
+
+ movq mm4, [mmx_0x02]
+
+ paddq mm0, mm4
+ psrlq mm0, 0x02
+
+ paddq mm2, mm4
+ psrlq mm2, 0x02
+
+ paddq mm3, mm4
+ paddq mm3, mm4
+ psrlq mm3, 0x03
+
+ paddq mm1, mm4
+ paddq mm1, mm4
+ psrlq mm1, 0x03
+
+ pmuludq mm0, [mmx_01bytes]
+ pmuludq mm3, [mmx_01bytes]
+ psllq mm0, 0x20
+ pxor mm0, mm3 ; mm0 = m_up
+
+ pmuludq mm2, [mmx_01bytes]
+ pmuludq mm1, [mmx_01bytes]
+ psllq mm1, 0x20
+ pxor mm1, mm2 ; mm2 = m_down
+
+ movq [r4], mm0
+ movq [r4+r1], mm0
+ movq [r4+2*r1], mm0
+ lea r4, [r4+2*r1]
+ movq [r4+r1], mm0
+
+ movq [r4+2*r1], mm1
+ lea r4, [r4+2*r1]
+ movq [r4+r1], mm1
+ movq [r4+2*r1], mm1
+ lea r4, [r4+2*r1]
+ movq [r4+r1], mm1
+
+ pop r4
+ pop r3
+ WELSEMMS
+ ret
+
+
+
+;*******************************************************************************
+;
+; void WelsDecoderI16x16LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsDecoderI16x16LumaPredDc_sse2
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r4, r0
+ sub r0, r1
+ movdqa xmm0, [r0] ; read one row
+ pxor xmm1, xmm1
+ psadbw xmm0, xmm1
+ movdqa xmm1, xmm0
+ psrldq xmm1, 0x08
+ pslldq xmm0, 0x08
+ psrldq xmm0, 0x08
+ paddw xmm0, xmm1
+
+ movzx r2, byte [r0+r1-0x01]
+ movzx r3, byte [r0+2*r1-0x01]
+ add r2, r3
+ lea r0, [r0+r1]
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ add r2, 0x10
+ movd xmm1, r2d
+ paddw xmm0, xmm1
+ psrld xmm0, 0x05
+ pmuludq xmm0, [mmx_01bytes]
+ pshufd xmm0, xmm0, 0
+
+ movdqa [r4], xmm0
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
+
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
+
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
+
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
+
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
+
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
+
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
+
+ movdqa [r4+r1], xmm0
+
+ pop r4
+ pop r3
+
+ ret
+
+;*******************************************************************************
+; for intra prediction as follows, 11/19/2010
+;*******************************************************************************
+
+;*******************************************************************************
+; void WelsDecoderI16x16LumaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsDecoderI16x16LumaPredDcTop_sse2
+ %assign push_num 0
+ LOAD_2_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
+ sub r2, r1
+ movdqa xmm0, [r2] ; pPred-kiStride, top line
+ pxor xmm7, xmm7
+ psadbw xmm0, xmm7
+ movdqa xmm1, xmm0
+ psrldq xmm1, 8
+ paddw xmm0, xmm1
+ xor r2, r2
+ movd r2d, xmm0
+ ;movdqa xmm1, xmm0
+ ;punpcklbw xmm0, xmm7
+ ;punpckhbw xmm1, xmm7
+
+ ;paddw xmm0, xmm1 ; (ub.max(ff) << 4) will not excceed of uw, so can perform it in unit of unsigned word scope
+ ;pshufd xmm1, xmm0, 04eh ; 01001110, w3w2w1w0,w7w6w5w4
+ ;paddw xmm0, xmm1 ; w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 w1+5 w0+4
+ ;pshufd xmm1, xmm0, 0b1h ; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6
+ ;paddw xmm0, xmm1 ; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6)
+ ;pshuflw xmm1, xmm0, 0b1h ; 10110001
+ ;paddw xmm0, xmm1 ; sum in word unit (x8)
+ ;xor r3, r3
+ ;movd r3d, xmm0
+ ;and edx, 0ffffh
+
+ add r2, 8
+ sar r2, 4
+ SSE2_Copy16Times xmm1, r2d
+ ;mov dh, dl
+ ;mov r2, edx
+ ;shl r2, 010h
+ ;or edx, r2
+ ;movd xmm1, edx
+ ;pshufd xmm0, xmm1, 00h
+ ;movdqa xmm1, xmm0
+ movdqa xmm0, xmm1
+ lea r2, [2*r1+r1] ; 3*kiStride
+
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
+
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
+
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
+
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
+
+ POP_XMM
+ ret
+
+;*******************************************************************************
+; void WelsDecoderI16x16LumaPredDcNA_sse2(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsDecoderI16x16LumaPredDcNA_sse2
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ lea r2, [2*r1+r1] ; 3*kiStride
+
+ movdqa xmm0, [sse2_dc_0x80]
+ movdqa xmm1, xmm0
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
+
+ ret
+
+;*******************************************************************************
+; void WelsDecoderIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsDecoderIChromaPredDcLeft_mmx
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ mov r4, r0
+ ; for left
+ dec r0
+ xor r2, r2
+ xor r3, r3
+ movzx r2, byte [r0]
+ movzx r3, byte [r0+r1]
+ add r2, r3
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0]
+ add r2, r3
+ movzx r3, byte [r0+r1]
+ add r2, r3
+ add r2, 02h
+ sar r2, 02h
+ ;SSE2_Copy16Times mm0, r2d
+ mov r3, r2
+ sal r3, 8
+ or r2, r3
+ movd mm1, r2d
+ pshufw mm0, mm1, 00h
+ ;mov bh, bl
+ ;movd mm1, ebx
+ ;pshufw mm0, mm1, 00h ; up64
+ movq mm1, mm0
+ xor r2, r2
+ lea r0, [r0+2*r1]
+ movzx r2, byte [r0]
+ movzx r3, byte [r0+r1]
+ add r2, r3
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0]
+ add r2, r3
+ movzx r3, byte [r0+r1]
+ add r2, r3
+ add r2, 02h
+ sar r2, 02h
+ mov r3, r2
+ sal r3, 8
+ or r2, r3
+ movd mm3, r2d
+ pshufw mm2, mm3, 00h
+ ;mov bh, bl
+ ;movd mm3, ebx
+ ;pshufw mm2, mm3, 00h ; down64
+ ;SSE2_Copy16Times mm2, r2d
+ movq mm3, mm2
+ lea r2, [2*r1+r1]
+ movq [r4], mm0
+ movq [r4+r1], mm1
+ movq [r4+2*r1], mm0
+ movq [r4+r2], mm1
+ lea r4, [r4+4*r1]
+ movq [r4], mm2
+ movq [r4+r1], mm3
+ movq [r4+2*r1], mm2
+ movq [r4+r2], mm3
+ pop r4
+ pop r3
+ emms
+ ret
+
+;*******************************************************************************
+; void WelsDecoderIChromaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsDecoderIChromaPredDcTop_sse2
+ %assign push_num 0
+ LOAD_2_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ mov r2, r0
+ sub r2, r1
+ movq xmm0, [r2] ; top: 8x1 pixels
+ pxor xmm7, xmm7
+ punpcklbw xmm0, xmm7 ; ext 8x2 words
+ pshufd xmm1, xmm0, 0B1h ; 10110001 B, w5 w4 w7 w6 w1 w0 w3 w2
+ paddw xmm0, xmm1 ; w5+7 w4+6 w5+7 w4+6 w1+3 w0+2 w1+3 w0+2
+ movdqa xmm1, xmm0
+ pshuflw xmm2, xmm0, 0B1h ; 10110001 B, .. w0+2 w1+3 w0+2 w1+3
+ pshufhw xmm3, xmm1, 0B1h ; 10110001 B, w4+6 w5+7 w4+6 w5+7 ..
+ paddw xmm0, xmm2 ; .. w0+..+3 w0+..+3 w0+..+3 w0+..+3
+ paddw xmm1, xmm3 ; w4+..+7 w4+..+7 w4+..+7 w4+..+7 ..
+ punpckhqdq xmm1, xmm7
+ punpcklqdq xmm0, xmm1 ; sum1 sum1 sum1 sum1 sum0 sum0 sum0 sum0
+ movdqa xmm6, [sse2_wd_0x02]
+ paddw xmm0, xmm6
+ psraw xmm0, 02h
+ packuswb xmm0, xmm7
+ lea r2, [2*r1+r1]
+ movq [r0], xmm0
+ movq [r0+r1], xmm0
+ movq [r0+2*r1], xmm0
+ movq [r0+r2], xmm0
+ lea r0, [r0+4*r1]
+ movq [r0], xmm0
+ movq [r0+r1], xmm0
+ movq [r0+2*r1], xmm0
+ movq [r0+r2], xmm0
+ POP_XMM
+ ret
+
+;*******************************************************************************
+; void WelsDecoderIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsDecoderIChromaPredDcNA_mmx
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ lea r2, [2*r1+r1]
+ movq mm0, [sse2_dc_0x80]
+ movq mm1, mm0
+ movq [r0], mm0
+ movq [r0+r1], mm1
+ movq [r0+2*r1], mm0
+ movq [r0+r2], mm1
+ lea r0, [r0+4*r1]
+ movq [r0], mm0
+ movq [r0+r1], mm1
+ movq [r0+2*r1], mm0
+ movq [r0+r2], mm1
+ emms
+ ret
+
--- a/codec/decoder/targets.mk
+++ b/codec/decoder/targets.mk
@@ -27,9 +27,9 @@
ifeq ($(ASM_ARCH), x86)
DECODER_ASM_SRCS=\
- $(DECODER_SRCDIR)/core/asm/block_add.asm\
- $(DECODER_SRCDIR)/core/asm/dct.asm\
- $(DECODER_SRCDIR)/core/asm/intra_pred.asm\
+ $(DECODER_SRCDIR)/core/x86/block_add.asm\
+ $(DECODER_SRCDIR)/core/x86/dct.asm\
+ $(DECODER_SRCDIR)/core/x86/intra_pred.asm\
DECODER_OBJS += $(DECODER_ASM_SRCS:.asm=.$(OBJ))
endif
--- a/codec/encoder/core/asm/coeff.asm
+++ /dev/null
@@ -1,459 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2010-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* memzero.asm
-;*
-;* Abstract
-;* cavlc
-;*
-;* History
-;* 09/08/2010 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-
-
-%ifdef X86_32
-SECTION .rodata align=16
-
-align 16
-sse2_b8 db 8, 8, 8, 8, 8, 8, 8, 8
-
-ALIGN 16
-sse2_b_1 db -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1
-
-align 16
-byte_1pos_table:
- db 0,0,0,0,0,0,0,0, ;0
- db 0,0,0,0,0,0,0,1, ;1
- db 1,0,0,0,0,0,0,1, ;2
- db 1,0,0,0,0,0,0,2, ;3
- db 2,0,0,0,0,0,0,1, ;4
- db 2,0,0,0,0,0,0,2, ;5
- db 2,1,0,0,0,0,0,2, ;6
- db 2,1,0,0,0,0,0,3, ;7
- db 3,0,0,0,0,0,0,1, ;8
- db 3,0,0,0,0,0,0,2, ;9
- db 3,1,0,0,0,0,0,2, ;10
- db 3,1,0,0,0,0,0,3, ;11
- db 3,2,0,0,0,0,0,2, ;12
- db 3,2,0,0,0,0,0,3, ;13
- db 3,2,1,0,0,0,0,3, ;14
- db 3,2,1,0,0,0,0,4, ;15
- db 4,0,0,0,0,0,0,1, ;16
- db 4,0,0,0,0,0,0,2, ;17
- db 4,1,0,0,0,0,0,2, ;18
- db 4,1,0,0,0,0,0,3, ;19
- db 4,2,0,0,0,0,0,2, ;20
- db 4,2,0,0,0,0,0,3, ;21
- db 4,2,1,0,0,0,0,3, ;22
- db 4,2,1,0,0,0,0,4, ;23
- db 4,3,0,0,0,0,0,2, ;24
- db 4,3,0,0,0,0,0,3, ;25
- db 4,3,1,0,0,0,0,3, ;26
- db 4,3,1,0,0,0,0,4, ;27
- db 4,3,2,0,0,0,0,3, ;28
- db 4,3,2,0,0,0,0,4, ;29
- db 4,3,2,1,0,0,0,4, ;30
- db 4,3,2,1,0,0,0,5, ;31
- db 5,0,0,0,0,0,0,1, ;32
- db 5,0,0,0,0,0,0,2, ;33
- db 5,1,0,0,0,0,0,2, ;34
- db 5,1,0,0,0,0,0,3, ;35
- db 5,2,0,0,0,0,0,2, ;36
- db 5,2,0,0,0,0,0,3, ;37
- db 5,2,1,0,0,0,0,3, ;38
- db 5,2,1,0,0,0,0,4, ;39
- db 5,3,0,0,0,0,0,2, ;40
- db 5,3,0,0,0,0,0,3, ;41
- db 5,3,1,0,0,0,0,3, ;42
- db 5,3,1,0,0,0,0,4, ;43
- db 5,3,2,0,0,0,0,3, ;44
- db 5,3,2,0,0,0,0,4, ;45
- db 5,3,2,1,0,0,0,4, ;46
- db 5,3,2,1,0,0,0,5, ;47
- db 5,4,0,0,0,0,0,2, ;48
- db 5,4,0,0,0,0,0,3, ;49
- db 5,4,1,0,0,0,0,3, ;50
- db 5,4,1,0,0,0,0,4, ;51
- db 5,4,2,0,0,0,0,3, ;52
- db 5,4,2,0,0,0,0,4, ;53
- db 5,4,2,1,0,0,0,4, ;54
- db 5,4,2,1,0,0,0,5, ;55
- db 5,4,3,0,0,0,0,3, ;56
- db 5,4,3,0,0,0,0,4, ;57
- db 5,4,3,1,0,0,0,4, ;58
- db 5,4,3,1,0,0,0,5, ;59
- db 5,4,3,2,0,0,0,4, ;60
- db 5,4,3,2,0,0,0,5, ;61
- db 5,4,3,2,1,0,0,5, ;62
- db 5,4,3,2,1,0,0,6, ;63
- db 6,0,0,0,0,0,0,1, ;64
- db 6,0,0,0,0,0,0,2, ;65
- db 6,1,0,0,0,0,0,2, ;66
- db 6,1,0,0,0,0,0,3, ;67
- db 6,2,0,0,0,0,0,2, ;68
- db 6,2,0,0,0,0,0,3, ;69
- db 6,2,1,0,0,0,0,3, ;70
- db 6,2,1,0,0,0,0,4, ;71
- db 6,3,0,0,0,0,0,2, ;72
- db 6,3,0,0,0,0,0,3, ;73
- db 6,3,1,0,0,0,0,3, ;74
- db 6,3,1,0,0,0,0,4, ;75
- db 6,3,2,0,0,0,0,3, ;76
- db 6,3,2,0,0,0,0,4, ;77
- db 6,3,2,1,0,0,0,4, ;78
- db 6,3,2,1,0,0,0,5, ;79
- db 6,4,0,0,0,0,0,2, ;80
- db 6,4,0,0,0,0,0,3, ;81
- db 6,4,1,0,0,0,0,3, ;82
- db 6,4,1,0,0,0,0,4, ;83
- db 6,4,2,0,0,0,0,3, ;84
- db 6,4,2,0,0,0,0,4, ;85
- db 6,4,2,1,0,0,0,4, ;86
- db 6,4,2,1,0,0,0,5, ;87
- db 6,4,3,0,0,0,0,3, ;88
- db 6,4,3,0,0,0,0,4, ;89
- db 6,4,3,1,0,0,0,4, ;90
- db 6,4,3,1,0,0,0,5, ;91
- db 6,4,3,2,0,0,0,4, ;92
- db 6,4,3,2,0,0,0,5, ;93
- db 6,4,3,2,1,0,0,5, ;94
- db 6,4,3,2,1,0,0,6, ;95
- db 6,5,0,0,0,0,0,2, ;96
- db 6,5,0,0,0,0,0,3, ;97
- db 6,5,1,0,0,0,0,3, ;98
- db 6,5,1,0,0,0,0,4, ;99
- db 6,5,2,0,0,0,0,3, ;100
- db 6,5,2,0,0,0,0,4, ;101
- db 6,5,2,1,0,0,0,4, ;102
- db 6,5,2,1,0,0,0,5, ;103
- db 6,5,3,0,0,0,0,3, ;104
- db 6,5,3,0,0,0,0,4, ;105
- db 6,5,3,1,0,0,0,4, ;106
- db 6,5,3,1,0,0,0,5, ;107
- db 6,5,3,2,0,0,0,4, ;108
- db 6,5,3,2,0,0,0,5, ;109
- db 6,5,3,2,1,0,0,5, ;110
- db 6,5,3,2,1,0,0,6, ;111
- db 6,5,4,0,0,0,0,3, ;112
- db 6,5,4,0,0,0,0,4, ;113
- db 6,5,4,1,0,0,0,4, ;114
- db 6,5,4,1,0,0,0,5, ;115
- db 6,5,4,2,0,0,0,4, ;116
- db 6,5,4,2,0,0,0,5, ;117
- db 6,5,4,2,1,0,0,5, ;118
- db 6,5,4,2,1,0,0,6, ;119
- db 6,5,4,3,0,0,0,4, ;120
- db 6,5,4,3,0,0,0,5, ;121
- db 6,5,4,3,1,0,0,5, ;122
- db 6,5,4,3,1,0,0,6, ;123
- db 6,5,4,3,2,0,0,5, ;124
- db 6,5,4,3,2,0,0,6, ;125
- db 6,5,4,3,2,1,0,6, ;126
- db 6,5,4,3,2,1,0,7, ;127
- db 7,0,0,0,0,0,0,1, ;128
- db 7,0,0,0,0,0,0,2, ;129
- db 7,1,0,0,0,0,0,2, ;130
- db 7,1,0,0,0,0,0,3, ;131
- db 7,2,0,0,0,0,0,2, ;132
- db 7,2,0,0,0,0,0,3, ;133
- db 7,2,1,0,0,0,0,3, ;134
- db 7,2,1,0,0,0,0,4, ;135
- db 7,3,0,0,0,0,0,2, ;136
- db 7,3,0,0,0,0,0,3, ;137
- db 7,3,1,0,0,0,0,3, ;138
- db 7,3,1,0,0,0,0,4, ;139
- db 7,3,2,0,0,0,0,3, ;140
- db 7,3,2,0,0,0,0,4, ;141
- db 7,3,2,1,0,0,0,4, ;142
- db 7,3,2,1,0,0,0,5, ;143
- db 7,4,0,0,0,0,0,2, ;144
- db 7,4,0,0,0,0,0,3, ;145
- db 7,4,1,0,0,0,0,3, ;146
- db 7,4,1,0,0,0,0,4, ;147
- db 7,4,2,0,0,0,0,3, ;148
- db 7,4,2,0,0,0,0,4, ;149
- db 7,4,2,1,0,0,0,4, ;150
- db 7,4,2,1,0,0,0,5, ;151
- db 7,4,3,0,0,0,0,3, ;152
- db 7,4,3,0,0,0,0,4, ;153
- db 7,4,3,1,0,0,0,4, ;154
- db 7,4,3,1,0,0,0,5, ;155
- db 7,4,3,2,0,0,0,4, ;156
- db 7,4,3,2,0,0,0,5, ;157
- db 7,4,3,2,1,0,0,5, ;158
- db 7,4,3,2,1,0,0,6, ;159
- db 7,5,0,0,0,0,0,2, ;160
- db 7,5,0,0,0,0,0,3, ;161
- db 7,5,1,0,0,0,0,3, ;162
- db 7,5,1,0,0,0,0,4, ;163
- db 7,5,2,0,0,0,0,3, ;164
- db 7,5,2,0,0,0,0,4, ;165
- db 7,5,2,1,0,0,0,4, ;166
- db 7,5,2,1,0,0,0,5, ;167
- db 7,5,3,0,0,0,0,3, ;168
- db 7,5,3,0,0,0,0,4, ;169
- db 7,5,3,1,0,0,0,4, ;170
- db 7,5,3,1,0,0,0,5, ;171
- db 7,5,3,2,0,0,0,4, ;172
- db 7,5,3,2,0,0,0,5, ;173
- db 7,5,3,2,1,0,0,5, ;174
- db 7,5,3,2,1,0,0,6, ;175
- db 7,5,4,0,0,0,0,3, ;176
- db 7,5,4,0,0,0,0,4, ;177
- db 7,5,4,1,0,0,0,4, ;178
- db 7,5,4,1,0,0,0,5, ;179
- db 7,5,4,2,0,0,0,4, ;180
- db 7,5,4,2,0,0,0,5, ;181
- db 7,5,4,2,1,0,0,5, ;182
- db 7,5,4,2,1,0,0,6, ;183
- db 7,5,4,3,0,0,0,4, ;184
- db 7,5,4,3,0,0,0,5, ;185
- db 7,5,4,3,1,0,0,5, ;186
- db 7,5,4,3,1,0,0,6, ;187
- db 7,5,4,3,2,0,0,5, ;188
- db 7,5,4,3,2,0,0,6, ;189
- db 7,5,4,3,2,1,0,6, ;190
- db 7,5,4,3,2,1,0,7, ;191
- db 7,6,0,0,0,0,0,2, ;192
- db 7,6,0,0,0,0,0,3, ;193
- db 7,6,1,0,0,0,0,3, ;194
- db 7,6,1,0,0,0,0,4, ;195
- db 7,6,2,0,0,0,0,3, ;196
- db 7,6,2,0,0,0,0,4, ;197
- db 7,6,2,1,0,0,0,4, ;198
- db 7,6,2,1,0,0,0,5, ;199
- db 7,6,3,0,0,0,0,3, ;200
- db 7,6,3,0,0,0,0,4, ;201
- db 7,6,3,1,0,0,0,4, ;202
- db 7,6,3,1,0,0,0,5, ;203
- db 7,6,3,2,0,0,0,4, ;204
- db 7,6,3,2,0,0,0,5, ;205
- db 7,6,3,2,1,0,0,5, ;206
- db 7,6,3,2,1,0,0,6, ;207
- db 7,6,4,0,0,0,0,3, ;208
- db 7,6,4,0,0,0,0,4, ;209
- db 7,6,4,1,0,0,0,4, ;210
- db 7,6,4,1,0,0,0,5, ;211
- db 7,6,4,2,0,0,0,4, ;212
- db 7,6,4,2,0,0,0,5, ;213
- db 7,6,4,2,1,0,0,5, ;214
- db 7,6,4,2,1,0,0,6, ;215
- db 7,6,4,3,0,0,0,4, ;216
- db 7,6,4,3,0,0,0,5, ;217
- db 7,6,4,3,1,0,0,5, ;218
- db 7,6,4,3,1,0,0,6, ;219
- db 7,6,4,3,2,0,0,5, ;220
- db 7,6,4,3,2,0,0,6, ;221
- db 7,6,4,3,2,1,0,6, ;222
- db 7,6,4,3,2,1,0,7, ;223
- db 7,6,5,0,0,0,0,3, ;224
- db 7,6,5,0,0,0,0,4, ;225
- db 7,6,5,1,0,0,0,4, ;226
- db 7,6,5,1,0,0,0,5, ;227
- db 7,6,5,2,0,0,0,4, ;228
- db 7,6,5,2,0,0,0,5, ;229
- db 7,6,5,2,1,0,0,5, ;230
- db 7,6,5,2,1,0,0,6, ;231
- db 7,6,5,3,0,0,0,4, ;232
- db 7,6,5,3,0,0,0,5, ;233
- db 7,6,5,3,1,0,0,5, ;234
- db 7,6,5,3,1,0,0,6, ;235
- db 7,6,5,3,2,0,0,5, ;236
- db 7,6,5,3,2,0,0,6, ;237
- db 7,6,5,3,2,1,0,6, ;238
- db 7,6,5,3,2,1,0,7, ;239
- db 7,6,5,4,0,0,0,4, ;240
- db 7,6,5,4,0,0,0,5, ;241
- db 7,6,5,4,1,0,0,5, ;242
- db 7,6,5,4,1,0,0,6, ;243
- db 7,6,5,4,2,0,0,5, ;244
- db 7,6,5,4,2,0,0,6, ;245
- db 7,6,5,4,2,1,0,6, ;246
- db 7,6,5,4,2,1,0,7, ;247
- db 7,6,5,4,3,0,0,5, ;248
- db 7,6,5,4,3,0,0,6, ;249
- db 7,6,5,4,3,1,0,6, ;250
- db 7,6,5,4,3,1,0,7, ;251
- db 7,6,5,4,3,2,0,6, ;252
- db 7,6,5,4,3,2,0,7, ;253
- db 7,6,5,4,3,2,1,7, ;254
- db 7,6,5,4,3,2,1,8, ;255
-
-;***********************************************************************
-; Code
-;***********************************************************************
-SECTION .text
-
-
-
-;***********************************************************************
-;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
-;***********************************************************************
-WELS_EXTERN CavlcParamCal_sse2
- push ebx
- push edi
- push esi
-
- mov eax, [esp+16] ;coffLevel
- mov edi, [esp+24] ;Level
- mov ebx, [esp+32] ;endIdx
- cmp ebx, 3
- jne .Level16
- pxor xmm1, xmm1
- movq xmm0, [eax] ; removed QWORD
- jmp .Cal_begin
-.Level16:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax+16]
-.Cal_begin:
- movdqa xmm2, xmm0
- packsswb xmm0, xmm1
- movdqa xmm4, xmm0
- pxor xmm3, xmm3
- pcmpgtb xmm0, xmm3
- pcmpgtb xmm3, xmm4
- por xmm0, xmm3
- pmovmskb edx, xmm0
- cmp edx, 0
- je near .return
- movdqa xmm6, [sse2_b_1]
- pcmpeqw xmm7, xmm7 ;generate -1
- mov ebx, 0xff
- ;pinsrw xmm6, ebx, 3
-
- mov bl, dh
-
- lea ebx, [byte_1pos_table+8*ebx]
- movq xmm0, [ebx]
- pextrw ecx, xmm0, 3
- shr ecx, 8
- mov dh, cl
-
-.loopHighFind0:
- cmp ecx, 0
- je .loopHighFind0End
- ;mov esi, [ebx]
- ;and esi, 0xff
- movzx esi, byte [ebx]
- add esi, 8
- mov esi, [eax+2*esi]
- mov [edi], si
- add edi, 2
- ;add ebx, 1
- inc ebx
- dec ecx
- jmp .loopHighFind0
-.loopHighFind0End:
- mov cl, dh
- cmp cl, 8
- pand xmm0, xmm6
- jne .LowByteFind0
- sub edi, 2
- mov esi, [eax+16]
- mov [edi], esi
- add edi, 2
-.LowByteFind0:
- and edx, 0xff
- lea ebx, [byte_1pos_table+8*edx]
- movq xmm1, [ebx]
- pextrw esi, xmm1, 3
- or esi, 0xff
- or ecx, 0xff00
- and ecx, esi
- shr esi, 8
- pand xmm1, xmm6
-.loopLowFind0:
- cmp esi, 0
- je .loopLowFind0End
- ;mov edx, [ebx]
- ;and edx, 0xff
- movzx edx, byte [ebx]
- mov edx, [eax+2*edx]
- mov [edi], dx
- add edi, 2
- ;add ebx, 1
- inc ebx
- dec esi
- jmp .loopLowFind0
-.loopLowFind0End:
- cmp ch, 8
- jne .getLevelEnd
- sub edi, 2
- mov edx, [eax]
- mov [edi], dx
-.getLevelEnd:
- mov edx, [esp+28] ;total_coeffs
- ;mov ebx, ecx
- ;and ebx, 0xff
- movzx ebx, byte cl
- add cl, ch
- mov [edx], cl
-;getRun
- movq xmm5, [sse2_b8]
- paddb xmm0, xmm5
- pxor xmm2, xmm2
- pxor xmm3, xmm3
- mov eax, 8
- sub eax, ebx
- shl eax, 3
- shl ebx, 3
- pinsrw xmm2, ebx, 0
- pinsrw xmm3, eax, 0
- psllq xmm0, xmm3
- psrlq xmm0, xmm3
- movdqa xmm4, xmm1
- psllq xmm1, xmm2
- psrlq xmm4, xmm3
- punpcklqdq xmm1, xmm4
- por xmm0, xmm1
-
- pextrw eax, xmm0, 0
- and eax, 0xff
- inc eax
- sub al, cl
- movdqa xmm1, xmm0
- paddb xmm1, xmm7
- psrldq xmm0, 1
- psubb xmm1, xmm0
- mov ecx, [esp+20] ;run
- movdqa [ecx], xmm1
-;getRunEnd
-.return:
- pop esi
- pop edi
- pop ebx
- ret
-%endif
--- a/codec/encoder/core/asm/dct.asm
+++ /dev/null
@@ -1,504 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* ?Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* ?Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* dct.asm
-;*
-;* Abstract
-;* WelsDctFourT4_sse2
-;*
-;* History
-;* 8/4/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Constant
-;***********************************************************************
-
-align 16
-SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
- dw 10, 13, 10, 13, 13, 16, 13, 16,
- dw 11, 14, 11, 14, 14, 18, 14, 18,
- dw 11, 14, 11, 14, 14, 18, 14, 18,
- dw 13, 16, 13, 16, 16, 20, 16, 20,
- dw 13, 16, 13, 16, 16, 20, 16, 20,
- dw 14, 18, 14, 18, 18, 23, 18, 23,
- dw 14, 18, 14, 18, 18, 23, 18, 23,
- dw 16, 20, 16, 20, 20, 25, 20, 25,
- dw 16, 20, 16, 20, 20, 25, 20, 25,
- dw 18, 23, 18, 23, 23, 29, 23, 29,
- dw 18, 23, 18, 23, 23, 29, 23, 29
-
-
-;***********************************************************************
-; MMX functions
-;***********************************************************************
-
-%macro MMX_LoadDiff4P 5
- movd %1, [%3]
- movd %2, [%4]
- punpcklbw %1, %5
- punpcklbw %2, %5
- psubw %1, %2
-%endmacro
-
-%macro MMX_LoadDiff4x4P 10 ;d0, d1, d2, d3, pix1address, pix1stride, pix2address, pix2stride, tmp(mm), 0(mm)
- MMX_LoadDiff4P %1, %9, %5, %7, %10
- MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10
- lea %5, [%5+2*%6]
- lea %7, [%7+2*%8]
- MMX_LoadDiff4P %3, %9, %5, %7, %10
- MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10
-%endmacro
-
-%macro MMX_SumSubMul2 3
- movq %3, %1
- psllw %1, $01
- paddw %1, %2
- psllw %2, $01
- psubw %3, %2
-%endmacro
-
-%macro MMX_SumSubDiv2 3
- movq %3, %2
- psraw %3, $01
- paddw %3, %1
- psraw %1, $01
- psubw %1, %2
-%endmacro
-
-%macro MMX_SumSub 3
- movq %3, %2
- psubw %2, %1
- paddw %1, %3
-%endmacro
-
-%macro MMX_DCT 6
- MMX_SumSub %4, %1, %6
- MMX_SumSub %3, %2, %6
- MMX_SumSub %3, %4, %6
- MMX_SumSubMul2 %1, %2, %5
-%endmacro
-
-%macro MMX_IDCT 6
- MMX_SumSub %4, %5, %6
- MMX_SumSubDiv2 %3, %2, %1
- MMX_SumSub %1, %4, %6
- MMX_SumSub %3, %5, %6
-%endmacro
-
-%macro MMX_StoreDiff4P 6
- movd %2, %6
- punpcklbw %2, %4
- paddw %1, %3
- psraw %1, $06
- paddsw %1, %2
- packuswb %1, %2
- movd %5, %1
-%endmacro
-SECTION .text
-;***********************************************************************
-; void WelsDctT4_mmx( int16_t *pDct[4], uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
-;***********************************************************************
-WELS_EXTERN WelsDctT4_mmx
- %assign push_num 0
- LOAD_5_PARA
- SIGN_EXTENSION r2, r2d
- SIGN_EXTENSION r4, r4d
- WELS_Zero mm7
-
- MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, r1, r2, r3, r4, mm0, mm7
-
- MMX_DCT mm1, mm2, mm3 ,mm4, mm5, mm6
- MMX_Trans4x4W mm3, mm1, mm4, mm5, mm2
-
- MMX_DCT mm3, mm5, mm2 ,mm4, mm1, mm6
- MMX_Trans4x4W mm2, mm3, mm4, mm1, mm5
-
- movq [r0+ 0], mm2
- movq [r0+ 8], mm1
- movq [r0+16], mm5
- movq [r0+24], mm4
- WELSEMMS
- LOAD_5_PARA_POP
- ret
-
-
-;***********************************************************************
-; void WelsIDctT4Rec_mmx(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs)
-;***********************************************************************
-WELS_EXTERN WelsIDctT4Rec_mmx
- %assign push_num 0
- LOAD_5_PARA
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- movq mm0, [r4+ 0]
- movq mm1, [r4+ 8]
- movq mm2, [r4+16]
- movq mm3, [r4+24]
-
- MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
- MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
- MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
- MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
-
- WELS_Zero mm7
- WELS_DW32 mm6
-
- MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0], [r2]
- MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1], [r2+r3]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0], [r2]
- MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1], [r2+r3]
-
- WELSEMMS
- LOAD_5_PARA_POP
- ret
-
-
-;***********************************************************************
-; SSE2 functions
-;***********************************************************************
-%macro SSE2_Store4x8p 6
- SSE2_XSawp qdq, %2, %3, %6
- SSE2_XSawp qdq, %4, %5, %3
- MOVDQ [%1+0x00], %2
- MOVDQ [%1+0x10], %4
- MOVDQ [%1+0x20], %6
- MOVDQ [%1+0x30], %3
-%endmacro
-
-%macro SSE2_Load4x8p 6
- MOVDQ %2, [%1+0x00]
- MOVDQ %4, [%1+0x10]
- MOVDQ %6, [%1+0x20]
- MOVDQ %3, [%1+0x30]
- SSE2_XSawp qdq, %4, %3, %5
- SSE2_XSawp qdq, %2, %6, %3
-%endmacro
-
-%macro SSE2_SumSubMul2 3
- movdqa %3, %1
- paddw %1, %1
- paddw %1, %2
- psubw %3, %2
- psubw %3, %2
-%endmacro
-
-%macro SSE2_SumSubDiv2 4
- movdqa %4, %1
- movdqa %3, %2
- psraw %2, $01
- psraw %4, $01
- paddw %1, %2
- psubw %4, %3
-%endmacro
-
-%macro SSE2_StoreDiff8p 6
- paddw %1, %3
- psraw %1, $06
- movq %2, %6
- punpcklbw %2, %4
- paddsw %2, %1
- packuswb %2, %2
- movq %5, %2
-%endmacro
-
-%macro SSE2_StoreDiff8p 5
- movq %2, %5
- punpcklbw %2, %3
- paddsw %2, %1
- packuswb %2, %2
- movq %4, %2
-%endmacro
-
-%macro SSE2_Load8DC 6
- movdqa %1, %6 ; %1 = dc0 dc1
- paddw %1, %5
- psraw %1, $06 ; (dc + 32) >> 6
-
- movdqa %2, %1
- psrldq %2, 4
- punpcklwd %2, %2
- punpckldq %2, %2 ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
-
- movdqa %3, %1
- psrldq %3, 8
- punpcklwd %3, %3
- punpckldq %3, %3 ; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
-
- movdqa %4, %1
- psrldq %4, 12
- punpcklwd %4, %4
- punpckldq %4, %4 ; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
-
- punpcklwd %1, %1
- punpckldq %1, %1 ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
-%endmacro
-
-%macro SSE2_DCT 6
- SSE2_SumSub %6, %3, %5
- SSE2_SumSub %1, %2, %5
- SSE2_SumSub %3, %2, %5
- SSE2_SumSubMul2 %6, %1, %4
-%endmacro
-
-%macro SSE2_IDCT 7
- SSE2_SumSub %7, %2, %6
- SSE2_SumSubDiv2 %1, %3, %5, %4
- SSE2_SumSub %2, %1, %5
- SSE2_SumSub %7, %4, %5
-%endmacro
-
-;***********************************************************************
-; void WelsDctFourT4_sse2(int16_t *pDct, uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
-;***********************************************************************
-WELS_EXTERN WelsDctFourT4_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r2, r2d
- SIGN_EXTENSION r4, r4d
- pxor xmm7, xmm7
- ;Load 4x8
- SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1], [r3]
- SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2], [r3+r4]
- lea r1, [r1 + 2 * r2]
- lea r3, [r3 + 2 * r4]
- SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
- SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
-
- SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
- SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
- SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
- SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
-
- SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
-
- lea r1, [r1 + 2 * r2]
- lea r3, [r3 + 2 * r4]
-
- ;Load 4x8
- SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1 ], [r3 ]
- SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2 ], [r3+r4]
- lea r1, [r1 + 2 * r2]
- lea r3, [r3 + 2 * r4]
- SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
- SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
-
- SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
- SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
- SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
- SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
-
- lea r0, [r0+64]
- SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
-
- POP_XMM
- LOAD_5_PARA_POP
- ret
-
-
-;***********************************************************************
-; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
-;***********************************************************************
-WELS_EXTERN WelsIDctFourT4Rec_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- ;Load 4x8
- SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
-
- SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
- SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
- SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
- SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
-
- WELS_Zero xmm7
- WELS_DW32 xmm6
-
- SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
- SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
- SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
-
- add r4, 64
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
-
- SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
- SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
- SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
- SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
-
- WELS_Zero xmm7
- WELS_DW32 xmm6
-
- SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
- SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
- SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1], [r2 + r3]
- POP_XMM
- LOAD_5_PARA_POP
- ; pop esi
- ; pop ebx
- ret
-
-%macro SSE2_StoreDiff4x8p 8
- SSE2_StoreDiff8p %1, %3, %4, [%5], [%6]
- SSE2_StoreDiff8p %1, %3, %4, [%5 + %7], [%6 + %8]
- SSE2_StoreDiff8p %2, %3, %4, [%5 + 8], [%6 + 8]
- SSE2_StoreDiff8p %2, %3, %4, [%5 + %7 + 8], [%6 + %8 + 8]
-%endmacro
-
- ;***********************************************************************
-; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
-;***********************************************************************
-WELS_EXTERN WelsIDctRecI16x16Dc_sse2
- %assign push_num 0
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- pxor xmm7, xmm7
- WELS_DW32 xmm6
-
- SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4]
- SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
-
- SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16]
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
- POP_XMM
- LOAD_5_PARA_POP
- ret
-
-
-
-%macro SSE2_SumSubD 3
- movdqa %3, %2
- paddd %2, %1
- psubd %1, %3
-%endmacro
-
-%macro SSE2_SumSubDiv2D 4
- paddd %1, %2
- paddd %1, %3
- psrad %1, 1
- movdqa %4, %1
- psubd %4, %2
-%endmacro
-%macro SSE2_Load4Col 5
- movsx r2, WORD[%5]
- movd %1, r2d
- movsx r2, WORD[%5 + 0x20]
- movd %2, r2d
- punpckldq %1, %2
- movsx r2, WORD[%5 + 0x80]
- movd %3, r2d
- movsx r2, WORD[%5 + 0xa0]
- movd %4, r2d
- punpckldq %3, %4
- punpcklqdq %1, %3
-%endmacro
-
-;***********************************************************************
-;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
-;***********************************************************************
-WELS_EXTERN WelsHadamardT4Dc_sse2
- %assign push_num 0
- LOAD_2_PARA
- PUSH_XMM 8
- SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, r1
- SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, r1 + 0x40
- SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, r1 + 0x100
- SSE2_Load4Col xmm4, xmm5, xmm6, xmm0, r1 + 0x140
-
- SSE2_SumSubD xmm1, xmm2, xmm7
- SSE2_SumSubD xmm3, xmm4, xmm7
- SSE2_SumSubD xmm2, xmm4, xmm7
- SSE2_SumSubD xmm1, xmm3, xmm7
-
- SSE2_Trans4x4D xmm4, xmm2, xmm1, xmm3, xmm5 ; pOut: xmm4,xmm3,xmm5,xmm1
-
- SSE2_SumSubD xmm4, xmm3, xmm7
- SSE2_SumSubD xmm5, xmm1, xmm7
-
- WELS_DD1 xmm6
- SSE2_SumSubDiv2D xmm3, xmm1, xmm6, xmm0 ; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
- SSE2_SumSubDiv2D xmm4, xmm5, xmm6, xmm1 ; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
- SSE2_Trans4x4D xmm3, xmm0, xmm1, xmm4, xmm2 ; pOut: xmm3,xmm4,xmm2,xmm1
-
- packssdw xmm3, xmm4
- packssdw xmm2, xmm1
- movdqa [r0+ 0], xmm3
- movdqa [r0+16], xmm2
-
- POP_XMM
- ret
--- a/codec/encoder/core/asm/intra_pred.asm
+++ /dev/null
@@ -1,1416 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* intra_pred.asm
-;*
-;* Abstract
-;* sse2 function for intra predict operations
-;*
-;* History
-;* 18/09/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-align 16
-sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
-align 16
-sse2_plane_inc dw 1, 2, 3, 4, 5, 6, 7, 8
-align 16
-sse2_plane_dec dw 8, 7, 6, 5, 4, 3, 2, 1
-
-; for chroma plane mode
-sse2_plane_inc_c dw 1, 2, 3, 4
-sse2_plane_dec_c dw 4, 3, 2, 1
-align 16
-sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
-
-align 16
-mmx_01bytes: times 16 db 1
-
-align 16
-mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
-
-
-;***********************************************************************
-; macros
-;***********************************************************************
-;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
-;%1 will keep the last result
-%macro SSE_DB_1_2REG 2
- pxor %1, %1
- pcmpeqw %2, %2
- psubb %1, %2
-%endmacro
-
-;xmm0, xmm1, xmm2, eax, ecx
-;lower 64 bits of xmm0 save the result
-%macro SSE2_PRED_H_4X4_TWO_LINE 5
- movd %1, [%4-1]
- movdqa %3, %1
- punpcklbw %1, %3
- movdqa %3, %1
- punpcklbw %1, %3
-
- ;add %4, %5
- movd %2, [%4+%5-1]
- movdqa %3, %2
- punpcklbw %2, %3
- movdqa %3, %2
- punpcklbw %2, %3
- punpckldq %1, %2
-%endmacro
-
-%macro SUMW_HORIZON1 2
- movdqa %2, %1
- psrldq %2, 8
- paddusw %1, %2
- movdqa %2, %1
- psrldq %2, 4
- paddusw %1, %2
- movdqa %2, %1
- psrldq %2, 2
- paddusw %1, %2
-%endmacro
-
-%macro LOAD_COLUMN 6
- movd %1, [%5]
- movd %2, [%5+%6]
- punpcklbw %1, %2
- lea %5, [%5+2*%6]
- movd %3, [%5]
- movd %2, [%5+%6]
- punpcklbw %3, %2
- punpcklwd %1, %3
- lea %5, [%5+2*%6]
- movd %4, [%5]
- movd %2, [%5+%6]
- punpcklbw %4, %2
- lea %5, [%5+2*%6]
- movd %3, [%5]
- movd %2, [%5+%6]
- lea %5, [%5+2*%6]
- punpcklbw %3, %2
- punpcklwd %4, %3
- punpckhdq %1, %4
-%endmacro
-
-%macro SUMW_HORIZON 3
- movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
- paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
- punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
- movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
- paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
- pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
- paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
-%endmacro
-
-
-%macro COPY_16_TIMES 2
- movdqa %2, [%1-16]
- psrldq %2, 15
- pmuludq %2, [mmx_01bytes]
- pshufd %2, %2, 0
-%endmacro
-
-%macro COPY_16_TIMESS 3
- movdqa %2, [%1+%3-16]
- psrldq %2, 15
- pmuludq %2, [mmx_01bytes]
- pshufd %2, %2, 0
-%endmacro
-
-%macro LOAD_COLUMN_C 6
- movd %1, [%5]
- movd %2, [%5+%6]
- punpcklbw %1,%2
- lea %5, [%5+2*%6]
- movd %3, [%5]
- movd %2, [%5+%6]
- punpcklbw %3, %2
- punpckhwd %1, %3
- lea %5, [%5+2*%6]
-%endmacro
-
-%macro LOAD_2_LEFT_AND_ADD 0
- lea r1, [r1+2*r2]
- movzx r4, byte [r1-0x01]
- add r3, r4
- movzx r4, byte [r1+r2-0x01]
- add r3, r4
-%endmacro
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-;***********************************************************************
-; void WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-;
-; pred must align to 16
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredH_sse2
- push r3
- %assign push_num 1
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- movzx r3, byte [r1-1]
- movd xmm0, r3d
- pmuludq xmm0, [mmx_01bytes]
-
- movzx r3, byte [r1+r2-1]
- movd xmm1, r3d
- pmuludq xmm1, [mmx_01bytes]
-
- unpcklps xmm0, xmm1
-
- lea r1, [r1+r2*2]
- movzx r3, byte [r1-1]
- movd xmm2, r3d
- pmuludq xmm2, [mmx_01bytes]
-
- movzx r3, byte [r1+r2-1]
- movd xmm3, r3d
- pmuludq xmm3, [mmx_01bytes]
-
- unpcklps xmm2, xmm3
- unpcklpd xmm0, xmm2
-
- movdqa [r0], xmm0
- pop r3
- ret
-
-;***********************************************************************
-; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-WELS_EXTERN WelsI16x16LumaPredPlane_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_3_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r2, r2d
- sub r1, 1
- sub r1, r2
-
- ;for H
- pxor xmm7, xmm7
- movq xmm0, [r1]
- movdqa xmm5, [sse2_plane_dec]
- punpcklbw xmm0, xmm7
- pmullw xmm0, xmm5
- movq xmm1, [r1 + 9]
- movdqa xmm6, [sse2_plane_inc]
- punpcklbw xmm1, xmm7
- pmullw xmm1, xmm6
- psubw xmm1, xmm0
-
- SUMW_HORIZON xmm1,xmm0,xmm2
- movd r3d, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]);
- movsx r3, r3w
- imul r3, 5
- add r3, 32
- sar r3, 6 ; b = (5 * H + 32) >> 6;
- SSE2_Copy8Times xmm1, r3d ; xmm1 = b,b,b,b,b,b,b,b
-
- movzx r4, BYTE [r1+16]
- sub r1, 3
- LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, r1, r2
-
- add r1, 3
- movzx r3, BYTE [r1+8*r2]
- add r4, r3
- shl r4, 4 ; a = (left[15*stride] + top[15]) << 4;
-
- sub r1, 3
- add r1, r2
- LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, r1, r2
- pxor xmm4, xmm4
- punpckhbw xmm0, xmm4
- pmullw xmm0, xmm5
- punpckhbw xmm7, xmm4
- pmullw xmm7, xmm6
- psubw xmm7, xmm0
-
- SUMW_HORIZON xmm7,xmm0,xmm2
- movd r3d, xmm7 ; V
- movsx r3, r3w
- imul r3, 5
- add r3, 32
- sar r3, 6 ; c = (5 * V + 32) >> 6;
- SSE2_Copy8Times xmm4, r3d ; xmm4 = c,c,c,c,c,c,c,c
-
- add r4, 16
- imul r3, -7
- add r3, r4 ; s = a + 16 + (-7)*c
- SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
-
- xor r3, r3
- movdqa xmm5, [sse2_plane_inc_minus]
-
-get_i16x16_luma_pred_plane_sse2_1:
- movdqa xmm2, xmm1
- pmullw xmm2, xmm5
- paddw xmm2, xmm0
- psraw xmm2, 5
- movdqa xmm3, xmm1
- pmullw xmm3, xmm6
- paddw xmm3, xmm0
- psraw xmm3, 5
- packuswb xmm2, xmm3
- movdqa [r0], xmm2
- paddw xmm0, xmm4
- add r0, 16
- inc r3
- cmp r3, 16
- jnz get_i16x16_luma_pred_plane_sse2_1
- POP_XMM
- pop r4
- pop r3
- ret
-
-;***********************************************************************
-; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-
-%macro SSE2_PRED_H_16X16_ONE_LINE 0
- add r0, 16
- add r1, r2
- movzx r3, byte [r1]
- SSE2_Copy16Times xmm0, r3d
- movdqa [r0], xmm0
-%endmacro
-
-WELS_EXTERN WelsI16x16LumaPredH_sse2
- push r3
- %assign push_num 1
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- dec r1
- movzx r3, byte [r1]
- SSE2_Copy16Times xmm0, r3d
- movdqa [r0], xmm0
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- pop r3
- ret
-
-;***********************************************************************
-; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-WELS_EXTERN WelsI16x16LumaPredV_sse2
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- sub r1, r2
- movdqa xmm0, [r1]
-
- movdqa [r0], xmm0
- movdqa [r0+10h], xmm0
- movdqa [r0+20h], xmm0
- movdqa [r0+30h], xmm0
- movdqa [r0+40h], xmm0
- movdqa [r0+50h], xmm0
- movdqa [r0+60h], xmm0
- movdqa [r0+70h], xmm0
- movdqa [r0+80h], xmm0
- movdqa [r0+90h], xmm0
- movdqa [r0+160], xmm0
- movdqa [r0+176], xmm0
- movdqa [r0+192], xmm0
- movdqa [r0+208], xmm0
- movdqa [r0+224], xmm0
- movdqa [r0+240], xmm0
-
- ret
-
-;***********************************************************************
-; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-WELS_EXTERN WelsIChromaPredPlane_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_3_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r2, r2d
- sub r1, 1
- sub r1, r2
-
- pxor mm7, mm7
- movq mm0, [r1]
- movq mm5, [sse2_plane_dec_c]
- punpcklbw mm0, mm7
- pmullw mm0, mm5
- movq mm1, [r1 + 5]
- movq mm6, [sse2_plane_inc_c]
- punpcklbw mm1, mm7
- pmullw mm1, mm6
- psubw mm1, mm0
-
- movq2dq xmm1, mm1
- pxor xmm2, xmm2
- SUMW_HORIZON xmm1,xmm0,xmm2
- movd r3d, xmm1
- movsx r3, r3w
- imul r3, 17
- add r3, 16
- sar r3, 5 ; b = (17 * H + 16) >> 5;
- SSE2_Copy8Times xmm1, r3d ; mm1 = b,b,b,b,b,b,b,b
-
- movzx r3, BYTE [r1+8]
- sub r1, 3
- LOAD_COLUMN_C mm0, mm2, mm3, mm4, r1, r2
-
- add r1, 3
- movzx r4, BYTE [r1+4*r2]
- add r4, r3
- shl r4, 4 ; a = (left[7*stride] + top[7]) << 4;
-
- sub r1, 3
- add r1, r2
- LOAD_COLUMN_C mm7, mm2, mm3, mm4, r1, r2
- pxor mm4, mm4
- punpckhbw mm0, mm4
- pmullw mm0, mm5
- punpckhbw mm7, mm4
- pmullw mm7, mm6
- psubw mm7, mm0
-
- movq2dq xmm7, mm7
- pxor xmm2, xmm2
- SUMW_HORIZON xmm7,xmm0,xmm2
- movd r3d, xmm7 ; V
- movsx r3, r3w
- imul r3, 17
- add r3, 16
- sar r3, 5 ; c = (17 * V + 16) >> 5;
- SSE2_Copy8Times xmm4, r3d ; mm4 = c,c,c,c,c,c,c,c
-
- add r4, 16
- imul r3, -3
- add r3, r4 ; s = a + 16 + (-3)*c
- SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
-
- xor r3, r3
- movdqa xmm5, [sse2_plane_mul_b_c]
-
-get_i_chroma_pred_plane_sse2_1:
- movdqa xmm2, xmm1
- pmullw xmm2, xmm5
- paddw xmm2, xmm0
- psraw xmm2, 5
- packuswb xmm2, xmm2
- movq [r0], xmm2
- paddw xmm0, xmm4
- add r0, 8
- inc r3
- cmp r3, 8
- jnz get_i_chroma_pred_plane_sse2_1
- POP_XMM
- pop r4
- pop r3
- WELSEMMS
- ret
-
-;***********************************************************************
-; 0 |1 |2 |3 |4 |
-; 6 |7 |8 |9 |10|
-; 11|12|13|14|15|
-; 16|17|18|19|20|
-; 21|22|23|24|25|
-; 7 is the start pixel of current 4x4 block
-; pred[7] = ([6]+[0]*2+[1]+2)/4
-;
-; void WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredDDR_mmx
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- movq mm1,[r1+r2-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
- movq mm2,[r1-8] ;get value of 6 mm2[8] = 6
- sub r1, r2 ;mov eax to above line of current block(postion of 1)
- punpckhbw mm2,[r1-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
- movd mm3,[r1] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
- punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
- psllq mm3,18h ;mm3[5]=[1]
- psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
- por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
- movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
- lea r1,[r1+r2*2-8h] ;set eax point to 12
- movq mm4,[r1+r2] ;get value of 16, mm4[8]=[16]
- psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
- psrlq mm4,38h ;mm4[1]=[16]
- por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
- movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
- movq mm4,[r1+r2*2] ;mm4[8]=[21]
- psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
- psrlq mm4,38h ;mm4[1]=[21]
- por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
- movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
- pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2
- pxor mm1,mm4 ;find odd value in the lowest bit of each byte
- pand mm1,[mmx_01bytes] ;set the odd bit
- psubusb mm3,mm1 ;decrease 1 from odd bytes
- pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
-
- movd [r0+12],mm2
- psrlq mm2,8
- movd [r0+8],mm2
- psrlq mm2,8
- movd [r0+4],mm2
- psrlq mm2,8
- movd [r0],mm2
- WELSEMMS
- ret
-
-;***********************************************************************
-; 0 |1 |2 |3 |4 |
-; 5 |6 |7 |8 |9 |
-; 10|11|12|13|14|
-; 15|16|17|18|19|
-; 20|21|22|23|24|
-; 6 is the start pixel of current 4x4 block
-; pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
-;
-; void WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredDc_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- movzx r4, byte [r1-1h]
- sub r1, r2
- movd xmm0, [r1]
- pxor xmm1, xmm1
- psadbw xmm0, xmm1
- xor r3, r3
- movd r3d, xmm0
- add r3, r4
- movzx r4, byte [r1+r2*2-1h]
- add r3, r4
-
- lea r1, [r1+r2*2-1]
- movzx r4, byte [r1+r2]
- add r3, r4
-
- movzx r4, byte [r1+r2*2]
- add r3, r4
- add r3, 4
- sar r3, 3
- imul r3, 0x01010101
-
- movd xmm0, r3d
- pshufd xmm0, xmm0, 0
- movdqa [r0], xmm0
- pop r4
- pop r3
- ret
-
-;***********************************************************************
-; void WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
-; copy 8 pixel of 8 line from left
-;***********************************************************************
-%macro MMX_PRED_H_8X8_ONE_LINE 4
- movq %1, [%3-8]
- psrlq %1, 38h
-
- ;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
- pmullw %1, [mmx_01bytes]
- pshufw %1, %1, 0
- movq [%4], %1
-%endmacro
-
-%macro MMX_PRED_H_8X8_ONE_LINEE 4
- movq %1, [%3+r2-8]
- psrlq %1, 38h
-
- ;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
- pmullw %1, [mmx_01bytes]
- pshufw %1, %1, 0
- movq [%4], %1
-%endmacro
-
-WELS_EXTERN WelsIChromaPredH_mmx
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- movq mm0, [r1-8]
- psrlq mm0, 38h
-
- ;pmuludq mm0, [mmx_01bytes] ;extend to 4 bytes
- pmullw mm0, [mmx_01bytes]
- pshufw mm0, mm0, 0
- movq [r0], mm0
-
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+8
-
- lea r1,[r1+r2*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+16
-
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+24
-
- lea r1,[r1+r2*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+32
-
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+40
-
- lea r1,[r1+r2*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+48
-
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+56
- WELSEMMS
- ret
-
-;***********************************************************************
-; void WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-; copy pixels from top 4 pixels
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredV_sse2
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- sub r1, r2
- movd xmm0, [r1]
- pshufd xmm0, xmm0, 0
- movdqa [r0], xmm0
- ret
-
-;***********************************************************************
-; void WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-; copy 8 pixels from top 8 pixels
-;***********************************************************************
-WELS_EXTERN WelsIChromaPredV_sse2
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- sub r1, r2
- movq xmm0, [r1]
- movdqa xmm1, xmm0
- punpcklqdq xmm0, xmm1
- movdqa [r0], xmm0
- movdqa [r0+16], xmm0
- movdqa [r0+32], xmm0
- movdqa [r0+48], xmm0
- ret
-
-;***********************************************************************
-; lt|t0|t1|t2|t3|
-; l0|
-; l1|
-; l2|
-; l3|
-; t3 will never been used
-; destination:
-; |a |b |c |d |
-; |e |f |a |b |
-; |g |h |e |f |
-; |i |j |g |h |
-
-; a = (1 + lt + l0)>>1
-; e = (1 + l0 + l1)>>1
-; g = (1 + l1 + l2)>>1
-; i = (1 + l2 + l3)>>1
-
-; d = (2 + t0 + (t1<<1) + t2)>>2
-; c = (2 + lt + (t0<<1) + t1)>>2
-; b = (2 + l0 + (lt<<1) + t0)>>2
-
-; f = (2 + l1 + (l0<<1) + lt)>>2
-; h = (2 + l2 + (l1<<1) + l0)>>2
-; j = (2 + l3 + (l2<<1) + l1)>>2
-; [b a f e h g j i] + [d c b a] --> mov to memory
-;
-; void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredHD_mmx
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- sub r1, r2
- movd mm0, [r1-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt]
- psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx]
-
- movd mm1, [r1+2*r2-4]
- punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1
- lea r1, [r1+2*r2]
- movd mm2, [r1+2*r2-4]
- punpcklbw mm2, [r1+r2-4] ; mm2[7] = l2, mm2[6] = l3
- punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
- psrlq mm2, 20h
- pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
-
- movq mm1, mm0
- psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
- movq mm2, mm0
- psrlq mm2, 8h ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
- movq mm3, mm2
- movq mm4, mm1
- pavgb mm1, mm0
-
- pxor mm4, mm0 ; find odd value in the lowest bit of each byte
- pand mm4, [mmx_01bytes] ; set the odd bit
- psubusb mm1, mm4 ; decrease 1 from odd bytes
-
- pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
-
- movq mm4, mm0
- pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i]
- punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i]
-
- psrlq mm2, 20h
- psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0]
- movq mm4, mm3
- psrlq mm4, 10h ; mm4 = [0 0 b a f e h j]
- pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx]
- psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a]
-
- movd [r0], mm2
- movd [r0+12], mm3
- psrlq mm3, 10h
- movd [r0+8], mm3
- psrlq mm3, 10h
- movd [r0+4], mm3
- WELSEMMS
- ret
-
-;***********************************************************************
-; lt|t0|t1|t2|t3|
-; l0|
-; l1|
-; l2|
-; l3|
-; t3 will never been used
-; destination:
-; |a |b |c |d |
-; |c |d |e |f |
-; |e |f |g |g |
-; |g |g |g |g |
-
-; a = (1 + l0 + l1)>>1
-; c = (1 + l1 + l2)>>1
-; e = (1 + l2 + l3)>>1
-; g = l3
-
-; b = (2 + l0 + (l1<<1) + l2)>>2
-; d = (2 + l1 + (l2<<1) + l3)>>2
-; f = (2 + l2 + (l3<<1) + l3)>>2
-
-; [g g f e d c b a] + [g g g g] --> mov to memory
-;
-; void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredHU_mmx
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- movd mm0, [r1-4] ; mm0[3] = l0
- punpcklbw mm0, [r1+r2-4] ; mm0[7] = l1, mm0[6] = l0
- lea r1, [r1+2*r2]
- movd mm2, [r1-4] ; mm2[3] = l2
- movd mm4, [r1+r2-4] ; mm4[3] = l3
- punpcklbw mm2, mm4
- punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
-
- psrlq mm4, 18h
- psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx]
- psrlq mm0, 8h
- pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
-
- movq mm1, mm0
- psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
- movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
- pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx]
-
- movq mm2, mm0
- psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx]
- movq mm5, mm2
- pavgb mm2, mm0
-
- pxor mm5, mm0 ; find odd value in the lowest bit of each byte
- pand mm5, [mmx_01bytes] ; set the odd bit
- psubusb mm2, mm5 ; decrease 1 from odd bytes
-
- pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
-
- psrlq mm2, 8h
- pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx]
-
- punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a]
- punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx]
- punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx]
-
- psrlq mm4, 20h
- movd [r0+12], mm4
-
- movd [r0], mm1
- psrlq mm1, 10h
- movd [r0+4], mm1
- psrlq mm1, 10h
- movd [r0+8], mm1
- WELSEMMS
- ret
-
-
-
-;***********************************************************************
-; lt|t0|t1|t2|t3|
-; l0|
-; l1|
-; l2|
-; l3|
-; l3 will never been used
-; destination:
-; |a |b |c |d |
-; |e |f |g |h |
-; |i |a |b |c |
-; |j |e |f |g |
-
-; a = (1 + lt + t0)>>1
-; b = (1 + t0 + t1)>>1
-; c = (1 + t1 + t2)>>1
-; d = (1 + t2 + t3)>>1
-
-; e = (2 + l0 + (lt<<1) + t0)>>2
-; f = (2 + lt + (t0<<1) + t1)>>2
-; g = (2 + t0 + (t1<<1) + t2)>>2
-
-; h = (2 + t1 + (t2<<1) + t3)>>2
-; i = (2 + lt + (l0<<1) + l1)>>2
-; j = (2 + l0 + (l1<<1) + l2)>>2
-;
-; void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredVR_mmx
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- sub r1, r2
- movq mm0, [r1-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
- psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
-
- movd mm1, [r1+2*r2-4]
- punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1
- lea r1, [r1+2*r2]
- movq mm2, [r1+r2-8] ; mm2[7] = l2
- punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx]
- psrlq mm2, 28h
- pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
-
- movq mm1, mm0
- psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
- pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx]
-
- movq mm2, mm0
- psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
- movq mm3, mm2
- pavgb mm2, mm0
-
- pxor mm3, mm0 ; find odd value in the lowest bit of each byte
- pand mm3, [mmx_01bytes] ; set the odd bit
- psubusb mm2, mm3 ; decrease 1 from odd bytes
-
- movq mm3, mm0
- psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
- pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx]
- movq mm2, mm3
-
- psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a]
- movd [r0], mm1
-
- psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e]
- movd [r0+4], mm2
-
- movq mm4, mm3
- psllq mm4, 20h
- psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i]
-
- movq mm5, mm3
- psllq mm5, 28h
- psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j]
-
- psllq mm1, 8h
- pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i]
- movd [r0+8], mm4
-
- psllq mm2, 8h
- pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
- movd [r0+12], mm5
- WELSEMMS
- ret
-
-;***********************************************************************
-; lt|t0|t1|t2|t3|t4|t5|t6|t7
-; l0|
-; l1|
-; l2|
-; l3|
-; lt,t0,t1,t2,t3 will never been used
-; destination:
-; |a |b |c |d |
-; |b |c |d |e |
-; |c |d |e |f |
-; |d |e |f |g |
-
-; a = (2 + t0 + t2 + (t1<<1))>>2
-; b = (2 + t1 + t3 + (t2<<1))>>2
-; c = (2 + t2 + t4 + (t3<<1))>>2
-; d = (2 + t3 + t5 + (t4<<1))>>2
-
-; e = (2 + t4 + t6 + (t5<<1))>>2
-; f = (2 + t5 + t7 + (t6<<1))>>2
-; g = (2 + t6 + t7 + (t7<<1))>>2
-
-; [g f e d c b a] --> mov to memory
-;
-; void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredDDL_mmx
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- sub r1, r2
- movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
- movq mm1, mm0
- movq mm2, mm0
-
- movq mm3, mm0
- psrlq mm3, 38h
- psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx]
-
- psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
- psrlq mm2, 8h
- pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
-
- movq mm3, mm1
- pavgb mm1, mm2
- pxor mm3, mm2 ; find odd value in the lowest bit of each byte
- pand mm3, [mmx_01bytes] ; set the odd bit
- psubusb mm1, mm3 ; decrease 1 from odd bytes
-
- pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
-
- psrlq mm0, 8h
- movd [r0], mm0
- psrlq mm0, 8h
- movd [r0+4], mm0
- psrlq mm0, 8h
- movd [r0+8], mm0
- psrlq mm0, 8h
- movd [r0+12], mm0
- WELSEMMS
- ret
-
-
-;***********************************************************************
-; lt|t0|t1|t2|t3|t4|t5|t6|t7
-; l0|
-; l1|
-; l2|
-; l3|
-; lt,t0,t1,t2,t3 will never been used
-; destination:
-; |a |b |c |d |
-; |e |f |g |h |
-; |b |c |d |i |
-; |f |g |h |j |
-
-; a = (1 + t0 + t1)>>1
-; b = (1 + t1 + t2)>>1
-; c = (1 + t2 + t3)>>1
-; d = (1 + t3 + t4)>>1
-; i = (1 + t4 + t5)>>1
-
-; e = (2 + t0 + (t1<<1) + t2)>>2
-; f = (2 + t1 + (t2<<1) + t3)>>2
-; g = (2 + t2 + (t3<<1) + t4)>>2
-; h = (2 + t3 + (t4<<1) + t5)>>2
-; j = (2 + t4 + (t5<<1) + t6)>>2
-
-; [i d c b a] + [j h g f e] --> mov to memory
-;
-; void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredVL_mmx
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- sub r1, r2
- movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
- movq mm1, mm0
- movq mm2, mm0
-
- psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
- psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
-
- movq mm3, mm1
- pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a]
-
- movq mm4, mm2
- pavgb mm2, mm0
- pxor mm4, mm0 ; find odd value in the lowest bit of each byte
- pand mm4, [mmx_01bytes] ; set the odd bit
- psubusb mm2, mm4 ; decrease 1 from odd bytes
-
- pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
-
- movd [r0], mm3
- psrlq mm3, 8h
- movd [r0+8], mm3
-
- movd [r0+4], mm2
- psrlq mm2, 8h
- movd [r0+12], mm2
- WELSEMMS
- ret
-
-;***********************************************************************
-;
-; void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsIChromaPredDc_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- sub r1, r2
- movq mm0, [r1]
-
- movzx r3, byte [r1+r2-0x01] ; l1
- lea r1, [r1+2*r2]
- movzx r4, byte [r1-0x01] ; l2
- add r3, r4
- movzx r4, byte [r1+r2-0x01] ; l3
- add r3, r4
- lea r1, [r1+2*r2]
- movzx r4, byte [r1-0x01] ; l4
- add r3, r4
- movd mm1, r3d ; mm1 = l1+l2+l3+l4
-
- movzx r3, byte [r1+r2-0x01] ; l5
- lea r1, [r1+2*r2]
- movzx r4, byte [r1-0x01] ; l6
- add r3, r4
- movzx r4, byte [r1+r2-0x01] ; l7
- add r3, r4
- lea r1, [r1+2*r2]
- movzx r4, byte [r1-0x01] ; l8
- add r3, r4
- movd mm2, r3d ; mm2 = l5+l6+l7+l8
-
- movq mm3, mm0
- psrlq mm0, 0x20
- psllq mm3, 0x20
- psrlq mm3, 0x20
- pxor mm4, mm4
- psadbw mm0, mm4
- psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
-
- paddq mm3, mm1
- movq mm1, mm2
- paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
-
- movq mm4, [mmx_0x02]
-
- paddq mm0, mm4
- psrlq mm0, 0x02
-
- paddq mm2, mm4
- psrlq mm2, 0x02
-
- paddq mm3, mm4
- paddq mm3, mm4
- psrlq mm3, 0x03
-
- paddq mm1, mm4
- paddq mm1, mm4
- psrlq mm1, 0x03
-
- pmuludq mm0, [mmx_01bytes]
- pmuludq mm3, [mmx_01bytes]
- psllq mm0, 0x20
- pxor mm0, mm3 ; mm0 = m_up
-
- pmuludq mm2, [mmx_01bytes]
- pmuludq mm1, [mmx_01bytes]
- psllq mm1, 0x20
- pxor mm1, mm2 ; mm2 = m_down
-
- movq [r0], mm0
- movq [r0+0x08], mm0
- movq [r0+0x10], mm0
- movq [r0+0x18], mm0
-
- movq [r0+0x20], mm1
- movq [r0+0x28], mm1
- movq [r0+0x30], mm1
- movq [r0+0x38], mm1
-
- pop r4
- pop r3
- WELSEMMS
- ret
-
-
-
-;***********************************************************************
-;
-; void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI16x16LumaPredDc_sse2
- push r3
- push r4
- %assign push_num 2
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- sub r1, r2
- movdqa xmm0, [r1] ; read one row
- pxor xmm1, xmm1
- psadbw xmm0, xmm1
- movdqa xmm1, xmm0
- psrldq xmm1, 0x08
- pslldq xmm0, 0x08
- psrldq xmm0, 0x08
- paddw xmm0, xmm1
-
- movzx r3, byte [r1+r2-0x01]
- movzx r4, byte [r1+2*r2-0x01]
- add r3, r4
- lea r1, [r1+r2]
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- add r3, 0x10
- movd xmm1, r3d
- paddw xmm0, xmm1
- psrld xmm0, 0x05
- pmuludq xmm0, [mmx_01bytes]
- pshufd xmm0, xmm0, 0
-
- movdqa [r0], xmm0
- movdqa [r0+0x10], xmm0
- movdqa [r0+0x20], xmm0
- movdqa [r0+0x30], xmm0
- movdqa [r0+0x40], xmm0
- movdqa [r0+0x50], xmm0
- movdqa [r0+0x60], xmm0
- movdqa [r0+0x70], xmm0
- movdqa [r0+0x80], xmm0
- movdqa [r0+0x90], xmm0
- movdqa [r0+0xa0], xmm0
- movdqa [r0+0xb0], xmm0
- movdqa [r0+0xc0], xmm0
- movdqa [r0+0xd0], xmm0
- movdqa [r0+0xe0], xmm0
- movdqa [r0+0xf0], xmm0
-
- pop r4
- pop r3
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
-; uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
-;
-;***********************************************************************
-%ifdef X86_32
-WELS_EXTERN WelsSampleSatdThree4x4_sse2
- push ebx
- push esi
- push edi
- mov eax, [esp+24];p_enc
- mov ebx, [esp+28];linesize_enc
-
- ; load source 4x4 samples and Hadamard transform
- movd xmm0, [eax]
- movd xmm1, [eax+ebx]
- lea eax , [eax+2*ebx]
- movd xmm2, [eax]
- movd xmm3, [eax+ebx]
- punpckldq xmm0, xmm2
- punpckldq xmm1, xmm3
-
- pxor xmm6, xmm6
- punpcklbw xmm0, xmm6
- punpcklbw xmm1, xmm6
-
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- psubw xmm2, xmm1
- SSE2_XSawp qdq, xmm0, xmm2, xmm3
-
- movdqa xmm4, xmm0
- paddw xmm0, xmm3
- psubw xmm4, xmm3
-
- movdqa xmm2, xmm0
- punpcklwd xmm0, xmm4
- punpckhwd xmm4, xmm2
-
- SSE2_XSawp dq, xmm0, xmm4, xmm3
- SSE2_XSawp qdq, xmm0, xmm3, xmm5
-
- movdqa xmm7, xmm0
- paddw xmm0, xmm5
- psubw xmm7, xmm5
-
- SSE2_XSawp qdq, xmm0, xmm7, xmm1
-
- ; Hadamard transform results are saved in xmm0 and xmm2
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- psubw xmm2, xmm1
-
- ; load top boundary samples: [a b c d]
- mov eax, [esp+16];p_dec
- sub eax, [esp+20];linesize_dec
- movzx ecx, byte [eax]
- movzx edx, byte [eax+1]
- movzx esi, byte [eax+2]
- movzx edi, byte [eax+3]
-
- ; get the transform results of top boundary samples: [a b c d]
- add edx, ecx ; edx = a + b
- add edi, esi ; edi = c + d
- add ecx, ecx ; ecx = a + a
- add esi, esi ; esi = c + c
- sub ecx, edx ; ecx = a + a - a - b = a - b
- sub esi, edi ; esi = c + c - c - d = c - d
- add edi, edx ; edi = (a + b) + (c + d)
- add edx, edx
- sub edx, edi ; edx = (a + b) - (c + d)
- add esi, ecx ; esi = (a - b) + (c - d)
- add ecx, ecx
- sub ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]
-
- movdqa xmm6, xmm0
- movdqa xmm7, xmm2
- movd xmm5, edi ; store the edi for DC mode
- pxor xmm3, xmm3
- pxor xmm4, xmm4
- pinsrw xmm3, edi, 0
- pinsrw xmm3, esi, 4
- psllw xmm3, 2
- pinsrw xmm4, edx, 0
- pinsrw xmm4, ecx, 4
- psllw xmm4, 2
-
- ; get the satd of H
- psubw xmm0, xmm3
- psubw xmm2, xmm4
-
- WELS_AbsW xmm0, xmm1
- WELS_AbsW xmm2, xmm1
- paddusw xmm0, xmm2
- SUMW_HORIZON1 xmm0, xmm1 ; satd of V is stored in xmm0
-
- ; load left boundary samples: [a b c d]'
- mov eax, [esp+16]
- mov ebx, [esp+20]
- movzx ecx, byte [eax-1]
- movzx edx, byte [eax+ebx-1]
- lea eax , [eax+2*ebx]
- movzx esi, byte [eax-1]
- movzx edi, byte [eax+ebx-1]
-
- ; get the transform results of left boundary samples: [a b c d]'
- add edx, ecx ; edx = a + b
- add edi, esi ; edi = c + d
- add ecx, ecx ; ecx = a + a
- add esi, esi ; esi = c + c
- sub ecx, edx ; ecx = a + a - a - b = a - b
- sub esi, edi ; esi = c + c - c - d = c - d
- add edi, edx ; edi = (a + b) + (c + d)
- add edx, edx
- sub edx, edi ; edx = (a + b) - (c + d)
- add esi, ecx ; esi = (a - b) + (c - d)
- add ecx, ecx
- sub ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]'
-
- ; store the transform results in xmm3
- movd xmm3, edi
- pinsrw xmm3, edx, 1
- pinsrw xmm3, ecx, 2
- pinsrw xmm3, esi, 3
- psllw xmm3, 2
-
- ; get the satd of V
- movdqa xmm2, xmm6
- movdqa xmm4, xmm7
- psubw xmm2, xmm3
- WELS_AbsW xmm2, xmm1
- WELS_AbsW xmm4, xmm1
- paddusw xmm2, xmm4
- SUMW_HORIZON1 xmm2, xmm1 ; satd of H is stored in xmm2
-
- ; DC result is stored in xmm1
- add edi, 4
- movd xmm1, edi
- paddw xmm1, xmm5
- psrlw xmm1, 3
- movdqa xmm5, xmm1
- psllw xmm1, 4
-
- ; get the satd of DC
- psubw xmm6, xmm1
- WELS_AbsW xmm6, xmm1
- WELS_AbsW xmm7, xmm1
- paddusw xmm6, xmm7
- SUMW_HORIZON1 xmm6, xmm1 ; satd of DC is stored in xmm6
-
- ; comparing order: DC H V
- mov edx, [esp+32]
- movd eax, xmm6
- movd edi, xmm2
- movd esi, xmm0
- and eax, 0xffff
- shr eax, 1
- and edi, 0xffff
- shr edi, 1
- and esi, 0xffff
- shr esi, 1
- add eax, [esp+40]
- add edi, [esp+44]
- add esi, [esp+48]
- cmp ax, di
- jg near not_dc
- cmp ax, si
- jg near not_dc_h
-
- ; for DC mode
- movd ebx, xmm5
- imul ebx, 0x01010101
- movd xmm5, ebx
- pshufd xmm5, xmm5, 0
- movdqa [edx], xmm5
- mov ebx, [esp+36]
- mov dword [ebx], 0x02
- pop edi
- pop esi
- pop ebx
- ret
-
-not_dc:
- cmp di, si
- jg near not_dc_h
-
- ; for H mode
- SSE_DB_1_2REG xmm6, xmm7
- mov eax, [esp+16]
- mov ebx, [esp+20]
- movzx ecx, byte [eax-1]
- movd xmm0, ecx
- pmuludq xmm0, xmm6
-
- movzx ecx, byte [eax+ebx-1]
- movd xmm1, ecx
- pmuludq xmm1, xmm6
-%if 1
- punpckldq xmm0, xmm1
-%else
- unpcklps xmm0, xmm1
-%endif
- lea eax, [eax+ebx*2]
- movzx ecx, byte [eax-1]
- movd xmm2, ecx
- pmuludq xmm2, xmm6
-
- movzx ecx, byte [eax+ebx-1]
- movd xmm3, ecx
- pmuludq xmm3, xmm6
-%if 1
- punpckldq xmm2, xmm3
- punpcklqdq xmm0, xmm2
-%else
- unpcklps xmm2, xmm3
- unpcklpd xmm0, xmm2
-%endif
- movdqa [edx],xmm0
-
- mov eax, edi
- mov ebx, [esp+36]
- mov dword [ebx], 0x01
-
- pop edi
- pop esi
- pop ebx
- ret
-not_dc_h:
- ; for V mode
- mov eax, [esp+16]
- sub eax, [esp+20]
- movd xmm0, [eax]
- pshufd xmm0, xmm0, 0
- movdqa [edx],xmm0
-
- mov eax, esi
- mov ebx, [esp+36]
- mov dword [ebx], 0x00
-
- pop edi
- pop esi
- pop ebx
- ret
-%endif
-
--- a/codec/encoder/core/asm/memzero.asm
+++ /dev/null
@@ -1,132 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* memzero.asm
-;*
-;* Abstract
-;*
-;*
-;* History
-;* 9/16/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-;***********************************************************************
-;void WelsPrefetchZero_mmx(int8_t const*_A);
-;***********************************************************************
-WELS_EXTERN WelsPrefetchZero_mmx
- %assign push_num 0
- LOAD_1_PARA
- prefetchnta [r0]
- ret
-
-
-;***********************************************************************
-; void WelsSetMemZeroAligned64_sse2(void *dst, int32_t size)
-;***********************************************************************
-WELS_EXTERN WelsSetMemZeroAligned64_sse2
-
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- neg r1
-
- pxor xmm0, xmm0
-.memzeroa64_sse2_loops:
- movdqa [r0], xmm0
- movdqa [r0+16], xmm0
- movdqa [r0+32], xmm0
- movdqa [r0+48], xmm0
- add r0, 0x40
-
- add r1, 0x40
- jnz near .memzeroa64_sse2_loops
-
- ret
-
-;***********************************************************************
-; void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
-;***********************************************************************
-WELS_EXTERN WelsSetMemZeroSize64_mmx
-
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- neg r1
-
- pxor mm0, mm0
-.memzero64_mmx_loops:
- movq [r0], mm0
- movq [r0+8], mm0
- movq [r0+16], mm0
- movq [r0+24], mm0
- movq [r0+32], mm0
- movq [r0+40], mm0
- movq [r0+48], mm0
- movq [r0+56], mm0
- add r0, 0x40
-
- add r1, 0x40
- jnz near .memzero64_mmx_loops
-
- WELSEMMS
- ret
-
-;***********************************************************************
-; void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
-;***********************************************************************
-WELS_EXTERN WelsSetMemZeroSize8_mmx
-
- %assign push_num 0
- LOAD_2_PARA
- SIGN_EXTENSION r1, r1d
- neg r1
- pxor mm0, mm0
-
-.memzero8_mmx_loops:
- movq [r0], mm0
- add r0, 0x08
-
- add r1, 0x08
- jnz near .memzero8_mmx_loops
-
- WELSEMMS
- ret
-
-
--- a/codec/encoder/core/asm/quant.asm
+++ /dev/null
@@ -1,370 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* quant.asm
-;*
-;* Abstract
-;* sse2 quantize inter-block
-;*
-;* History
-;* 7/6/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-
-SECTION .text
-;************************************************
-;NEW_QUANT
-;************************************************
-
-%macro SSE2_Quant8 5
- MOVDQ %1, %5
- pxor %2, %2
- pcmpgtw %2, %1
- pxor %1, %2
- psubw %1, %2
- paddusw %1, %3
- pmulhuw %1, %4
- pxor %1, %2
- psubw %1, %2
- MOVDQ %5, %1
-%endmacro
-
-%macro SSE2_QuantMax8 6
- MOVDQ %1, %5
- pxor %2, %2
- pcmpgtw %2, %1
- pxor %1, %2
- psubw %1, %2
- paddusw %1, %3
- pmulhuw %1, %4
- pmaxsw %6, %1
- pxor %1, %2
- psubw %1, %2
- MOVDQ %5, %1
-%endmacro
-
-%define pDct esp + 4
-%define ff esp + 8
-%define mf esp + 12
-%define max esp + 16
-;***********************************************************************
-; void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
-;***********************************************************************
-WELS_EXTERN WelsQuant4x4_sse2
- %assign push_num 0
- LOAD_3_PARA
- movdqa xmm2, [r1]
- movdqa xmm3, [r2]
-
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
-
- ret
-
-;***********************************************************************
-;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf);
-;***********************************************************************
-WELS_EXTERN WelsQuant4x4Dc_sse2
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSIONW r1, r1w
- SIGN_EXTENSIONW r2, r2w
- SSE2_Copy8Times xmm3, r2d
-
- SSE2_Copy8Times xmm2, r1d
-
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
-
- ret
-
-;***********************************************************************
-; void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
-;***********************************************************************
-WELS_EXTERN WelsQuantFour4x4_sse2
- %assign push_num 0
- LOAD_3_PARA
- MOVDQ xmm2, [r1]
- MOVDQ xmm3, [r2]
-
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70]
-
- ret
-
-;***********************************************************************
-; void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f, int16_t *mf, int16_t *max);
-;***********************************************************************
-WELS_EXTERN WelsQuantFour4x4Max_sse2
- %assign push_num 0
- LOAD_4_PARA
- PUSH_XMM 8
- MOVDQ xmm2, [r1]
- MOVDQ xmm3, [r2]
-
- pxor xmm4, xmm4
- pxor xmm5, xmm5
- pxor xmm6, xmm6
- pxor xmm7, xmm7
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 ], xmm4
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10], xmm4
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20], xmm5
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30], xmm5
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40], xmm6
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50], xmm6
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60], xmm7
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70], xmm7
-
- SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
- pmaxsw xmm0, xmm4
- pmaxsw xmm0, xmm5
- pmaxsw xmm0, xmm7
- movdqa xmm1, xmm0
- punpckhqdq xmm0, xmm1
- pmaxsw xmm0, xmm1
-
- movq [r3], xmm0
- POP_XMM
- LOAD_4_PARA_POP
- ret
-
-%macro MMX_Copy4Times 2
- movd %1, %2
- punpcklwd %1, %1
- punpckldq %1, %1
-%endmacro
-
-SECTION .text
-
-%macro MMX_Quant4 4
- pxor %2, %2
- pcmpgtw %2, %1
- pxor %1, %2
- psubw %1, %2
- paddusw %1, %3
- pmulhuw %1, %4
- pxor %1, %2
- psubw %1, %2
-%endmacro
-
-;***********************************************************************
-;int32_t WelsHadamardQuant2x2_mmx(int16_t *rs, const int16_t ff, int16_t mf, int16_t * pDct, int16_t * block);
-;***********************************************************************
-WELS_EXTERN WelsHadamardQuant2x2_mmx
- %assign push_num 0
- LOAD_5_PARA
- SIGN_EXTENSIONW r1, r1w
- SIGN_EXTENSIONW r2, r2w
- movd mm0, [r0]
- movd mm1, [r0 + 0x20]
- punpcklwd mm0, mm1
- movd mm3, [r0 + 0x40]
- movd mm1, [r0 + 0x60]
- punpcklwd mm3, mm1
-
- ;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
- movq mm5, mm3
- paddw mm3, mm0
- psubw mm0, mm5
- punpcklwd mm3, mm0
- movq mm1, mm3
- psrlq mm1, 32
- movq mm5, mm1
- paddw mm1, mm3
- psubw mm3, mm5
- punpcklwd mm1, mm3
-
- ;quant_2x2_dc
- MMX_Copy4Times mm3, r2d
- MMX_Copy4Times mm2, r1d
- MMX_Quant4 mm1, mm0, mm2, mm3
-
- ; store dct_2x2
- movq [r3], mm1
- movq [r4], mm1
-
- ; pNonZeroCount of dct_2x2
- pcmpeqb mm2, mm2 ; mm2 = FF
- pxor mm3, mm3
- packsswb mm1, mm3
- pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
- psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
- psadbw mm1, mm3 ;
- mov r1w, 0
- mov [r0], r1w
- mov [r0 + 0x20], r1w
- mov [r0 + 0x40], r1w
- mov [r0 + 0x60], r1w
-
-
- movd retrd, mm1
-
- WELSEMMS
- LOAD_5_PARA_POP
- ret
-
-;***********************************************************************
-;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff, int16_t mf);
-;***********************************************************************
-WELS_EXTERN WelsHadamardQuant2x2Skip_mmx
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSIONW r1, r1w
- SIGN_EXTENSIONW r2, r2w
- movd mm0, [r0]
- movd mm1, [r0 + 0x20]
- punpcklwd mm0, mm1
- movd mm3, [r0 + 0x40]
- movd mm1, [r0 + 0x60]
- punpcklwd mm3, mm1
-
- ;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
- movq mm5, mm3
- paddw mm3, mm0
- psubw mm0, mm5
- punpcklwd mm3, mm0
- movq mm1, mm3
- psrlq mm1, 32
- movq mm5, mm1
- paddw mm1, mm3
- psubw mm3, mm5
- punpcklwd mm1, mm3
-
- ;quant_2x2_dc
- MMX_Copy4Times mm3, r2d
- MMX_Copy4Times mm2, r1d
- MMX_Quant4 mm1, mm0, mm2, mm3
-
- ; pNonZeroCount of dct_2x2
- pcmpeqb mm2, mm2 ; mm2 = FF
- pxor mm3, mm3
- packsswb mm1, mm3
- pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
- psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
- psadbw mm1, mm3 ;
- movd retrd, mm1
-
- WELSEMMS
- ret
-
-
-%macro SSE2_DeQuant8 3
- MOVDQ %2, %1
- pmullw %2, %3
- MOVDQ %1, %2
-%endmacro
-
-
-;***********************************************************************
-; void WelsDequant4x4_sse2(int16_t *pDct, const uint16_t* mf);
-;***********************************************************************
-WELS_EXTERN WelsDequant4x4_sse2
- %assign push_num 0
- LOAD_2_PARA
-
- movdqa xmm1, [r1]
- SSE2_DeQuant8 [r0 ], xmm0, xmm1
- SSE2_DeQuant8 [r0 + 0x10], xmm0, xmm1
-
- ret
-
-;***********************************************************************====
-;void WelsDequantFour4x4_sse2(int16_t *pDct, const uint16_t* mf);
-;***********************************************************************====
-
-WELS_EXTERN WelsDequantFour4x4_sse2
- %assign push_num 0
- LOAD_2_PARA
-
- movdqa xmm1, [r1]
- SSE2_DeQuant8 [r0 ], xmm0, xmm1
- SSE2_DeQuant8 [r0+0x10 ], xmm0, xmm1
- SSE2_DeQuant8 [r0+0x20 ], xmm0, xmm1
- SSE2_DeQuant8 [r0+0x30 ], xmm0, xmm1
- SSE2_DeQuant8 [r0+0x40 ], xmm0, xmm1
- SSE2_DeQuant8 [r0+0x50 ], xmm0, xmm1
- SSE2_DeQuant8 [r0+0x60 ], xmm0, xmm1
- SSE2_DeQuant8 [r0+0x70 ], xmm0, xmm1
-
- ret
-
-;***********************************************************************
-;void WelsDequantIHadamard4x4_sse2(int16_t *rs, const uint16_t mf);
-;***********************************************************************
-WELS_EXTERN WelsDequantIHadamard4x4_sse2
- %assign push_num 0
- LOAD_2_PARA
- %ifndef X86_32
- movzx r1, r1w
- %endif
-
- ; WelsDequantLumaDc4x4
- SSE2_Copy8Times xmm1, r1d
- ;psrlw xmm1, 2 ; for the (>>2) in ihdm
- MOVDQ xmm0, [r0]
- MOVDQ xmm2, [r0+0x10]
- pmullw xmm0, xmm1
- pmullw xmm2, xmm1
-
- ; ihdm_4x4
- movdqa xmm1, xmm0
- psrldq xmm1, 8
- movdqa xmm3, xmm2
- psrldq xmm3, 8
-
- SSE2_SumSub xmm0, xmm3, xmm5 ; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
- SSE2_SumSub xmm1, xmm2, xmm5 ; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
- SSE2_SumSub xmm3, xmm2, xmm5 ; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
- SSE2_SumSub xmm0, xmm1, xmm5 ; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1
-
- SSE2_TransTwo4x4W xmm2, xmm1, xmm3, xmm0, xmm4
- SSE2_SumSub xmm2, xmm4, xmm5
- SSE2_SumSub xmm1, xmm0, xmm5
- SSE2_SumSub xmm4, xmm0, xmm5
- SSE2_SumSub xmm2, xmm1, xmm5
- SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
-
- punpcklqdq xmm0, xmm1
- MOVDQ [r0], xmm0
-
- punpcklqdq xmm2, xmm3
- MOVDQ [r0+16], xmm2
- ret
--- a/codec/encoder/core/asm/score.asm
+++ /dev/null
@@ -1,339 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* score.asm
-;*
-;* Abstract
-;* scan/score/count of sse2
-;*
-;* History
-;* 8/21/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Macros
-;***********************************************************************
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-SECTION .rodata align=16
-
-;align 16
-;se2_2 dw 2, 2, 2, 2, 2, 2, 2, 2
-align 16
-sse2_1: dw 1, 1, 1, 1, 1, 1, 1, 1
-align 16
-sse2_b1: db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
-i_ds_table: db 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-align 16
-sse2_plane_inc_minus: dw -7, -6, -5, -4, -3, -2, -1, 0
-align 16
-sse2_plane_inc: dw 1, 2, 3, 4, 5, 6, 7, 8
-align 16
-sse2_plane_dec: dw 8, 7, 6, 5, 4, 3, 2, 1
-align 16
-pb_scanacdc_maska:db 0,1,2,3,8,9,14,15,10,11,4,5,6,7,12,13
-align 16
-pb_scanacdc_maskb:db 2,3,8,9,10,11,4,5,0,1,6,7,12,13,14,15
-align 16
-pb_scandc_maska:db 2,3,8,9,14,15,10,11,4,5,6,7,12,13,0,1
-align 16
-pb_scandc_maskb:db 8,9,10,11,4,5,0,1,6,7,12,13,14,15,128,128
-
-align 16
-nozero_count_table:
-db 0,1,1,2,1,2,2,3,1,2
-db 2,3,2,3,3,4,1,2,2,3
-db 2,3,3,4,2,3,3,4,3,4
-db 4,5,1,2,2,3,2,3,3,4
-db 2,3,3,4,3,4,4,5,2,3
-db 3,4,3,4,4,5,3,4,4,5
-db 4,5,5,6,1,2,2,3,2,3
-db 3,4,2,3,3,4,3,4,4,5
-db 2,3,3,4,3,4,4,5,3,4
-db 4,5,4,5,5,6,2,3,3,4
-db 3,4,4,5,3,4,4,5,4,5
-db 5,6,3,4,4,5,4,5,5,6
-db 4,5,5,6,5,6,6,7,1,2
-db 2,3,2,3,3,4,2,3,3,4
-db 3,4,4,5,2,3,3,4,3,4
-db 4,5,3,4,4,5,4,5,5,6
-db 2,3,3,4,3,4,4,5,3,4
-db 4,5,4,5,5,6,3,4,4,5
-db 4,5,5,6,4,5,5,6,5,6
-db 6,7,2,3,3,4,3,4,4,5
-db 3,4,4,5,4,5,5,6,3,4
-db 4,5,4,5,5,6,4,5,5,6
-db 5,6,6,7,3,4,4,5,4,5
-db 5,6,4,5,5,6,5,6,6,7
-db 4,5,5,6,5,6,6,7,5,6
-db 6,7,6,7,7,8
-
-align 16
-high_mask_table:
- db 0, 0, 0, 3, 0, 2, 3, 6, 0, 2
- db 2, 5, 3, 5, 6, 9, 0, 1, 2, 5
- db 2, 4, 5, 8, 3, 5, 5, 8, 6, 8
- db 9,12, 0, 1, 1, 4, 2, 4, 5, 8
- db 2, 4, 4, 7, 5, 7, 8,11, 3, 4
- db 5, 8, 5, 7, 8,11, 6, 8, 8,11
- db 9,11,12,15, 0, 1, 1, 4, 1, 3
- db 4, 7, 2, 4, 4, 7, 5, 7, 8,11
- db 2, 3, 4, 7, 4, 6, 7,10, 5, 7
- db 7,10, 8,10,11,14, 3, 4, 4, 7
- db 5, 7, 8,11, 5, 7, 7,10, 8,10
- db 11,14, 6, 7, 8,11, 8,10,11,14
- db 9,11,11,14,12,14,15,18, 0, 0
- db 1, 4, 1, 3, 4, 7, 1, 3, 3, 6
- db 4, 6, 7,10, 2, 3, 4, 7, 4, 6
- db 7,10, 5, 7, 7,10, 8,10,11,14
- db 2, 3, 3, 6, 4, 6, 7,10, 4, 6
- db 6, 9, 7, 9,10,13, 5, 6, 7,10
- db 7, 9,10,13, 8,10,10,13,11,13
- db 14,17, 3, 4, 4, 7, 4, 6, 7,10
- db 5, 7, 7,10, 8,10,11,14, 5, 6
- db 7,10, 7, 9,10,13, 8,10,10,13
- db 11,13,14,17, 6, 7, 7,10, 8,10
- db 11,14, 8,10,10,13,11,13,14,17
- db 9,10,11,14,11,13,14,17,12,14
- db 14,17,15,17,18,21
-
-align 16
-low_mask_table:
- db 0, 3, 2, 6, 2, 5, 5, 9, 1, 5
- db 4, 8, 5, 8, 8,12, 1, 4, 4, 8
- db 4, 7, 7,11, 4, 8, 7,11, 8,11
- db 11,15, 1, 4, 3, 7, 4, 7, 7,11
- db 3, 7, 6,10, 7,10,10,14, 4, 7
- db 7,11, 7,10,10,14, 7,11,10,14
- db 11,14,14,18, 0, 4, 3, 7, 3, 6
- db 6,10, 3, 7, 6,10, 7,10,10,14
- db 3, 6, 6,10, 6, 9, 9,13, 6,10
- db 9,13,10,13,13,17, 4, 7, 6,10
- db 7,10,10,14, 6,10, 9,13,10,13
- db 13,17, 7,10,10,14,10,13,13,17
- db 10,14,13,17,14,17,17,21, 0, 3
- db 3, 7, 3, 6, 6,10, 2, 6, 5, 9
- db 6, 9, 9,13, 3, 6, 6,10, 6, 9
- db 9,13, 6,10, 9,13,10,13,13,17
- db 3, 6, 5, 9, 6, 9, 9,13, 5, 9
- db 8,12, 9,12,12,16, 6, 9, 9,13
- db 9,12,12,16, 9,13,12,16,13,16
- db 16,20, 3, 7, 6,10, 6, 9, 9,13
- db 6,10, 9,13,10,13,13,17, 6, 9
- db 9,13, 9,12,12,16, 9,13,12,16
- db 13,16,16,20, 7,10, 9,13,10,13
- db 13,17, 9,13,12,16,13,16,16,20
- db 10,13,13,17,13,16,16,20,13,17
- db 16,20,17,20,20,24
-
-
-SECTION .text
-
-;***********************************************************************
-;void WelsScan4x4DcAc_sse2( int16_t level[16], int16_t *pDct )
-;***********************************************************************
-WELS_EXTERN WelsScan4x4DcAc_sse2
- %ifdef X86_32
- push r3
- %assign push_num 1
- %else
- %assign push_num 0
- %endif
- LOAD_2_PARA
- movdqa xmm0, [r1] ; 7 6 5 4 3 2 1 0
- movdqa xmm1, [r1+16] ; f e d c b a 9 8
- pextrw r2d, xmm0, 7 ; ecx = 7
- pextrw r3d, xmm1, 2 ; edx = a
- pextrw r1d, xmm0, 5 ; eax = 5
- pinsrw xmm1, r2d, 2 ; f e d c b 7 9 8
- pinsrw xmm0, r1d, 7 ; 5 6 5 4 3 2 1 0
- pextrw r2d, xmm1, 0 ; ecx = 8
- pinsrw xmm0, r2d, 5 ; 5 6 8 4 3 2 1 0
- pinsrw xmm1, r3d, 0 ; f e d c b 7 9 a
- pshufd xmm2, xmm0, 0xd8 ; 5 6 3 2 8 4 1 0
- pshufd xmm3, xmm1, 0xd8 ; f e b 7 d c 9 a
- pshufhw xmm0, xmm2, 0x93 ; 6 3 2 5 8 4 1 0
- pshuflw xmm1, xmm3, 0x39 ; f e b 7 a d c 9
- movdqa [r0],xmm0
- movdqa [r0+16], xmm1
- %ifdef X86_32
- pop r3
- %endif
- ret
-
-;***********************************************************************
-;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct )
-;***********************************************************************
-WELS_EXTERN WelsScan4x4DcAc_ssse3
- %assign push_num 0
- LOAD_2_PARA
- movdqa xmm0, [r1]
- movdqa xmm1, [r1+16]
- pextrw r2d, xmm0, 7 ; ecx = [7]
- pextrw r1d, xmm1, 0 ; eax = [8]
- pinsrw xmm0, r1d, 7 ; xmm0[7] = [8]
- pinsrw xmm1, r2d, 0 ; xmm1[0] = [7]
- pshufb xmm1, [pb_scanacdc_maskb]
- pshufb xmm0, [pb_scanacdc_maska]
-
- movdqa [r0],xmm0
- movdqa [r0+16], xmm1
- ret
-;***********************************************************************
-;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct )
-;***********************************************************************
-WELS_EXTERN WelsScan4x4Ac_sse2
- %assign push_num 0
- LOAD_2_PARA
- movdqa xmm0, [r1]
- movdqa xmm1, [r1+16]
- movdqa xmm2, xmm0
- punpcklqdq xmm0, xmm1
- punpckhqdq xmm2, xmm1
-
- movdqa xmm3, xmm0
- punpckldq xmm0, xmm2
- punpckhdq xmm3, xmm2
- pextrw r1d , xmm0, 3
- pextrw r2d , xmm0, 7
- pinsrw xmm0, r1d, 7
- pextrw r1d, xmm3, 4
- pinsrw xmm3, r2d, 4
- pextrw r2d, xmm3, 0
- pinsrw xmm3, r1d, 0
- pinsrw xmm0, r2d, 3
-
- pshufhw xmm1, xmm0, 0x93
- pshuflw xmm2, xmm3, 0x39
-
- movdqa xmm3, xmm2
- psrldq xmm1, 2
- pslldq xmm3, 14
- por xmm1, xmm3
- psrldq xmm2, 2
- movdqa [r0],xmm1
- movdqa [r0+16], xmm2
- ret
-
-
-;***********************************************************************
-;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
-;***********************************************************************
-WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
- %ifdef X86_32
- push r3
- %assign push_num 1
- %else
- %assign push_num 0
- %endif
- LOAD_1_PARA
- movdqa xmm0, [r0]
- movdqa xmm1, [r0+16]
-
- packsswb xmm0, xmm1
- ; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx
- xor r3, r3
- pxor xmm3, xmm3
- pcmpeqb xmm0, xmm3
- pmovmskb r3d, xmm0
-
- xor r3, 0xffff
-
- xor r0, r0
- mov r2, 7
- mov r1, 8
-.loop_low8_find1:
- bt r3, r2
- jc .loop_high8_find1
- dec r2
- jnz .loop_low8_find1
-.loop_high8_find1:
- bt r3, r1
- jc .find1end
- inc r1
- cmp r1,16
- jb .loop_high8_find1
-.find1end:
- sub r1, r2
- sub r1, 1
- lea r2, [i_ds_table]
- add r0b, [r2+r1]
- mov r1, r3
- and r3, 0xff
- shr r1, 8
- and r1, 0xff
- lea r2 , [low_mask_table]
- add r0b, [r2 +r3]
- lea r2, [high_mask_table]
- add r0b, [r2+r1]
- %ifdef X86_32
- pop r3
- %else
- mov retrd, r0d
- %endif
- ret
-
-
-;***********************************************************************
-; int32_t WelsGetNoneZeroCount_sse2(int16_t* level);
-;***********************************************************************
-WELS_EXTERN WelsGetNoneZeroCount_sse2
- %assign push_num 0
- LOAD_1_PARA
- movdqa xmm0, [r0]
- movdqa xmm1, [r0+16]
- pxor xmm2, xmm2
- pcmpeqw xmm0, xmm2
- pcmpeqw xmm1, xmm2
- packsswb xmm1, xmm0
- xor r1, r1
- pmovmskb r1d, xmm1
- xor r1d, 0xffff
- mov r2, r1
- and r1, 0xff
- shr r2, 8
-; and ecx, 0xff ; we do not need this due to high 16bits equal to 0 yet
-; xor retr, retr
- ;add al, [nozero_count_table+r2]
- lea r0 , [nozero_count_table]
- movzx r2, byte [r0+r2]
- movzx r1, byte [r0+r1]
- mov retrq, r2
- add retrq, r1
- ;add al, [nozero_count_table+r1]
- ret
-
--- /dev/null
+++ b/codec/encoder/core/x86/coeff.asm
@@ -1,0 +1,459 @@
+;*!
+;* \copy
+;* Copyright (c) 2010-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* memzero.asm
+;*
+;* Abstract
+;* cavlc
+;*
+;* History
+;* 09/08/2010 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+
+
+%ifdef X86_32
+SECTION .rodata align=16
+
+align 16
+sse2_b8 db 8, 8, 8, 8, 8, 8, 8, 8
+
+ALIGN 16
+sse2_b_1 db -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1
+
+align 16
+byte_1pos_table:
+ db 0,0,0,0,0,0,0,0, ;0
+ db 0,0,0,0,0,0,0,1, ;1
+ db 1,0,0,0,0,0,0,1, ;2
+ db 1,0,0,0,0,0,0,2, ;3
+ db 2,0,0,0,0,0,0,1, ;4
+ db 2,0,0,0,0,0,0,2, ;5
+ db 2,1,0,0,0,0,0,2, ;6
+ db 2,1,0,0,0,0,0,3, ;7
+ db 3,0,0,0,0,0,0,1, ;8
+ db 3,0,0,0,0,0,0,2, ;9
+ db 3,1,0,0,0,0,0,2, ;10
+ db 3,1,0,0,0,0,0,3, ;11
+ db 3,2,0,0,0,0,0,2, ;12
+ db 3,2,0,0,0,0,0,3, ;13
+ db 3,2,1,0,0,0,0,3, ;14
+ db 3,2,1,0,0,0,0,4, ;15
+ db 4,0,0,0,0,0,0,1, ;16
+ db 4,0,0,0,0,0,0,2, ;17
+ db 4,1,0,0,0,0,0,2, ;18
+ db 4,1,0,0,0,0,0,3, ;19
+ db 4,2,0,0,0,0,0,2, ;20
+ db 4,2,0,0,0,0,0,3, ;21
+ db 4,2,1,0,0,0,0,3, ;22
+ db 4,2,1,0,0,0,0,4, ;23
+ db 4,3,0,0,0,0,0,2, ;24
+ db 4,3,0,0,0,0,0,3, ;25
+ db 4,3,1,0,0,0,0,3, ;26
+ db 4,3,1,0,0,0,0,4, ;27
+ db 4,3,2,0,0,0,0,3, ;28
+ db 4,3,2,0,0,0,0,4, ;29
+ db 4,3,2,1,0,0,0,4, ;30
+ db 4,3,2,1,0,0,0,5, ;31
+ db 5,0,0,0,0,0,0,1, ;32
+ db 5,0,0,0,0,0,0,2, ;33
+ db 5,1,0,0,0,0,0,2, ;34
+ db 5,1,0,0,0,0,0,3, ;35
+ db 5,2,0,0,0,0,0,2, ;36
+ db 5,2,0,0,0,0,0,3, ;37
+ db 5,2,1,0,0,0,0,3, ;38
+ db 5,2,1,0,0,0,0,4, ;39
+ db 5,3,0,0,0,0,0,2, ;40
+ db 5,3,0,0,0,0,0,3, ;41
+ db 5,3,1,0,0,0,0,3, ;42
+ db 5,3,1,0,0,0,0,4, ;43
+ db 5,3,2,0,0,0,0,3, ;44
+ db 5,3,2,0,0,0,0,4, ;45
+ db 5,3,2,1,0,0,0,4, ;46
+ db 5,3,2,1,0,0,0,5, ;47
+ db 5,4,0,0,0,0,0,2, ;48
+ db 5,4,0,0,0,0,0,3, ;49
+ db 5,4,1,0,0,0,0,3, ;50
+ db 5,4,1,0,0,0,0,4, ;51
+ db 5,4,2,0,0,0,0,3, ;52
+ db 5,4,2,0,0,0,0,4, ;53
+ db 5,4,2,1,0,0,0,4, ;54
+ db 5,4,2,1,0,0,0,5, ;55
+ db 5,4,3,0,0,0,0,3, ;56
+ db 5,4,3,0,0,0,0,4, ;57
+ db 5,4,3,1,0,0,0,4, ;58
+ db 5,4,3,1,0,0,0,5, ;59
+ db 5,4,3,2,0,0,0,4, ;60
+ db 5,4,3,2,0,0,0,5, ;61
+ db 5,4,3,2,1,0,0,5, ;62
+ db 5,4,3,2,1,0,0,6, ;63
+ db 6,0,0,0,0,0,0,1, ;64
+ db 6,0,0,0,0,0,0,2, ;65
+ db 6,1,0,0,0,0,0,2, ;66
+ db 6,1,0,0,0,0,0,3, ;67
+ db 6,2,0,0,0,0,0,2, ;68
+ db 6,2,0,0,0,0,0,3, ;69
+ db 6,2,1,0,0,0,0,3, ;70
+ db 6,2,1,0,0,0,0,4, ;71
+ db 6,3,0,0,0,0,0,2, ;72
+ db 6,3,0,0,0,0,0,3, ;73
+ db 6,3,1,0,0,0,0,3, ;74
+ db 6,3,1,0,0,0,0,4, ;75
+ db 6,3,2,0,0,0,0,3, ;76
+ db 6,3,2,0,0,0,0,4, ;77
+ db 6,3,2,1,0,0,0,4, ;78
+ db 6,3,2,1,0,0,0,5, ;79
+ db 6,4,0,0,0,0,0,2, ;80
+ db 6,4,0,0,0,0,0,3, ;81
+ db 6,4,1,0,0,0,0,3, ;82
+ db 6,4,1,0,0,0,0,4, ;83
+ db 6,4,2,0,0,0,0,3, ;84
+ db 6,4,2,0,0,0,0,4, ;85
+ db 6,4,2,1,0,0,0,4, ;86
+ db 6,4,2,1,0,0,0,5, ;87
+ db 6,4,3,0,0,0,0,3, ;88
+ db 6,4,3,0,0,0,0,4, ;89
+ db 6,4,3,1,0,0,0,4, ;90
+ db 6,4,3,1,0,0,0,5, ;91
+ db 6,4,3,2,0,0,0,4, ;92
+ db 6,4,3,2,0,0,0,5, ;93
+ db 6,4,3,2,1,0,0,5, ;94
+ db 6,4,3,2,1,0,0,6, ;95
+ db 6,5,0,0,0,0,0,2, ;96
+ db 6,5,0,0,0,0,0,3, ;97
+ db 6,5,1,0,0,0,0,3, ;98
+ db 6,5,1,0,0,0,0,4, ;99
+ db 6,5,2,0,0,0,0,3, ;100
+ db 6,5,2,0,0,0,0,4, ;101
+ db 6,5,2,1,0,0,0,4, ;102
+ db 6,5,2,1,0,0,0,5, ;103
+ db 6,5,3,0,0,0,0,3, ;104
+ db 6,5,3,0,0,0,0,4, ;105
+ db 6,5,3,1,0,0,0,4, ;106
+ db 6,5,3,1,0,0,0,5, ;107
+ db 6,5,3,2,0,0,0,4, ;108
+ db 6,5,3,2,0,0,0,5, ;109
+ db 6,5,3,2,1,0,0,5, ;110
+ db 6,5,3,2,1,0,0,6, ;111
+ db 6,5,4,0,0,0,0,3, ;112
+ db 6,5,4,0,0,0,0,4, ;113
+ db 6,5,4,1,0,0,0,4, ;114
+ db 6,5,4,1,0,0,0,5, ;115
+ db 6,5,4,2,0,0,0,4, ;116
+ db 6,5,4,2,0,0,0,5, ;117
+ db 6,5,4,2,1,0,0,5, ;118
+ db 6,5,4,2,1,0,0,6, ;119
+ db 6,5,4,3,0,0,0,4, ;120
+ db 6,5,4,3,0,0,0,5, ;121
+ db 6,5,4,3,1,0,0,5, ;122
+ db 6,5,4,3,1,0,0,6, ;123
+ db 6,5,4,3,2,0,0,5, ;124
+ db 6,5,4,3,2,0,0,6, ;125
+ db 6,5,4,3,2,1,0,6, ;126
+ db 6,5,4,3,2,1,0,7, ;127
+ db 7,0,0,0,0,0,0,1, ;128
+ db 7,0,0,0,0,0,0,2, ;129
+ db 7,1,0,0,0,0,0,2, ;130
+ db 7,1,0,0,0,0,0,3, ;131
+ db 7,2,0,0,0,0,0,2, ;132
+ db 7,2,0,0,0,0,0,3, ;133
+ db 7,2,1,0,0,0,0,3, ;134
+ db 7,2,1,0,0,0,0,4, ;135
+ db 7,3,0,0,0,0,0,2, ;136
+ db 7,3,0,0,0,0,0,3, ;137
+ db 7,3,1,0,0,0,0,3, ;138
+ db 7,3,1,0,0,0,0,4, ;139
+ db 7,3,2,0,0,0,0,3, ;140
+ db 7,3,2,0,0,0,0,4, ;141
+ db 7,3,2,1,0,0,0,4, ;142
+ db 7,3,2,1,0,0,0,5, ;143
+ db 7,4,0,0,0,0,0,2, ;144
+ db 7,4,0,0,0,0,0,3, ;145
+ db 7,4,1,0,0,0,0,3, ;146
+ db 7,4,1,0,0,0,0,4, ;147
+ db 7,4,2,0,0,0,0,3, ;148
+ db 7,4,2,0,0,0,0,4, ;149
+ db 7,4,2,1,0,0,0,4, ;150
+ db 7,4,2,1,0,0,0,5, ;151
+ db 7,4,3,0,0,0,0,3, ;152
+ db 7,4,3,0,0,0,0,4, ;153
+ db 7,4,3,1,0,0,0,4, ;154
+ db 7,4,3,1,0,0,0,5, ;155
+ db 7,4,3,2,0,0,0,4, ;156
+ db 7,4,3,2,0,0,0,5, ;157
+ db 7,4,3,2,1,0,0,5, ;158
+ db 7,4,3,2,1,0,0,6, ;159
+ db 7,5,0,0,0,0,0,2, ;160
+ db 7,5,0,0,0,0,0,3, ;161
+ db 7,5,1,0,0,0,0,3, ;162
+ db 7,5,1,0,0,0,0,4, ;163
+ db 7,5,2,0,0,0,0,3, ;164
+ db 7,5,2,0,0,0,0,4, ;165
+ db 7,5,2,1,0,0,0,4, ;166
+ db 7,5,2,1,0,0,0,5, ;167
+ db 7,5,3,0,0,0,0,3, ;168
+ db 7,5,3,0,0,0,0,4, ;169
+ db 7,5,3,1,0,0,0,4, ;170
+ db 7,5,3,1,0,0,0,5, ;171
+ db 7,5,3,2,0,0,0,4, ;172
+ db 7,5,3,2,0,0,0,5, ;173
+ db 7,5,3,2,1,0,0,5, ;174
+ db 7,5,3,2,1,0,0,6, ;175
+ db 7,5,4,0,0,0,0,3, ;176
+ db 7,5,4,0,0,0,0,4, ;177
+ db 7,5,4,1,0,0,0,4, ;178
+ db 7,5,4,1,0,0,0,5, ;179
+ db 7,5,4,2,0,0,0,4, ;180
+ db 7,5,4,2,0,0,0,5, ;181
+ db 7,5,4,2,1,0,0,5, ;182
+ db 7,5,4,2,1,0,0,6, ;183
+ db 7,5,4,3,0,0,0,4, ;184
+ db 7,5,4,3,0,0,0,5, ;185
+ db 7,5,4,3,1,0,0,5, ;186
+ db 7,5,4,3,1,0,0,6, ;187
+ db 7,5,4,3,2,0,0,5, ;188
+ db 7,5,4,3,2,0,0,6, ;189
+ db 7,5,4,3,2,1,0,6, ;190
+ db 7,5,4,3,2,1,0,7, ;191
+ db 7,6,0,0,0,0,0,2, ;192
+ db 7,6,0,0,0,0,0,3, ;193
+ db 7,6,1,0,0,0,0,3, ;194
+ db 7,6,1,0,0,0,0,4, ;195
+ db 7,6,2,0,0,0,0,3, ;196
+ db 7,6,2,0,0,0,0,4, ;197
+ db 7,6,2,1,0,0,0,4, ;198
+ db 7,6,2,1,0,0,0,5, ;199
+ db 7,6,3,0,0,0,0,3, ;200
+ db 7,6,3,0,0,0,0,4, ;201
+ db 7,6,3,1,0,0,0,4, ;202
+ db 7,6,3,1,0,0,0,5, ;203
+ db 7,6,3,2,0,0,0,4, ;204
+ db 7,6,3,2,0,0,0,5, ;205
+ db 7,6,3,2,1,0,0,5, ;206
+ db 7,6,3,2,1,0,0,6, ;207
+ db 7,6,4,0,0,0,0,3, ;208
+ db 7,6,4,0,0,0,0,4, ;209
+ db 7,6,4,1,0,0,0,4, ;210
+ db 7,6,4,1,0,0,0,5, ;211
+ db 7,6,4,2,0,0,0,4, ;212
+ db 7,6,4,2,0,0,0,5, ;213
+ db 7,6,4,2,1,0,0,5, ;214
+ db 7,6,4,2,1,0,0,6, ;215
+ db 7,6,4,3,0,0,0,4, ;216
+ db 7,6,4,3,0,0,0,5, ;217
+ db 7,6,4,3,1,0,0,5, ;218
+ db 7,6,4,3,1,0,0,6, ;219
+ db 7,6,4,3,2,0,0,5, ;220
+ db 7,6,4,3,2,0,0,6, ;221
+ db 7,6,4,3,2,1,0,6, ;222
+ db 7,6,4,3,2,1,0,7, ;223
+ db 7,6,5,0,0,0,0,3, ;224
+ db 7,6,5,0,0,0,0,4, ;225
+ db 7,6,5,1,0,0,0,4, ;226
+ db 7,6,5,1,0,0,0,5, ;227
+ db 7,6,5,2,0,0,0,4, ;228
+ db 7,6,5,2,0,0,0,5, ;229
+ db 7,6,5,2,1,0,0,5, ;230
+ db 7,6,5,2,1,0,0,6, ;231
+ db 7,6,5,3,0,0,0,4, ;232
+ db 7,6,5,3,0,0,0,5, ;233
+ db 7,6,5,3,1,0,0,5, ;234
+ db 7,6,5,3,1,0,0,6, ;235
+ db 7,6,5,3,2,0,0,5, ;236
+ db 7,6,5,3,2,0,0,6, ;237
+ db 7,6,5,3,2,1,0,6, ;238
+ db 7,6,5,3,2,1,0,7, ;239
+ db 7,6,5,4,0,0,0,4, ;240
+ db 7,6,5,4,0,0,0,5, ;241
+ db 7,6,5,4,1,0,0,5, ;242
+ db 7,6,5,4,1,0,0,6, ;243
+ db 7,6,5,4,2,0,0,5, ;244
+ db 7,6,5,4,2,0,0,6, ;245
+ db 7,6,5,4,2,1,0,6, ;246
+ db 7,6,5,4,2,1,0,7, ;247
+ db 7,6,5,4,3,0,0,5, ;248
+ db 7,6,5,4,3,0,0,6, ;249
+ db 7,6,5,4,3,1,0,6, ;250
+ db 7,6,5,4,3,1,0,7, ;251
+ db 7,6,5,4,3,2,0,6, ;252
+ db 7,6,5,4,3,2,0,7, ;253
+ db 7,6,5,4,3,2,1,7, ;254
+ db 7,6,5,4,3,2,1,8, ;255
+
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+
+
+
+;***********************************************************************
+;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
+;***********************************************************************
+WELS_EXTERN CavlcParamCal_sse2
+ push ebx
+ push edi
+ push esi
+
+ mov eax, [esp+16] ;coffLevel
+ mov edi, [esp+24] ;Level
+ mov ebx, [esp+32] ;endIdx
+ cmp ebx, 3
+ jne .Level16
+ pxor xmm1, xmm1
+ movq xmm0, [eax] ; removed QWORD
+ jmp .Cal_begin
+.Level16:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax+16]
+.Cal_begin:
+ movdqa xmm2, xmm0
+ packsswb xmm0, xmm1
+ movdqa xmm4, xmm0
+ pxor xmm3, xmm3
+ pcmpgtb xmm0, xmm3
+ pcmpgtb xmm3, xmm4
+ por xmm0, xmm3
+ pmovmskb edx, xmm0
+ cmp edx, 0
+ je near .return
+ movdqa xmm6, [sse2_b_1]
+ pcmpeqw xmm7, xmm7 ;generate -1
+ mov ebx, 0xff
+ ;pinsrw xmm6, ebx, 3
+
+ mov bl, dh
+
+ lea ebx, [byte_1pos_table+8*ebx]
+ movq xmm0, [ebx]
+ pextrw ecx, xmm0, 3
+ shr ecx, 8
+ mov dh, cl
+
+.loopHighFind0:
+ cmp ecx, 0
+ je .loopHighFind0End
+ ;mov esi, [ebx]
+ ;and esi, 0xff
+ movzx esi, byte [ebx]
+ add esi, 8
+ mov esi, [eax+2*esi]
+ mov [edi], si
+ add edi, 2
+ ;add ebx, 1
+ inc ebx
+ dec ecx
+ jmp .loopHighFind0
+.loopHighFind0End:
+ mov cl, dh
+ cmp cl, 8
+ pand xmm0, xmm6
+ jne .LowByteFind0
+ sub edi, 2
+ mov esi, [eax+16]
+ mov [edi], esi
+ add edi, 2
+.LowByteFind0:
+ and edx, 0xff
+ lea ebx, [byte_1pos_table+8*edx]
+ movq xmm1, [ebx]
+ pextrw esi, xmm1, 3
+ or esi, 0xff
+ or ecx, 0xff00
+ and ecx, esi
+ shr esi, 8
+ pand xmm1, xmm6
+.loopLowFind0:
+ cmp esi, 0
+ je .loopLowFind0End
+ ;mov edx, [ebx]
+ ;and edx, 0xff
+ movzx edx, byte [ebx]
+ mov edx, [eax+2*edx]
+ mov [edi], dx
+ add edi, 2
+ ;add ebx, 1
+ inc ebx
+ dec esi
+ jmp .loopLowFind0
+.loopLowFind0End:
+ cmp ch, 8
+ jne .getLevelEnd
+ sub edi, 2
+ mov edx, [eax]
+ mov [edi], dx
+.getLevelEnd:
+ mov edx, [esp+28] ;total_coeffs
+ ;mov ebx, ecx
+ ;and ebx, 0xff
+ movzx ebx, byte cl
+ add cl, ch
+ mov [edx], cl
+;getRun
+ movq xmm5, [sse2_b8]
+ paddb xmm0, xmm5
+ pxor xmm2, xmm2
+ pxor xmm3, xmm3
+ mov eax, 8
+ sub eax, ebx
+ shl eax, 3
+ shl ebx, 3
+ pinsrw xmm2, ebx, 0
+ pinsrw xmm3, eax, 0
+ psllq xmm0, xmm3
+ psrlq xmm0, xmm3
+ movdqa xmm4, xmm1
+ psllq xmm1, xmm2
+ psrlq xmm4, xmm3
+ punpcklqdq xmm1, xmm4
+ por xmm0, xmm1
+
+ pextrw eax, xmm0, 0
+ and eax, 0xff
+ inc eax
+ sub al, cl
+ movdqa xmm1, xmm0
+ paddb xmm1, xmm7
+ psrldq xmm0, 1
+ psubb xmm1, xmm0
+ mov ecx, [esp+20] ;run
+ movdqa [ecx], xmm1
+;getRunEnd
+.return:
+ pop esi
+ pop edi
+ pop ebx
+ ret
+%endif
--- /dev/null
+++ b/codec/encoder/core/x86/dct.asm
@@ -1,0 +1,504 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* ?Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* ?Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* dct.asm
+;*
+;* Abstract
+;* WelsDctFourT4_sse2
+;*
+;* History
+;* 8/4/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Constant
+;***********************************************************************
+
+align 16
+SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
+ dw 10, 13, 10, 13, 13, 16, 13, 16,
+ dw 11, 14, 11, 14, 14, 18, 14, 18,
+ dw 11, 14, 11, 14, 14, 18, 14, 18,
+ dw 13, 16, 13, 16, 16, 20, 16, 20,
+ dw 13, 16, 13, 16, 16, 20, 16, 20,
+ dw 14, 18, 14, 18, 18, 23, 18, 23,
+ dw 14, 18, 14, 18, 18, 23, 18, 23,
+ dw 16, 20, 16, 20, 20, 25, 20, 25,
+ dw 16, 20, 16, 20, 20, 25, 20, 25,
+ dw 18, 23, 18, 23, 23, 29, 23, 29,
+ dw 18, 23, 18, 23, 23, 29, 23, 29
+
+
+;***********************************************************************
+; MMX functions
+;***********************************************************************
+
+%macro MMX_LoadDiff4P 5
+ movd %1, [%3]
+ movd %2, [%4]
+ punpcklbw %1, %5
+ punpcklbw %2, %5
+ psubw %1, %2
+%endmacro
+
+%macro MMX_LoadDiff4x4P 10 ;d0, d1, d2, d3, pix1address, pix1stride, pix2address, pix2stride, tmp(mm), 0(mm)
+ MMX_LoadDiff4P %1, %9, %5, %7, %10
+ MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10
+ lea %5, [%5+2*%6]
+ lea %7, [%7+2*%8]
+ MMX_LoadDiff4P %3, %9, %5, %7, %10
+ MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10
+%endmacro
+
+%macro MMX_SumSubMul2 3
+ movq %3, %1
+ psllw %1, $01
+ paddw %1, %2
+ psllw %2, $01
+ psubw %3, %2
+%endmacro
+
+%macro MMX_SumSubDiv2 3
+ movq %3, %2
+ psraw %3, $01
+ paddw %3, %1
+ psraw %1, $01
+ psubw %1, %2
+%endmacro
+
+%macro MMX_SumSub 3
+ movq %3, %2
+ psubw %2, %1
+ paddw %1, %3
+%endmacro
+
+%macro MMX_DCT 6
+ MMX_SumSub %4, %1, %6
+ MMX_SumSub %3, %2, %6
+ MMX_SumSub %3, %4, %6
+ MMX_SumSubMul2 %1, %2, %5
+%endmacro
+
+%macro MMX_IDCT 6
+ MMX_SumSub %4, %5, %6
+ MMX_SumSubDiv2 %3, %2, %1
+ MMX_SumSub %1, %4, %6
+ MMX_SumSub %3, %5, %6
+%endmacro
+
+%macro MMX_StoreDiff4P 6
+ movd %2, %6
+ punpcklbw %2, %4
+ paddw %1, %3
+ psraw %1, $06
+ paddsw %1, %2
+ packuswb %1, %2
+ movd %5, %1
+%endmacro
+SECTION .text
+;***********************************************************************
+; void WelsDctT4_mmx( int16_t *pDct[4], uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
+;***********************************************************************
+WELS_EXTERN WelsDctT4_mmx
+ %assign push_num 0
+ LOAD_5_PARA
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r4, r4d
+ WELS_Zero mm7
+
+ MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, r1, r2, r3, r4, mm0, mm7
+
+ MMX_DCT mm1, mm2, mm3 ,mm4, mm5, mm6
+ MMX_Trans4x4W mm3, mm1, mm4, mm5, mm2
+
+ MMX_DCT mm3, mm5, mm2 ,mm4, mm1, mm6
+ MMX_Trans4x4W mm2, mm3, mm4, mm1, mm5
+
+ movq [r0+ 0], mm2
+ movq [r0+ 8], mm1
+ movq [r0+16], mm5
+ movq [r0+24], mm4
+ WELSEMMS
+ LOAD_5_PARA_POP
+ ret
+
+
+;***********************************************************************
+; void WelsIDctT4Rec_mmx(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs)
+;***********************************************************************
+WELS_EXTERN WelsIDctT4Rec_mmx
+ %assign push_num 0
+ LOAD_5_PARA
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ movq mm0, [r4+ 0]
+ movq mm1, [r4+ 8]
+ movq mm2, [r4+16]
+ movq mm3, [r4+24]
+
+ MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
+ MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
+ MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
+ MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
+
+ WELS_Zero mm7
+ WELS_DW32 mm6
+
+ MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0], [r2]
+ MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1], [r2+r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0], [r2]
+ MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1], [r2+r3]
+
+ WELSEMMS
+ LOAD_5_PARA_POP
+ ret
+
+
+;***********************************************************************
+; SSE2 functions
+;***********************************************************************
+%macro SSE2_Store4x8p 6
+ SSE2_XSawp qdq, %2, %3, %6
+ SSE2_XSawp qdq, %4, %5, %3
+ MOVDQ [%1+0x00], %2
+ MOVDQ [%1+0x10], %4
+ MOVDQ [%1+0x20], %6
+ MOVDQ [%1+0x30], %3
+%endmacro
+
+%macro SSE2_Load4x8p 6
+ MOVDQ %2, [%1+0x00]
+ MOVDQ %4, [%1+0x10]
+ MOVDQ %6, [%1+0x20]
+ MOVDQ %3, [%1+0x30]
+ SSE2_XSawp qdq, %4, %3, %5
+ SSE2_XSawp qdq, %2, %6, %3
+%endmacro
+
+%macro SSE2_SumSubMul2 3
+ movdqa %3, %1
+ paddw %1, %1
+ paddw %1, %2
+ psubw %3, %2
+ psubw %3, %2
+%endmacro
+
+%macro SSE2_SumSubDiv2 4
+ movdqa %4, %1
+ movdqa %3, %2
+ psraw %2, $01
+ psraw %4, $01
+ paddw %1, %2
+ psubw %4, %3
+%endmacro
+
+%macro SSE2_StoreDiff8p 6
+ paddw %1, %3
+ psraw %1, $06
+ movq %2, %6
+ punpcklbw %2, %4
+ paddsw %2, %1
+ packuswb %2, %2
+ movq %5, %2
+%endmacro
+
+%macro SSE2_StoreDiff8p 5
+ movq %2, %5
+ punpcklbw %2, %3
+ paddsw %2, %1
+ packuswb %2, %2
+ movq %4, %2
+%endmacro
+
+%macro SSE2_Load8DC 6
+ movdqa %1, %6 ; %1 = dc0 dc1
+ paddw %1, %5
+ psraw %1, $06 ; (dc + 32) >> 6
+
+ movdqa %2, %1
+ psrldq %2, 4
+ punpcklwd %2, %2
+ punpckldq %2, %2 ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
+
+ movdqa %3, %1
+ psrldq %3, 8
+ punpcklwd %3, %3
+ punpckldq %3, %3 ; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
+
+ movdqa %4, %1
+ psrldq %4, 12
+ punpcklwd %4, %4
+ punpckldq %4, %4 ; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
+
+ punpcklwd %1, %1
+ punpckldq %1, %1 ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
+%endmacro
+
+%macro SSE2_DCT 6
+ SSE2_SumSub %6, %3, %5
+ SSE2_SumSub %1, %2, %5
+ SSE2_SumSub %3, %2, %5
+ SSE2_SumSubMul2 %6, %1, %4
+%endmacro
+
+%macro SSE2_IDCT 7
+ SSE2_SumSub %7, %2, %6
+ SSE2_SumSubDiv2 %1, %3, %5, %4
+ SSE2_SumSub %2, %1, %5
+ SSE2_SumSub %7, %4, %5
+%endmacro
+
+;***********************************************************************
+; void WelsDctFourT4_sse2(int16_t *pDct, uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
+;***********************************************************************
+WELS_EXTERN WelsDctFourT4_sse2
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r2, r2d
+ SIGN_EXTENSION r4, r4d
+ pxor xmm7, xmm7
+ ;Load 4x8
+ SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1], [r3]
+ SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2], [r3+r4]
+ lea r1, [r1 + 2 * r2]
+ lea r3, [r3 + 2 * r4]
+ SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
+ SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
+
+ SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
+ SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
+ SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
+ SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
+
+ SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
+
+ lea r1, [r1 + 2 * r2]
+ lea r3, [r3 + 2 * r4]
+
+ ;Load 4x8
+ SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1 ], [r3 ]
+ SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2 ], [r3+r4]
+ lea r1, [r1 + 2 * r2]
+ lea r3, [r3 + 2 * r4]
+ SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
+ SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
+
+ SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
+ SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
+ SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
+ SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
+
+ lea r0, [r0+64]
+ SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
+
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
+
+
+;***********************************************************************
+; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
+;***********************************************************************
+WELS_EXTERN WelsIDctFourT4Rec_sse2
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ ;Load 4x8
+ SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
+
+ SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
+ SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
+ SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
+ SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
+
+ WELS_Zero xmm7
+ WELS_DW32 xmm6
+
+ SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
+ SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
+ SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
+
+ add r4, 64
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
+
+ SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
+ SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
+ SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
+ SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
+
+ WELS_Zero xmm7
+ WELS_DW32 xmm6
+
+ SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
+ SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
+ SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1], [r2 + r3]
+ POP_XMM
+ LOAD_5_PARA_POP
+ ; pop esi
+ ; pop ebx
+ ret
+
+%macro SSE2_StoreDiff4x8p 8
+ SSE2_StoreDiff8p %1, %3, %4, [%5], [%6]
+ SSE2_StoreDiff8p %1, %3, %4, [%5 + %7], [%6 + %8]
+ SSE2_StoreDiff8p %2, %3, %4, [%5 + 8], [%6 + 8]
+ SSE2_StoreDiff8p %2, %3, %4, [%5 + %7 + 8], [%6 + %8 + 8]
+%endmacro
+
+ ;***********************************************************************
+; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
+;***********************************************************************
+WELS_EXTERN WelsIDctRecI16x16Dc_sse2
+ %assign push_num 0
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm7, xmm7
+ WELS_DW32 xmm6
+
+ SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4]
+ SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+
+ SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16]
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+ POP_XMM
+ LOAD_5_PARA_POP
+ ret
+
+
+
+%macro SSE2_SumSubD 3
+ movdqa %3, %2
+ paddd %2, %1
+ psubd %1, %3
+%endmacro
+
+%macro SSE2_SumSubDiv2D 4
+ paddd %1, %2
+ paddd %1, %3
+ psrad %1, 1
+ movdqa %4, %1
+ psubd %4, %2
+%endmacro
+%macro SSE2_Load4Col 5
+ movsx r2, WORD[%5]
+ movd %1, r2d
+ movsx r2, WORD[%5 + 0x20]
+ movd %2, r2d
+ punpckldq %1, %2
+ movsx r2, WORD[%5 + 0x80]
+ movd %3, r2d
+ movsx r2, WORD[%5 + 0xa0]
+ movd %4, r2d
+ punpckldq %3, %4
+ punpcklqdq %1, %3
+%endmacro
+
+;***********************************************************************
+;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
+;***********************************************************************
+WELS_EXTERN WelsHadamardT4Dc_sse2
+ %assign push_num 0
+ LOAD_2_PARA
+ PUSH_XMM 8
+ SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, r1
+ SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, r1 + 0x40
+ SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, r1 + 0x100
+ SSE2_Load4Col xmm4, xmm5, xmm6, xmm0, r1 + 0x140
+
+ SSE2_SumSubD xmm1, xmm2, xmm7
+ SSE2_SumSubD xmm3, xmm4, xmm7
+ SSE2_SumSubD xmm2, xmm4, xmm7
+ SSE2_SumSubD xmm1, xmm3, xmm7
+
+ SSE2_Trans4x4D xmm4, xmm2, xmm1, xmm3, xmm5 ; pOut: xmm4,xmm3,xmm5,xmm1
+
+ SSE2_SumSubD xmm4, xmm3, xmm7
+ SSE2_SumSubD xmm5, xmm1, xmm7
+
+ WELS_DD1 xmm6
+ SSE2_SumSubDiv2D xmm3, xmm1, xmm6, xmm0 ; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
+ SSE2_SumSubDiv2D xmm4, xmm5, xmm6, xmm1 ; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
+ SSE2_Trans4x4D xmm3, xmm0, xmm1, xmm4, xmm2 ; pOut: xmm3,xmm4,xmm2,xmm1
+
+ packssdw xmm3, xmm4
+ packssdw xmm2, xmm1
+ movdqa [r0+ 0], xmm3
+ movdqa [r0+16], xmm2
+
+ POP_XMM
+ ret
--- /dev/null
+++ b/codec/encoder/core/x86/intra_pred.asm
@@ -1,0 +1,1416 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* intra_pred.asm
+;*
+;* Abstract
+;* sse2 function for intra predict operations
+;*
+;* History
+;* 18/09/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+align 16
+sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
+align 16
+sse2_plane_inc dw 1, 2, 3, 4, 5, 6, 7, 8
+align 16
+sse2_plane_dec dw 8, 7, 6, 5, 4, 3, 2, 1
+
+; for chroma plane mode
+sse2_plane_inc_c dw 1, 2, 3, 4
+sse2_plane_dec_c dw 4, 3, 2, 1
+align 16
+sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
+
+align 16
+mmx_01bytes: times 16 db 1
+
+align 16
+mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
+
+
+;***********************************************************************
+; macros
+;***********************************************************************
+;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+;%1 will keep the last result
+%macro SSE_DB_1_2REG 2
+ pxor %1, %1
+ pcmpeqw %2, %2
+ psubb %1, %2
+%endmacro
+
+;xmm0, xmm1, xmm2, eax, ecx
+;lower 64 bits of xmm0 save the result
+%macro SSE2_PRED_H_4X4_TWO_LINE 5
+ movd %1, [%4-1]
+ movdqa %3, %1
+ punpcklbw %1, %3
+ movdqa %3, %1
+ punpcklbw %1, %3
+
+ ;add %4, %5
+ movd %2, [%4+%5-1]
+ movdqa %3, %2
+ punpcklbw %2, %3
+ movdqa %3, %2
+ punpcklbw %2, %3
+ punpckldq %1, %2
+%endmacro
+
+%macro SUMW_HORIZON1 2
+ movdqa %2, %1
+ psrldq %2, 8
+ paddusw %1, %2
+ movdqa %2, %1
+ psrldq %2, 4
+ paddusw %1, %2
+ movdqa %2, %1
+ psrldq %2, 2
+ paddusw %1, %2
+%endmacro
+
+%macro LOAD_COLUMN 6
+ movd %1, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %1, %2
+ lea %5, [%5+2*%6]
+ movd %3, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %3, %2
+ punpcklwd %1, %3
+ lea %5, [%5+2*%6]
+ movd %4, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %4, %2
+ lea %5, [%5+2*%6]
+ movd %3, [%5]
+ movd %2, [%5+%6]
+ lea %5, [%5+2*%6]
+ punpcklbw %3, %2
+ punpcklwd %4, %3
+ punpckhdq %1, %4
+%endmacro
+
+%macro SUMW_HORIZON 3
+ movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
+ paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
+ punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
+ movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
+ paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
+ pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
+ paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
+%endmacro
+
+
+%macro COPY_16_TIMES 2
+ movdqa %2, [%1-16]
+ psrldq %2, 15
+ pmuludq %2, [mmx_01bytes]
+ pshufd %2, %2, 0
+%endmacro
+
+%macro COPY_16_TIMESS 3
+ movdqa %2, [%1+%3-16]
+ psrldq %2, 15
+ pmuludq %2, [mmx_01bytes]
+ pshufd %2, %2, 0
+%endmacro
+
+%macro LOAD_COLUMN_C 6
+ movd %1, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %1,%2
+ lea %5, [%5+2*%6]
+ movd %3, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %3, %2
+ punpckhwd %1, %3
+ lea %5, [%5+2*%6]
+%endmacro
+
+%macro LOAD_2_LEFT_AND_ADD 0
+ lea r1, [r1+2*r2]
+ movzx r4, byte [r1-0x01]
+ add r3, r4
+ movzx r4, byte [r1+r2-0x01]
+ add r3, r4
+%endmacro
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+;***********************************************************************
+; void WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;
+; pred must align to 16
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredH_sse2
+ push r3
+ %assign push_num 1
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ movzx r3, byte [r1-1]
+ movd xmm0, r3d
+ pmuludq xmm0, [mmx_01bytes]
+
+ movzx r3, byte [r1+r2-1]
+ movd xmm1, r3d
+ pmuludq xmm1, [mmx_01bytes]
+
+ unpcklps xmm0, xmm1
+
+ lea r1, [r1+r2*2]
+ movzx r3, byte [r1-1]
+ movd xmm2, r3d
+ pmuludq xmm2, [mmx_01bytes]
+
+ movzx r3, byte [r1+r2-1]
+ movd xmm3, r3d
+ pmuludq xmm3, [mmx_01bytes]
+
+ unpcklps xmm2, xmm3
+ unpcklpd xmm0, xmm2
+
+ movdqa [r0], xmm0
+ pop r3
+ ret
+
+;***********************************************************************
+; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+WELS_EXTERN WelsI16x16LumaPredPlane_sse2
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_3_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r2, r2d
+ sub r1, 1
+ sub r1, r2
+
+ ;for H
+ pxor xmm7, xmm7
+ movq xmm0, [r1]
+ movdqa xmm5, [sse2_plane_dec]
+ punpcklbw xmm0, xmm7
+ pmullw xmm0, xmm5
+ movq xmm1, [r1 + 9]
+ movdqa xmm6, [sse2_plane_inc]
+ punpcklbw xmm1, xmm7
+ pmullw xmm1, xmm6
+ psubw xmm1, xmm0
+
+ SUMW_HORIZON xmm1,xmm0,xmm2
+ movd r3d, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]);
+ movsx r3, r3w
+ imul r3, 5
+ add r3, 32
+ sar r3, 6 ; b = (5 * H + 32) >> 6;
+ SSE2_Copy8Times xmm1, r3d ; xmm1 = b,b,b,b,b,b,b,b
+
+ movzx r4, BYTE [r1+16]
+ sub r1, 3
+ LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, r1, r2
+
+ add r1, 3
+ movzx r3, BYTE [r1+8*r2]
+ add r4, r3
+ shl r4, 4 ; a = (left[15*stride] + top[15]) << 4;
+
+ sub r1, 3
+ add r1, r2
+ LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, r1, r2
+ pxor xmm4, xmm4
+ punpckhbw xmm0, xmm4
+ pmullw xmm0, xmm5
+ punpckhbw xmm7, xmm4
+ pmullw xmm7, xmm6
+ psubw xmm7, xmm0
+
+ SUMW_HORIZON xmm7,xmm0,xmm2
+ movd r3d, xmm7 ; V
+ movsx r3, r3w
+ imul r3, 5
+ add r3, 32
+ sar r3, 6 ; c = (5 * V + 32) >> 6;
+ SSE2_Copy8Times xmm4, r3d ; xmm4 = c,c,c,c,c,c,c,c
+
+ add r4, 16
+ imul r3, -7
+ add r3, r4 ; s = a + 16 + (-7)*c
+ SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
+
+ xor r3, r3
+ movdqa xmm5, [sse2_plane_inc_minus]
+
+get_i16x16_luma_pred_plane_sse2_1:
+ movdqa xmm2, xmm1
+ pmullw xmm2, xmm5
+ paddw xmm2, xmm0
+ psraw xmm2, 5
+ movdqa xmm3, xmm1
+ pmullw xmm3, xmm6
+ paddw xmm3, xmm0
+ psraw xmm3, 5
+ packuswb xmm2, xmm3
+ movdqa [r0], xmm2
+ paddw xmm0, xmm4
+ add r0, 16
+ inc r3
+ cmp r3, 16
+ jnz get_i16x16_luma_pred_plane_sse2_1
+ POP_XMM
+ pop r4
+ pop r3
+ ret
+
+;***********************************************************************
+; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+
+%macro SSE2_PRED_H_16X16_ONE_LINE 0
+ add r0, 16
+ add r1, r2
+ movzx r3, byte [r1]
+ SSE2_Copy16Times xmm0, r3d
+ movdqa [r0], xmm0
+%endmacro
+
+WELS_EXTERN WelsI16x16LumaPredH_sse2
+ push r3
+ %assign push_num 1
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ dec r1
+ movzx r3, byte [r1]
+ SSE2_Copy16Times xmm0, r3d
+ movdqa [r0], xmm0
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ pop r3
+ ret
+
+;***********************************************************************
+; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+WELS_EXTERN WelsI16x16LumaPredV_sse2
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ sub r1, r2
+ movdqa xmm0, [r1]
+
+ movdqa [r0], xmm0
+ movdqa [r0+10h], xmm0
+ movdqa [r0+20h], xmm0
+ movdqa [r0+30h], xmm0
+ movdqa [r0+40h], xmm0
+ movdqa [r0+50h], xmm0
+ movdqa [r0+60h], xmm0
+ movdqa [r0+70h], xmm0
+ movdqa [r0+80h], xmm0
+ movdqa [r0+90h], xmm0
+ movdqa [r0+160], xmm0
+ movdqa [r0+176], xmm0
+ movdqa [r0+192], xmm0
+ movdqa [r0+208], xmm0
+ movdqa [r0+224], xmm0
+ movdqa [r0+240], xmm0
+
+ ret
+
+;***********************************************************************
+; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+WELS_EXTERN WelsIChromaPredPlane_sse2
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_3_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r2, r2d
+ sub r1, 1
+ sub r1, r2
+
+ pxor mm7, mm7
+ movq mm0, [r1]
+ movq mm5, [sse2_plane_dec_c]
+ punpcklbw mm0, mm7
+ pmullw mm0, mm5
+ movq mm1, [r1 + 5]
+ movq mm6, [sse2_plane_inc_c]
+ punpcklbw mm1, mm7
+ pmullw mm1, mm6
+ psubw mm1, mm0
+
+ movq2dq xmm1, mm1
+ pxor xmm2, xmm2
+ SUMW_HORIZON xmm1,xmm0,xmm2
+ movd r3d, xmm1
+ movsx r3, r3w
+ imul r3, 17
+ add r3, 16
+ sar r3, 5 ; b = (17 * H + 16) >> 5;
+ SSE2_Copy8Times xmm1, r3d ; mm1 = b,b,b,b,b,b,b,b
+
+ movzx r3, BYTE [r1+8]
+ sub r1, 3
+ LOAD_COLUMN_C mm0, mm2, mm3, mm4, r1, r2
+
+ add r1, 3
+ movzx r4, BYTE [r1+4*r2]
+ add r4, r3
+ shl r4, 4 ; a = (left[7*stride] + top[7]) << 4;
+
+ sub r1, 3
+ add r1, r2
+ LOAD_COLUMN_C mm7, mm2, mm3, mm4, r1, r2
+ pxor mm4, mm4
+ punpckhbw mm0, mm4
+ pmullw mm0, mm5
+ punpckhbw mm7, mm4
+ pmullw mm7, mm6
+ psubw mm7, mm0
+
+ movq2dq xmm7, mm7
+ pxor xmm2, xmm2
+ SUMW_HORIZON xmm7,xmm0,xmm2
+ movd r3d, xmm7 ; V
+ movsx r3, r3w
+ imul r3, 17
+ add r3, 16
+ sar r3, 5 ; c = (17 * V + 16) >> 5;
+ SSE2_Copy8Times xmm4, r3d ; mm4 = c,c,c,c,c,c,c,c
+
+ add r4, 16
+ imul r3, -3
+ add r3, r4 ; s = a + 16 + (-3)*c
+ SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
+
+ xor r3, r3
+ movdqa xmm5, [sse2_plane_mul_b_c]
+
+get_i_chroma_pred_plane_sse2_1:
+ movdqa xmm2, xmm1
+ pmullw xmm2, xmm5
+ paddw xmm2, xmm0
+ psraw xmm2, 5
+ packuswb xmm2, xmm2
+ movq [r0], xmm2
+ paddw xmm0, xmm4
+ add r0, 8
+ inc r3
+ cmp r3, 8
+ jnz get_i_chroma_pred_plane_sse2_1
+ POP_XMM
+ pop r4
+ pop r3
+ WELSEMMS
+ ret
+
+;***********************************************************************
+; 0 |1 |2 |3 |4 |
+; 6 |7 |8 |9 |10|
+; 11|12|13|14|15|
+; 16|17|18|19|20|
+; 21|22|23|24|25|
+; 7 is the start pixel of current 4x4 block
+; pred[7] = ([6]+[0]*2+[1]+2)/4
+;
+; void WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredDDR_mmx
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ movq mm1,[r1+r2-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
+ movq mm2,[r1-8] ;get value of 6 mm2[8] = 6
+ sub r1, r2 ;mov eax to above line of current block(postion of 1)
+ punpckhbw mm2,[r1-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
+ movd mm3,[r1] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
+ punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
+ psllq mm3,18h ;mm3[5]=[1]
+ psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+ por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
+ movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+ lea r1,[r1+r2*2-8h] ;set eax point to 12
+ movq mm4,[r1+r2] ;get value of 16, mm4[8]=[16]
+ psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
+ psrlq mm4,38h ;mm4[1]=[16]
+ por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
+ movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
+ movq mm4,[r1+r2*2] ;mm4[8]=[21]
+ psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
+ psrlq mm4,38h ;mm4[1]=[21]
+ por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
+ movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
+ pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2
+ pxor mm1,mm4 ;find odd value in the lowest bit of each byte
+ pand mm1,[mmx_01bytes] ;set the odd bit
+ psubusb mm3,mm1 ;decrease 1 from odd bytes
+ pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
+
+ movd [r0+12],mm2
+ psrlq mm2,8
+ movd [r0+8],mm2
+ psrlq mm2,8
+ movd [r0+4],mm2
+ psrlq mm2,8
+ movd [r0],mm2
+ WELSEMMS
+ ret
+
+;***********************************************************************
+; 0 |1 |2 |3 |4 |
+; 5 |6 |7 |8 |9 |
+; 10|11|12|13|14|
+; 15|16|17|18|19|
+; 20|21|22|23|24|
+; 6 is the start pixel of current 4x4 block
+; pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
+;
+; void WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredDc_sse2
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ movzx r4, byte [r1-1h]
+ sub r1, r2
+ movd xmm0, [r1]
+ pxor xmm1, xmm1
+ psadbw xmm0, xmm1
+ xor r3, r3
+ movd r3d, xmm0
+ add r3, r4
+ movzx r4, byte [r1+r2*2-1h]
+ add r3, r4
+
+ lea r1, [r1+r2*2-1]
+ movzx r4, byte [r1+r2]
+ add r3, r4
+
+ movzx r4, byte [r1+r2*2]
+ add r3, r4
+ add r3, 4
+ sar r3, 3
+ imul r3, 0x01010101
+
+ movd xmm0, r3d
+ pshufd xmm0, xmm0, 0
+ movdqa [r0], xmm0
+ pop r4
+ pop r3
+ ret
+
+;***********************************************************************
+; void WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
+; copy 8 pixel of 8 line from left
+;***********************************************************************
+%macro MMX_PRED_H_8X8_ONE_LINE 4
+ movq %1, [%3-8]
+ psrlq %1, 38h
+
+ ;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
+ pmullw %1, [mmx_01bytes]
+ pshufw %1, %1, 0
+ movq [%4], %1
+%endmacro
+
+%macro MMX_PRED_H_8X8_ONE_LINEE 4
+ movq %1, [%3+r2-8]
+ psrlq %1, 38h
+
+ ;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
+ pmullw %1, [mmx_01bytes]
+ pshufw %1, %1, 0
+ movq [%4], %1
+%endmacro
+
+WELS_EXTERN WelsIChromaPredH_mmx
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ movq mm0, [r1-8]
+ psrlq mm0, 38h
+
+ ;pmuludq mm0, [mmx_01bytes] ;extend to 4 bytes
+ pmullw mm0, [mmx_01bytes]
+ pshufw mm0, mm0, 0
+ movq [r0], mm0
+
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+8
+
+ lea r1,[r1+r2*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+16
+
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+24
+
+ lea r1,[r1+r2*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+32
+
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+40
+
+ lea r1,[r1+r2*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+48
+
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+56
+ WELSEMMS
+ ret
+
+;***********************************************************************
+; void WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+; copy pixels from top 4 pixels
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredV_sse2
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ sub r1, r2
+ movd xmm0, [r1]
+ pshufd xmm0, xmm0, 0
+ movdqa [r0], xmm0
+ ret
+
+;***********************************************************************
+; void WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+; copy 8 pixels from top 8 pixels
+;***********************************************************************
+WELS_EXTERN WelsIChromaPredV_sse2
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ sub r1, r2
+ movq xmm0, [r1]
+ movdqa xmm1, xmm0
+ punpcklqdq xmm0, xmm1
+ movdqa [r0], xmm0
+ movdqa [r0+16], xmm0
+ movdqa [r0+32], xmm0
+ movdqa [r0+48], xmm0
+ ret
+
+;***********************************************************************
+; lt|t0|t1|t2|t3|
+; l0|
+; l1|
+; l2|
+; l3|
+; t3 will never been used
+; destination:
+; |a |b |c |d |
+; |e |f |a |b |
+; |g |h |e |f |
+; |i |j |g |h |
+
+; a = (1 + lt + l0)>>1
+; e = (1 + l0 + l1)>>1
+; g = (1 + l1 + l2)>>1
+; i = (1 + l2 + l3)>>1
+
+; d = (2 + t0 + (t1<<1) + t2)>>2
+; c = (2 + lt + (t0<<1) + t1)>>2
+; b = (2 + l0 + (lt<<1) + t0)>>2
+
+; f = (2 + l1 + (l0<<1) + lt)>>2
+; h = (2 + l2 + (l1<<1) + l0)>>2
+; j = (2 + l3 + (l2<<1) + l1)>>2
+; [b a f e h g j i] + [d c b a] --> mov to memory
+;
+; void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredHD_mmx
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ sub r1, r2
+ movd mm0, [r1-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt]
+ psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx]
+
+ movd mm1, [r1+2*r2-4]
+ punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1
+ lea r1, [r1+2*r2]
+ movd mm2, [r1+2*r2-4]
+ punpcklbw mm2, [r1+r2-4] ; mm2[7] = l2, mm2[6] = l3
+ punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
+ psrlq mm2, 20h
+ pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
+
+ movq mm1, mm0
+ psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
+ movq mm2, mm0
+ psrlq mm2, 8h ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
+ movq mm3, mm2
+ movq mm4, mm1
+ pavgb mm1, mm0
+
+ pxor mm4, mm0 ; find odd value in the lowest bit of each byte
+ pand mm4, [mmx_01bytes] ; set the odd bit
+ psubusb mm1, mm4 ; decrease 1 from odd bytes
+
+ pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
+
+ movq mm4, mm0
+ pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i]
+ punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i]
+
+ psrlq mm2, 20h
+ psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0]
+ movq mm4, mm3
+ psrlq mm4, 10h ; mm4 = [0 0 b a f e h j]
+ pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx]
+ psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a]
+
+ movd [r0], mm2
+ movd [r0+12], mm3
+ psrlq mm3, 10h
+ movd [r0+8], mm3
+ psrlq mm3, 10h
+ movd [r0+4], mm3
+ WELSEMMS
+ ret
+
+;***********************************************************************
+; lt|t0|t1|t2|t3|
+; l0|
+; l1|
+; l2|
+; l3|
+; t3 will never been used
+; destination:
+; |a |b |c |d |
+; |c |d |e |f |
+; |e |f |g |g |
+; |g |g |g |g |
+
+; a = (1 + l0 + l1)>>1
+; c = (1 + l1 + l2)>>1
+; e = (1 + l2 + l3)>>1
+; g = l3
+
+; b = (2 + l0 + (l1<<1) + l2)>>2
+; d = (2 + l1 + (l2<<1) + l3)>>2
+; f = (2 + l2 + (l3<<1) + l3)>>2
+
+; [g g f e d c b a] + [g g g g] --> mov to memory
+;
+; void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredHU_mmx
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ movd mm0, [r1-4] ; mm0[3] = l0
+ punpcklbw mm0, [r1+r2-4] ; mm0[7] = l1, mm0[6] = l0
+ lea r1, [r1+2*r2]
+ movd mm2, [r1-4] ; mm2[3] = l2
+ movd mm4, [r1+r2-4] ; mm4[3] = l3
+ punpcklbw mm2, mm4
+ punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
+
+ psrlq mm4, 18h
+ psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx]
+ psrlq mm0, 8h
+ pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
+
+ movq mm1, mm0
+ psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
+ movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
+ pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx]
+
+ movq mm2, mm0
+ psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx]
+ movq mm5, mm2
+ pavgb mm2, mm0
+
+ pxor mm5, mm0 ; find odd value in the lowest bit of each byte
+ pand mm5, [mmx_01bytes] ; set the odd bit
+ psubusb mm2, mm5 ; decrease 1 from odd bytes
+
+ pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
+
+ psrlq mm2, 8h
+ pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx]
+
+ punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a]
+ punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx]
+ punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx]
+
+ psrlq mm4, 20h
+ movd [r0+12], mm4
+
+ movd [r0], mm1
+ psrlq mm1, 10h
+ movd [r0+4], mm1
+ psrlq mm1, 10h
+ movd [r0+8], mm1
+ WELSEMMS
+ ret
+
+
+
+;***********************************************************************
+; lt|t0|t1|t2|t3|
+; l0|
+; l1|
+; l2|
+; l3|
+; l3 will never been used
+; destination:
+; |a |b |c |d |
+; |e |f |g |h |
+; |i |a |b |c |
+; |j |e |f |g |
+
+; a = (1 + lt + t0)>>1
+; b = (1 + t0 + t1)>>1
+; c = (1 + t1 + t2)>>1
+; d = (1 + t2 + t3)>>1
+
+; e = (2 + l0 + (lt<<1) + t0)>>2
+; f = (2 + lt + (t0<<1) + t1)>>2
+; g = (2 + t0 + (t1<<1) + t2)>>2
+
+; h = (2 + t1 + (t2<<1) + t3)>>2
+; i = (2 + lt + (l0<<1) + l1)>>2
+; j = (2 + l0 + (l1<<1) + l2)>>2
+;
+; void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredVR_mmx
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ sub r1, r2
+ movq mm0, [r1-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
+ psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
+
+ movd mm1, [r1+2*r2-4]
+ punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1
+ lea r1, [r1+2*r2]
+ movq mm2, [r1+r2-8] ; mm2[7] = l2
+ punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx]
+ psrlq mm2, 28h
+ pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
+
+ movq mm1, mm0
+ psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
+ pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx]
+
+ movq mm2, mm0
+ psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
+ movq mm3, mm2
+ pavgb mm2, mm0
+
+ pxor mm3, mm0 ; find odd value in the lowest bit of each byte
+ pand mm3, [mmx_01bytes] ; set the odd bit
+ psubusb mm2, mm3 ; decrease 1 from odd bytes
+
+ movq mm3, mm0
+ psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
+ pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx]
+ movq mm2, mm3
+
+ psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a]
+ movd [r0], mm1
+
+ psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e]
+ movd [r0+4], mm2
+
+ movq mm4, mm3
+ psllq mm4, 20h
+ psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i]
+
+ movq mm5, mm3
+ psllq mm5, 28h
+ psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j]
+
+ psllq mm1, 8h
+ pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i]
+ movd [r0+8], mm4
+
+ psllq mm2, 8h
+ pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
+ movd [r0+12], mm5
+ WELSEMMS
+ ret
+
+;***********************************************************************
+; lt|t0|t1|t2|t3|t4|t5|t6|t7
+; l0|
+; l1|
+; l2|
+; l3|
+; lt,t0,t1,t2,t3 will never been used
+; destination:
+; |a |b |c |d |
+; |b |c |d |e |
+; |c |d |e |f |
+; |d |e |f |g |
+
+; a = (2 + t0 + t2 + (t1<<1))>>2
+; b = (2 + t1 + t3 + (t2<<1))>>2
+; c = (2 + t2 + t4 + (t3<<1))>>2
+; d = (2 + t3 + t5 + (t4<<1))>>2
+
+; e = (2 + t4 + t6 + (t5<<1))>>2
+; f = (2 + t5 + t7 + (t6<<1))>>2
+; g = (2 + t6 + t7 + (t7<<1))>>2
+
+; [g f e d c b a] --> mov to memory
+;
+; void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredDDL_mmx
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ sub r1, r2
+ movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+ movq mm1, mm0
+ movq mm2, mm0
+
+ movq mm3, mm0
+ psrlq mm3, 38h
+ psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx]
+
+ psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
+ psrlq mm2, 8h
+ pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
+
+ movq mm3, mm1
+ pavgb mm1, mm2
+ pxor mm3, mm2 ; find odd value in the lowest bit of each byte
+ pand mm3, [mmx_01bytes] ; set the odd bit
+ psubusb mm1, mm3 ; decrease 1 from odd bytes
+
+ pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
+
+ psrlq mm0, 8h
+ movd [r0], mm0
+ psrlq mm0, 8h
+ movd [r0+4], mm0
+ psrlq mm0, 8h
+ movd [r0+8], mm0
+ psrlq mm0, 8h
+ movd [r0+12], mm0
+ WELSEMMS
+ ret
+
+
+;***********************************************************************
+; lt|t0|t1|t2|t3|t4|t5|t6|t7
+; l0|
+; l1|
+; l2|
+; l3|
+; lt,t0,t1,t2,t3 will never been used
+; destination:
+; |a |b |c |d |
+; |e |f |g |h |
+; |b |c |d |i |
+; |f |g |h |j |
+
+; a = (1 + t0 + t1)>>1
+; b = (1 + t1 + t2)>>1
+; c = (1 + t2 + t3)>>1
+; d = (1 + t3 + t4)>>1
+; i = (1 + t4 + t5)>>1
+
+; e = (2 + t0 + (t1<<1) + t2)>>2
+; f = (2 + t1 + (t2<<1) + t3)>>2
+; g = (2 + t2 + (t3<<1) + t4)>>2
+; h = (2 + t3 + (t4<<1) + t5)>>2
+; j = (2 + t4 + (t5<<1) + t6)>>2
+
+; [i d c b a] + [j h g f e] --> mov to memory
+;
+; void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredVL_mmx
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ sub r1, r2
+ movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+ movq mm1, mm0
+ movq mm2, mm0
+
+ psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
+ psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
+
+ movq mm3, mm1
+ pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a]
+
+ movq mm4, mm2
+ pavgb mm2, mm0
+ pxor mm4, mm0 ; find odd value in the lowest bit of each byte
+ pand mm4, [mmx_01bytes] ; set the odd bit
+ psubusb mm2, mm4 ; decrease 1 from odd bytes
+
+ pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
+
+ movd [r0], mm3
+ psrlq mm3, 8h
+ movd [r0+8], mm3
+
+ movd [r0+4], mm2
+ psrlq mm2, 8h
+ movd [r0+12], mm2
+ WELSEMMS
+ ret
+
+;***********************************************************************
+;
+; void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsIChromaPredDc_sse2
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ sub r1, r2
+ movq mm0, [r1]
+
+ movzx r3, byte [r1+r2-0x01] ; l1
+ lea r1, [r1+2*r2]
+ movzx r4, byte [r1-0x01] ; l2
+ add r3, r4
+ movzx r4, byte [r1+r2-0x01] ; l3
+ add r3, r4
+ lea r1, [r1+2*r2]
+ movzx r4, byte [r1-0x01] ; l4
+ add r3, r4
+ movd mm1, r3d ; mm1 = l1+l2+l3+l4
+
+ movzx r3, byte [r1+r2-0x01] ; l5
+ lea r1, [r1+2*r2]
+ movzx r4, byte [r1-0x01] ; l6
+ add r3, r4
+ movzx r4, byte [r1+r2-0x01] ; l7
+ add r3, r4
+ lea r1, [r1+2*r2]
+ movzx r4, byte [r1-0x01] ; l8
+ add r3, r4
+ movd mm2, r3d ; mm2 = l5+l6+l7+l8
+
+ movq mm3, mm0
+ psrlq mm0, 0x20
+ psllq mm3, 0x20
+ psrlq mm3, 0x20
+ pxor mm4, mm4
+ psadbw mm0, mm4
+ psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
+
+ paddq mm3, mm1
+ movq mm1, mm2
+ paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
+
+ movq mm4, [mmx_0x02]
+
+ paddq mm0, mm4
+ psrlq mm0, 0x02
+
+ paddq mm2, mm4
+ psrlq mm2, 0x02
+
+ paddq mm3, mm4
+ paddq mm3, mm4
+ psrlq mm3, 0x03
+
+ paddq mm1, mm4
+ paddq mm1, mm4
+ psrlq mm1, 0x03
+
+ pmuludq mm0, [mmx_01bytes]
+ pmuludq mm3, [mmx_01bytes]
+ psllq mm0, 0x20
+ pxor mm0, mm3 ; mm0 = m_up
+
+ pmuludq mm2, [mmx_01bytes]
+ pmuludq mm1, [mmx_01bytes]
+ psllq mm1, 0x20
+ pxor mm1, mm2 ; mm2 = m_down
+
+ movq [r0], mm0
+ movq [r0+0x08], mm0
+ movq [r0+0x10], mm0
+ movq [r0+0x18], mm0
+
+ movq [r0+0x20], mm1
+ movq [r0+0x28], mm1
+ movq [r0+0x30], mm1
+ movq [r0+0x38], mm1
+
+ pop r4
+ pop r3
+ WELSEMMS
+ ret
+
+
+
+;***********************************************************************
+;
+; void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI16x16LumaPredDc_sse2
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ sub r1, r2
+ movdqa xmm0, [r1] ; read one row
+ pxor xmm1, xmm1
+ psadbw xmm0, xmm1
+ movdqa xmm1, xmm0
+ psrldq xmm1, 0x08
+ pslldq xmm0, 0x08
+ psrldq xmm0, 0x08
+ paddw xmm0, xmm1
+
+ movzx r3, byte [r1+r2-0x01]
+ movzx r4, byte [r1+2*r2-0x01]
+ add r3, r4
+ lea r1, [r1+r2]
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ add r3, 0x10
+ movd xmm1, r3d
+ paddw xmm0, xmm1
+ psrld xmm0, 0x05
+ pmuludq xmm0, [mmx_01bytes]
+ pshufd xmm0, xmm0, 0
+
+ movdqa [r0], xmm0
+ movdqa [r0+0x10], xmm0
+ movdqa [r0+0x20], xmm0
+ movdqa [r0+0x30], xmm0
+ movdqa [r0+0x40], xmm0
+ movdqa [r0+0x50], xmm0
+ movdqa [r0+0x60], xmm0
+ movdqa [r0+0x70], xmm0
+ movdqa [r0+0x80], xmm0
+ movdqa [r0+0x90], xmm0
+ movdqa [r0+0xa0], xmm0
+ movdqa [r0+0xb0], xmm0
+ movdqa [r0+0xc0], xmm0
+ movdqa [r0+0xd0], xmm0
+ movdqa [r0+0xe0], xmm0
+ movdqa [r0+0xf0], xmm0
+
+ pop r4
+ pop r3
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
+; uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
+;
+;***********************************************************************
+%ifdef X86_32
+WELS_EXTERN WelsSampleSatdThree4x4_sse2
+ push ebx
+ push esi
+ push edi
+ mov eax, [esp+24];p_enc
+ mov ebx, [esp+28];linesize_enc
+
+ ; load source 4x4 samples and Hadamard transform
+ movd xmm0, [eax]
+ movd xmm1, [eax+ebx]
+ lea eax , [eax+2*ebx]
+ movd xmm2, [eax]
+ movd xmm3, [eax+ebx]
+ punpckldq xmm0, xmm2
+ punpckldq xmm1, xmm3
+
+ pxor xmm6, xmm6
+ punpcklbw xmm0, xmm6
+ punpcklbw xmm1, xmm6
+
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ psubw xmm2, xmm1
+ SSE2_XSawp qdq, xmm0, xmm2, xmm3
+
+ movdqa xmm4, xmm0
+ paddw xmm0, xmm3
+ psubw xmm4, xmm3
+
+ movdqa xmm2, xmm0
+ punpcklwd xmm0, xmm4
+ punpckhwd xmm4, xmm2
+
+ SSE2_XSawp dq, xmm0, xmm4, xmm3
+ SSE2_XSawp qdq, xmm0, xmm3, xmm5
+
+ movdqa xmm7, xmm0
+ paddw xmm0, xmm5
+ psubw xmm7, xmm5
+
+ SSE2_XSawp qdq, xmm0, xmm7, xmm1
+
+ ; Hadamard transform results are saved in xmm0 and xmm2
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ psubw xmm2, xmm1
+
+ ; load top boundary samples: [a b c d]
+ mov eax, [esp+16];p_dec
+ sub eax, [esp+20];linesize_dec
+ movzx ecx, byte [eax]
+ movzx edx, byte [eax+1]
+ movzx esi, byte [eax+2]
+ movzx edi, byte [eax+3]
+
+ ; get the transform results of top boundary samples: [a b c d]
+ add edx, ecx ; edx = a + b
+ add edi, esi ; edi = c + d
+ add ecx, ecx ; ecx = a + a
+ add esi, esi ; esi = c + c
+ sub ecx, edx ; ecx = a + a - a - b = a - b
+ sub esi, edi ; esi = c + c - c - d = c - d
+ add edi, edx ; edi = (a + b) + (c + d)
+ add edx, edx
+ sub edx, edi ; edx = (a + b) - (c + d)
+ add esi, ecx ; esi = (a - b) + (c - d)
+ add ecx, ecx
+ sub ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]
+
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm2
+ movd xmm5, edi ; store the edi for DC mode
+ pxor xmm3, xmm3
+ pxor xmm4, xmm4
+ pinsrw xmm3, edi, 0
+ pinsrw xmm3, esi, 4
+ psllw xmm3, 2
+ pinsrw xmm4, edx, 0
+ pinsrw xmm4, ecx, 4
+ psllw xmm4, 2
+
+ ; get the satd of H
+ psubw xmm0, xmm3
+ psubw xmm2, xmm4
+
+ WELS_AbsW xmm0, xmm1
+ WELS_AbsW xmm2, xmm1
+ paddusw xmm0, xmm2
+ SUMW_HORIZON1 xmm0, xmm1 ; satd of V is stored in xmm0
+
+ ; load left boundary samples: [a b c d]'
+ mov eax, [esp+16]
+ mov ebx, [esp+20]
+ movzx ecx, byte [eax-1]
+ movzx edx, byte [eax+ebx-1]
+ lea eax , [eax+2*ebx]
+ movzx esi, byte [eax-1]
+ movzx edi, byte [eax+ebx-1]
+
+ ; get the transform results of left boundary samples: [a b c d]'
+ add edx, ecx ; edx = a + b
+ add edi, esi ; edi = c + d
+ add ecx, ecx ; ecx = a + a
+ add esi, esi ; esi = c + c
+ sub ecx, edx ; ecx = a + a - a - b = a - b
+ sub esi, edi ; esi = c + c - c - d = c - d
+ add edi, edx ; edi = (a + b) + (c + d)
+ add edx, edx
+ sub edx, edi ; edx = (a + b) - (c + d)
+ add esi, ecx ; esi = (a - b) + (c - d)
+ add ecx, ecx
+ sub ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]'
+
+ ; store the transform results in xmm3
+ movd xmm3, edi
+ pinsrw xmm3, edx, 1
+ pinsrw xmm3, ecx, 2
+ pinsrw xmm3, esi, 3
+ psllw xmm3, 2
+
+ ; get the satd of V
+ movdqa xmm2, xmm6
+ movdqa xmm4, xmm7
+ psubw xmm2, xmm3
+ WELS_AbsW xmm2, xmm1
+ WELS_AbsW xmm4, xmm1
+ paddusw xmm2, xmm4
+ SUMW_HORIZON1 xmm2, xmm1 ; satd of H is stored in xmm2
+
+ ; DC result is stored in xmm1
+ add edi, 4
+ movd xmm1, edi
+ paddw xmm1, xmm5
+ psrlw xmm1, 3
+ movdqa xmm5, xmm1
+ psllw xmm1, 4
+
+ ; get the satd of DC
+ psubw xmm6, xmm1
+ WELS_AbsW xmm6, xmm1
+ WELS_AbsW xmm7, xmm1
+ paddusw xmm6, xmm7
+ SUMW_HORIZON1 xmm6, xmm1 ; satd of DC is stored in xmm6
+
+ ; comparing order: DC H V
+ mov edx, [esp+32]
+ movd eax, xmm6
+ movd edi, xmm2
+ movd esi, xmm0
+ and eax, 0xffff
+ shr eax, 1
+ and edi, 0xffff
+ shr edi, 1
+ and esi, 0xffff
+ shr esi, 1
+ add eax, [esp+40]
+ add edi, [esp+44]
+ add esi, [esp+48]
+ cmp ax, di
+ jg near not_dc
+ cmp ax, si
+ jg near not_dc_h
+
+ ; for DC mode
+ movd ebx, xmm5
+ imul ebx, 0x01010101
+ movd xmm5, ebx
+ pshufd xmm5, xmm5, 0
+ movdqa [edx], xmm5
+ mov ebx, [esp+36]
+ mov dword [ebx], 0x02
+ pop edi
+ pop esi
+ pop ebx
+ ret
+
+not_dc:
+ cmp di, si
+ jg near not_dc_h
+
+ ; for H mode
+ SSE_DB_1_2REG xmm6, xmm7
+ mov eax, [esp+16]
+ mov ebx, [esp+20]
+ movzx ecx, byte [eax-1]
+ movd xmm0, ecx
+ pmuludq xmm0, xmm6
+
+ movzx ecx, byte [eax+ebx-1]
+ movd xmm1, ecx
+ pmuludq xmm1, xmm6
+%if 1
+ punpckldq xmm0, xmm1
+%else
+ unpcklps xmm0, xmm1
+%endif
+ lea eax, [eax+ebx*2]
+ movzx ecx, byte [eax-1]
+ movd xmm2, ecx
+ pmuludq xmm2, xmm6
+
+ movzx ecx, byte [eax+ebx-1]
+ movd xmm3, ecx
+ pmuludq xmm3, xmm6
+%if 1
+ punpckldq xmm2, xmm3
+ punpcklqdq xmm0, xmm2
+%else
+ unpcklps xmm2, xmm3
+ unpcklpd xmm0, xmm2
+%endif
+ movdqa [edx],xmm0
+
+ mov eax, edi
+ mov ebx, [esp+36]
+ mov dword [ebx], 0x01
+
+ pop edi
+ pop esi
+ pop ebx
+ ret
+not_dc_h:
+ ; for V mode
+ mov eax, [esp+16]
+ sub eax, [esp+20]
+ movd xmm0, [eax]
+ pshufd xmm0, xmm0, 0
+ movdqa [edx],xmm0
+
+ mov eax, esi
+ mov ebx, [esp+36]
+ mov dword [ebx], 0x00
+
+ pop edi
+ pop esi
+ pop ebx
+ ret
+%endif
+
--- /dev/null
+++ b/codec/encoder/core/x86/memzero.asm
@@ -1,0 +1,132 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* memzero.asm
+;*
+;* Abstract
+;*
+;*
+;* History
+;* 9/16/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+;***********************************************************************
+;void WelsPrefetchZero_mmx(int8_t const*_A);
+;***********************************************************************
+WELS_EXTERN WelsPrefetchZero_mmx
+ %assign push_num 0
+ LOAD_1_PARA
+ prefetchnta [r0]
+ ret
+
+
+;***********************************************************************
+; void WelsSetMemZeroAligned64_sse2(void *dst, int32_t size)
+;***********************************************************************
+WELS_EXTERN WelsSetMemZeroAligned64_sse2
+
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ neg r1
+
+ pxor xmm0, xmm0
+.memzeroa64_sse2_loops:
+ movdqa [r0], xmm0
+ movdqa [r0+16], xmm0
+ movdqa [r0+32], xmm0
+ movdqa [r0+48], xmm0
+ add r0, 0x40
+
+ add r1, 0x40
+ jnz near .memzeroa64_sse2_loops
+
+ ret
+
+;***********************************************************************
+; void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
+;***********************************************************************
+WELS_EXTERN WelsSetMemZeroSize64_mmx
+
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ neg r1
+
+ pxor mm0, mm0
+.memzero64_mmx_loops:
+ movq [r0], mm0
+ movq [r0+8], mm0
+ movq [r0+16], mm0
+ movq [r0+24], mm0
+ movq [r0+32], mm0
+ movq [r0+40], mm0
+ movq [r0+48], mm0
+ movq [r0+56], mm0
+ add r0, 0x40
+
+ add r1, 0x40
+ jnz near .memzero64_mmx_loops
+
+ WELSEMMS
+ ret
+
+;***********************************************************************
+; void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
+;***********************************************************************
+WELS_EXTERN WelsSetMemZeroSize8_mmx
+
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENSION r1, r1d
+ neg r1
+ pxor mm0, mm0
+
+.memzero8_mmx_loops:
+ movq [r0], mm0
+ add r0, 0x08
+
+ add r1, 0x08
+ jnz near .memzero8_mmx_loops
+
+ WELSEMMS
+ ret
+
+
--- /dev/null
+++ b/codec/encoder/core/x86/quant.asm
@@ -1,0 +1,370 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* quant.asm
+;*
+;* Abstract
+;* sse2 quantize inter-block
+;*
+;* History
+;* 7/6/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+
+SECTION .text
+;************************************************
+;NEW_QUANT
+;************************************************
+
+%macro SSE2_Quant8 5
+ MOVDQ %1, %5
+ pxor %2, %2
+ pcmpgtw %2, %1
+ pxor %1, %2
+ psubw %1, %2
+ paddusw %1, %3
+ pmulhuw %1, %4
+ pxor %1, %2
+ psubw %1, %2
+ MOVDQ %5, %1
+%endmacro
+
+%macro SSE2_QuantMax8 6
+ MOVDQ %1, %5
+ pxor %2, %2
+ pcmpgtw %2, %1
+ pxor %1, %2
+ psubw %1, %2
+ paddusw %1, %3
+ pmulhuw %1, %4
+ pmaxsw %6, %1
+ pxor %1, %2
+ psubw %1, %2
+ MOVDQ %5, %1
+%endmacro
+
+%define pDct esp + 4
+%define ff esp + 8
+%define mf esp + 12
+%define max esp + 16
+;***********************************************************************
+; void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
+;***********************************************************************
+WELS_EXTERN WelsQuant4x4_sse2
+ %assign push_num 0
+ LOAD_3_PARA
+ movdqa xmm2, [r1]
+ movdqa xmm3, [r2]
+
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
+
+ ret
+
+;***********************************************************************
+;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf);
+;***********************************************************************
+WELS_EXTERN WelsQuant4x4Dc_sse2
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSIONW r1, r1w
+ SIGN_EXTENSIONW r2, r2w
+ SSE2_Copy8Times xmm3, r2d
+
+ SSE2_Copy8Times xmm2, r1d
+
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
+
+ ret
+
+;***********************************************************************
+; void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
+;***********************************************************************
+WELS_EXTERN WelsQuantFour4x4_sse2
+ %assign push_num 0
+ LOAD_3_PARA
+ MOVDQ xmm2, [r1]
+ MOVDQ xmm3, [r2]
+
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70]
+
+ ret
+
+;***********************************************************************
+; void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f, int16_t *mf, int16_t *max);
+;***********************************************************************
+WELS_EXTERN WelsQuantFour4x4Max_sse2
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 8
+ MOVDQ xmm2, [r1]
+ MOVDQ xmm3, [r2]
+
+ pxor xmm4, xmm4
+ pxor xmm5, xmm5
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 ], xmm4
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10], xmm4
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20], xmm5
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30], xmm5
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40], xmm6
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50], xmm6
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60], xmm7
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70], xmm7
+
+ SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
+ pmaxsw xmm0, xmm4
+ pmaxsw xmm0, xmm5
+ pmaxsw xmm0, xmm7
+ movdqa xmm1, xmm0
+ punpckhqdq xmm0, xmm1
+ pmaxsw xmm0, xmm1
+
+ movq [r3], xmm0
+ POP_XMM
+ LOAD_4_PARA_POP
+ ret
+
+%macro MMX_Copy4Times 2
+ movd %1, %2
+ punpcklwd %1, %1
+ punpckldq %1, %1
+%endmacro
+
+SECTION .text
+
+%macro MMX_Quant4 4
+ pxor %2, %2
+ pcmpgtw %2, %1
+ pxor %1, %2
+ psubw %1, %2
+ paddusw %1, %3
+ pmulhuw %1, %4
+ pxor %1, %2
+ psubw %1, %2
+%endmacro
+
+;***********************************************************************
+;int32_t WelsHadamardQuant2x2_mmx(int16_t *rs, const int16_t ff, int16_t mf, int16_t * pDct, int16_t * block);
+;***********************************************************************
+WELS_EXTERN WelsHadamardQuant2x2_mmx
+ %assign push_num 0
+ LOAD_5_PARA
+ SIGN_EXTENSIONW r1, r1w
+ SIGN_EXTENSIONW r2, r2w
+ movd mm0, [r0]
+ movd mm1, [r0 + 0x20]
+ punpcklwd mm0, mm1
+ movd mm3, [r0 + 0x40]
+ movd mm1, [r0 + 0x60]
+ punpcklwd mm3, mm1
+
+ ;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
+ movq mm5, mm3
+ paddw mm3, mm0
+ psubw mm0, mm5
+ punpcklwd mm3, mm0
+ movq mm1, mm3
+ psrlq mm1, 32
+ movq mm5, mm1
+ paddw mm1, mm3
+ psubw mm3, mm5
+ punpcklwd mm1, mm3
+
+ ;quant_2x2_dc
+ MMX_Copy4Times mm3, r2d
+ MMX_Copy4Times mm2, r1d
+ MMX_Quant4 mm1, mm0, mm2, mm3
+
+ ; store dct_2x2
+ movq [r3], mm1
+ movq [r4], mm1
+
+ ; pNonZeroCount of dct_2x2
+ pcmpeqb mm2, mm2 ; mm2 = FF
+ pxor mm3, mm3
+ packsswb mm1, mm3
+ pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
+ psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
+ psadbw mm1, mm3 ;
+ mov r1w, 0
+ mov [r0], r1w
+ mov [r0 + 0x20], r1w
+ mov [r0 + 0x40], r1w
+ mov [r0 + 0x60], r1w
+
+
+ movd retrd, mm1
+
+ WELSEMMS
+ LOAD_5_PARA_POP
+ ret
+
+;***********************************************************************
+;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff, int16_t mf);
+;***********************************************************************
+WELS_EXTERN WelsHadamardQuant2x2Skip_mmx
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSIONW r1, r1w
+ SIGN_EXTENSIONW r2, r2w
+ movd mm0, [r0]
+ movd mm1, [r0 + 0x20]
+ punpcklwd mm0, mm1
+ movd mm3, [r0 + 0x40]
+ movd mm1, [r0 + 0x60]
+ punpcklwd mm3, mm1
+
+ ;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
+ movq mm5, mm3
+ paddw mm3, mm0
+ psubw mm0, mm5
+ punpcklwd mm3, mm0
+ movq mm1, mm3
+ psrlq mm1, 32
+ movq mm5, mm1
+ paddw mm1, mm3
+ psubw mm3, mm5
+ punpcklwd mm1, mm3
+
+ ;quant_2x2_dc
+ MMX_Copy4Times mm3, r2d
+ MMX_Copy4Times mm2, r1d
+ MMX_Quant4 mm1, mm0, mm2, mm3
+
+ ; pNonZeroCount of dct_2x2
+ pcmpeqb mm2, mm2 ; mm2 = FF
+ pxor mm3, mm3
+ packsswb mm1, mm3
+ pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
+ psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
+ psadbw mm1, mm3 ;
+ movd retrd, mm1
+
+ WELSEMMS
+ ret
+
+
+%macro SSE2_DeQuant8 3
+ MOVDQ %2, %1
+ pmullw %2, %3
+ MOVDQ %1, %2
+%endmacro
+
+
+;***********************************************************************
+; void WelsDequant4x4_sse2(int16_t *pDct, const uint16_t* mf);
+;***********************************************************************
+WELS_EXTERN WelsDequant4x4_sse2
+ %assign push_num 0
+ LOAD_2_PARA
+
+ movdqa xmm1, [r1]
+ SSE2_DeQuant8 [r0 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0 + 0x10], xmm0, xmm1
+
+ ret
+
+;***********************************************************************====
+;void WelsDequantFour4x4_sse2(int16_t *pDct, const uint16_t* mf);
+;***********************************************************************====
+
+WELS_EXTERN WelsDequantFour4x4_sse2
+ %assign push_num 0
+ LOAD_2_PARA
+
+ movdqa xmm1, [r1]
+ SSE2_DeQuant8 [r0 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x10 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x20 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x30 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x40 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x50 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x60 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x70 ], xmm0, xmm1
+
+ ret
+
+;***********************************************************************
+;void WelsDequantIHadamard4x4_sse2(int16_t *rs, const uint16_t mf);
+;***********************************************************************
+WELS_EXTERN WelsDequantIHadamard4x4_sse2
+ %assign push_num 0
+ LOAD_2_PARA
+ %ifndef X86_32
+ movzx r1, r1w
+ %endif
+
+ ; WelsDequantLumaDc4x4
+ SSE2_Copy8Times xmm1, r1d
+ ;psrlw xmm1, 2 ; for the (>>2) in ihdm
+ MOVDQ xmm0, [r0]
+ MOVDQ xmm2, [r0+0x10]
+ pmullw xmm0, xmm1
+ pmullw xmm2, xmm1
+
+ ; ihdm_4x4
+ movdqa xmm1, xmm0
+ psrldq xmm1, 8
+ movdqa xmm3, xmm2
+ psrldq xmm3, 8
+
+ SSE2_SumSub xmm0, xmm3, xmm5 ; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
+ SSE2_SumSub xmm1, xmm2, xmm5 ; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
+ SSE2_SumSub xmm3, xmm2, xmm5 ; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
+ SSE2_SumSub xmm0, xmm1, xmm5 ; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1
+
+ SSE2_TransTwo4x4W xmm2, xmm1, xmm3, xmm0, xmm4
+ SSE2_SumSub xmm2, xmm4, xmm5
+ SSE2_SumSub xmm1, xmm0, xmm5
+ SSE2_SumSub xmm4, xmm0, xmm5
+ SSE2_SumSub xmm2, xmm1, xmm5
+ SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
+
+ punpcklqdq xmm0, xmm1
+ MOVDQ [r0], xmm0
+
+ punpcklqdq xmm2, xmm3
+ MOVDQ [r0+16], xmm2
+ ret
--- /dev/null
+++ b/codec/encoder/core/x86/score.asm
@@ -1,0 +1,339 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* score.asm
+;*
+;* Abstract
+;* scan/score/count of sse2
+;*
+;* History
+;* 8/21/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Macros
+;***********************************************************************
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+SECTION .rodata align=16
+
+;align 16
+;se2_2 dw 2, 2, 2, 2, 2, 2, 2, 2
+align 16
+sse2_1: dw 1, 1, 1, 1, 1, 1, 1, 1
+align 16
+sse2_b1: db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+i_ds_table: db 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+align 16
+sse2_plane_inc_minus: dw -7, -6, -5, -4, -3, -2, -1, 0
+align 16
+sse2_plane_inc: dw 1, 2, 3, 4, 5, 6, 7, 8
+align 16
+sse2_plane_dec: dw 8, 7, 6, 5, 4, 3, 2, 1
+align 16
+pb_scanacdc_maska:db 0,1,2,3,8,9,14,15,10,11,4,5,6,7,12,13
+align 16
+pb_scanacdc_maskb:db 2,3,8,9,10,11,4,5,0,1,6,7,12,13,14,15
+align 16
+pb_scandc_maska:db 2,3,8,9,14,15,10,11,4,5,6,7,12,13,0,1
+align 16
+pb_scandc_maskb:db 8,9,10,11,4,5,0,1,6,7,12,13,14,15,128,128
+
+align 16
+nozero_count_table:
+db 0,1,1,2,1,2,2,3,1,2
+db 2,3,2,3,3,4,1,2,2,3
+db 2,3,3,4,2,3,3,4,3,4
+db 4,5,1,2,2,3,2,3,3,4
+db 2,3,3,4,3,4,4,5,2,3
+db 3,4,3,4,4,5,3,4,4,5
+db 4,5,5,6,1,2,2,3,2,3
+db 3,4,2,3,3,4,3,4,4,5
+db 2,3,3,4,3,4,4,5,3,4
+db 4,5,4,5,5,6,2,3,3,4
+db 3,4,4,5,3,4,4,5,4,5
+db 5,6,3,4,4,5,4,5,5,6
+db 4,5,5,6,5,6,6,7,1,2
+db 2,3,2,3,3,4,2,3,3,4
+db 3,4,4,5,2,3,3,4,3,4
+db 4,5,3,4,4,5,4,5,5,6
+db 2,3,3,4,3,4,4,5,3,4
+db 4,5,4,5,5,6,3,4,4,5
+db 4,5,5,6,4,5,5,6,5,6
+db 6,7,2,3,3,4,3,4,4,5
+db 3,4,4,5,4,5,5,6,3,4
+db 4,5,4,5,5,6,4,5,5,6
+db 5,6,6,7,3,4,4,5,4,5
+db 5,6,4,5,5,6,5,6,6,7
+db 4,5,5,6,5,6,6,7,5,6
+db 6,7,6,7,7,8
+
+align 16
+high_mask_table:
+ db 0, 0, 0, 3, 0, 2, 3, 6, 0, 2
+ db 2, 5, 3, 5, 6, 9, 0, 1, 2, 5
+ db 2, 4, 5, 8, 3, 5, 5, 8, 6, 8
+ db 9,12, 0, 1, 1, 4, 2, 4, 5, 8
+ db 2, 4, 4, 7, 5, 7, 8,11, 3, 4
+ db 5, 8, 5, 7, 8,11, 6, 8, 8,11
+ db 9,11,12,15, 0, 1, 1, 4, 1, 3
+ db 4, 7, 2, 4, 4, 7, 5, 7, 8,11
+ db 2, 3, 4, 7, 4, 6, 7,10, 5, 7
+ db 7,10, 8,10,11,14, 3, 4, 4, 7
+ db 5, 7, 8,11, 5, 7, 7,10, 8,10
+ db 11,14, 6, 7, 8,11, 8,10,11,14
+ db 9,11,11,14,12,14,15,18, 0, 0
+ db 1, 4, 1, 3, 4, 7, 1, 3, 3, 6
+ db 4, 6, 7,10, 2, 3, 4, 7, 4, 6
+ db 7,10, 5, 7, 7,10, 8,10,11,14
+ db 2, 3, 3, 6, 4, 6, 7,10, 4, 6
+ db 6, 9, 7, 9,10,13, 5, 6, 7,10
+ db 7, 9,10,13, 8,10,10,13,11,13
+ db 14,17, 3, 4, 4, 7, 4, 6, 7,10
+ db 5, 7, 7,10, 8,10,11,14, 5, 6
+ db 7,10, 7, 9,10,13, 8,10,10,13
+ db 11,13,14,17, 6, 7, 7,10, 8,10
+ db 11,14, 8,10,10,13,11,13,14,17
+ db 9,10,11,14,11,13,14,17,12,14
+ db 14,17,15,17,18,21
+
+align 16
+low_mask_table:
+ db 0, 3, 2, 6, 2, 5, 5, 9, 1, 5
+ db 4, 8, 5, 8, 8,12, 1, 4, 4, 8
+ db 4, 7, 7,11, 4, 8, 7,11, 8,11
+ db 11,15, 1, 4, 3, 7, 4, 7, 7,11
+ db 3, 7, 6,10, 7,10,10,14, 4, 7
+ db 7,11, 7,10,10,14, 7,11,10,14
+ db 11,14,14,18, 0, 4, 3, 7, 3, 6
+ db 6,10, 3, 7, 6,10, 7,10,10,14
+ db 3, 6, 6,10, 6, 9, 9,13, 6,10
+ db 9,13,10,13,13,17, 4, 7, 6,10
+ db 7,10,10,14, 6,10, 9,13,10,13
+ db 13,17, 7,10,10,14,10,13,13,17
+ db 10,14,13,17,14,17,17,21, 0, 3
+ db 3, 7, 3, 6, 6,10, 2, 6, 5, 9
+ db 6, 9, 9,13, 3, 6, 6,10, 6, 9
+ db 9,13, 6,10, 9,13,10,13,13,17
+ db 3, 6, 5, 9, 6, 9, 9,13, 5, 9
+ db 8,12, 9,12,12,16, 6, 9, 9,13
+ db 9,12,12,16, 9,13,12,16,13,16
+ db 16,20, 3, 7, 6,10, 6, 9, 9,13
+ db 6,10, 9,13,10,13,13,17, 6, 9
+ db 9,13, 9,12,12,16, 9,13,12,16
+ db 13,16,16,20, 7,10, 9,13,10,13
+ db 13,17, 9,13,12,16,13,16,16,20
+ db 10,13,13,17,13,16,16,20,13,17
+ db 16,20,17,20,20,24
+
+
+SECTION .text
+
+;***********************************************************************
+;void WelsScan4x4DcAc_sse2( int16_t level[16], int16_t *pDct )
+;***********************************************************************
+WELS_EXTERN WelsScan4x4DcAc_sse2
+ %ifdef X86_32
+ push r3
+ %assign push_num 1
+ %else
+ %assign push_num 0
+ %endif
+ LOAD_2_PARA
+ movdqa xmm0, [r1] ; 7 6 5 4 3 2 1 0
+ movdqa xmm1, [r1+16] ; f e d c b a 9 8
+ pextrw r2d, xmm0, 7 ; ecx = 7
+ pextrw r3d, xmm1, 2 ; edx = a
+ pextrw r1d, xmm0, 5 ; eax = 5
+ pinsrw xmm1, r2d, 2 ; f e d c b 7 9 8
+ pinsrw xmm0, r1d, 7 ; 5 6 5 4 3 2 1 0
+ pextrw r2d, xmm1, 0 ; ecx = 8
+ pinsrw xmm0, r2d, 5 ; 5 6 8 4 3 2 1 0
+ pinsrw xmm1, r3d, 0 ; f e d c b 7 9 a
+ pshufd xmm2, xmm0, 0xd8 ; 5 6 3 2 8 4 1 0
+ pshufd xmm3, xmm1, 0xd8 ; f e b 7 d c 9 a
+ pshufhw xmm0, xmm2, 0x93 ; 6 3 2 5 8 4 1 0
+ pshuflw xmm1, xmm3, 0x39 ; f e b 7 a d c 9
+ movdqa [r0],xmm0
+ movdqa [r0+16], xmm1
+ %ifdef X86_32
+ pop r3
+ %endif
+ ret
+
+;***********************************************************************
+;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct )
+;***********************************************************************
+WELS_EXTERN WelsScan4x4DcAc_ssse3
+ %assign push_num 0
+ LOAD_2_PARA
+ movdqa xmm0, [r1]
+ movdqa xmm1, [r1+16]
+ pextrw r2d, xmm0, 7 ; ecx = [7]
+ pextrw r1d, xmm1, 0 ; eax = [8]
+ pinsrw xmm0, r1d, 7 ; xmm0[7] = [8]
+ pinsrw xmm1, r2d, 0 ; xmm1[0] = [7]
+ pshufb xmm1, [pb_scanacdc_maskb]
+ pshufb xmm0, [pb_scanacdc_maska]
+
+ movdqa [r0],xmm0
+ movdqa [r0+16], xmm1
+ ret
+;***********************************************************************
+;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct )
+;***********************************************************************
+WELS_EXTERN WelsScan4x4Ac_sse2
+ %assign push_num 0
+ LOAD_2_PARA
+ movdqa xmm0, [r1]
+ movdqa xmm1, [r1+16]
+ movdqa xmm2, xmm0
+ punpcklqdq xmm0, xmm1
+ punpckhqdq xmm2, xmm1
+
+ movdqa xmm3, xmm0
+ punpckldq xmm0, xmm2
+ punpckhdq xmm3, xmm2
+ pextrw r1d , xmm0, 3
+ pextrw r2d , xmm0, 7
+ pinsrw xmm0, r1d, 7
+ pextrw r1d, xmm3, 4
+ pinsrw xmm3, r2d, 4
+ pextrw r2d, xmm3, 0
+ pinsrw xmm3, r1d, 0
+ pinsrw xmm0, r2d, 3
+
+ pshufhw xmm1, xmm0, 0x93
+ pshuflw xmm2, xmm3, 0x39
+
+ movdqa xmm3, xmm2
+ psrldq xmm1, 2
+ pslldq xmm3, 14
+ por xmm1, xmm3
+ psrldq xmm2, 2
+ movdqa [r0],xmm1
+ movdqa [r0+16], xmm2
+ ret
+
+
+;***********************************************************************
+;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
+;***********************************************************************
+WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
+ %ifdef X86_32
+ push r3
+ %assign push_num 1
+ %else
+ %assign push_num 0
+ %endif
+ LOAD_1_PARA
+ movdqa xmm0, [r0]
+ movdqa xmm1, [r0+16]
+
+ packsswb xmm0, xmm1
+ ; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx
+ xor r3, r3
+ pxor xmm3, xmm3
+ pcmpeqb xmm0, xmm3
+ pmovmskb r3d, xmm0
+
+ xor r3, 0xffff
+
+ xor r0, r0
+ mov r2, 7
+ mov r1, 8
+.loop_low8_find1:
+ bt r3, r2
+ jc .loop_high8_find1
+ dec r2
+ jnz .loop_low8_find1
+.loop_high8_find1:
+ bt r3, r1
+ jc .find1end
+ inc r1
+ cmp r1,16
+ jb .loop_high8_find1
+.find1end:
+ sub r1, r2
+ sub r1, 1
+ lea r2, [i_ds_table]
+ add r0b, [r2+r1]
+ mov r1, r3
+ and r3, 0xff
+ shr r1, 8
+ and r1, 0xff
+ lea r2 , [low_mask_table]
+ add r0b, [r2 +r3]
+ lea r2, [high_mask_table]
+ add r0b, [r2+r1]
+ %ifdef X86_32
+ pop r3
+ %else
+ mov retrd, r0d
+ %endif
+ ret
+
+
+;***********************************************************************
+; int32_t WelsGetNoneZeroCount_sse2(int16_t* level);
+;***********************************************************************
+WELS_EXTERN WelsGetNoneZeroCount_sse2
+ %assign push_num 0
+ LOAD_1_PARA
+ movdqa xmm0, [r0]
+ movdqa xmm1, [r0+16]
+ pxor xmm2, xmm2
+ pcmpeqw xmm0, xmm2
+ pcmpeqw xmm1, xmm2
+ packsswb xmm1, xmm0
+ xor r1, r1
+ pmovmskb r1d, xmm1
+ xor r1d, 0xffff
+ mov r2, r1
+ and r1, 0xff
+ shr r2, 8
+; and ecx, 0xff ; we do not need this due to high 16bits equal to 0 yet
+; xor retr, retr
+ ;add al, [nozero_count_table+r2]
+ lea r0 , [nozero_count_table]
+ movzx r2, byte [r0+r2]
+ movzx r1, byte [r0+r1]
+ mov retrq, r2
+ add retrq, r1
+ ;add al, [nozero_count_table+r1]
+ ret
+
--- a/codec/encoder/targets.mk
+++ b/codec/encoder/targets.mk
@@ -37,12 +37,12 @@
ifeq ($(ASM_ARCH), x86)
ENCODER_ASM_SRCS=\
- $(ENCODER_SRCDIR)/core/asm/coeff.asm\
- $(ENCODER_SRCDIR)/core/asm/dct.asm\
- $(ENCODER_SRCDIR)/core/asm/intra_pred.asm\
- $(ENCODER_SRCDIR)/core/asm/memzero.asm\
- $(ENCODER_SRCDIR)/core/asm/quant.asm\
- $(ENCODER_SRCDIR)/core/asm/score.asm\
+ $(ENCODER_SRCDIR)/core/x86/coeff.asm\
+ $(ENCODER_SRCDIR)/core/x86/dct.asm\
+ $(ENCODER_SRCDIR)/core/x86/intra_pred.asm\
+ $(ENCODER_SRCDIR)/core/x86/memzero.asm\
+ $(ENCODER_SRCDIR)/core/x86/quant.asm\
+ $(ENCODER_SRCDIR)/core/x86/score.asm\
ENCODER_OBJS += $(ENCODER_ASM_SRCS:.asm=.$(OBJ))
endif
--- a/codec/processing/build/win32/WelsVP_2008.vcproj
+++ b/codec/processing/build/win32/WelsVP_2008.vcproj
@@ -514,7 +514,7 @@
</FileConfiguration>
</File>
<File
- RelativePath="..\..\src\asm\denoisefilter.asm"
+ RelativePath="..\..\src\x86\denoisefilter.asm"
>
<FileConfiguration
Name="Debug|Win32"
@@ -554,7 +554,7 @@
</FileConfiguration>
</File>
<File
- RelativePath="..\..\src\asm\downsample_bilinear.asm"
+ RelativePath="..\..\src\x86\downsample_bilinear.asm"
>
<FileConfiguration
Name="Debug|Win32"
@@ -634,7 +634,7 @@
</FileConfiguration>
</File>
<File
- RelativePath="..\..\src\asm\vaa.asm"
+ RelativePath="..\..\src\x86\vaa.asm"
>
<FileConfiguration
Name="Debug|Win32"
--- a/codec/processing/src/asm/denoisefilter.asm
+++ /dev/null
@@ -1,272 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2010-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* predenoise.asm
-;*
-;* Abstract
-;* denoise for SVC2.1
-;* History
-;* 4/13/2010 Created
-;* 7/30/2010 Modified
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Constant
-;***********************************************************************
-SECTION .rodata align=16
-
-sse2_32 times 8 dw 32
-sse2_20 times 8 dw 20
-
-
-
-;***********************************************************************
-; Code
-;***********************************************************************
-SECTION .text
-
-%macro WEIGHT_LINE 9
- movq %2, %9
- punpcklbw %2, %7
- movdqa %8, %2
-
- movdqa %1, %6
- psubusb %1, %8
- psubusb %8, %6
- por %8, %1 ; ABS(curPixel - centerPixel);
-
- movdqa %1, %3
- psubusb %1, %8
-
- pmullw %1, %1
- psrlw %1, 5
- pmullw %2, %1
- paddusw %4, %1
- paddusw %5, %2
-%endmacro
-
-%macro WEIGHT_LINE1_UV 4
- movdqa %2, %1
- punpcklbw %2, %4
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 1
- punpcklbw %2, %4
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 2
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 3
- punpcklbw %2, %4
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 4
- punpcklbw %2, %4
- paddw %3, %2
-%endmacro
-
-%macro WEIGHT_LINE2_UV 4
- movdqa %2, %1
- punpcklbw %2, %4
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 1
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 2
- punpcklbw %2, %4
- psllw %2, 2
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 3
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 4
- punpcklbw %2, %4
- paddw %3, %2
-%endmacro
-
-%macro WEIGHT_LINE3_UV 4
- movdqa %2, %1
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 1
- punpcklbw %2, %4
- psllw %2, 2
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 2
- punpcklbw %2, %4
- pmullw %2, [sse2_20]
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 3
- punpcklbw %2, %4
- psllw %2, 2
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 4
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
-%endmacro
-
-;***********************************************************************
-; BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
-;***********************************************************************
-; 1 2 3
-; 4 0 5
-; 6 7 8
-; 0: the center point
-
-WELS_EXTERN BilateralLumaFilter8_sse2
-
- push r3
- %assign push_num 1
- LOAD_2_PARA
- PUSH_XMM 8
-
- pxor xmm7, xmm7
-
- mov r3, r0
-
- movq xmm6, [r0]
- punpcklbw xmm6, xmm7
- movdqa xmm3, [sse2_32]
- pxor xmm4, xmm4 ; nTotWeight
- pxor xmm5, xmm5 ; nSum
-
- dec r0
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 4
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 5
-
- sub r0, r1
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 1
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 2
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 3
-
- lea r0, [r0 + r1 * 2]
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 6
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 7
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 8
-
- pcmpeqw xmm0, xmm0
- psrlw xmm0, 15
- psllw xmm0, 8
- psubusw xmm0, xmm4
- pmullw xmm0, xmm6
- paddusw xmm5, xmm0
- psrlw xmm5, 8
- packuswb xmm5, xmm5
- movq [r3], xmm5
-
-
- POP_XMM
- pop r3
- %assign push_num 0
-
- ret
-
-;***********************************************************************
-; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
-;***********************************************************************
-;5x5 filter:
-;1 1 2 1 1
-;1 2 4 2 1
-;2 4 20 4 2
-;1 2 4 2 1
-;1 1 2 1 1
-
-WELS_EXTERN WaverageChromaFilter8_sse2
-
- push r3
-
- %assign push_num 1
-
- LOAD_2_PARA
-
- mov r3, r1
- add r3, r3
- sub r0, r3 ; pixels - 2 * stride
- sub r0, 2
-
- pxor xmm0, xmm0
- pxor xmm3, xmm3
-
- movdqu xmm1, [r0]
- WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
-
- movdqu xmm1, [r0 + r1]
- WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
-
- add r0, r3
- movdqu xmm1, [r0]
- WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0
-
- movdqu xmm1, [r0 + r1]
- WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
-
- movdqu xmm1, [r0 + r1 * 2]
- WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
-
- psrlw xmm3, 6
- packuswb xmm3, xmm3
- movq [r0 + 2], xmm3
-
-
- pop r3
-
- %assign push_num 0
- ret
--- a/codec/processing/src/asm/downsample_bilinear.asm
+++ /dev/null
@@ -1,1205 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* upsampling.asm
-;*
-;* Abstract
-;* SIMD for pixel domain down sampling
-;*
-;* History
-;* 10/22/2009 Created
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-%ifdef X86_32
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-
-;***********************************************************************
-; Some constants
-;***********************************************************************
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-shufb_mask_low:
- db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
-shufb_mask_high:
- db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
-
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx32_sse( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $01 ; iSrcHeight >> 1
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $01 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 32 bytes
-.xloops:
- ; 1st part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
- ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movq mm0, [esi] ; 1st pSrc line
- movq mm1, [esi+8] ; 1st pSrc line + 8
- movq mm2, [esi+ecx] ; 2nd pSrc line
- movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8
-
- ; to handle mm0, mm1, mm2, mm3
- pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B
- pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B
- punpcklbw mm4, mm5 ; d c D C b a B A
- pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4
-
- pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B
- pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B
- punpcklbw mm5, mm6 ; h g H G f e F E
- pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5
-
- pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B
- pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B
- punpcklbw mm6, mm7 ; l k L K j i J I
- pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6
-
- pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B
- pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B
- punpcklbw mm7, mm0 ; p o P O n m N M
- pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7
-
- ; to handle mm4, mm5, mm6, mm7
- movq mm0, mm4 ;
- punpckldq mm0, mm5 ; H G F E D C B A
- punpckhdq mm4, mm5 ; h g f e d c b a
-
- movq mm1, mm6
- punpckldq mm1, mm7 ; P O N M L K J I
- punpckhdq mm6, mm7 ; p o n m l k j i
-
- ; avg within MB horizon width (16 x 2 lines)
- pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
- pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
- pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-
- ; 2nd part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
- ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movq mm1, [esi+16] ; 1st pSrc line + 16
- movq mm2, [esi+24] ; 1st pSrc line + 24
- movq mm3, [esi+ecx+16] ; 2nd pSrc line + 16
- movq mm4, [esi+ecx+24] ; 2nd pSrc line + 24
-
- ; to handle mm1, mm2, mm3, mm4
- pshufw mm5, mm1, 0d8h ; d D b B c C a A ; 11011000 B
- pshufw mm6, mm5, 04eh ; c C a A d D b B ; 01001110 B
- punpcklbw mm5, mm6 ; d c D C b a B A
- pshufw mm5, mm5, 0d8h ; d c b a D C B A ; 11011000 B: mm5
-
- pshufw mm6, mm2, 0d8h ; h H f F g G e E ; 11011000 B
- pshufw mm7, mm6, 04eh ; g G e E h H f F ; 01001110 B
- punpcklbw mm6, mm7 ; h g H G f e F E
- pshufw mm6, mm6, 0d8h ; h g f e H G F E ; 11011000 B: mm6
-
- pshufw mm7, mm3, 0d8h ; l L j J k K i I ; 11011000 B
- pshufw mm1, mm7, 04eh ; k K i I l L j J ; 01001110 B
- punpcklbw mm7, mm1 ; l k L K j i J I
- pshufw mm7, mm7, 0d8h ; l k j i L K J I ; 11011000 B: mm7
-
- pshufw mm1, mm4, 0d8h ; p P n N o O m M ; 11011000 B
- pshufw mm2, mm1, 04eh ; o O m M p P n N ; 01001110 B
- punpcklbw mm1, mm2 ; p o P O n m N M
- pshufw mm1, mm1, 0d8h ; p o n m P O N M ; 11011000 B: mm1
-
- ; to handle mm5, mm6, mm7, mm1
- movq mm2, mm5
- punpckldq mm2, mm6 ; H G F E D C B A
- punpckhdq mm5, mm6 ; h g f e d c b a
-
- movq mm3, mm7
- punpckldq mm3, mm1 ; P O N M L K J I
- punpckhdq mm7, mm1 ; p o n m l k j i
-
- ; avg within MB horizon width (16 x 2 lines)
- pavgb mm2, mm5 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
- pavgb mm3, mm7 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
- pavgb mm2, mm3 ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
-
- movq [edi ], mm0
- movq [edi+8], mm2
-
- ; next SMB
- lea esi, [esi+32]
- lea edi, [edi+16]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- WELSEMMS
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $01 ; iSrcHeight >> 1
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $01 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 16 bytes
-.xloops:
- ; 1st part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
- ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movq mm0, [esi] ; 1st pSrc line
- movq mm1, [esi+8] ; 1st pSrc line + 8
- movq mm2, [esi+ecx] ; 2nd pSrc line
- movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8
-
- ; to handle mm0, mm1, mm2, mm3
- pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B
- pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B
- punpcklbw mm4, mm5 ; d c D C b a B A
- pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4
-
- pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B
- pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B
- punpcklbw mm5, mm6 ; h g H G f e F E
- pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5
-
- pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B
- pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B
- punpcklbw mm6, mm7 ; l k L K j i J I
- pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6
-
- pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B
- pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B
- punpcklbw mm7, mm0 ; p o P O n m N M
- pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7
-
- ; to handle mm4, mm5, mm6, mm7
- movq mm0, mm4 ;
- punpckldq mm0, mm5 ; H G F E D C B A
- punpckhdq mm4, mm5 ; h g f e d c b a
-
- movq mm1, mm6
- punpckldq mm1, mm7 ; P O N M L K J I
- punpckhdq mm6, mm7 ; p o n m l k j i
-
- ; avg within MB horizon width (16 x 2 lines)
- pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
- pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
- pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-
- movq [edi ], mm0
-
- ; next SMB
- lea esi, [esi+16]
- lea edi, [edi+8]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- WELSEMMS
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $01 ; iSrcHeight >> 1
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $01 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $02 ; (iSrcWidth >> 1) / 4 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 8 bytes
-.xloops:
- ; 1st part horizonal loop: x8 bytes
- ; mem hi<- ->lo
- ;1st Line Src: mm0: d D c C b B a A
- ;2nd Line Src: mm1: h H g G f F e E
- ;=> target:
- ;: H G F E D C B A
- ;: h g f e d c b a
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movq mm0, [esi] ; 1st pSrc line
- movq mm1, [esi+ecx] ; 2nd pSrc line
-
- ; to handle mm0, mm1, mm2, mm3
- pshufw mm2, mm0, 0d8h ; d D b B c C a A ; 11011000 B
- pshufw mm3, mm2, 04eh ; c C a A d D b B ; 01001110 B
- punpcklbw mm2, mm3 ; d c D C b a B A
- pshufw mm2, mm2, 0d8h ; d c b a D C B A ; 11011000 B: mm4
-
- pshufw mm4, mm1, 0d8h ; h H f F g G e E ; 11011000 B
- pshufw mm5, mm4, 04eh ; g G e E h H f F ; 01001110 B
- punpcklbw mm4, mm5 ; h g H G f e F E
- pshufw mm4, mm4, 0d8h ; h g f e H G F E ; 11011000 B: mm5
-
- ; to handle mm2, mm4
- movq mm0, mm2 ;
- punpckldq mm0, mm4 ; H G F E D C B A
- punpckhdq mm2, mm4 ; h g f e d c b a
-
- ; avg within MB horizon width (16 x 2 lines)
- pavgb mm0, mm2 ; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
- pshufw mm1, mm0, 04eh ; 01001110 B
- pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-
- movd [edi], mm0
-
- ; next unit
- lea esi, [esi+8]
- lea edi, [edi+4]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- WELSEMMS
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-
-
-; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx32_ssse3( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $01 ; iSrcHeight >> 1
-
- movdqa xmm7, [shufb_mask_low] ; mask low
- movdqa xmm6, [shufb_mask_high] ; mask high
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $01 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 32 bytes
-.xloops:
- ; 1st part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
- ; xmm1: p P o O n N m M l L k K j J i I
- ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
- ; xmm3: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: P O N M L K J I H G F E D C B A
- ;: p o n m l k j i h g f e d c b a
- ;: P .. A
- ;: p .. a
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movdqa xmm0, [esi] ; 1st_src_line
- movdqa xmm1, [esi+16] ; 1st_src_line + 16
- movdqa xmm2, [esi+ecx] ; 2nd_src_line
- movdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
-
- ; packing & avg
- movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- ; another implementation for xmm4 high bits
-; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm4
-
- movdqa xmm5, xmm1
- pshufb xmm1, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm1
-; psrlw xmm5, 8
- pavgb xmm1, xmm5
-
- movdqa xmm4, xmm2
- pshufb xmm2, xmm7
- pshufb xmm4, xmm6
-; psubb xmm4, xmm2
-; psrlw xmm4, 8
- pavgb xmm2, xmm4
-
- movdqa xmm5, xmm3
- pshufb xmm3, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm3
-; psrlw xmm5, 8
- pavgb xmm3, xmm5
-
- packuswb xmm0, xmm1
- packuswb xmm2, xmm3
- pavgb xmm0, xmm2
-
- ; write pDst
- movdqa [edi], xmm0
-
- ; next SMB
- lea esi, [esi+32]
- lea edi, [edi+16]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $01 ; iSrcHeight >> 1
- movdqa xmm7, [shufb_mask_low] ; mask low
- movdqa xmm6, [shufb_mask_high] ; mask high
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $01 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 16 bytes
-.xloops:
- ; horizonal loop: x16 bytes by source
- ; mem hi<- ->lo
- ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
- ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movdqa xmm0, [esi] ; 1st_src_line
- movdqa xmm1, [esi+ecx] ; 2nd_src_line
-
- ; packing & avg
- movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- ; another implementation for xmm2 high bits
-; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm2
-
- movdqa xmm3, xmm1
- pshufb xmm1, xmm7
- pshufb xmm3, xmm6
-; psubb xmm3, xmm1
-; psrlw xmm3, 8
- pavgb xmm1, xmm3
-
- pavgb xmm0, xmm1
- packuswb xmm0, xmm1
-
- ; write pDst
- movq [edi], xmm0
-
- ; next SMB
- lea esi, [esi+16]
- lea edi, [edi+8]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx32_sse4( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $01 ; iSrcHeight >> 1
-
- movdqa xmm7, [shufb_mask_low] ; mask low
- movdqa xmm6, [shufb_mask_high] ; mask high
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $01 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 32 bytes
-.xloops:
- ; 1st part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
- ; xmm1: p P o O n N m M l L k K j J i I
- ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
- ; xmm3: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: P O N M L K J I H G F E D C B A
- ;: p o n m l k j i h g f e d c b a
- ;: P .. A
- ;: p .. a
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movntdqa xmm0, [esi] ; 1st_src_line
- movntdqa xmm1, [esi+16] ; 1st_src_line + 16
- movntdqa xmm2, [esi+ecx] ; 2nd_src_line
- movntdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
-
- ; packing & avg
- movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm4
-
- movdqa xmm5, xmm1
- pshufb xmm1, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm1
-; psrlw xmm5, 8
- pavgb xmm1, xmm5
-
- movdqa xmm4, xmm2
- pshufb xmm2, xmm7
- pshufb xmm4, xmm6
-; psubb xmm4, xmm2
-; psrlw xmm4, 8
- pavgb xmm2, xmm4
-
- movdqa xmm5, xmm3
- pshufb xmm3, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm3
-; psrlw xmm5, 8
- pavgb xmm3, xmm5
-
- packuswb xmm0, xmm1
- packuswb xmm2, xmm3
- pavgb xmm0, xmm2
-
- ; write pDst
- movdqa [edi], xmm0
-
- ; next SMB
- lea esi, [esi+32]
- lea edi, [edi+16]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $01 ; iSrcHeight >> 1
- movdqa xmm7, [shufb_mask_low] ; mask low
- movdqa xmm6, [shufb_mask_high] ; mask high
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $01 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 16 bytes
-.xloops:
- ; horizonal loop: x16 bytes by source
- ; mem hi<- ->lo
- ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
- ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movntdqa xmm0, [esi] ; 1st_src_line
- movntdqa xmm1, [esi+ecx] ; 2nd_src_line
-
- ; packing & avg
- movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm2
-
- movdqa xmm3, xmm1
- pshufb xmm1, xmm7
- pshufb xmm3, xmm6
-; psubb xmm3, xmm1
-; psrlw xmm3, 8
- pavgb xmm1, xmm3
-
- pavgb xmm0, xmm1
- packuswb xmm0, xmm1
-
- ; write pDst
- movq [edi], xmm0
-
- ; next SMB
- lea esi, [esi+16]
- lea edi, [edi+8]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-
-
-
-
-;**************************************************************************************************************
-;int GeneralBilinearAccurateDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
-; unsigned int uiScaleX, unsigned int uiScaleY );
-;{
-;**************************************************************************************************************
-
-WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
- push ebp
- push esi
- push edi
- push ebx
-%define pushsize 16
-%define localsize 28
-%define pDstData esp + pushsize + localsize + 4
-%define dwDstStride esp + pushsize + localsize + 8
-%define dwDstWidth esp + pushsize + localsize + 12
-%define dwDstHeight esp + pushsize + localsize + 16
-%define pSrcData esp + pushsize + localsize + 20
-%define dwSrcStride esp + pushsize + localsize + 24
-%define dwSrcWidth esp + pushsize + localsize + 28
-%define dwSrcHeight esp + pushsize + localsize + 32
-%define scale esp + 0
-%define uiScaleX esp + pushsize + localsize + 36
-%define uiScaleY esp + pushsize + localsize + 40
-%define tmpHeight esp + 12
-%define yInverse esp + 16
-%define xInverse esp + 20
-%define dstStep esp + 24
- sub esp, localsize
-
- pxor xmm0, xmm0
- mov edx, 32767
- mov eax, [uiScaleX]
- and eax, 32767
- mov ebx, eax
- neg ebx
- and ebx, 32767
- movd xmm1, eax ; uinc(uiScaleX mod 32767)
- movd xmm2, ebx ; -uinc
- psllq xmm1, 32
- por xmm1, xmm2 ; 0 0 uinc -uinc (dword)
- pshufd xmm7, xmm1, 01000100b ; xmm7: uinc -uinc uinc -uinc
-
- mov eax, [uiScaleY]
- and eax, 32767
- mov ebx, eax
- neg ebx
- and ebx, 32767
- movd xmm6, eax ; vinc(uiScaleY mod 32767)
- movd xmm2, ebx ; -vinc
- psllq xmm6, 32
- por xmm6, xmm2 ; 0 0 vinc -vinc (dword)
- pshufd xmm6, xmm6, 01010000b ; xmm6: vinc vinc -vinc -vinc
-
- mov edx, 40003fffh
- movd xmm5, edx
- punpcklwd xmm5, xmm0 ; 16384 16383
- pshufd xmm5, xmm5, 01000100b ; xmm5: 16384 16383 16384 16383
-
-
-DOWNSAMPLE:
-
- mov eax, [dwDstHeight]
- mov edi, [pDstData]
- mov edx, [dwDstStride]
- mov ecx, [dwDstWidth]
- sub edx, ecx
- mov [dstStep], edx ; stride - width
- dec eax
- mov [tmpHeight], eax
- mov eax, 16384
- mov [yInverse], eax
-
- pshufd xmm4, xmm5, 01010000b ; initial v to 16384 16384 16383 16383
-
-HEIGHT:
- mov eax, [yInverse]
- mov esi, [pSrcData]
- shr eax, 15
- mul dword [dwSrcStride]
- add esi, eax ; get current row address
- mov ebp, esi
- add ebp, [dwSrcStride]
-
- mov eax, 16384
- mov [xInverse], eax
- mov ecx, [dwDstWidth]
- dec ecx
-
- movdqa xmm3, xmm5 ; initial u to 16384 16383 16384 16383
-
-WIDTH:
- mov eax, [xInverse]
- shr eax, 15
-
- movd xmm1, [esi+eax] ; xxxxxxba
- movd xmm2, [ebp+eax] ; xxxxxxdc
- pxor xmm0, xmm0
- punpcklwd xmm1, xmm2 ; xxxxdcba
- punpcklbw xmm1, xmm0 ; 0d0c0b0a
- punpcklwd xmm1, xmm0 ; 000d000c000b000a
-
- movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv
- pmaddwd xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2
- movdqa xmm0, xmm2
- pmuludq xmm2, xmm1
- psrlq xmm0, 32
- psrlq xmm1, 32
- pmuludq xmm0, xmm1
- paddq xmm2, xmm0
- pshufd xmm1, xmm2, 00001110b
- paddq xmm2, xmm1
- psrlq xmm2, 29
-
- movd eax, xmm2
- inc eax
- shr eax, 1
- mov [edi], al
- inc edi
-
- mov eax, [uiScaleX]
- add [xInverse], eax
-
- paddw xmm3, xmm7 ; inc u
- psllw xmm3, 1
- psrlw xmm3, 1
-
- loop WIDTH
-
-WIDTH_END:
- mov eax, [xInverse]
- shr eax, 15
- mov cl, [esi+eax]
- mov [edi], cl
- inc edi
-
- mov eax, [uiScaleY]
- add [yInverse], eax
- add edi, [dstStep]
-
- paddw xmm4, xmm6 ; inc v
- psllw xmm4, 1
- psrlw xmm4, 1
-
- dec dword [tmpHeight]
- jg HEIGHT
-
-
-LAST_ROW:
- mov eax, [yInverse]
- mov esi, [pSrcData]
- shr eax, 15
- mul dword [dwSrcStride]
- add esi, eax ; get current row address
-
- mov eax, 16384
- mov [xInverse], eax
- mov ecx, [dwDstWidth]
-
-LAST_ROW_WIDTH:
- mov eax, [xInverse]
- shr eax, 15
-
- mov al, [esi+eax]
- mov [edi], al
- inc edi
-
- mov eax, [uiScaleX]
- add [xInverse], eax
-
- loop LAST_ROW_WIDTH
-
-LAST_ROW_END:
-
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef pushsize
-%undef localsize
-%undef pSrcData
-%undef dwSrcWidth
-%undef dwSrcHeight
-%undef dwSrcStride
-%undef pDstData
-%undef dwDstWidth
-%undef dwDstHeight
-%undef dwDstStride
-%undef scale
-%undef uiScaleX
-%undef uiScaleY
-%undef tmpHeight
-%undef yInverse
-%undef xInverse
-%undef dstStep
- ret
-
-
-
-
-;**************************************************************************************************************
-;int GeneralBilinearFastDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
-; unsigned int uiScaleX, unsigned int uiScaleY );
-;{
-;**************************************************************************************************************
-
-WELS_EXTERN GeneralBilinearFastDownsampler_sse2
- push ebp
- push esi
- push edi
- push ebx
-%define pushsize 16
-%define localsize 28
-%define pDstData esp + pushsize + localsize + 4
-%define dwDstStride esp + pushsize + localsize + 8
-%define dwDstWidth esp + pushsize + localsize + 12
-%define dwDstHeight esp + pushsize + localsize + 16
-%define pSrcData esp + pushsize + localsize + 20
-%define dwSrcStride esp + pushsize + localsize + 24
-%define dwSrcWidth esp + pushsize + localsize + 28
-%define dwSrcHeight esp + pushsize + localsize + 32
-%define scale esp + 0
-%define uiScaleX esp + pushsize + localsize + 36
-%define uiScaleY esp + pushsize + localsize + 40
-%define tmpHeight esp + 12
-%define yInverse esp + 16
-%define xInverse esp + 20
-%define dstStep esp + 24
- sub esp, localsize
-
- pxor xmm0, xmm0
- mov edx, 65535
- mov eax, [uiScaleX]
- and eax, edx
- mov ebx, eax
- neg ebx
- and ebx, 65535
- movd xmm1, eax ; uinc(uiScaleX mod 65536)
- movd xmm2, ebx ; -uinc
- psllq xmm1, 32
- por xmm1, xmm2 ; 0 uinc 0 -uinc
- pshuflw xmm7, xmm1, 10001000b ; xmm7: uinc -uinc uinc -uinc
-
- mov eax, [uiScaleY]
- and eax, 32767
- mov ebx, eax
- neg ebx
- and ebx, 32767
- movd xmm6, eax ; vinc(uiScaleY mod 32767)
- movd xmm2, ebx ; -vinc
- psllq xmm6, 32
- por xmm6, xmm2 ; 0 vinc 0 -vinc
- pshuflw xmm6, xmm6, 10100000b ; xmm6: vinc vinc -vinc -vinc
-
- mov edx, 80007fffh ; 32768 32767
- movd xmm5, edx
- pshuflw xmm5, xmm5, 01000100b ; 32768 32767 32768 32767
- mov ebx, 16384
-
-
-FAST_DOWNSAMPLE:
-
- mov eax, [dwDstHeight]
- mov edi, [pDstData]
- mov edx, [dwDstStride]
- mov ecx, [dwDstWidth]
- sub edx, ecx
- mov [dstStep], edx ; stride - width
- dec eax
- mov [tmpHeight], eax
- mov eax, 16384
- mov [yInverse], eax
-
- pshuflw xmm4, xmm5, 01010000b
- psrlw xmm4, 1 ; initial v to 16384 16384 16383 16383
-
-FAST_HEIGHT:
- mov eax, [yInverse]
- mov esi, [pSrcData]
- shr eax, 15
- mul dword [dwSrcStride]
- add esi, eax ; get current row address
- mov ebp, esi
- add ebp, [dwSrcStride]
-
- mov eax, 32768
- mov [xInverse], eax
- mov ecx, [dwDstWidth]
- dec ecx
-
- movdqa xmm3, xmm5 ; initial u to 32768 32767 32768 32767
-
-FAST_WIDTH:
- mov eax, [xInverse]
- shr eax, 16
-
- movd xmm1, [esi+eax] ; xxxxxxba
- movd xmm2, [ebp+eax] ; xxxxxxdc
- punpcklwd xmm1, xmm2 ; xxxxdcba
- punpcklbw xmm1, xmm0 ; 0d0c0b0a
-
- movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv
- pmulhuw xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2
- pmaddwd xmm2, xmm1
- pshufd xmm1, xmm2, 00000001b
- paddd xmm2, xmm1
- movd xmm1, ebx
- paddd xmm2, xmm1
- psrld xmm2, 15
-
- packuswb xmm2, xmm0
- movd eax, xmm2
- mov [edi], al
- inc edi
-
- mov eax, [uiScaleX]
- add [xInverse], eax
-
- paddw xmm3, xmm7 ; inc u
-
- loop FAST_WIDTH
-
-FAST_WIDTH_END:
- mov eax, [xInverse]
- shr eax, 16
- mov cl, [esi+eax]
- mov [edi], cl
- inc edi
-
- mov eax, [uiScaleY]
- add [yInverse], eax
- add edi, [dstStep]
-
- paddw xmm4, xmm6 ; inc v
- psllw xmm4, 1
- psrlw xmm4, 1
-
- dec dword [tmpHeight]
- jg FAST_HEIGHT
-
-
-FAST_LAST_ROW:
- mov eax, [yInverse]
- mov esi, [pSrcData]
- shr eax, 15
- mul dword [dwSrcStride]
- add esi, eax ; get current row address
-
- mov eax, 32768
- mov [xInverse], eax
- mov ecx, [dwDstWidth]
-
-FAST_LAST_ROW_WIDTH:
- mov eax, [xInverse]
- shr eax, 16
-
- mov al, [esi+eax]
- mov [edi], al
- inc edi
-
- mov eax, [uiScaleX]
- add [xInverse], eax
-
- loop FAST_LAST_ROW_WIDTH
-
-FAST_LAST_ROW_END:
-
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef pushsize
-%undef localsize
-%undef pSrcData
-%undef dwSrcWidth
-%undef dwSrcHeight
-%undef dwSrcStride
-%undef pDstData
-%undef dwDstWidth
-%undef dwDstHeight
-%undef dwDstStride
-%undef scale
-%undef uiScaleX
-%undef uiScaleY
-%undef tmpHeight
-%undef yInverse
-%undef xInverse
-%undef dstStep
- ret
-%endif
--- a/codec/processing/src/asm/vaa.asm
+++ /dev/null
@@ -1,2030 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2010-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* vaa.asm
-;*
-;* Abstract
-;* sse2 for pVaa routines
-;*
-;* History
-;* 04/14/2010 Created
-;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3)
-;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement
-;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-%macro SUM_SQR_SSE2 3 ; dst, pSrc, zero
- movdqa %1, %2
- punpcklbw %1, %3
- punpckhbw %2, %3
- pmaddwd %1, %1
- pmaddwd %2, %2
- paddd %1, %2
- pshufd %2, %1, 04Eh ; 01001110 B
- paddd %1, %2
- pshufd %2, %1, 0B1h ; 10110001 B
- paddd %1, %2
-%endmacro ; END OF SUM_SQR_SSE2
-
-%macro WELS_SAD_16x2_SSE2 3 ;esi :%1 edi:%2 ebx:%3
- movdqa xmm1, [%1]
- movdqa xmm2, [%2]
- movdqa xmm3, [%1+%3]
- movdqa xmm4, [%2+%3]
- psadbw xmm1, xmm2
- psadbw xmm3, xmm4
- paddd xmm6, xmm1
- paddd xmm6, xmm3
- lea %1, [%1+%3*2]
- lea %2, [%2+%3*2]
-%endmacro
-
-; by comparing it outperforms than phaddw(SSSE3) sets
-%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp
- ; @sum_8x2 begin
- pshufd %2, %1, 04Eh ; 01001110 B
- paddw %1, %2
- pshuflw %2, %1, 04Eh ; 01001110 B
- paddw %1, %2
- pshuflw %2, %1, 0B1h ; 10110001 B
- paddw %1, %2
- ; end of @sum_8x2
-%endmacro ; END of SUM_WORD_8x2_SSE2
-
-%macro WELS_SAD_SUM_SQSUM_16x1_SSE2 3 ;esi:%1,edi:%2,ebx:%3
- movdqa xmm1, [%1]
- movdqa xmm2, [%2]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm2
- paddd xmm6, xmm3
-
- movdqa xmm3, xmm1
- psadbw xmm3, xmm0
- paddd xmm5, xmm3
-
- movdqa xmm2, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm2, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- paddd xmm4, xmm1
- paddd xmm4, xmm2
-
- add %1, %3
- add %2, %3
-%endmacro
-
-%macro WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 3 ;esi:%1 edi:%2 ebx:%3
- movdqa xmm1, [%1]
- movdqa xmm2, [%2]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm2
- paddd xmm7, xmm3 ; sad
-
- movdqa xmm3, xmm1
- pmaxub xmm3, xmm2
- pminub xmm2, xmm1
- psubb xmm3, xmm2 ; diff
-
- movdqa xmm2, xmm1
- psadbw xmm2, xmm0
- paddd xmm6, xmm2 ; sum
-
- movdqa xmm2, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm2, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- paddd xmm5, xmm1
- paddd xmm5, xmm2 ; sqsum
-
- movdqa xmm1, xmm3
- punpcklbw xmm1, xmm0
- punpckhbw xmm3, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm3, xmm3
- paddd xmm4, xmm1
- paddd xmm4, xmm3 ; sqdiff
-
- add %1, %3
- add %2, %3
-%endmacro
-
-%macro WELS_SAD_SD_MAD_16x1_SSE2 7 ;esi:%5 edi:%6 ebx:%7
-%define sad_reg %1
-%define sum_cur_reg %2
-%define sum_ref_reg %3
-%define mad_reg %4
- movdqa xmm1, [%5]
- movdqa xmm2, [%6]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm0
- paddd sum_cur_reg, xmm3 ; sum_cur
- movdqa xmm3, xmm2
- psadbw xmm3, xmm0
- paddd sum_ref_reg, xmm3 ; sum_ref
-
- movdqa xmm3, xmm1
- pmaxub xmm3, xmm2
- pminub xmm2, xmm1
- psubb xmm3, xmm2 ; abs diff
- pmaxub mad_reg, xmm3 ; max abs diff
-
- psadbw xmm3, xmm0
- paddd sad_reg, xmm3 ; sad
-
- add %5, %7
- add %6, %7
-%endmacro
-
-
-%macro WELS_MAX_REG_SSE2 1 ; xmm1, xmm2, xmm3 can be used
-%define max_reg %1
- movdqa xmm1, max_reg
- psrldq xmm1, 4
- pmaxub max_reg, xmm1
- movdqa xmm1, max_reg
- psrldq xmm1, 2
- pmaxub max_reg, xmm1
- movdqa xmm1, max_reg
- psrldq xmm1, 1
- pmaxub max_reg, xmm1
-%endmacro
-
-%macro WELS_SAD_BGD_SQDIFF_16x1_SSE2 7 ;esi:%5 edi:%6 ebx:%7
-%define sad_reg %1
-%define sum_reg %2
-%define mad_reg %3
-%define sqdiff_reg %4
- movdqa xmm1, [%5]
- movdqa xmm2, xmm1
- movdqa xmm3, xmm1
- punpcklbw xmm2, xmm0
- punpckhbw xmm3, xmm0
- pmaddwd xmm2, xmm2
- pmaddwd xmm3, xmm3
- paddd xmm2, xmm3
- movdqa xmm3, xmm2
- psllq xmm2, 32
- psrlq xmm3, 32
- psllq xmm3, 32
- paddd xmm2, xmm3
- paddd sad_reg, xmm2 ; sqsum
-
- movdqa xmm2, [%6]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm0
- paddd sum_reg, xmm3 ; sum_cur
- movdqa xmm3, xmm2
- psadbw xmm3, xmm0
- pslldq xmm3, 4
- paddd sum_reg, xmm3 ; sum_ref
-
- movdqa xmm3, xmm1
- pmaxub xmm3, xmm2
- pminub xmm2, xmm1
- psubb xmm3, xmm2 ; abs diff
- pmaxub mad_reg, xmm3 ; max abs diff
-
- movdqa xmm1, xmm3
- psadbw xmm3, xmm0
- paddd sad_reg, xmm3 ; sad
-
- movdqa xmm3, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm3, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm3, xmm3
- paddd sqdiff_reg, xmm1
- paddd sqdiff_reg, xmm3 ; sqdiff
-
- add %5, %7
- add %6, %7
-%endmacro
-
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-%ifdef X86_32
-
-;***********************************************************************
-; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
-;***********************************************************************
-WELS_EXTERN SampleVariance16x16_sse2
- push esi
- push edi
- push ebx
-
- sub esp, 16
- %define SUM [esp]
- %define SUM_CUR [esp+4]
- %define SQR [esp+8]
- %define SQR_CUR [esp+12]
- %define PUSH_SIZE 28 ; 12 + 16
-
- mov edi, [esp+PUSH_SIZE+4] ; y_ref
- mov edx, [esp+PUSH_SIZE+8] ; y_ref_stride
- mov esi, [esp+PUSH_SIZE+12] ; y_src
- mov eax, [esp+PUSH_SIZE+16] ; y_src_stride
- mov ecx, 010h ; height = 16
-
- pxor xmm7, xmm7
- movdqu SUM, xmm7
-
-.hloops:
- movdqa xmm0, [edi] ; y_ref
- movdqa xmm1, [esi] ; y_src
- movdqa xmm2, xmm0 ; store first for future process
- movdqa xmm3, xmm1
- ; sum += diff;
- movdqa xmm4, xmm0
- psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79]
- ; to be continued for sum
- pshufd xmm5, xmm4, 0C6h ; 11000110 B
- paddw xmm4, xmm5
- movd ebx, xmm4
- add SUM, ebx
-
- ; sqr += diff * diff;
- pmaxub xmm0, xmm1
- pminub xmm1, xmm2
- psubb xmm0, xmm1 ; diff
- SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
- movd ebx, xmm1
- add SQR, ebx
-
- ; sum_cur += y_src[x];
- movdqa xmm0, xmm3 ; cur_orig
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm7
- punpckhbw xmm1, xmm7
- paddw xmm0, xmm1 ; 8x2
- SUM_WORD_8x2_SSE2 xmm0, xmm1
- movd ebx, xmm0
- and ebx, 0ffffh
- add SUM_CUR, ebx
-
- ; sqr_cur += y_src[x] * y_src[x];
- SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
- movd ebx, xmm0
- add SQR_CUR, ebx
-
- lea edi, [edi+edx]
- lea esi, [esi+eax]
- dec ecx
- jnz near .hloops
-
- mov ebx, 0
- mov bx, word SUM
- sar ebx, 8
- imul ebx, ebx
- mov ecx, SQR
- sar ecx, 8
- sub ecx, ebx
- mov edi, [esp+PUSH_SIZE+20] ; pMotionTexture
- mov [edi], cx ; to store uiMotionIndex
- mov ebx, 0
- mov bx, word SUM_CUR
- sar ebx, 8
- imul ebx, ebx
- mov ecx, SQR_CUR
- sar ecx, 8
- sub ecx, ebx
- mov [edi+2], cx ; to store uiTextureIndex
-
- %undef SUM
- %undef SUM_CUR
- %undef SQR
- %undef SQR_CUR
- %undef PUSH_SIZE
-
- add esp, 16
- pop ebx
- pop edi
- pop esi
-
- ret
-
-
-
-;*************************************************************************************************************
-;void VAACalcSad_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
-; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
-;*************************************************************************************************************
-
-
-WELS_EXTERN VAACalcSad_sse2
-%define cur_data esp + pushsize + 4
-%define ref_data esp + pushsize + 8
-%define iPicWidth esp + pushsize + 12
-%define iPicHeight esp + pushsize + 16
-%define iPicStride esp + pushsize + 20
-%define psadframe esp + pushsize + 24
-%define psad8x8 esp + pushsize + 28
-%define pushsize 12
- push esi
- push edi
- push ebx
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov edx, [psad8x8]
- mov eax, ebx
-
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- pxor xmm0, xmm0
- pxor xmm7, xmm7 ; iFrameSad
-height_loop:
- mov ecx, dword [iPicWidth]
- push esi
- push edi
-width_loop:
- pxor xmm6, xmm6 ;
- WELS_SAD_16x2_SSE2 esi,edi,ebx
- WELS_SAD_16x2_SSE2 esi,edi,ebx
- WELS_SAD_16x2_SSE2 esi,edi,ebx
- WELS_SAD_16x2_SSE2 esi,edi,ebx
- paddd xmm7, xmm6
- movd [edx], xmm6
- psrldq xmm6, 8
- movd [edx+4], xmm6
-
- pxor xmm6, xmm6
- WELS_SAD_16x2_SSE2 esi,edi,ebx
- WELS_SAD_16x2_SSE2 esi,edi,ebx
- WELS_SAD_16x2_SSE2 esi,edi,ebx
- WELS_SAD_16x2_SSE2 esi,edi,ebx
- paddd xmm7, xmm6
- movd [edx+8], xmm6
- psrldq xmm6, 8
- movd [edx+12], xmm6
-
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
-
- dec ecx
- jnz width_loop
-
- pop edi
- pop esi
- add esi, eax
- add edi, eax
-
- dec dword [iPicHeight]
- jnz height_loop
-
- mov edx, [psadframe]
- movdqa xmm5, xmm7
- psrldq xmm7, 8
- paddd xmm7, xmm5
- movd [edx], xmm7
-
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef pushsize
- pop ebx
- pop edi
- pop esi
- ret
-
-%else ;64-bit
-
-;***********************************************************************
-; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
-;***********************************************************************
-WELS_EXTERN SampleVariance16x16_sse2
- %define SUM r10;[esp]
- %define SUM_CUR r11;[esp+4]
- %define SQR r13;[esp+8]
- %define SQR_CUR r15;[esp+12]
-
- push r12
- push r13
- push r14
- push r15
- %assign push_num 4
- LOAD_5_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1,r1d
- SIGN_EXTENSION r3,r3d
-
- mov r12,010h
- pxor xmm7, xmm7
- movq SUM, xmm7
- movq SUM_CUR,xmm7
- movq SQR,xmm7
- movq SQR_CUR,xmm7
-
-.hloops:
- mov r14,0
- movdqa xmm0, [r0] ; y_ref
- movdqa xmm1, [r2] ; y_src
- movdqa xmm2, xmm0 ; store first for future process
- movdqa xmm3, xmm1
- ; sum += diff;
- movdqa xmm4, xmm0
- psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79]
- ; to be continued for sum
- pshufd xmm5, xmm4, 0C6h ; 11000110 B
- paddw xmm4, xmm5
- movd r14d, xmm4
- add SUM, r14
-
- ; sqr += diff * diff;
- pmaxub xmm0, xmm1
- pminub xmm1, xmm2
- psubb xmm0, xmm1 ; diff
- SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
- movd r14d, xmm1
- add SQR, r14
-
- ; sum_cur += y_src[x];
- movdqa xmm0, xmm3 ; cur_orig
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm7
- punpckhbw xmm1, xmm7
- paddw xmm0, xmm1 ; 8x2
- SUM_WORD_8x2_SSE2 xmm0, xmm1
- movd r14d, xmm0
- and r14, 0ffffh
- add SUM_CUR, r14
-
- ; sqr_cur += y_src[x] * y_src[x];
- SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
- movd r14d, xmm0
- add SQR_CUR, r14
-
- lea r0, [r0+r1]
- lea r2, [r2+r3]
- dec r12
- jnz near .hloops
-
- mov r0, SUM
- sar r0, 8
- imul r0, r0
- mov r1, SQR
- sar r1, 8
- sub r1, r0
- mov [r4], r1w ; to store uiMotionIndex
- mov r0, SUM_CUR
- sar r0, 8
- imul r0, r0
- mov r1, SQR_CUR
- sar r1, 8
- sub r1, r0
- mov [r4+2], r1w ; to store uiTextureIndex
-
- POP_XMM
- LOAD_5_PARA_POP
- pop r15
- pop r14
- pop r13
- pop r12
-
-
- %assign push_num 0
-
- ret
-
-
-;*************************************************************************************************************
-;void VAACalcSad_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
-; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
-;*************************************************************************************************************
-
-
-WELS_EXTERN VAACalcSad_sse2
-%define cur_data r0
-%define ref_data r1
-%define iPicWidth r2
-%define iPicHeight r3
-%define iPicStride r4
-%define psadframe r5
-%define psad8x8 r6
-
- push r12
- push r13
- %assign push_num 2
- LOAD_7_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r2,r2d
- SIGN_EXTENSION r3,r3d
- SIGN_EXTENSION r4,r4d
-
- mov r12,r4
- shr r2, 4 ; iPicWidth/16
- shr r3, 4 ; iPicHeight/16
-
- shl r12, 4 ; iPicStride*16
- pxor xmm0, xmm0
- pxor xmm7, xmm7 ; iFrameSad
-height_loop:
- mov r13, r2
- push r0
- push r1
-width_loop:
- pxor xmm6, xmm6
- WELS_SAD_16x2_SSE2 r0,r1,r4
- WELS_SAD_16x2_SSE2 r0,r1,r4
- WELS_SAD_16x2_SSE2 r0,r1,r4
- WELS_SAD_16x2_SSE2 r0,r1,r4
- paddd xmm7, xmm6
- movd [r6], xmm6
- psrldq xmm6, 8
- movd [r6+4], xmm6
-
- pxor xmm6, xmm6
- WELS_SAD_16x2_SSE2 r0,r1,r4
- WELS_SAD_16x2_SSE2 r0,r1,r4
- WELS_SAD_16x2_SSE2 r0,r1,r4
- WELS_SAD_16x2_SSE2 r0,r1,r4
- paddd xmm7, xmm6
- movd [r6+8], xmm6
- psrldq xmm6, 8
- movd [r6+12], xmm6
-
- add r6, 16
- sub r0, r12
- sub r1, r12
- add r0, 16
- add r1, 16
-
- dec r13
- jnz width_loop
-
- pop r1
- pop r0
- add r0, r12
- add r1, r12
-
- dec r3
- jnz height_loop
-
- ;mov r13, [psadframe]
- movdqa xmm5, xmm7
- psrldq xmm7, 8
- paddd xmm7, xmm5
- movd [psadframe], xmm7
-
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef pushsize
- POP_XMM
- LOAD_7_PARA_POP
- pop r13
- pop r12
- %assign push_num 0
- ret
-
-%endif
-
-
-%ifdef X86_32
-;*************************************************************************************************************
-;void VAACalcSadVar_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
-; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
-;*************************************************************************************************************
-
-
-WELS_EXTERN VAACalcSadVar_sse2
-%define localsize 8
-%define cur_data esp + pushsize + localsize + 4
-%define ref_data esp + pushsize + localsize + 8
-%define iPicWidth esp + pushsize + localsize + 12
-%define iPicHeight esp + pushsize + localsize + 16
-%define iPicStride esp + pushsize + localsize + 20
-%define psadframe esp + pushsize + localsize + 24
-%define psad8x8 esp + pushsize + localsize + 28
-%define psum16x16 esp + pushsize + localsize + 32
-%define psqsum16x16 esp + pushsize + localsize + 36
-%define tmp_esi esp + 0
-%define tmp_edi esp + 4
-%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov edx, [psad8x8]
- mov eax, ebx
-
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- pxor xmm0, xmm0
- pxor xmm7, xmm7 ; iFrameSad
-var_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
-var_width_loop:
- pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8
- pxor xmm5, xmm5 ; pSum16x16
- pxor xmm4, xmm4 ; sqsum_16x16
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- paddd xmm7, xmm6
- movd [edx], xmm6
- psrldq xmm6, 8
- movd [edx+4], xmm6
-
- pxor xmm6, xmm6
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
- paddd xmm7, xmm6
- movd [edx+8], xmm6
- psrldq xmm6, 8
- movd [edx+12], xmm6
-
- mov ebp, [psum16x16]
- movdqa xmm1, xmm5
- psrldq xmm1, 8
- paddd xmm5, xmm1
- movd [ebp], xmm5
- add dword [psum16x16], 4
-
- movdqa xmm5, xmm4
- psrldq xmm5, 8
- paddd xmm4, xmm5
- movdqa xmm3, xmm4
- psrldq xmm3, 4
- paddd xmm4, xmm3
-
- mov ebp, [psqsum16x16]
- movd [ebp], xmm4
- add dword [psqsum16x16], 4
-
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
-
- dec ecx
- jnz var_width_loop
-
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
-
- dec dword [iPicHeight]
- jnz var_height_loop
-
- mov edx, [psadframe]
- movdqa xmm5, xmm7
- psrldq xmm7, 8
- paddd xmm7, xmm5
- movd [edx], xmm7
-
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef psum16x16
-%undef psqsum16x16
-%undef tmp_esi
-%undef tmp_edi
-%undef pushsize
-%undef localsize
- ret
-
-%else ;64-bit
-
-;*************************************************************************************************************
-;void VAACalcSadVar_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
-; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
-;*************************************************************************************************************
-
-
-WELS_EXTERN VAACalcSadVar_sse2
-%define cur_data arg1 ;r0
-%define ref_data arg2 ;r1
-%define iPicWidth arg3 ;r2
-%define iPicHeight arg4 ;r3
-%define iPicStride arg5
-%define psadframe arg6
-%define psad8x8 arg7
-%define psum16x16 arg8
-%define psqsum16x16 arg9
-
- push r12
- push r13
- push r14
- push r15
- %assign push_num 4
- PUSH_XMM 8
-
-%ifdef WIN64
- mov r4, arg5 ;iPicStride
- mov r5, arg6 ;psad8x8
-%endif
- mov r14,arg7
- SIGN_EXTENSION r2,r2d
- SIGN_EXTENSION r3,r3d
- SIGN_EXTENSION r4,r4d
-
- mov r13,r4
- shr r2,4
- shr r3,4
-
- shl r13,4 ; iPicStride*16
- pxor xmm0, xmm0
- pxor xmm7, xmm7 ; iFrameSad
-var_height_loop:
- push r2
- %assign push_num push_num+1
- mov r11, r0
- mov r12, r1
-var_width_loop:
- pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8
- pxor xmm5, xmm5 ; pSum16x16
- pxor xmm4, xmm4 ; sqsum_16x16
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- paddd xmm7, xmm6
- movd [r14], xmm6
- psrldq xmm6, 8
- movd [r14+4], xmm6
-
- pxor xmm6, xmm6
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
- paddd xmm7, xmm6
- movd [r14+8], xmm6
- psrldq xmm6, 8
- movd [r14+12], xmm6
-
- mov r15, psum16x16
- movdqa xmm1, xmm5
- psrldq xmm1, 8
- paddd xmm5, xmm1
- movd [r15], xmm5
- add dword psum16x16, 4
-
- movdqa xmm5, xmm4
- psrldq xmm5, 8
- paddd xmm4, xmm5
- movdqa xmm3, xmm4
- psrldq xmm3, 4
- paddd xmm4, xmm3
-
- mov r15, psqsum16x16
- movd [r15], xmm4
- add dword psqsum16x16, 4
-
- add r14,16
- sub r0, r13
- sub r1, r13
- add r0, 16
- add r1, 16
-
- dec r2
- jnz var_width_loop
-
- pop r2
- %assign push_num push_num-1
- mov r0, r11
- mov r1, r12
- add r0, r13
- add r1, r13
- dec r3
- jnz var_height_loop
-
- mov r15, psadframe
- movdqa xmm5, xmm7
- psrldq xmm7, 8
- paddd xmm7, xmm5
- movd [r15], xmm7
-
- POP_XMM
- pop r15
- pop r14
- pop r13
- pop r12
-%assign push_num 0
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef psum16x16
-%undef psqsum16x16
-%undef tmp_esi
-%undef tmp_edi
-%undef pushsize
-%undef localsize
- ret
-
-%endif
-
-%ifdef X86_32
-
-;*************************************************************************************************************
-;void VAACalcSadSsd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
-;*************************************************************************************************************
-
-
-WELS_EXTERN VAACalcSadSsd_sse2
-%define localsize 12
-%define cur_data esp + pushsize + localsize + 4
-%define ref_data esp + pushsize + localsize + 8
-%define iPicWidth esp + pushsize + localsize + 12
-%define iPicHeight esp + pushsize + localsize + 16
-%define iPicStride esp + pushsize + localsize + 20
-%define psadframe esp + pushsize + localsize + 24
-%define psad8x8 esp + pushsize + localsize + 28
-%define psum16x16 esp + pushsize + localsize + 32
-%define psqsum16x16 esp + pushsize + localsize + 36
-%define psqdiff16x16 esp + pushsize + localsize + 40
-%define tmp_esi esp + 0
-%define tmp_edi esp + 4
-%define tmp_sadframe esp + 8
-%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
-
- mov ecx, [iPicWidth]
- mov ecx, [iPicHeight]
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov edx, [psad8x8]
- mov eax, ebx
-
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- mov ecx, [iPicWidth]
- mov ecx, [iPicHeight]
- pxor xmm0, xmm0
- movd [tmp_sadframe], xmm0
-sqdiff_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
-sqdiff_width_loop:
- pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8
- pxor xmm6, xmm6 ; pSum16x16
- pxor xmm5, xmm5 ; sqsum_16x16 four dword
- pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- movdqa xmm1, xmm7
- movd [edx], xmm7
- psrldq xmm7, 8
- paddd xmm1, xmm7
- movd [edx+4], xmm7
- movd ebp, xmm1
- add [tmp_sadframe], ebp
-
- pxor xmm7, xmm7
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
- movdqa xmm1, xmm7
- movd [edx+8], xmm7
- psrldq xmm7, 8
- paddd xmm1, xmm7
- movd [edx+12], xmm7
- movd ebp, xmm1
- add [tmp_sadframe], ebp
-
- mov ebp, [psum16x16]
- movdqa xmm1, xmm6
- psrldq xmm1, 8
- paddd xmm6, xmm1
- movd [ebp], xmm6
- add dword [psum16x16], 4
-
- mov ebp, [psqsum16x16]
- pshufd xmm6, xmm5, 14 ;00001110
- paddd xmm6, xmm5
- pshufd xmm5, xmm6, 1 ;00000001
- paddd xmm5, xmm6
- movd [ebp], xmm5
- add dword [psqsum16x16], 4
-
- mov ebp, [psqdiff16x16]
- pshufd xmm5, xmm4, 14 ; 00001110
- paddd xmm5, xmm4
- pshufd xmm4, xmm5, 1 ; 00000001
- paddd xmm4, xmm5
- movd [ebp], xmm4
- add dword [psqdiff16x16], 4
-
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
-
- dec ecx
- jnz sqdiff_width_loop
-
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
-
- dec dword [iPicHeight]
- jnz sqdiff_height_loop
-
- mov ebx, [tmp_sadframe]
- mov eax, [psadframe]
- mov [eax], ebx
-
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef psum16x16
-%undef psqsum16x16
-%undef psqdiff16x16
-%undef tmp_esi
-%undef tmp_edi
-%undef tmp_sadframe
-%undef pushsize
-%undef localsize
- ret
-
-%else
-
-
-;*************************************************************************************************************
-;void VAACalcSadSsd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
-;*************************************************************************************************************
-
-
-WELS_EXTERN VAACalcSadSsd_sse2
-%define localsize 12
-%define cur_data arg1;r0
-%define ref_data arg2;r1
-%define iPicWidth arg3;r2
-%define iPicHeight arg4;r3
-%define iPicStride arg5;
-%define psadframe arg6;
-%define psad8x8 arg7;
-%define psum16x16 arg8;
-%define psqsum16x16 arg9;
-%define psqdiff16x16 arg10
-
- push r12
- push r13
- push r14
- push r15
- %assign push_num 4
- PUSH_XMM 10
-
-%ifdef WIN64
- mov r4,arg5
-%endif
- mov r14,arg7
- SIGN_EXTENSION r2,r2d
- SIGN_EXTENSION r3,r3d
- SIGN_EXTENSION r4,r4d
-
- mov r13,r4
- shr r2,4 ; iPicWidth/16
- shr r3,4 ; iPicHeight/16
- shl r13,4 ; iPicStride*16
- pxor xmm0, xmm0
- pxor xmm8, xmm8 ;framesad
- pxor xmm9, xmm9
-sqdiff_height_loop:
- ;mov ecx, dword [iPicWidth]
- ;mov r14,r2
- push r2
- %assign push_num push_num +1
- mov r10, r0
- mov r11, r1
-sqdiff_width_loop:
- pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8
- pxor xmm6, xmm6 ; pSum16x16
- pxor xmm5, xmm5 ; sqsum_16x16 four dword
- pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- movdqa xmm1, xmm7
- movd [r14], xmm7
- psrldq xmm7, 8
- paddd xmm1, xmm7
- movd [r14+4], xmm7
- movd r15d, xmm1
- movd xmm9, r15d
- paddd xmm8,xmm9
-
-
- pxor xmm7, xmm7
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
- movdqa xmm1, xmm7
- movd [r14+8], xmm7
- psrldq xmm7, 8
- paddd xmm1, xmm7
- movd [r14+12], xmm7
- movd r15d, xmm1
- movd xmm9, r15d
- paddd xmm8,xmm9
-
- mov r15, psum16x16
- movdqa xmm1, xmm6
- psrldq xmm1, 8
- paddd xmm6, xmm1
- movd [r15], xmm6
- add dword psum16x16, 4
-
- mov r15, psqsum16x16
- pshufd xmm6, xmm5, 14 ;00001110
- paddd xmm6, xmm5
- pshufd xmm5, xmm6, 1 ;00000001
- paddd xmm5, xmm6
- movd [r15], xmm5
- add dword psqsum16x16, 4
-
- mov r15, psqdiff16x16
- pshufd xmm5, xmm4, 14 ; 00001110
- paddd xmm5, xmm4
- pshufd xmm4, xmm5, 1 ; 00000001
- paddd xmm4, xmm5
- movd [r15], xmm4
- add dword psqdiff16x16, 4
-
- add r14,16
- sub r0, r13
- sub r1, r13
- add r0, 16
- add r1, 16
-
- dec r2
- jnz sqdiff_width_loop
-
- pop r2
- %assign push_num push_num -1
-
- mov r0, r10
- mov r1, r11
- add r0, r13
- add r1, r13
-
- dec r3
- jnz sqdiff_height_loop
-
- mov r13, psadframe
- movd [r13], xmm8
-
- POP_XMM
- pop r15
- pop r14
- pop r13
- pop r12
- %assign push_num 0
-
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef psum16x16
-%undef psqsum16x16
-%undef psqdiff16x16
-%undef tmp_esi
-%undef tmp_edi
-%undef tmp_sadframe
-%undef pushsize
-%undef localsize
- ret
-
-
-
-%endif
-
-%ifdef X86_32
-;*************************************************************************************************************
-;void VAACalcSadBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
-;*************************************************************************************************************
-
-
-WELS_EXTERN VAACalcSadBgd_sse2
-%define localsize 12
-%define cur_data esp + pushsize + localsize + 4
-%define ref_data esp + pushsize + localsize + 8
-%define iPicWidth esp + pushsize + localsize + 12
-%define iPicHeight esp + pushsize + localsize + 16
-%define iPicStride esp + pushsize + localsize + 20
-%define psadframe esp + pushsize + localsize + 24
-%define psad8x8 esp + pushsize + localsize + 28
-%define p_sd8x8 esp + pushsize + localsize + 32
-%define p_mad8x8 esp + pushsize + localsize + 36
-%define tmp_esi esp + 0
-%define tmp_edi esp + 4
-%define tmp_ecx esp + 8
-%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov eax, ebx
-
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- xor ebp, ebp
- pxor xmm0, xmm0
-bgd_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
-bgd_width_loop:
- pxor xmm7, xmm7 ; pSad8x8
- pxor xmm6, xmm6 ; sum_cur_8x8
- pxor xmm5, xmm5 ; sum_ref_8x8
- pxor xmm4, xmm4 ; pMad8x8
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
-
-
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm4
-
- ;movdqa xmm1, xmm4
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm4, xmm0
- ;punpcklwd xmm4, xmm0
- ;movd [edx+4], xmm4
- ;add edx, 8
- ;mov [p_mad8x8], edx
- mov [tmp_ecx], ecx
- movhlps xmm1, xmm4
- movd ecx, xmm4
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
-
-
- pslldq xmm7, 4
- pslldq xmm6, 4
- pslldq xmm5, 4
-
-
- pxor xmm4, xmm4 ; pMad8x8
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
-
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm4
-
- ;movdqa xmm1, xmm4
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm4, xmm0
- ;punpcklwd xmm4, xmm0
- ;movd [edx+4], xmm4
- ;add edx, 8
- ;mov [p_mad8x8], edx
- movhlps xmm1, xmm4
- movd ecx, xmm4
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
-
- ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2
-
- mov edx, [psad8x8]
- pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0
- movdqa [edx], xmm1
- add edx, 16
- mov [psad8x8], edx ; sad8x8
-
- paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0
- pshufd xmm2, xmm1, 00000011b
- paddd xmm1, xmm2
- movd edx, xmm1
- add ebp, edx ; sad frame
-
- mov edx, [p_sd8x8]
- psubd xmm6, xmm5
- pshufd xmm1, xmm6, 10001101b
- movdqa [edx], xmm1
- add edx, 16
- mov [p_sd8x8], edx
-
-
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
-
- mov ecx, [tmp_ecx]
- dec ecx
- jnz bgd_width_loop
-
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
-
- dec dword [iPicHeight]
- jnz bgd_height_loop
-
- mov edx, [psadframe]
- mov [edx], ebp
-
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef p_sd8x8
-%undef p_mad8x8
-%undef tmp_esi
-%undef tmp_edi
-%undef pushsize
-%undef localsize
- ret
-
-
-
-;*************************************************************************************************************
-;void VAACalcSadSsdBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
-; int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
-;*************************************************************************************************************
-
-
-WELS_EXTERN VAACalcSadSsdBgd_sse2
-%define localsize 16
-%define cur_data esp + pushsize + localsize + 4
-%define ref_data esp + pushsize + localsize + 8
-%define iPicWidth esp + pushsize + localsize + 12
-%define iPicHeight esp + pushsize + localsize + 16
-%define iPicStride esp + pushsize + localsize + 20
-%define psadframe esp + pushsize + localsize + 24
-%define psad8x8 esp + pushsize + localsize + 28
-%define psum16x16 esp + pushsize + localsize + 32
-%define psqsum16x16 esp + pushsize + localsize + 36
-%define psqdiff16x16 esp + pushsize + localsize + 40
-%define p_sd8x8 esp + pushsize + localsize + 44
-%define p_mad8x8 esp + pushsize + localsize + 48
-%define tmp_esi esp + 0
-%define tmp_edi esp + 4
-%define tmp_sadframe esp + 8
-%define tmp_ecx esp + 12
-%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov eax, ebx
-
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- pxor xmm0, xmm0
- movd [tmp_sadframe], xmm0
-sqdiff_bgd_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
-sqdiff_bgd_width_loop:
- pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0
- pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
- pxor xmm5, xmm5 ; pMad8x8
- pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
-
- mov edx, [psad8x8]
- movdqa xmm2, xmm7
- pshufd xmm1, xmm2, 00001110b
- movd [edx], xmm2
- movd [edx+4], xmm1
- add edx, 8
- mov [psad8x8], edx ; sad8x8
-
- paddd xmm1, xmm2
- movd edx, xmm1
- add [tmp_sadframe], edx ; iFrameSad
-
- mov edx, [psum16x16]
- movdqa xmm1, xmm6
- pshufd xmm2, xmm1, 00001110b
- paddd xmm1, xmm2
- movd [edx], xmm1 ; sum
-
- mov edx, [p_sd8x8]
- pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
- psubd xmm6, xmm1 ; 00 diff1 00 diff0
- pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
- movq [edx], xmm1
- add edx, 8
- mov [p_sd8x8], edx
-
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm5
- ;movdqa xmm1, xmm5
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm5, xmm0
- ;punpcklwd xmm5, xmm0
- ;movd [edx+4], xmm5
- ;add edx, 8
- ;mov [p_mad8x8], edx
- mov [tmp_ecx], ecx
- movhlps xmm1, xmm5
- movd ecx, xmm5
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
-
- psrlq xmm7, 32
- psllq xmm7, 32 ; clear sad
- pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
- pxor xmm5, xmm5 ; pMad8x8
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
-
- mov edx, [psad8x8]
- movdqa xmm2, xmm7
- pshufd xmm1, xmm2, 00001110b
- movd [edx], xmm2
- movd [edx+4], xmm1
- add edx, 8
- mov [psad8x8], edx ; sad8x8
-
- paddd xmm1, xmm2
- movd edx, xmm1
- add [tmp_sadframe], edx ; iFrameSad
-
- mov edx, [psum16x16]
- movdqa xmm1, xmm6
- pshufd xmm2, xmm1, 00001110b
- paddd xmm1, xmm2
- movd ebp, xmm1 ; sum
- add [edx], ebp
- add edx, 4
- mov [psum16x16], edx
-
- mov edx, [psqsum16x16]
- psrlq xmm7, 32
- pshufd xmm2, xmm7, 00001110b
- paddd xmm2, xmm7
- movd [edx], xmm2 ; sqsum
- add edx, 4
- mov [psqsum16x16], edx
-
- mov edx, [p_sd8x8]
- pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
- psubd xmm6, xmm1 ; 00 diff1 00 diff0
- pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
- movq [edx], xmm1
- add edx, 8
- mov [p_sd8x8], edx
-
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm5
- ;movdqa xmm1, xmm5
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm5, xmm0
- ;punpcklwd xmm5, xmm0
- ;movd [edx+4], xmm5
- ;add edx, 8
- ;mov [p_mad8x8], edx
- movhlps xmm1, xmm5
- movd ecx, xmm5
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
-
- mov edx, [psqdiff16x16]
- pshufd xmm1, xmm4, 00001110b
- paddd xmm4, xmm1
- pshufd xmm1, xmm4, 00000001b
- paddd xmm4, xmm1
- movd [edx], xmm4
- add edx, 4
- mov [psqdiff16x16], edx
-
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
-
- mov ecx, [tmp_ecx]
- dec ecx
- jnz sqdiff_bgd_width_loop
-
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
-
- dec dword [iPicHeight]
- jnz sqdiff_bgd_height_loop
-
- mov edx, [psadframe]
- mov ebp, [tmp_sadframe]
- mov [edx], ebp
-
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef psum16x16
-%undef psqsum16x16
-%undef psqdiff16x16
-%undef p_sd8x8
-%undef p_mad8x8
-%undef tmp_esi
-%undef tmp_edi
-%undef pushsize
-%undef localsize
- ret
-%else
-
-;*************************************************************************************************************
-;void VAACalcSadBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
-;*************************************************************************************************************
-
-
-WELS_EXTERN VAACalcSadBgd_sse2
-%define cur_data arg1;
-%define ref_data arg2;
-%define iPicWidth arg3;
-%define iPicHeight arg4;
-%define iPicStride arg5;
-%define psadframe arg6;
-%define psad8x8 arg7;
-%define p_sd8x8 arg8;
-%define p_mad8x8 arg9;
-
- push r12
- push r13
- push r14
- push r15
-%assign push_num 4
- PUSH_XMM 10
-%ifdef WIN64
- mov r4,arg5
- ; mov r5,arg6
-%endif
- mov r14,arg7
- SIGN_EXTENSION r2,r2d
- SIGN_EXTENSION r3,r3d
- SIGN_EXTENSION r4,r4d
-
-
- mov r13,r4
- mov r15,r0
- shr r2,4
- shr r3,4
- shl r13,4
- pxor xmm0, xmm0
- pxor xmm8, xmm8
- pxor xmm9, xmm9
-bgd_height_loop:
- ;mov ecx, dword [iPicWidth]
- push r2
- %assign push_num push_num+1
- mov r10, r15
- mov r11, r1
-bgd_width_loop:
- pxor xmm7, xmm7 ; pSad8x8
- pxor xmm6, xmm6 ; sum_cur_8x8
- pxor xmm5, xmm5 ; sum_ref_8x8
- pxor xmm4, xmm4 ; pMad8x8
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
-
-
- mov r14, p_mad8x8
- WELS_MAX_REG_SSE2 xmm4
-
- ;mov [tmp_ecx], ecx
- movhlps xmm1, xmm4
- movd r0d, xmm4
-
-
- mov [r14], r0b
- movd r0d, xmm1
- mov [r14+1],r0b
- add r14, 2
- ;mov p_mad8x8, r14
-
-
- pslldq xmm7, 4
- pslldq xmm6, 4
- pslldq xmm5, 4
-
-
- pxor xmm4, xmm4 ; pMad8x8
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
-
- ;mov r14, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm4
-
- movhlps xmm1, xmm4
- movd r0d, xmm4
- mov [r14], r0b
- movd r0d, xmm1
- mov [r14+1],r0b
- add r14, 2
- mov p_mad8x8, r14
-
- ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2
-
- mov r14, psad8x8
- pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0
- movdqa [r14], xmm1
- add r14, 16
- mov psad8x8, r14 ; sad8x8
-
- paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0
- pshufd xmm2, xmm1, 00000011b
- paddd xmm1, xmm2
- movd r14d, xmm1
- movd xmm9, r14d
- paddd xmm8, xmm9 ; sad frame
-
- mov r14, p_sd8x8
- psubd xmm6, xmm5
- pshufd xmm1, xmm6, 10001101b
- movdqa [r14], xmm1
- add r14, 16
- mov p_sd8x8, r14
-
-
- ;add edx, 16
- sub r15, r13
- sub r1, r13
- add r15, 16
- add r1, 16
-
-
- dec r2
- jnz bgd_width_loop
- pop r2
-%assign push_num push_num-1
- mov r15, r10
- mov r1, r11
- add r15, r13
- add r1, r13
-
- dec r3
- jnz bgd_height_loop
-
- mov r13, psadframe
- movd [r13], xmm8
-
- POP_XMM
- pop r15
- pop r14
- pop r13
- pop r12
-%assign push_num 0
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef p_sd8x8
-%undef p_mad8x8
-%undef tmp_esi
-%undef tmp_edi
-%undef pushsize
-%undef localsize
- ret
-
-
-
-;*************************************************************************************************************
-;void VAACalcSadSsdBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
-; int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
-;*************************************************************************************************************
-
-
-WELS_EXTERN VAACalcSadSsdBgd_sse2
-%define cur_data arg1;
-%define ref_data arg2;
-%define iPicWidth arg3;
-%define iPicHeight arg4;
-%define iPicStride arg5;
-%define psadframe arg6;
-%define psad8x8 arg7;
-%define psum16x16 arg8;
-%define psqsum16x16 arg9;
-%define psqdiff16x16 arg10;
-%define p_sd8x8 arg11
-%define p_mad8x8 arg12
-
- push r12
- push r13
- push r14
- push r15
-%assign push_num 4
- PUSH_XMM 10
-%ifdef WIN64
- mov r4,arg5
- ;mov r5,arg6
-%endif
- SIGN_EXTENSION r2,r2d
- SIGN_EXTENSION r3,r3d
- SIGN_EXTENSION r4,r4d
-
- mov r13,r4
- shr r2, 4 ; iPicWidth/16
- shr r3, 4 ; iPicHeight/16
- shl r13, 4 ; iPicStride*16
- pxor xmm0, xmm0
- pxor xmm8, xmm8
- pxor xmm9, xmm9
-
-
-sqdiff_bgd_height_loop:
- mov r10, r0
- mov r11, r1
- push r2
-%assign push_num push_num+1
-sqdiff_bgd_width_loop:
-
- pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0
- pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
- pxor xmm5, xmm5 ; pMad8x8
- pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
-
- mov r14, psad8x8
- movdqa xmm2, xmm7
- pshufd xmm1, xmm2, 00001110b
- movd [r14], xmm2
- movd [r14+4], xmm1
- add r14, 8
- mov psad8x8, r14 ; sad8x8
-
- paddd xmm1, xmm2
- movd r14d, xmm1
- movd xmm9,r14d
- paddd xmm8, xmm9 ; iFrameSad
-
- mov r14, psum16x16
- movdqa xmm1, xmm6
- pshufd xmm2, xmm1, 00001110b
- paddd xmm1, xmm2
- movd [r14], xmm1 ; sum
-
- mov r14, p_sd8x8
- pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
- psubd xmm6, xmm1 ; 00 diff1 00 diff0
- pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
- movq [r14], xmm1
- add r14, 8
- mov p_sd8x8, r14
-
- mov r14, p_mad8x8
- WELS_MAX_REG_SSE2 xmm5
-
- movhlps xmm1, xmm5
- push r0
- movd r0d, xmm5
- mov [r14], r0b
- movd r0d, xmm1
- mov [r14+1],r0b
- pop r0
- add r14, 2
- mov p_mad8x8, r14
-
- psrlq xmm7, 32
- psllq xmm7, 32 ; clear sad
- pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
- pxor xmm5, xmm5 ; pMad8x8
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
-
- mov r14, psad8x8
- movdqa xmm2, xmm7
- pshufd xmm1, xmm2, 00001110b
- movd [r14], xmm2
- movd [r14+4], xmm1
- add r14, 8
- mov psad8x8, r14 ; sad8x8
-
- paddd xmm1, xmm2
- movd r14d, xmm1
- movd xmm9, r14d
- paddd xmm8, xmm9 ; iFrameSad
-
- mov r14, psum16x16
- movdqa xmm1, xmm6
- pshufd xmm2, xmm1, 00001110b
- paddd xmm1, xmm2
- movd r15d, xmm1 ; sum
- add [r14], r15d
- add r14, 4
- mov psum16x16, r14
-
- mov r14, psqsum16x16
- psrlq xmm7, 32
- pshufd xmm2, xmm7, 00001110b
- paddd xmm2, xmm7
- movd [r14], xmm2 ; sqsum
- add r14, 4
- mov psqsum16x16, r14
-
- mov r14, p_sd8x8
- pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
- psubd xmm6, xmm1 ; 00 diff1 00 diff0
- pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
- movq [r14], xmm1
- add r14, 8
- mov p_sd8x8, r14
-
- mov r14, p_mad8x8
- WELS_MAX_REG_SSE2 xmm5
-
-
- movhlps xmm1, xmm5
- push r0
- movd r0d, xmm5
- mov [r14], r0b
- movd r0d, xmm1
- mov [r14+1],r0b
- pop r0
- add r14, 2
- mov p_mad8x8, r14
-
- mov r14, psqdiff16x16
- pshufd xmm1, xmm4, 00001110b
- paddd xmm4, xmm1
- pshufd xmm1, xmm4, 00000001b
- paddd xmm4, xmm1
- movd [r14], xmm4
- add r14, 4
- mov psqdiff16x16, r14
-
- add r14, 16
- sub r0, r13
- sub r1, r13
- add r0, 16
- add r1, 16
-
- dec r2
- jnz sqdiff_bgd_width_loop
- pop r2
- %assign push_num push_num-1
- mov r0, r10
- mov r1, r11
- add r0, r13
- add r1, r13
-
- dec r3
- jnz sqdiff_bgd_height_loop
-
- mov r14, psadframe
- movd [r14], xmm8
-
- POP_XMM
- pop r15
- pop r14
- pop r13
- pop r12
-%assign push_num 0
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef psum16x16
-%undef psqsum16x16
-%undef psqdiff16x16
-%undef p_sd8x8
-%undef p_mad8x8
-%undef tmp_esi
-%undef tmp_edi
-%undef pushsize
-%undef localsize
- ret
-%endif
--- /dev/null
+++ b/codec/processing/src/x86/denoisefilter.asm
@@ -1,0 +1,272 @@
+;*!
+;* \copy
+;* Copyright (c) 2010-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* predenoise.asm
+;*
+;* Abstract
+;* denoise for SVC2.1
+;* History
+;* 4/13/2010 Created
+;* 7/30/2010 Modified
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Constant
+;***********************************************************************
+SECTION .rodata align=16
+
+sse2_32 times 8 dw 32
+sse2_20 times 8 dw 20
+
+
+
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+
+%macro WEIGHT_LINE 9
+ movq %2, %9
+ punpcklbw %2, %7
+ movdqa %8, %2
+
+ movdqa %1, %6
+ psubusb %1, %8
+ psubusb %8, %6
+ por %8, %1 ; ABS(curPixel - centerPixel);
+
+ movdqa %1, %3
+ psubusb %1, %8
+
+ pmullw %1, %1
+ psrlw %1, 5
+ pmullw %2, %1
+ paddusw %4, %1
+ paddusw %5, %2
+%endmacro
+
+%macro WEIGHT_LINE1_UV 4
+ movdqa %2, %1
+ punpcklbw %2, %4
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 1
+ punpcklbw %2, %4
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 2
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 3
+ punpcklbw %2, %4
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 4
+ punpcklbw %2, %4
+ paddw %3, %2
+%endmacro
+
+%macro WEIGHT_LINE2_UV 4
+ movdqa %2, %1
+ punpcklbw %2, %4
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 1
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 2
+ punpcklbw %2, %4
+ psllw %2, 2
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 3
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 4
+ punpcklbw %2, %4
+ paddw %3, %2
+%endmacro
+
+%macro WEIGHT_LINE3_UV 4
+ movdqa %2, %1
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 1
+ punpcklbw %2, %4
+ psllw %2, 2
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 2
+ punpcklbw %2, %4
+ pmullw %2, [sse2_20]
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 3
+ punpcklbw %2, %4
+ psllw %2, 2
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 4
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
+%endmacro
+
+;***********************************************************************
+; BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
+;***********************************************************************
+; 1 2 3
+; 4 0 5
+; 6 7 8
+; 0: the center point
+
+WELS_EXTERN BilateralLumaFilter8_sse2
+
+ push r3
+ %assign push_num 1
+ LOAD_2_PARA
+ PUSH_XMM 8
+
+ pxor xmm7, xmm7
+
+ mov r3, r0
+
+ movq xmm6, [r0]
+ punpcklbw xmm6, xmm7
+ movdqa xmm3, [sse2_32]
+ pxor xmm4, xmm4 ; nTotWeight
+ pxor xmm5, xmm5 ; nSum
+
+ dec r0
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 4
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 5
+
+ sub r0, r1
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 1
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 2
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 3
+
+ lea r0, [r0 + r1 * 2]
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 6
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 7
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 8
+
+ pcmpeqw xmm0, xmm0
+ psrlw xmm0, 15
+ psllw xmm0, 8
+ psubusw xmm0, xmm4
+ pmullw xmm0, xmm6
+ paddusw xmm5, xmm0
+ psrlw xmm5, 8
+ packuswb xmm5, xmm5
+ movq [r3], xmm5
+
+
+ POP_XMM
+ pop r3
+ %assign push_num 0
+
+ ret
+
+;***********************************************************************
+; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
+;***********************************************************************
+;5x5 filter:
+;1 1 2 1 1
+;1 2 4 2 1
+;2 4 20 4 2
+;1 2 4 2 1
+;1 1 2 1 1
+
+WELS_EXTERN WaverageChromaFilter8_sse2
+
+ push r3
+
+ %assign push_num 1
+
+ LOAD_2_PARA
+
+ mov r3, r1
+ add r3, r3
+ sub r0, r3 ; pixels - 2 * stride
+ sub r0, 2
+
+ pxor xmm0, xmm0
+ pxor xmm3, xmm3
+
+ movdqu xmm1, [r0]
+ WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
+
+ movdqu xmm1, [r0 + r1]
+ WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
+
+ add r0, r3
+ movdqu xmm1, [r0]
+ WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0
+
+ movdqu xmm1, [r0 + r1]
+ WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
+
+ movdqu xmm1, [r0 + r1 * 2]
+ WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
+
+ psrlw xmm3, 6
+ packuswb xmm3, xmm3
+ movq [r0 + 2], xmm3
+
+
+ pop r3
+
+ %assign push_num 0
+ ret
--- /dev/null
+++ b/codec/processing/src/x86/downsample_bilinear.asm
@@ -1,0 +1,1205 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* upsampling.asm
+;*
+;* Abstract
+;* SIMD for pixel domain down sampling
+;*
+;* History
+;* 10/22/2009 Created
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+%ifdef X86_32
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+
+;***********************************************************************
+; Some constants
+;***********************************************************************
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+shufb_mask_low:
+ db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
+shufb_mask_high:
+ db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
+
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx32_sse( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $01 ; iSrcHeight >> 1
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $01 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 32 bytes
+.xloops:
+ ; 1st part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
+ ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movq mm0, [esi] ; 1st pSrc line
+ movq mm1, [esi+8] ; 1st pSrc line + 8
+ movq mm2, [esi+ecx] ; 2nd pSrc line
+ movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8
+
+ ; to handle mm0, mm1, mm2, mm3
+ pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B
+ pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B
+ punpcklbw mm4, mm5 ; d c D C b a B A
+ pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4
+
+ pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B
+ pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B
+ punpcklbw mm5, mm6 ; h g H G f e F E
+ pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5
+
+ pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B
+ pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B
+ punpcklbw mm6, mm7 ; l k L K j i J I
+ pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6
+
+ pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B
+ pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B
+ punpcklbw mm7, mm0 ; p o P O n m N M
+ pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7
+
+ ; to handle mm4, mm5, mm6, mm7
+ movq mm0, mm4 ;
+ punpckldq mm0, mm5 ; H G F E D C B A
+ punpckhdq mm4, mm5 ; h g f e d c b a
+
+ movq mm1, mm6
+ punpckldq mm1, mm7 ; P O N M L K J I
+ punpckhdq mm6, mm7 ; p o n m l k j i
+
+ ; avg within MB horizon width (16 x 2 lines)
+ pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+ pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+ pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+ ; 2nd part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
+ ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movq mm1, [esi+16] ; 1st pSrc line + 16
+ movq mm2, [esi+24] ; 1st pSrc line + 24
+ movq mm3, [esi+ecx+16] ; 2nd pSrc line + 16
+ movq mm4, [esi+ecx+24] ; 2nd pSrc line + 24
+
+ ; to handle mm1, mm2, mm3, mm4
+ pshufw mm5, mm1, 0d8h ; d D b B c C a A ; 11011000 B
+ pshufw mm6, mm5, 04eh ; c C a A d D b B ; 01001110 B
+ punpcklbw mm5, mm6 ; d c D C b a B A
+ pshufw mm5, mm5, 0d8h ; d c b a D C B A ; 11011000 B: mm5
+
+ pshufw mm6, mm2, 0d8h ; h H f F g G e E ; 11011000 B
+ pshufw mm7, mm6, 04eh ; g G e E h H f F ; 01001110 B
+ punpcklbw mm6, mm7 ; h g H G f e F E
+ pshufw mm6, mm6, 0d8h ; h g f e H G F E ; 11011000 B: mm6
+
+ pshufw mm7, mm3, 0d8h ; l L j J k K i I ; 11011000 B
+ pshufw mm1, mm7, 04eh ; k K i I l L j J ; 01001110 B
+ punpcklbw mm7, mm1 ; l k L K j i J I
+ pshufw mm7, mm7, 0d8h ; l k j i L K J I ; 11011000 B: mm7
+
+ pshufw mm1, mm4, 0d8h ; p P n N o O m M ; 11011000 B
+ pshufw mm2, mm1, 04eh ; o O m M p P n N ; 01001110 B
+ punpcklbw mm1, mm2 ; p o P O n m N M
+ pshufw mm1, mm1, 0d8h ; p o n m P O N M ; 11011000 B: mm1
+
+ ; to handle mm5, mm6, mm7, mm1
+ movq mm2, mm5
+ punpckldq mm2, mm6 ; H G F E D C B A
+ punpckhdq mm5, mm6 ; h g f e d c b a
+
+ movq mm3, mm7
+ punpckldq mm3, mm1 ; P O N M L K J I
+ punpckhdq mm7, mm1 ; p o n m l k j i
+
+ ; avg within MB horizon width (16 x 2 lines)
+ pavgb mm2, mm5 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+ pavgb mm3, mm7 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+ pavgb mm2, mm3 ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
+
+ movq [edi ], mm0
+ movq [edi+8], mm2
+
+ ; next SMB
+ lea esi, [esi+32]
+ lea edi, [edi+16]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ WELSEMMS
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $01 ; iSrcHeight >> 1
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $01 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 16 bytes
+.xloops:
+ ; 1st part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
+ ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movq mm0, [esi] ; 1st pSrc line
+ movq mm1, [esi+8] ; 1st pSrc line + 8
+ movq mm2, [esi+ecx] ; 2nd pSrc line
+ movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8
+
+ ; to handle mm0, mm1, mm2, mm3
+ pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B
+ pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B
+ punpcklbw mm4, mm5 ; d c D C b a B A
+ pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4
+
+ pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B
+ pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B
+ punpcklbw mm5, mm6 ; h g H G f e F E
+ pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5
+
+ pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B
+ pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B
+ punpcklbw mm6, mm7 ; l k L K j i J I
+ pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6
+
+ pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B
+ pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B
+ punpcklbw mm7, mm0 ; p o P O n m N M
+ pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7
+
+ ; to handle mm4, mm5, mm6, mm7
+ movq mm0, mm4 ;
+ punpckldq mm0, mm5 ; H G F E D C B A
+ punpckhdq mm4, mm5 ; h g f e d c b a
+
+ movq mm1, mm6
+ punpckldq mm1, mm7 ; P O N M L K J I
+ punpckhdq mm6, mm7 ; p o n m l k j i
+
+ ; avg within MB horizon width (16 x 2 lines)
+ pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+ pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+ pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+ movq [edi ], mm0
+
+ ; next SMB
+ lea esi, [esi+16]
+ lea edi, [edi+8]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ WELSEMMS
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $01 ; iSrcHeight >> 1
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $01 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $02 ; (iSrcWidth >> 1) / 4 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 8 bytes
+.xloops:
+ ; 1st part horizonal loop: x8 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: mm0: d D c C b B a A
+ ;2nd Line Src: mm1: h H g G f F e E
+ ;=> target:
+ ;: H G F E D C B A
+ ;: h g f e d c b a
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movq mm0, [esi] ; 1st pSrc line
+ movq mm1, [esi+ecx] ; 2nd pSrc line
+
+ ; to handle mm0, mm1, mm2, mm3
+ pshufw mm2, mm0, 0d8h ; d D b B c C a A ; 11011000 B
+ pshufw mm3, mm2, 04eh ; c C a A d D b B ; 01001110 B
+ punpcklbw mm2, mm3 ; d c D C b a B A
+ pshufw mm2, mm2, 0d8h ; d c b a D C B A ; 11011000 B: mm4
+
+ pshufw mm4, mm1, 0d8h ; h H f F g G e E ; 11011000 B
+ pshufw mm5, mm4, 04eh ; g G e E h H f F ; 01001110 B
+ punpcklbw mm4, mm5 ; h g H G f e F E
+ pshufw mm4, mm4, 0d8h ; h g f e H G F E ; 11011000 B: mm5
+
+ ; to handle mm2, mm4
+ movq mm0, mm2 ;
+ punpckldq mm0, mm4 ; H G F E D C B A
+ punpckhdq mm2, mm4 ; h g f e d c b a
+
+ ; avg within MB horizon width (16 x 2 lines)
+ pavgb mm0, mm2 ; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
+ pshufw mm1, mm0, 04eh ; 01001110 B
+ pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+ movd [edi], mm0
+
+ ; next unit
+ lea esi, [esi+8]
+ lea edi, [edi+4]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ WELSEMMS
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+
+
+; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx32_ssse3( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $01 ; iSrcHeight >> 1
+
+ movdqa xmm7, [shufb_mask_low] ; mask low
+ movdqa xmm6, [shufb_mask_high] ; mask high
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $01 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 32 bytes
+.xloops:
+ ; 1st part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
+ ; xmm1: p P o O n N m M l L k K j J i I
+ ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
+ ; xmm3: p P o O n N m M l L k K j J i I
+ ;=> target:
+ ;: P O N M L K J I H G F E D C B A
+ ;: p o n m l k j i h g f e d c b a
+ ;: P .. A
+ ;: p .. a
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movdqa xmm0, [esi] ; 1st_src_line
+ movdqa xmm1, [esi+16] ; 1st_src_line + 16
+ movdqa xmm2, [esi+ecx] ; 2nd_src_line
+ movdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
+
+ ; packing & avg
+ movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
+ pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+ pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ ; another implementation for xmm4 high bits
+; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ pavgb xmm0, xmm4
+
+ movdqa xmm5, xmm1
+ pshufb xmm1, xmm7
+ pshufb xmm5, xmm6
+; psubb xmm5, xmm1
+; psrlw xmm5, 8
+ pavgb xmm1, xmm5
+
+ movdqa xmm4, xmm2
+ pshufb xmm2, xmm7
+ pshufb xmm4, xmm6
+; psubb xmm4, xmm2
+; psrlw xmm4, 8
+ pavgb xmm2, xmm4
+
+ movdqa xmm5, xmm3
+ pshufb xmm3, xmm7
+ pshufb xmm5, xmm6
+; psubb xmm5, xmm3
+; psrlw xmm5, 8
+ pavgb xmm3, xmm5
+
+ packuswb xmm0, xmm1
+ packuswb xmm2, xmm3
+ pavgb xmm0, xmm2
+
+ ; write pDst
+ movdqa [edi], xmm0
+
+ ; next SMB
+ lea esi, [esi+32]
+ lea edi, [edi+16]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $01 ; iSrcHeight >> 1
+ movdqa xmm7, [shufb_mask_low] ; mask low
+ movdqa xmm6, [shufb_mask_high] ; mask high
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $01 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 16 bytes
+.xloops:
+ ; horizonal loop: x16 bytes by source
+ ; mem hi<- ->lo
+ ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
+ ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movdqa xmm0, [esi] ; 1st_src_line
+ movdqa xmm1, [esi+ecx] ; 2nd_src_line
+
+ ; packing & avg
+ movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
+ pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+ pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ ; another implementation for xmm2 high bits
+; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ pavgb xmm0, xmm2
+
+ movdqa xmm3, xmm1
+ pshufb xmm1, xmm7
+ pshufb xmm3, xmm6
+; psubb xmm3, xmm1
+; psrlw xmm3, 8
+ pavgb xmm1, xmm3
+
+ pavgb xmm0, xmm1
+ packuswb xmm0, xmm1
+
+ ; write pDst
+ movq [edi], xmm0
+
+ ; next SMB
+ lea esi, [esi+16]
+ lea edi, [edi+8]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx32_sse4( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $01 ; iSrcHeight >> 1
+
+ movdqa xmm7, [shufb_mask_low] ; mask low
+ movdqa xmm6, [shufb_mask_high] ; mask high
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $01 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 32 bytes
+.xloops:
+ ; 1st part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
+ ; xmm1: p P o O n N m M l L k K j J i I
+ ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
+ ; xmm3: p P o O n N m M l L k K j J i I
+ ;=> target:
+ ;: P O N M L K J I H G F E D C B A
+ ;: p o n m l k j i h g f e d c b a
+ ;: P .. A
+ ;: p .. a
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movntdqa xmm0, [esi] ; 1st_src_line
+ movntdqa xmm1, [esi+16] ; 1st_src_line + 16
+ movntdqa xmm2, [esi+ecx] ; 2nd_src_line
+ movntdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
+
+ ; packing & avg
+ movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
+ pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+ pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ pavgb xmm0, xmm4
+
+ movdqa xmm5, xmm1
+ pshufb xmm1, xmm7
+ pshufb xmm5, xmm6
+; psubb xmm5, xmm1
+; psrlw xmm5, 8
+ pavgb xmm1, xmm5
+
+ movdqa xmm4, xmm2
+ pshufb xmm2, xmm7
+ pshufb xmm4, xmm6
+; psubb xmm4, xmm2
+; psrlw xmm4, 8
+ pavgb xmm2, xmm4
+
+ movdqa xmm5, xmm3
+ pshufb xmm3, xmm7
+ pshufb xmm5, xmm6
+; psubb xmm5, xmm3
+; psrlw xmm5, 8
+ pavgb xmm3, xmm5
+
+ packuswb xmm0, xmm1
+ packuswb xmm2, xmm3
+ pavgb xmm0, xmm2
+
+ ; write pDst
+ movdqa [edi], xmm0
+
+ ; next SMB
+ lea esi, [esi+32]
+ lea edi, [edi+16]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $01 ; iSrcHeight >> 1
+ movdqa xmm7, [shufb_mask_low] ; mask low
+ movdqa xmm6, [shufb_mask_high] ; mask high
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $01 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 16 bytes
+.xloops:
+ ; horizonal loop: x16 bytes by source
+ ; mem hi<- ->lo
+ ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
+ ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movntdqa xmm0, [esi] ; 1st_src_line
+ movntdqa xmm1, [esi+ecx] ; 2nd_src_line
+
+ ; packing & avg
+ movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
+ pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+ pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ pavgb xmm0, xmm2
+
+ movdqa xmm3, xmm1
+ pshufb xmm1, xmm7
+ pshufb xmm3, xmm6
+; psubb xmm3, xmm1
+; psrlw xmm3, 8
+ pavgb xmm1, xmm3
+
+ pavgb xmm0, xmm1
+ packuswb xmm0, xmm1
+
+ ; write pDst
+ movq [edi], xmm0
+
+ ; next SMB
+ lea esi, [esi+16]
+ lea edi, [edi+8]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+
+
+
+
+;**************************************************************************************************************
+;int GeneralBilinearAccurateDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
+; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+; unsigned int uiScaleX, unsigned int uiScaleY );
+;{
+;**************************************************************************************************************
+
+WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
+ push ebp
+ push esi
+ push edi
+ push ebx
+%define pushsize 16
+%define localsize 28
+%define pDstData esp + pushsize + localsize + 4
+%define dwDstStride esp + pushsize + localsize + 8
+%define dwDstWidth esp + pushsize + localsize + 12
+%define dwDstHeight esp + pushsize + localsize + 16
+%define pSrcData esp + pushsize + localsize + 20
+%define dwSrcStride esp + pushsize + localsize + 24
+%define dwSrcWidth esp + pushsize + localsize + 28
+%define dwSrcHeight esp + pushsize + localsize + 32
+%define scale esp + 0
+%define uiScaleX esp + pushsize + localsize + 36
+%define uiScaleY esp + pushsize + localsize + 40
+%define tmpHeight esp + 12
+%define yInverse esp + 16
+%define xInverse esp + 20
+%define dstStep esp + 24
+ sub esp, localsize
+
+ pxor xmm0, xmm0
+ mov edx, 32767
+ mov eax, [uiScaleX]
+ and eax, 32767
+ mov ebx, eax
+ neg ebx
+ and ebx, 32767
+ movd xmm1, eax ; uinc(uiScaleX mod 32767)
+ movd xmm2, ebx ; -uinc
+ psllq xmm1, 32
+ por xmm1, xmm2 ; 0 0 uinc -uinc (dword)
+ pshufd xmm7, xmm1, 01000100b ; xmm7: uinc -uinc uinc -uinc
+
+ mov eax, [uiScaleY]
+ and eax, 32767
+ mov ebx, eax
+ neg ebx
+ and ebx, 32767
+ movd xmm6, eax ; vinc(uiScaleY mod 32767)
+ movd xmm2, ebx ; -vinc
+ psllq xmm6, 32
+ por xmm6, xmm2 ; 0 0 vinc -vinc (dword)
+ pshufd xmm6, xmm6, 01010000b ; xmm6: vinc vinc -vinc -vinc
+
+ mov edx, 40003fffh
+ movd xmm5, edx
+ punpcklwd xmm5, xmm0 ; 16384 16383
+ pshufd xmm5, xmm5, 01000100b ; xmm5: 16384 16383 16384 16383
+
+
+DOWNSAMPLE:
+
+ mov eax, [dwDstHeight]
+ mov edi, [pDstData]
+ mov edx, [dwDstStride]
+ mov ecx, [dwDstWidth]
+ sub edx, ecx
+ mov [dstStep], edx ; stride - width
+ dec eax
+ mov [tmpHeight], eax
+ mov eax, 16384
+ mov [yInverse], eax
+
+ pshufd xmm4, xmm5, 01010000b ; initial v to 16384 16384 16383 16383
+
+HEIGHT:
+ mov eax, [yInverse]
+ mov esi, [pSrcData]
+ shr eax, 15
+ mul dword [dwSrcStride]
+ add esi, eax ; get current row address
+ mov ebp, esi
+ add ebp, [dwSrcStride]
+
+ mov eax, 16384
+ mov [xInverse], eax
+ mov ecx, [dwDstWidth]
+ dec ecx
+
+ movdqa xmm3, xmm5 ; initial u to 16384 16383 16384 16383
+
+WIDTH:
+ mov eax, [xInverse]
+ shr eax, 15
+
+ movd xmm1, [esi+eax] ; xxxxxxba
+ movd xmm2, [ebp+eax] ; xxxxxxdc
+ pxor xmm0, xmm0
+ punpcklwd xmm1, xmm2 ; xxxxdcba
+ punpcklbw xmm1, xmm0 ; 0d0c0b0a
+ punpcklwd xmm1, xmm0 ; 000d000c000b000a
+
+ movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv
+ pmaddwd xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2
+ movdqa xmm0, xmm2
+ pmuludq xmm2, xmm1
+ psrlq xmm0, 32
+ psrlq xmm1, 32
+ pmuludq xmm0, xmm1
+ paddq xmm2, xmm0
+ pshufd xmm1, xmm2, 00001110b
+ paddq xmm2, xmm1
+ psrlq xmm2, 29
+
+ movd eax, xmm2
+ inc eax
+ shr eax, 1
+ mov [edi], al
+ inc edi
+
+ mov eax, [uiScaleX]
+ add [xInverse], eax
+
+ paddw xmm3, xmm7 ; inc u
+ psllw xmm3, 1
+ psrlw xmm3, 1
+
+ loop WIDTH
+
+WIDTH_END:
+ mov eax, [xInverse]
+ shr eax, 15
+ mov cl, [esi+eax]
+ mov [edi], cl
+ inc edi
+
+ mov eax, [uiScaleY]
+ add [yInverse], eax
+ add edi, [dstStep]
+
+ paddw xmm4, xmm6 ; inc v
+ psllw xmm4, 1
+ psrlw xmm4, 1
+
+ dec dword [tmpHeight]
+ jg HEIGHT
+
+
+LAST_ROW:
+ mov eax, [yInverse]
+ mov esi, [pSrcData]
+ shr eax, 15
+ mul dword [dwSrcStride]
+ add esi, eax ; get current row address
+
+ mov eax, 16384
+ mov [xInverse], eax
+ mov ecx, [dwDstWidth]
+
+LAST_ROW_WIDTH:
+ mov eax, [xInverse]
+ shr eax, 15
+
+ mov al, [esi+eax]
+ mov [edi], al
+ inc edi
+
+ mov eax, [uiScaleX]
+ add [xInverse], eax
+
+ loop LAST_ROW_WIDTH
+
+LAST_ROW_END:
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef pushsize
+%undef localsize
+%undef pSrcData
+%undef dwSrcWidth
+%undef dwSrcHeight
+%undef dwSrcStride
+%undef pDstData
+%undef dwDstWidth
+%undef dwDstHeight
+%undef dwDstStride
+%undef scale
+%undef uiScaleX
+%undef uiScaleY
+%undef tmpHeight
+%undef yInverse
+%undef xInverse
+%undef dstStep
+ ret
+
+
+
+
+;**************************************************************************************************************
+;int GeneralBilinearFastDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
+; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+; unsigned int uiScaleX, unsigned int uiScaleY );
+;{
+;**************************************************************************************************************
+
+WELS_EXTERN GeneralBilinearFastDownsampler_sse2
+ push ebp
+ push esi
+ push edi
+ push ebx
+%define pushsize 16
+%define localsize 28
+%define pDstData esp + pushsize + localsize + 4
+%define dwDstStride esp + pushsize + localsize + 8
+%define dwDstWidth esp + pushsize + localsize + 12
+%define dwDstHeight esp + pushsize + localsize + 16
+%define pSrcData esp + pushsize + localsize + 20
+%define dwSrcStride esp + pushsize + localsize + 24
+%define dwSrcWidth esp + pushsize + localsize + 28
+%define dwSrcHeight esp + pushsize + localsize + 32
+%define scale esp + 0
+%define uiScaleX esp + pushsize + localsize + 36
+%define uiScaleY esp + pushsize + localsize + 40
+%define tmpHeight esp + 12
+%define yInverse esp + 16
+%define xInverse esp + 20
+%define dstStep esp + 24
+ sub esp, localsize
+
+ pxor xmm0, xmm0
+ mov edx, 65535
+ mov eax, [uiScaleX]
+ and eax, edx
+ mov ebx, eax
+ neg ebx
+ and ebx, 65535
+ movd xmm1, eax ; uinc(uiScaleX mod 65536)
+ movd xmm2, ebx ; -uinc
+ psllq xmm1, 32
+ por xmm1, xmm2 ; 0 uinc 0 -uinc
+ pshuflw xmm7, xmm1, 10001000b ; xmm7: uinc -uinc uinc -uinc
+
+ mov eax, [uiScaleY]
+ and eax, 32767
+ mov ebx, eax
+ neg ebx
+ and ebx, 32767
+ movd xmm6, eax ; vinc(uiScaleY mod 32767)
+ movd xmm2, ebx ; -vinc
+ psllq xmm6, 32
+ por xmm6, xmm2 ; 0 vinc 0 -vinc
+ pshuflw xmm6, xmm6, 10100000b ; xmm6: vinc vinc -vinc -vinc
+
+ mov edx, 80007fffh ; 32768 32767
+ movd xmm5, edx
+ pshuflw xmm5, xmm5, 01000100b ; 32768 32767 32768 32767
+ mov ebx, 16384
+
+
+FAST_DOWNSAMPLE:
+
+ mov eax, [dwDstHeight]
+ mov edi, [pDstData]
+ mov edx, [dwDstStride]
+ mov ecx, [dwDstWidth]
+ sub edx, ecx
+ mov [dstStep], edx ; stride - width
+ dec eax
+ mov [tmpHeight], eax
+ mov eax, 16384
+ mov [yInverse], eax
+
+ pshuflw xmm4, xmm5, 01010000b
+ psrlw xmm4, 1 ; initial v to 16384 16384 16383 16383
+
+FAST_HEIGHT:
+ mov eax, [yInverse]
+ mov esi, [pSrcData]
+ shr eax, 15
+ mul dword [dwSrcStride]
+ add esi, eax ; get current row address
+ mov ebp, esi
+ add ebp, [dwSrcStride]
+
+ mov eax, 32768
+ mov [xInverse], eax
+ mov ecx, [dwDstWidth]
+ dec ecx
+
+ movdqa xmm3, xmm5 ; initial u to 32768 32767 32768 32767
+
+FAST_WIDTH:
+ mov eax, [xInverse]
+ shr eax, 16
+
+ movd xmm1, [esi+eax] ; xxxxxxba
+ movd xmm2, [ebp+eax] ; xxxxxxdc
+ punpcklwd xmm1, xmm2 ; xxxxdcba
+ punpcklbw xmm1, xmm0 ; 0d0c0b0a
+
+ movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv
+ pmulhuw xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2
+ pmaddwd xmm2, xmm1
+ pshufd xmm1, xmm2, 00000001b
+ paddd xmm2, xmm1
+ movd xmm1, ebx
+ paddd xmm2, xmm1
+ psrld xmm2, 15
+
+ packuswb xmm2, xmm0
+ movd eax, xmm2
+ mov [edi], al
+ inc edi
+
+ mov eax, [uiScaleX]
+ add [xInverse], eax
+
+ paddw xmm3, xmm7 ; inc u
+
+ loop FAST_WIDTH
+
+FAST_WIDTH_END:
+ mov eax, [xInverse]
+ shr eax, 16
+ mov cl, [esi+eax]
+ mov [edi], cl
+ inc edi
+
+ mov eax, [uiScaleY]
+ add [yInverse], eax
+ add edi, [dstStep]
+
+ paddw xmm4, xmm6 ; inc v
+ psllw xmm4, 1
+ psrlw xmm4, 1
+
+ dec dword [tmpHeight]
+ jg FAST_HEIGHT
+
+
+FAST_LAST_ROW:
+ mov eax, [yInverse]
+ mov esi, [pSrcData]
+ shr eax, 15
+ mul dword [dwSrcStride]
+ add esi, eax ; get current row address
+
+ mov eax, 32768
+ mov [xInverse], eax
+ mov ecx, [dwDstWidth]
+
+FAST_LAST_ROW_WIDTH:
+ mov eax, [xInverse]
+ shr eax, 16
+
+ mov al, [esi+eax]
+ mov [edi], al
+ inc edi
+
+ mov eax, [uiScaleX]
+ add [xInverse], eax
+
+ loop FAST_LAST_ROW_WIDTH
+
+FAST_LAST_ROW_END:
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef pushsize
+%undef localsize
+%undef pSrcData
+%undef dwSrcWidth
+%undef dwSrcHeight
+%undef dwSrcStride
+%undef pDstData
+%undef dwDstWidth
+%undef dwDstHeight
+%undef dwDstStride
+%undef scale
+%undef uiScaleX
+%undef uiScaleY
+%undef tmpHeight
+%undef yInverse
+%undef xInverse
+%undef dstStep
+ ret
+%endif
--- /dev/null
+++ b/codec/processing/src/x86/vaa.asm
@@ -1,0 +1,2030 @@
+;*!
+;* \copy
+;* Copyright (c) 2010-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* vaa.asm
+;*
+;* Abstract
+;* sse2 for pVaa routines
+;*
+;* History
+;* 04/14/2010 Created
+;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3)
+;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement
+;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+%macro SUM_SQR_SSE2 3 ; dst, pSrc, zero
+ movdqa %1, %2
+ punpcklbw %1, %3
+ punpckhbw %2, %3
+ pmaddwd %1, %1
+ pmaddwd %2, %2
+ paddd %1, %2
+ pshufd %2, %1, 04Eh ; 01001110 B
+ paddd %1, %2
+ pshufd %2, %1, 0B1h ; 10110001 B
+ paddd %1, %2
+%endmacro ; END OF SUM_SQR_SSE2
+
+%macro WELS_SAD_16x2_SSE2 3 ;esi :%1 edi:%2 ebx:%3
+ movdqa xmm1, [%1]
+ movdqa xmm2, [%2]
+ movdqa xmm3, [%1+%3]
+ movdqa xmm4, [%2+%3]
+ psadbw xmm1, xmm2
+ psadbw xmm3, xmm4
+ paddd xmm6, xmm1
+ paddd xmm6, xmm3
+ lea %1, [%1+%3*2]
+ lea %2, [%2+%3*2]
+%endmacro
+
+; by comparing it outperforms than phaddw(SSSE3) sets
+%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp
+ ; @sum_8x2 begin
+ pshufd %2, %1, 04Eh ; 01001110 B
+ paddw %1, %2
+ pshuflw %2, %1, 04Eh ; 01001110 B
+ paddw %1, %2
+ pshuflw %2, %1, 0B1h ; 10110001 B
+ paddw %1, %2
+ ; end of @sum_8x2
+%endmacro ; END of SUM_WORD_8x2_SSE2
+
+%macro WELS_SAD_SUM_SQSUM_16x1_SSE2 3 ;esi:%1,edi:%2,ebx:%3
+ movdqa xmm1, [%1]
+ movdqa xmm2, [%2]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm2
+ paddd xmm6, xmm3
+
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm0
+ paddd xmm5, xmm3
+
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm2, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm4, xmm1
+ paddd xmm4, xmm2
+
+ add %1, %3
+ add %2, %3
+%endmacro
+
+%macro WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 3 ;esi:%1 edi:%2 ebx:%3
+ movdqa xmm1, [%1]
+ movdqa xmm2, [%2]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm2
+ paddd xmm7, xmm3 ; sad
+
+ movdqa xmm3, xmm1
+ pmaxub xmm3, xmm2
+ pminub xmm2, xmm1
+ psubb xmm3, xmm2 ; diff
+
+ movdqa xmm2, xmm1
+ psadbw xmm2, xmm0
+ paddd xmm6, xmm2 ; sum
+
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm2, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm5, xmm1
+ paddd xmm5, xmm2 ; sqsum
+
+ movdqa xmm1, xmm3
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm3, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm3, xmm3
+ paddd xmm4, xmm1
+ paddd xmm4, xmm3 ; sqdiff
+
+ add %1, %3
+ add %2, %3
+%endmacro
+
+%macro WELS_SAD_SD_MAD_16x1_SSE2 7 ;esi:%5 edi:%6 ebx:%7
+%define sad_reg %1
+%define sum_cur_reg %2
+%define sum_ref_reg %3
+%define mad_reg %4
+ movdqa xmm1, [%5]
+ movdqa xmm2, [%6]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm0
+ paddd sum_cur_reg, xmm3 ; sum_cur
+ movdqa xmm3, xmm2
+ psadbw xmm3, xmm0
+ paddd sum_ref_reg, xmm3 ; sum_ref
+
+ movdqa xmm3, xmm1
+ pmaxub xmm3, xmm2
+ pminub xmm2, xmm1
+ psubb xmm3, xmm2 ; abs diff
+ pmaxub mad_reg, xmm3 ; max abs diff
+
+ psadbw xmm3, xmm0
+ paddd sad_reg, xmm3 ; sad
+
+ add %5, %7
+ add %6, %7
+%endmacro
+
+
+%macro WELS_MAX_REG_SSE2 1 ; xmm1, xmm2, xmm3 can be used
+%define max_reg %1
+ movdqa xmm1, max_reg
+ psrldq xmm1, 4
+ pmaxub max_reg, xmm1
+ movdqa xmm1, max_reg
+ psrldq xmm1, 2
+ pmaxub max_reg, xmm1
+ movdqa xmm1, max_reg
+ psrldq xmm1, 1
+ pmaxub max_reg, xmm1
+%endmacro
+
+%macro WELS_SAD_BGD_SQDIFF_16x1_SSE2 7 ;esi:%5 edi:%6 ebx:%7
+%define sad_reg %1
+%define sum_reg %2
+%define mad_reg %3
+%define sqdiff_reg %4
+ movdqa xmm1, [%5]
+ movdqa xmm2, xmm1
+ movdqa xmm3, xmm1
+ punpcklbw xmm2, xmm0
+ punpckhbw xmm3, xmm0
+ pmaddwd xmm2, xmm2
+ pmaddwd xmm3, xmm3
+ paddd xmm2, xmm3
+ movdqa xmm3, xmm2
+ psllq xmm2, 32
+ psrlq xmm3, 32
+ psllq xmm3, 32
+ paddd xmm2, xmm3
+ paddd sad_reg, xmm2 ; sqsum
+
+ movdqa xmm2, [%6]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm0
+ paddd sum_reg, xmm3 ; sum_cur
+ movdqa xmm3, xmm2
+ psadbw xmm3, xmm0
+ pslldq xmm3, 4
+ paddd sum_reg, xmm3 ; sum_ref
+
+ movdqa xmm3, xmm1
+ pmaxub xmm3, xmm2
+ pminub xmm2, xmm1
+ psubb xmm3, xmm2 ; abs diff
+ pmaxub mad_reg, xmm3 ; max abs diff
+
+ movdqa xmm1, xmm3
+ psadbw xmm3, xmm0
+ paddd sad_reg, xmm3 ; sad
+
+ movdqa xmm3, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm3, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm3, xmm3
+ paddd sqdiff_reg, xmm1
+ paddd sqdiff_reg, xmm3 ; sqdiff
+
+ add %5, %7
+ add %6, %7
+%endmacro
+
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+%ifdef X86_32
+
+;***********************************************************************
+; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
+;***********************************************************************
+WELS_EXTERN SampleVariance16x16_sse2
+ push esi
+ push edi
+ push ebx
+
+ sub esp, 16
+ %define SUM [esp]
+ %define SUM_CUR [esp+4]
+ %define SQR [esp+8]
+ %define SQR_CUR [esp+12]
+ %define PUSH_SIZE 28 ; 12 + 16
+
+ mov edi, [esp+PUSH_SIZE+4] ; y_ref
+ mov edx, [esp+PUSH_SIZE+8] ; y_ref_stride
+ mov esi, [esp+PUSH_SIZE+12] ; y_src
+ mov eax, [esp+PUSH_SIZE+16] ; y_src_stride
+ mov ecx, 010h ; height = 16
+
+ pxor xmm7, xmm7
+ movdqu SUM, xmm7
+
+.hloops:
+ movdqa xmm0, [edi] ; y_ref
+ movdqa xmm1, [esi] ; y_src
+ movdqa xmm2, xmm0 ; store first for future process
+ movdqa xmm3, xmm1
+ ; sum += diff;
+ movdqa xmm4, xmm0
+ psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79]
+ ; to be continued for sum
+ pshufd xmm5, xmm4, 0C6h ; 11000110 B
+ paddw xmm4, xmm5
+ movd ebx, xmm4
+ add SUM, ebx
+
+ ; sqr += diff * diff;
+ pmaxub xmm0, xmm1
+ pminub xmm1, xmm2
+ psubb xmm0, xmm1 ; diff
+ SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
+ movd ebx, xmm1
+ add SQR, ebx
+
+ ; sum_cur += y_src[x];
+ movdqa xmm0, xmm3 ; cur_orig
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm7
+ punpckhbw xmm1, xmm7
+ paddw xmm0, xmm1 ; 8x2
+ SUM_WORD_8x2_SSE2 xmm0, xmm1
+ movd ebx, xmm0
+ and ebx, 0ffffh
+ add SUM_CUR, ebx
+
+ ; sqr_cur += y_src[x] * y_src[x];
+ SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
+ movd ebx, xmm0
+ add SQR_CUR, ebx
+
+ lea edi, [edi+edx]
+ lea esi, [esi+eax]
+ dec ecx
+ jnz near .hloops
+
+ mov ebx, 0
+ mov bx, word SUM
+ sar ebx, 8
+ imul ebx, ebx
+ mov ecx, SQR
+ sar ecx, 8
+ sub ecx, ebx
+ mov edi, [esp+PUSH_SIZE+20] ; pMotionTexture
+ mov [edi], cx ; to store uiMotionIndex
+ mov ebx, 0
+ mov bx, word SUM_CUR
+ sar ebx, 8
+ imul ebx, ebx
+ mov ecx, SQR_CUR
+ sar ecx, 8
+ sub ecx, ebx
+ mov [edi+2], cx ; to store uiTextureIndex
+
+ %undef SUM
+ %undef SUM_CUR
+ %undef SQR
+ %undef SQR_CUR
+ %undef PUSH_SIZE
+
+ add esp, 16
+ pop ebx
+ pop edi
+ pop esi
+
+ ret
+
+
+
+;*************************************************************************************************************
+;void VAACalcSad_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
+;*************************************************************************************************************
+
+
+WELS_EXTERN VAACalcSad_sse2
+%define cur_data esp + pushsize + 4
+%define ref_data esp + pushsize + 8
+%define iPicWidth esp + pushsize + 12
+%define iPicHeight esp + pushsize + 16
+%define iPicStride esp + pushsize + 20
+%define psadframe esp + pushsize + 24
+%define psad8x8 esp + pushsize + 28
+%define pushsize 12
+ push esi
+ push edi
+ push ebx
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov edx, [psad8x8]
+ mov eax, ebx
+
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm7, xmm7 ; iFrameSad
+height_loop:
+ mov ecx, dword [iPicWidth]
+ push esi
+ push edi
+width_loop:
+ pxor xmm6, xmm6 ;
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ paddd xmm7, xmm6
+ movd [edx], xmm6
+ psrldq xmm6, 8
+ movd [edx+4], xmm6
+
+ pxor xmm6, xmm6
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ paddd xmm7, xmm6
+ movd [edx+8], xmm6
+ psrldq xmm6, 8
+ movd [edx+12], xmm6
+
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
+
+ dec ecx
+ jnz width_loop
+
+ pop edi
+ pop esi
+ add esi, eax
+ add edi, eax
+
+ dec dword [iPicHeight]
+ jnz height_loop
+
+ mov edx, [psadframe]
+ movdqa xmm5, xmm7
+ psrldq xmm7, 8
+ paddd xmm7, xmm5
+ movd [edx], xmm7
+
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef pushsize
+ pop ebx
+ pop edi
+ pop esi
+ ret
+
+%else ;64-bit
+
+;***********************************************************************
+; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
+;***********************************************************************
+WELS_EXTERN SampleVariance16x16_sse2
+ %define SUM r10;[esp]
+ %define SUM_CUR r11;[esp+4]
+ %define SQR r13;[esp+8]
+ %define SQR_CUR r15;[esp+12]
+
+ push r12
+ push r13
+ push r14
+ push r15
+ %assign push_num 4
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1,r1d
+ SIGN_EXTENSION r3,r3d
+
+ mov r12,010h
+ pxor xmm7, xmm7
+ movq SUM, xmm7
+ movq SUM_CUR,xmm7
+ movq SQR,xmm7
+ movq SQR_CUR,xmm7
+
+.hloops:
+ mov r14,0
+ movdqa xmm0, [r0] ; y_ref
+ movdqa xmm1, [r2] ; y_src
+ movdqa xmm2, xmm0 ; store first for future process
+ movdqa xmm3, xmm1
+ ; sum += diff;
+ movdqa xmm4, xmm0
+ psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79]
+ ; to be continued for sum
+ pshufd xmm5, xmm4, 0C6h ; 11000110 B
+ paddw xmm4, xmm5
+ movd r14d, xmm4
+ add SUM, r14
+
+ ; sqr += diff * diff;
+ pmaxub xmm0, xmm1
+ pminub xmm1, xmm2
+ psubb xmm0, xmm1 ; diff
+ SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
+ movd r14d, xmm1
+ add SQR, r14
+
+ ; sum_cur += y_src[x];
+ movdqa xmm0, xmm3 ; cur_orig
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm7
+ punpckhbw xmm1, xmm7
+ paddw xmm0, xmm1 ; 8x2
+ SUM_WORD_8x2_SSE2 xmm0, xmm1
+ movd r14d, xmm0
+ and r14, 0ffffh
+ add SUM_CUR, r14
+
+ ; sqr_cur += y_src[x] * y_src[x];
+ SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
+ movd r14d, xmm0
+ add SQR_CUR, r14
+
+ lea r0, [r0+r1]
+ lea r2, [r2+r3]
+ dec r12
+ jnz near .hloops
+
+ mov r0, SUM
+ sar r0, 8
+ imul r0, r0
+ mov r1, SQR
+ sar r1, 8
+ sub r1, r0
+ mov [r4], r1w ; to store uiMotionIndex
+ mov r0, SUM_CUR
+ sar r0, 8
+ imul r0, r0
+ mov r1, SQR_CUR
+ sar r1, 8
+ sub r1, r0
+ mov [r4+2], r1w ; to store uiTextureIndex
+
+ POP_XMM
+ LOAD_5_PARA_POP
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+
+
+ %assign push_num 0
+
+ ret
+
+
+;*************************************************************************************************************
+;void VAACalcSad_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
+;*************************************************************************************************************
+
+
+WELS_EXTERN VAACalcSad_sse2
+%define cur_data r0
+%define ref_data r1
+%define iPicWidth r2
+%define iPicHeight r3
+%define iPicStride r4
+%define psadframe r5
+%define psad8x8 r6
+
+ push r12
+ push r13
+ %assign push_num 2
+ LOAD_7_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r2,r2d
+ SIGN_EXTENSION r3,r3d
+ SIGN_EXTENSION r4,r4d
+
+ mov r12,r4
+ shr r2, 4 ; iPicWidth/16
+ shr r3, 4 ; iPicHeight/16
+
+ shl r12, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm7, xmm7 ; iFrameSad
+height_loop:
+ mov r13, r2
+ push r0
+ push r1
+width_loop:
+ pxor xmm6, xmm6
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ paddd xmm7, xmm6
+ movd [r6], xmm6
+ psrldq xmm6, 8
+ movd [r6+4], xmm6
+
+ pxor xmm6, xmm6
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ paddd xmm7, xmm6
+ movd [r6+8], xmm6
+ psrldq xmm6, 8
+ movd [r6+12], xmm6
+
+ add r6, 16
+ sub r0, r12
+ sub r1, r12
+ add r0, 16
+ add r1, 16
+
+ dec r13
+ jnz width_loop
+
+ pop r1
+ pop r0
+ add r0, r12
+ add r1, r12
+
+ dec r3
+ jnz height_loop
+
+ ;mov r13, [psadframe]
+ movdqa xmm5, xmm7
+ psrldq xmm7, 8
+ paddd xmm7, xmm5
+ movd [psadframe], xmm7
+
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef pushsize
+ POP_XMM
+ LOAD_7_PARA_POP
+ pop r13
+ pop r12
+ %assign push_num 0
+ ret
+
+%endif
+
+
+%ifdef X86_32
+;*************************************************************************************************************
+;void VAACalcSadVar_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
+;*************************************************************************************************************
+
+
+WELS_EXTERN VAACalcSadVar_sse2
+%define localsize 8
+%define cur_data esp + pushsize + localsize + 4
+%define ref_data esp + pushsize + localsize + 8
+%define iPicWidth esp + pushsize + localsize + 12
+%define iPicHeight esp + pushsize + localsize + 16
+%define iPicStride esp + pushsize + localsize + 20
+%define psadframe esp + pushsize + localsize + 24
+%define psad8x8 esp + pushsize + localsize + 28
+%define psum16x16 esp + pushsize + localsize + 32
+%define psqsum16x16 esp + pushsize + localsize + 36
+%define tmp_esi esp + 0
+%define tmp_edi esp + 4
+%define pushsize 16
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov edx, [psad8x8]
+ mov eax, ebx
+
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm7, xmm7 ; iFrameSad
+var_height_loop:
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
+var_width_loop:
+ pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8
+ pxor xmm5, xmm5 ; pSum16x16
+ pxor xmm4, xmm4 ; sqsum_16x16
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ paddd xmm7, xmm6
+ movd [edx], xmm6
+ psrldq xmm6, 8
+ movd [edx+4], xmm6
+
+ pxor xmm6, xmm6
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ paddd xmm7, xmm6
+ movd [edx+8], xmm6
+ psrldq xmm6, 8
+ movd [edx+12], xmm6
+
+ mov ebp, [psum16x16]
+ movdqa xmm1, xmm5
+ psrldq xmm1, 8
+ paddd xmm5, xmm1
+ movd [ebp], xmm5
+ add dword [psum16x16], 4
+
+ movdqa xmm5, xmm4
+ psrldq xmm5, 8
+ paddd xmm4, xmm5
+ movdqa xmm3, xmm4
+ psrldq xmm3, 4
+ paddd xmm4, xmm3
+
+ mov ebp, [psqsum16x16]
+ movd [ebp], xmm4
+ add dword [psqsum16x16], 4
+
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
+
+ dec ecx
+ jnz var_width_loop
+
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
+
+ dec dword [iPicHeight]
+ jnz var_height_loop
+
+ mov edx, [psadframe]
+ movdqa xmm5, xmm7
+ psrldq xmm7, 8
+ paddd xmm7, xmm5
+ movd [edx], xmm7
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef psum16x16
+%undef psqsum16x16
+%undef tmp_esi
+%undef tmp_edi
+%undef pushsize
+%undef localsize
+ ret
+
+%else ;64-bit
+
+;*************************************************************************************************************
+;void VAACalcSadVar_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
+;*************************************************************************************************************
+
+
+WELS_EXTERN VAACalcSadVar_sse2
+%define cur_data arg1 ;r0
+%define ref_data arg2 ;r1
+%define iPicWidth arg3 ;r2
+%define iPicHeight arg4 ;r3
+%define iPicStride arg5
+%define psadframe arg6
+%define psad8x8 arg7
+%define psum16x16 arg8
+%define psqsum16x16 arg9
+
+ push r12
+ push r13
+ push r14
+ push r15
+ %assign push_num 4
+ PUSH_XMM 8
+
+%ifdef WIN64
+ mov r4, arg5 ;iPicStride
+ mov r5, arg6 ;psad8x8
+%endif
+ mov r14,arg7
+ SIGN_EXTENSION r2,r2d
+ SIGN_EXTENSION r3,r3d
+ SIGN_EXTENSION r4,r4d
+
+ mov r13,r4
+ shr r2,4
+ shr r3,4
+
+ shl r13,4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm7, xmm7 ; iFrameSad
+var_height_loop:
+ push r2
+ %assign push_num push_num+1
+ mov r11, r0
+ mov r12, r1
+var_width_loop:
+ pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8
+ pxor xmm5, xmm5 ; pSum16x16
+ pxor xmm4, xmm4 ; sqsum_16x16
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ paddd xmm7, xmm6
+ movd [r14], xmm6
+ psrldq xmm6, 8
+ movd [r14+4], xmm6
+
+ pxor xmm6, xmm6
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ paddd xmm7, xmm6
+ movd [r14+8], xmm6
+ psrldq xmm6, 8
+ movd [r14+12], xmm6
+
+ mov r15, psum16x16
+ movdqa xmm1, xmm5
+ psrldq xmm1, 8
+ paddd xmm5, xmm1
+ movd [r15], xmm5
+ add dword psum16x16, 4
+
+ movdqa xmm5, xmm4
+ psrldq xmm5, 8
+ paddd xmm4, xmm5
+ movdqa xmm3, xmm4
+ psrldq xmm3, 4
+ paddd xmm4, xmm3
+
+ mov r15, psqsum16x16
+ movd [r15], xmm4
+ add dword psqsum16x16, 4
+
+ add r14,16
+ sub r0, r13
+ sub r1, r13
+ add r0, 16
+ add r1, 16
+
+ dec r2
+ jnz var_width_loop
+
+ pop r2
+ %assign push_num push_num-1
+ mov r0, r11
+ mov r1, r12
+ add r0, r13
+ add r1, r13
+ dec r3
+ jnz var_height_loop
+
+ mov r15, psadframe
+ movdqa xmm5, xmm7
+ psrldq xmm7, 8
+ paddd xmm7, xmm5
+ movd [r15], xmm7
+
+ POP_XMM
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%assign push_num 0
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef psum16x16
+%undef psqsum16x16
+%undef tmp_esi
+%undef tmp_edi
+%undef pushsize
+%undef localsize
+ ret
+
+%endif
+
+%ifdef X86_32
+
+;*************************************************************************************************************
+;void VAACalcSadSsd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
+;*************************************************************************************************************
+
+
+WELS_EXTERN VAACalcSadSsd_sse2
+%define localsize 12
+%define cur_data esp + pushsize + localsize + 4
+%define ref_data esp + pushsize + localsize + 8
+%define iPicWidth esp + pushsize + localsize + 12
+%define iPicHeight esp + pushsize + localsize + 16
+%define iPicStride esp + pushsize + localsize + 20
+%define psadframe esp + pushsize + localsize + 24
+%define psad8x8 esp + pushsize + localsize + 28
+%define psum16x16 esp + pushsize + localsize + 32
+%define psqsum16x16 esp + pushsize + localsize + 36
+%define psqdiff16x16 esp + pushsize + localsize + 40
+%define tmp_esi esp + 0
+%define tmp_edi esp + 4
+%define tmp_sadframe esp + 8
+%define pushsize 16
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
+
+ mov ecx, [iPicWidth]
+ mov ecx, [iPicHeight]
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov edx, [psad8x8]
+ mov eax, ebx
+
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ mov ecx, [iPicWidth]
+ mov ecx, [iPicHeight]
+ pxor xmm0, xmm0
+ movd [tmp_sadframe], xmm0
+sqdiff_height_loop:
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
+sqdiff_width_loop:
+ pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8
+ pxor xmm6, xmm6 ; pSum16x16
+ pxor xmm5, xmm5 ; sqsum_16x16 four dword
+ pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ movdqa xmm1, xmm7
+ movd [edx], xmm7
+ psrldq xmm7, 8
+ paddd xmm1, xmm7
+ movd [edx+4], xmm7
+ movd ebp, xmm1
+ add [tmp_sadframe], ebp
+
+ pxor xmm7, xmm7
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ movdqa xmm1, xmm7
+ movd [edx+8], xmm7
+ psrldq xmm7, 8
+ paddd xmm1, xmm7
+ movd [edx+12], xmm7
+ movd ebp, xmm1
+ add [tmp_sadframe], ebp
+
+ mov ebp, [psum16x16]
+ movdqa xmm1, xmm6
+ psrldq xmm1, 8
+ paddd xmm6, xmm1
+ movd [ebp], xmm6
+ add dword [psum16x16], 4
+
+ mov ebp, [psqsum16x16]
+ pshufd xmm6, xmm5, 14 ;00001110
+ paddd xmm6, xmm5
+ pshufd xmm5, xmm6, 1 ;00000001
+ paddd xmm5, xmm6
+ movd [ebp], xmm5
+ add dword [psqsum16x16], 4
+
+ mov ebp, [psqdiff16x16]
+ pshufd xmm5, xmm4, 14 ; 00001110
+ paddd xmm5, xmm4
+ pshufd xmm4, xmm5, 1 ; 00000001
+ paddd xmm4, xmm5
+ movd [ebp], xmm4
+ add dword [psqdiff16x16], 4
+
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
+
+ dec ecx
+ jnz sqdiff_width_loop
+
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
+
+ dec dword [iPicHeight]
+ jnz sqdiff_height_loop
+
+ mov ebx, [tmp_sadframe]
+ mov eax, [psadframe]
+ mov [eax], ebx
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef psum16x16
+%undef psqsum16x16
+%undef psqdiff16x16
+%undef tmp_esi
+%undef tmp_edi
+%undef tmp_sadframe
+%undef pushsize
+%undef localsize
+ ret
+
+%else
+
+
+;*************************************************************************************************************
+;void VAACalcSadSsd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
+;*************************************************************************************************************
+
+
+WELS_EXTERN VAACalcSadSsd_sse2
+%define localsize 12
+%define cur_data arg1;r0
+%define ref_data arg2;r1
+%define iPicWidth arg3;r2
+%define iPicHeight arg4;r3
+%define iPicStride arg5;
+%define psadframe arg6;
+%define psad8x8 arg7;
+%define psum16x16 arg8;
+%define psqsum16x16 arg9;
+%define psqdiff16x16 arg10
+
+ push r12
+ push r13
+ push r14
+ push r15
+ %assign push_num 4
+ PUSH_XMM 10
+
+%ifdef WIN64
+ mov r4,arg5
+%endif
+ mov r14,arg7
+ SIGN_EXTENSION r2,r2d
+ SIGN_EXTENSION r3,r3d
+ SIGN_EXTENSION r4,r4d
+
+ mov r13,r4
+ shr r2,4 ; iPicWidth/16
+ shr r3,4 ; iPicHeight/16
+ shl r13,4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm8, xmm8 ;framesad
+ pxor xmm9, xmm9
+sqdiff_height_loop:
+ ;mov ecx, dword [iPicWidth]
+ ;mov r14,r2
+ push r2
+ %assign push_num push_num +1
+ mov r10, r0
+ mov r11, r1
+sqdiff_width_loop:
+ pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8
+ pxor xmm6, xmm6 ; pSum16x16
+ pxor xmm5, xmm5 ; sqsum_16x16 four dword
+ pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ movdqa xmm1, xmm7
+ movd [r14], xmm7
+ psrldq xmm7, 8
+ paddd xmm1, xmm7
+ movd [r14+4], xmm7
+ movd r15d, xmm1
+ movd xmm9, r15d
+ paddd xmm8,xmm9
+
+
+ pxor xmm7, xmm7
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ movdqa xmm1, xmm7
+ movd [r14+8], xmm7
+ psrldq xmm7, 8
+ paddd xmm1, xmm7
+ movd [r14+12], xmm7
+ movd r15d, xmm1
+ movd xmm9, r15d
+ paddd xmm8,xmm9
+
+ mov r15, psum16x16
+ movdqa xmm1, xmm6
+ psrldq xmm1, 8
+ paddd xmm6, xmm1
+ movd [r15], xmm6
+ add dword psum16x16, 4
+
+ mov r15, psqsum16x16
+ pshufd xmm6, xmm5, 14 ;00001110
+ paddd xmm6, xmm5
+ pshufd xmm5, xmm6, 1 ;00000001
+ paddd xmm5, xmm6
+ movd [r15], xmm5
+ add dword psqsum16x16, 4
+
+ mov r15, psqdiff16x16
+ pshufd xmm5, xmm4, 14 ; 00001110
+ paddd xmm5, xmm4
+ pshufd xmm4, xmm5, 1 ; 00000001
+ paddd xmm4, xmm5
+ movd [r15], xmm4
+ add dword psqdiff16x16, 4
+
+ add r14,16
+ sub r0, r13
+ sub r1, r13
+ add r0, 16
+ add r1, 16
+
+ dec r2
+ jnz sqdiff_width_loop
+
+ pop r2
+ %assign push_num push_num -1
+
+ mov r0, r10
+ mov r1, r11
+ add r0, r13
+ add r1, r13
+
+ dec r3
+ jnz sqdiff_height_loop
+
+ mov r13, psadframe
+ movd [r13], xmm8
+
+ POP_XMM
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %assign push_num 0
+
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef psum16x16
+%undef psqsum16x16
+%undef psqdiff16x16
+%undef tmp_esi
+%undef tmp_edi
+%undef tmp_sadframe
+%undef pushsize
+%undef localsize
+ ret
+
+
+
+%endif
+
+%ifdef X86_32
+;*************************************************************************************************************
+;void VAACalcSadBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+
+WELS_EXTERN VAACalcSadBgd_sse2
+%define localsize 12
+%define cur_data esp + pushsize + localsize + 4
+%define ref_data esp + pushsize + localsize + 8
+%define iPicWidth esp + pushsize + localsize + 12
+%define iPicHeight esp + pushsize + localsize + 16
+%define iPicStride esp + pushsize + localsize + 20
+%define psadframe esp + pushsize + localsize + 24
+%define psad8x8 esp + pushsize + localsize + 28
+%define p_sd8x8 esp + pushsize + localsize + 32
+%define p_mad8x8 esp + pushsize + localsize + 36
+%define tmp_esi esp + 0
+%define tmp_edi esp + 4
+%define tmp_ecx esp + 8
+%define pushsize 16
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov eax, ebx
+
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ xor ebp, ebp
+ pxor xmm0, xmm0
+bgd_height_loop:
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
+bgd_width_loop:
+ pxor xmm7, xmm7 ; pSad8x8
+ pxor xmm6, xmm6 ; sum_cur_8x8
+ pxor xmm5, xmm5 ; sum_ref_8x8
+ pxor xmm4, xmm4 ; pMad8x8
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+
+
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm4
+
+ ;movdqa xmm1, xmm4
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm4, xmm0
+ ;punpcklwd xmm4, xmm0
+ ;movd [edx+4], xmm4
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ mov [tmp_ecx], ecx
+ movhlps xmm1, xmm4
+ movd ecx, xmm4
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
+
+
+ pslldq xmm7, 4
+ pslldq xmm6, 4
+ pslldq xmm5, 4
+
+
+ pxor xmm4, xmm4 ; pMad8x8
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm4
+
+ ;movdqa xmm1, xmm4
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm4, xmm0
+ ;punpcklwd xmm4, xmm0
+ ;movd [edx+4], xmm4
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ movhlps xmm1, xmm4
+ movd ecx, xmm4
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
+
+ ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2
+
+ mov edx, [psad8x8]
+ pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0
+ movdqa [edx], xmm1
+ add edx, 16
+ mov [psad8x8], edx ; sad8x8
+
+ paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0
+ pshufd xmm2, xmm1, 00000011b
+ paddd xmm1, xmm2
+ movd edx, xmm1
+ add ebp, edx ; sad frame
+
+ mov edx, [p_sd8x8]
+ psubd xmm6, xmm5
+ pshufd xmm1, xmm6, 10001101b
+ movdqa [edx], xmm1
+ add edx, 16
+ mov [p_sd8x8], edx
+
+
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
+
+ mov ecx, [tmp_ecx]
+ dec ecx
+ jnz bgd_width_loop
+
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
+
+ dec dword [iPicHeight]
+ jnz bgd_height_loop
+
+ mov edx, [psadframe]
+ mov [edx], ebp
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef p_sd8x8
+%undef p_mad8x8
+%undef tmp_esi
+%undef tmp_edi
+%undef pushsize
+%undef localsize
+ ret
+
+
+
+;*************************************************************************************************************
+;void VAACalcSadSsdBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
+; int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+
+WELS_EXTERN VAACalcSadSsdBgd_sse2
+%define localsize 16
+%define cur_data esp + pushsize + localsize + 4
+%define ref_data esp + pushsize + localsize + 8
+%define iPicWidth esp + pushsize + localsize + 12
+%define iPicHeight esp + pushsize + localsize + 16
+%define iPicStride esp + pushsize + localsize + 20
+%define psadframe esp + pushsize + localsize + 24
+%define psad8x8 esp + pushsize + localsize + 28
+%define psum16x16 esp + pushsize + localsize + 32
+%define psqsum16x16 esp + pushsize + localsize + 36
+%define psqdiff16x16 esp + pushsize + localsize + 40
+%define p_sd8x8 esp + pushsize + localsize + 44
+%define p_mad8x8 esp + pushsize + localsize + 48
+%define tmp_esi esp + 0
+%define tmp_edi esp + 4
+%define tmp_sadframe esp + 8
+%define tmp_ecx esp + 12
+%define pushsize 16
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov eax, ebx
+
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ movd [tmp_sadframe], xmm0
+sqdiff_bgd_height_loop:
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
+sqdiff_bgd_width_loop:
+ pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0
+ pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
+ pxor xmm5, xmm5 ; pMad8x8
+ pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+
+ mov edx, [psad8x8]
+ movdqa xmm2, xmm7
+ pshufd xmm1, xmm2, 00001110b
+ movd [edx], xmm2
+ movd [edx+4], xmm1
+ add edx, 8
+ mov [psad8x8], edx ; sad8x8
+
+ paddd xmm1, xmm2
+ movd edx, xmm1
+ add [tmp_sadframe], edx ; iFrameSad
+
+ mov edx, [psum16x16]
+ movdqa xmm1, xmm6
+ pshufd xmm2, xmm1, 00001110b
+ paddd xmm1, xmm2
+ movd [edx], xmm1 ; sum
+
+ mov edx, [p_sd8x8]
+ pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
+ psubd xmm6, xmm1 ; 00 diff1 00 diff0
+ pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
+ movq [edx], xmm1
+ add edx, 8
+ mov [p_sd8x8], edx
+
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm5
+ ;movdqa xmm1, xmm5
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm5, xmm0
+ ;punpcklwd xmm5, xmm0
+ ;movd [edx+4], xmm5
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ mov [tmp_ecx], ecx
+ movhlps xmm1, xmm5
+ movd ecx, xmm5
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
+
+ psrlq xmm7, 32
+ psllq xmm7, 32 ; clear sad
+ pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
+ pxor xmm5, xmm5 ; pMad8x8
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+
+ mov edx, [psad8x8]
+ movdqa xmm2, xmm7
+ pshufd xmm1, xmm2, 00001110b
+ movd [edx], xmm2
+ movd [edx+4], xmm1
+ add edx, 8
+ mov [psad8x8], edx ; sad8x8
+
+ paddd xmm1, xmm2
+ movd edx, xmm1
+ add [tmp_sadframe], edx ; iFrameSad
+
+ mov edx, [psum16x16]
+ movdqa xmm1, xmm6
+ pshufd xmm2, xmm1, 00001110b
+ paddd xmm1, xmm2
+ movd ebp, xmm1 ; sum
+ add [edx], ebp
+ add edx, 4
+ mov [psum16x16], edx
+
+ mov edx, [psqsum16x16]
+ psrlq xmm7, 32
+ pshufd xmm2, xmm7, 00001110b
+ paddd xmm2, xmm7
+ movd [edx], xmm2 ; sqsum
+ add edx, 4
+ mov [psqsum16x16], edx
+
+ mov edx, [p_sd8x8]
+ pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
+ psubd xmm6, xmm1 ; 00 diff1 00 diff0
+ pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
+ movq [edx], xmm1
+ add edx, 8
+ mov [p_sd8x8], edx
+
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm5
+ ;movdqa xmm1, xmm5
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm5, xmm0
+ ;punpcklwd xmm5, xmm0
+ ;movd [edx+4], xmm5
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ movhlps xmm1, xmm5
+ movd ecx, xmm5
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
+
+ mov edx, [psqdiff16x16]
+ pshufd xmm1, xmm4, 00001110b
+ paddd xmm4, xmm1
+ pshufd xmm1, xmm4, 00000001b
+ paddd xmm4, xmm1
+ movd [edx], xmm4
+ add edx, 4
+ mov [psqdiff16x16], edx
+
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
+
+ mov ecx, [tmp_ecx]
+ dec ecx
+ jnz sqdiff_bgd_width_loop
+
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
+
+ dec dword [iPicHeight]
+ jnz sqdiff_bgd_height_loop
+
+ mov edx, [psadframe]
+ mov ebp, [tmp_sadframe]
+ mov [edx], ebp
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef psum16x16
+%undef psqsum16x16
+%undef psqdiff16x16
+%undef p_sd8x8
+%undef p_mad8x8
+%undef tmp_esi
+%undef tmp_edi
+%undef pushsize
+%undef localsize
+ ret
+%else
+
+;*************************************************************************************************************
+;void VAACalcSadBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+
+WELS_EXTERN VAACalcSadBgd_sse2
+%define cur_data arg1;
+%define ref_data arg2;
+%define iPicWidth arg3;
+%define iPicHeight arg4;
+%define iPicStride arg5;
+%define psadframe arg6;
+%define psad8x8 arg7;
+%define p_sd8x8 arg8;
+%define p_mad8x8 arg9;
+
+ push r12
+ push r13
+ push r14
+ push r15
+%assign push_num 4
+ PUSH_XMM 10
+%ifdef WIN64
+ mov r4,arg5
+ ; mov r5,arg6
+%endif
+ mov r14,arg7
+ SIGN_EXTENSION r2,r2d
+ SIGN_EXTENSION r3,r3d
+ SIGN_EXTENSION r4,r4d
+
+
+ mov r13,r4
+ mov r15,r0
+ shr r2,4
+ shr r3,4
+ shl r13,4
+ pxor xmm0, xmm0
+ pxor xmm8, xmm8
+ pxor xmm9, xmm9
+bgd_height_loop:
+ ;mov ecx, dword [iPicWidth]
+ push r2
+ %assign push_num push_num+1
+ mov r10, r15
+ mov r11, r1
+bgd_width_loop:
+ pxor xmm7, xmm7 ; pSad8x8
+ pxor xmm6, xmm6 ; sum_cur_8x8
+ pxor xmm5, xmm5 ; sum_ref_8x8
+ pxor xmm4, xmm4 ; pMad8x8
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+
+
+ mov r14, p_mad8x8
+ WELS_MAX_REG_SSE2 xmm4
+
+ ;mov [tmp_ecx], ecx
+ movhlps xmm1, xmm4
+ movd r0d, xmm4
+
+
+ mov [r14], r0b
+ movd r0d, xmm1
+ mov [r14+1],r0b
+ add r14, 2
+ ;mov p_mad8x8, r14
+
+
+ pslldq xmm7, 4
+ pslldq xmm6, 4
+ pslldq xmm5, 4
+
+
+ pxor xmm4, xmm4 ; pMad8x8
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+
+ ;mov r14, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm4
+
+ movhlps xmm1, xmm4
+ movd r0d, xmm4
+ mov [r14], r0b
+ movd r0d, xmm1
+ mov [r14+1],r0b
+ add r14, 2
+ mov p_mad8x8, r14
+
+ ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2
+
+ mov r14, psad8x8
+ pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0
+ movdqa [r14], xmm1
+ add r14, 16
+ mov psad8x8, r14 ; sad8x8
+
+ paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0
+ pshufd xmm2, xmm1, 00000011b
+ paddd xmm1, xmm2
+ movd r14d, xmm1
+ movd xmm9, r14d
+ paddd xmm8, xmm9 ; sad frame
+
+ mov r14, p_sd8x8
+ psubd xmm6, xmm5
+ pshufd xmm1, xmm6, 10001101b
+ movdqa [r14], xmm1
+ add r14, 16
+ mov p_sd8x8, r14
+
+
+ ;add edx, 16
+ sub r15, r13
+ sub r1, r13
+ add r15, 16
+ add r1, 16
+
+
+ dec r2
+ jnz bgd_width_loop
+ pop r2
+%assign push_num push_num-1
+ mov r15, r10
+ mov r1, r11
+ add r15, r13
+ add r1, r13
+
+ dec r3
+ jnz bgd_height_loop
+
+ mov r13, psadframe
+ movd [r13], xmm8
+
+ POP_XMM
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%assign push_num 0
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef p_sd8x8
+%undef p_mad8x8
+%undef tmp_esi
+%undef tmp_edi
+%undef pushsize
+%undef localsize
+ ret
+
+
+
+;*************************************************************************************************************
+;void VAACalcSadSsdBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
+; int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+
+WELS_EXTERN VAACalcSadSsdBgd_sse2
+%define cur_data arg1;
+%define ref_data arg2;
+%define iPicWidth arg3;
+%define iPicHeight arg4;
+%define iPicStride arg5;
+%define psadframe arg6;
+%define psad8x8 arg7;
+%define psum16x16 arg8;
+%define psqsum16x16 arg9;
+%define psqdiff16x16 arg10;
+%define p_sd8x8 arg11
+%define p_mad8x8 arg12
+
+ push r12
+ push r13
+ push r14
+ push r15
+%assign push_num 4
+ PUSH_XMM 10
+%ifdef WIN64
+ mov r4,arg5
+ ;mov r5,arg6
+%endif
+ SIGN_EXTENSION r2,r2d
+ SIGN_EXTENSION r3,r3d
+ SIGN_EXTENSION r4,r4d
+
+ mov r13,r4
+ shr r2, 4 ; iPicWidth/16
+ shr r3, 4 ; iPicHeight/16
+ shl r13, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm8, xmm8
+ pxor xmm9, xmm9
+
+
+sqdiff_bgd_height_loop:
+ mov r10, r0
+ mov r11, r1
+ push r2
+%assign push_num push_num+1
+sqdiff_bgd_width_loop:
+
+ pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0
+ pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
+ pxor xmm5, xmm5 ; pMad8x8
+ pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+
+ mov r14, psad8x8
+ movdqa xmm2, xmm7
+ pshufd xmm1, xmm2, 00001110b
+ movd [r14], xmm2
+ movd [r14+4], xmm1
+ add r14, 8
+ mov psad8x8, r14 ; sad8x8
+
+ paddd xmm1, xmm2
+ movd r14d, xmm1
+ movd xmm9,r14d
+ paddd xmm8, xmm9 ; iFrameSad
+
+ mov r14, psum16x16
+ movdqa xmm1, xmm6
+ pshufd xmm2, xmm1, 00001110b
+ paddd xmm1, xmm2
+ movd [r14], xmm1 ; sum
+
+ mov r14, p_sd8x8
+ pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
+ psubd xmm6, xmm1 ; 00 diff1 00 diff0
+ pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
+ movq [r14], xmm1
+ add r14, 8
+ mov p_sd8x8, r14
+
+ mov r14, p_mad8x8
+ WELS_MAX_REG_SSE2 xmm5
+
+ movhlps xmm1, xmm5
+ push r0
+ movd r0d, xmm5
+ mov [r14], r0b
+ movd r0d, xmm1
+ mov [r14+1],r0b
+ pop r0
+ add r14, 2
+ mov p_mad8x8, r14
+
+ psrlq xmm7, 32
+ psllq xmm7, 32 ; clear sad
+ pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
+ pxor xmm5, xmm5 ; pMad8x8
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+
+ mov r14, psad8x8
+ movdqa xmm2, xmm7
+ pshufd xmm1, xmm2, 00001110b
+ movd [r14], xmm2
+ movd [r14+4], xmm1
+ add r14, 8
+ mov psad8x8, r14 ; sad8x8
+
+ paddd xmm1, xmm2
+ movd r14d, xmm1
+ movd xmm9, r14d
+ paddd xmm8, xmm9 ; iFrameSad
+
+ mov r14, psum16x16
+ movdqa xmm1, xmm6
+ pshufd xmm2, xmm1, 00001110b
+ paddd xmm1, xmm2
+ movd r15d, xmm1 ; sum
+ add [r14], r15d
+ add r14, 4
+ mov psum16x16, r14
+
+ mov r14, psqsum16x16
+ psrlq xmm7, 32
+ pshufd xmm2, xmm7, 00001110b
+ paddd xmm2, xmm7
+ movd [r14], xmm2 ; sqsum
+ add r14, 4
+ mov psqsum16x16, r14
+
+ mov r14, p_sd8x8
+ pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
+ psubd xmm6, xmm1 ; 00 diff1 00 diff0
+ pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
+ movq [r14], xmm1
+ add r14, 8
+ mov p_sd8x8, r14
+
+ mov r14, p_mad8x8
+ WELS_MAX_REG_SSE2 xmm5
+
+
+ movhlps xmm1, xmm5
+ push r0
+ movd r0d, xmm5
+ mov [r14], r0b
+ movd r0d, xmm1
+ mov [r14+1],r0b
+ pop r0
+ add r14, 2
+ mov p_mad8x8, r14
+
+ mov r14, psqdiff16x16
+ pshufd xmm1, xmm4, 00001110b
+ paddd xmm4, xmm1
+ pshufd xmm1, xmm4, 00000001b
+ paddd xmm4, xmm1
+ movd [r14], xmm4
+ add r14, 4
+ mov psqdiff16x16, r14
+
+ add r14, 16
+ sub r0, r13
+ sub r1, r13
+ add r0, 16
+ add r1, 16
+
+ dec r2
+ jnz sqdiff_bgd_width_loop
+ pop r2
+ %assign push_num push_num-1
+ mov r0, r10
+ mov r1, r11
+ add r0, r13
+ add r1, r13
+
+ dec r3
+ jnz sqdiff_bgd_height_loop
+
+ mov r14, psadframe
+ movd [r14], xmm8
+
+ POP_XMM
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%assign push_num 0
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef psum16x16
+%undef psqsum16x16
+%undef psqdiff16x16
+%undef p_sd8x8
+%undef p_mad8x8
+%undef tmp_esi
+%undef tmp_edi
+%undef pushsize
+%undef localsize
+ ret
+%endif
--- a/codec/processing/targets.mk
+++ b/codec/processing/targets.mk
@@ -21,9 +21,9 @@
ifeq ($(ASM_ARCH), x86)
PROCESSING_ASM_SRCS=\
- $(PROCESSING_SRCDIR)/src/asm/denoisefilter.asm\
- $(PROCESSING_SRCDIR)/src/asm/downsample_bilinear.asm\
- $(PROCESSING_SRCDIR)/src/asm/vaa.asm\
+ $(PROCESSING_SRCDIR)/src/x86/denoisefilter.asm\
+ $(PROCESSING_SRCDIR)/src/x86/downsample_bilinear.asm\
+ $(PROCESSING_SRCDIR)/src/x86/vaa.asm\
PROCESSING_OBJS += $(PROCESSING_ASM_SRCS:.asm=.$(OBJ))
endif