ref: 1928b9a1ba6f7c18ebc058835f308c3d7016a1da
dir: /codec/decoder/core/asm/block_add.asm/
;*! ;* \copy ;* Copyright (c) 2009-2013, Cisco Systems ;* All rights reserved. ;* ;* Redistribution and use in source and binary forms, with or without ;* modification, are permitted provided that the following conditions ;* are met: ;* ;* * Redistributions of source code must retain the above copyright ;* notice, this list of conditions and the following disclaimer. ;* ;* * Redistributions in binary form must reproduce the above copyright ;* notice, this list of conditions and the following disclaimer in ;* the documentation and/or other materials provided with the ;* distribution. ;* ;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, ;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, ;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ;* POSSIBILITY OF SUCH DAMAGE. ;* ;* ;* block_add.asm ;* ;* Abstract ;* add block ;* ;* History ;* 09/21/2009 Created ;* ;* ;*************************************************************************/ %include "asm_inc.asm" BITS 32 ;******************************************************************************* ; Macros and other preprocessor constants ;******************************************************************************* %macro BLOCK_ADD_16_SSE2 4 movdqa xmm0, [%2] movdqa xmm1, [%3] movdqa xmm2, [%3+10h] movdqa xmm6, xmm0 punpcklbw xmm0, xmm7 punpckhbw xmm6, xmm7 paddw xmm0, xmm1 paddw xmm6, xmm2 packuswb xmm0, xmm6 movdqa [%1], xmm0 lea %2, [%2+%4] lea %3, [%3+%4*2] lea %1, [%1+%4] %endmacro %macro BLOCK_ADD_8_MMXEXT 4 movq mm0, [%2] movq mm1, [%3] movq mm2, [%3+08h] movq mm6, mm0 punpcklbw mm0, mm7 punpckhbw mm6, mm7 paddw mm0, mm1 paddw mm6, mm2 packuswb mm0, mm6 movq [%1], mm0 lea %2, [%2+%4] lea %3, [%3+%4*2] lea %1, [%1+%4] %endmacro %macro BLOCK_ADD_16_STRIDE_SSE2 5 movdqa xmm0, [%2] movdqa xmm1, [%3] movdqa xmm2, [%3+10h] movdqa xmm6, xmm0 punpcklbw xmm0, xmm7 punpckhbw xmm6, xmm7 paddw xmm0, xmm1 paddw xmm6, xmm2 packuswb xmm0, xmm6 movdqa [%1], xmm0 lea %2, [%2+%4] lea %3, [%3+%5*2] lea %1, [%1+%4] %endmacro %macro BLOCK_ADD_8_STRIDE_MMXEXT 5 movq mm0, [%2] movq mm1, [%3] movq mm2, [%3+08h] movq mm6, mm0 punpcklbw mm0, mm7 punpckhbw mm6, mm7 paddw mm0, mm1 paddw mm6, mm2 packuswb mm0, mm6 movq [%1], mm0 lea %2, [%2+%4] lea %3, [%3+%5*2] lea %1, [%1+%4] %endmacro %macro BLOCK_ADD_8_STRIDE_2_LINES_SSE2 5 movdqa xmm1, [%3] movq xmm0, [%2] punpcklbw xmm0, xmm7 paddw xmm0, xmm1 packuswb xmm0, xmm7 movq [%1], xmm0 movdqa xmm3, [%3+%5*2] movq xmm2, [%2+%4] punpcklbw xmm2, xmm7 paddw xmm2, xmm3 packuswb xmm2, xmm7 movq [%1+%4], xmm2 lea %1, [%1+%4*2] lea %2, [%2+%4*2] lea %3, [%3+%5*4] %endmacro %macro CHECK_DATA_16_ZERO_SSE4 3 mov eax, 0h movdqa xmm0, [%1] movdqa xmm1, [%1+10h] mov ebx, [ecx] por xmm0, xmm1 ptest xmm7, xmm0 cmovae eax, %3 add %1, 20h add ecx, 04h mov byte [%2+ebx], al %endmacro %macro CHECK_RS_4x4_BLOCK_2_ZERO_SSE4 5 movdqa xmm0, [%1] movdqa xmm1, [%1+%3] movdqa xmm2, [%1+%3*2] movdqa xmm3, [%1+%4] mov eax, 0h mov ebx, 0h movdqa xmm4, xmm0 movdqa xmm5, xmm2 punpcklqdq xmm0, xmm1 punpckhqdq xmm4, xmm1 punpcklqdq xmm2, xmm3 punpckhqdq xmm5, xmm3 por xmm0, xmm2 por xmm4, xmm5 ptest xmm7, xmm0 cmovae eax, %5 ptest xmm7, xmm4 cmovae ebx, %5 mov byte [%2], al mov byte [%2+1], bl %endmacro %macro DATA_COPY_16x2_SSE2 3 movdqa xmm0, [%1] movdqa xmm1, [%1+10h] movdqa xmm2, [%1+%3] movdqa xmm3, [%1+%3+10h] movdqa [%2], xmm0 movdqa [%2+10h], xmm1 movdqa [%2+20h], xmm2 movdqa [%2+30h], xmm3 lea %1, [%1+%3*2] lea %2, [%2+40h] %endmacro %macro DATA_COPY_8x4_SSE2 4 movdqa xmm0, [%1] movdqa xmm1, [%1+%3] movdqa xmm2, [%1+%3*2] movdqa xmm3, [%1+%4] movdqa [%2], xmm0 movdqa [%2+10h], xmm1 movdqa [%2+20h], xmm2 movdqa [%2+30h], xmm3 lea %1, [%1+%3*4] lea %2, [%2+40h] %endmacro %macro CHECK_DATA_16_ZERO_SSE2 3 mov eax, 0h movdqa xmm0, [%1] movdqa xmm1, [%1+10h] mov ebx, [ecx] pcmpeqw xmm0, xmm7 pcmpeqw xmm1, xmm7 packsswb xmm0, xmm1 pmovmskb edx, xmm0 sub edx, 0ffffh cmovb eax, ebp add ecx, 4 add %1, 20h mov byte [%2+ebx], al %endmacro %macro CHECK_RS_4x4_BLOCK_2_ZERO_SSE2 5 movdqa xmm0, [%1] movdqa xmm1, [%1 + %3] movdqa xmm2, [%1 + %3*2] movdqa xmm3, [%1 + %4] movdqa xmm4, xmm0 movdqa xmm5, xmm2 punpcklqdq xmm0, xmm1 punpckhqdq xmm4, xmm1 punpcklqdq xmm2, xmm3 punpckhqdq xmm5, xmm3 pcmpeqw xmm0, xmm7 pcmpeqw xmm2, xmm7 pcmpeqw xmm4, xmm7 pcmpeqw xmm5, xmm7 packsswb xmm0, xmm2 packsswb xmm4, xmm5 pmovmskb eax, xmm0 pmovmskb ebx, xmm4 sub eax, 0ffffh mov eax, 0 cmovb eax, %5 sub ebx, 0ffffh mov ebx, 0 cmovb ebx, %5 mov byte [%2], al mov byte [%2+1], bl %endmacro ;******************************************************************************* ; Data ;******************************************************************************* %ifdef FORMAT_COFF SECTION .rodata data %else SECTION .rodata align=16 %endif ALIGN 16 SubMbScanIdx: dd 0x0, 0x1, 0x4, 0x5, dd 0x2, 0x3, 0x6, 0x7, dd 0x8, 0x9, 0xc, 0xd, dd 0xa, 0xb, 0xe, 0xf, dd 0x10, 0x11, 0x14, 0x15, dd 0x12, 0x13, 0x16, 0x17, ;******************************************************************************* ; Code ;******************************************************************************* SECTION .text WELS_EXTERN WelsResBlockZero16x16_sse2 ALIGN 16 ;******************************************************************************* ; void_t WelsResBlockZero16x16_sse2(int16_t* pBlock,int32_t iStride) ;******************************************************************************* WelsResBlockZero16x16_sse2: push esi mov esi, [esp+08h] mov ecx, [esp+0ch] lea ecx, [ecx*2] lea eax, [ecx*3] pxor xmm7, xmm7 ; four lines movdqa [esi], xmm7 movdqa [esi+10h], xmm7 movdqa [esi+ecx], xmm7 movdqa [esi+ecx+10h], xmm7 movdqa [esi+ecx*2], xmm7 movdqa [esi+ecx*2+10h], xmm7 movdqa [esi+eax], xmm7 movdqa [esi+eax+10h], xmm7 ; four lines lea esi, [esi+ecx*4] movdqa [esi], xmm7 movdqa [esi+10h], xmm7 movdqa [esi+ecx], xmm7 movdqa [esi+ecx+10h], xmm7 movdqa [esi+ecx*2], xmm7 movdqa [esi+ecx*2+10h], xmm7 movdqa [esi+eax], xmm7 movdqa [esi+eax+10h], xmm7 ; four lines lea esi, [esi+ecx*4] movdqa [esi], xmm7 movdqa [esi+10h], xmm7 movdqa [esi+ecx], xmm7 movdqa [esi+ecx+10h], xmm7 movdqa [esi+ecx*2], xmm7 movdqa [esi+ecx*2+10h], xmm7 movdqa [esi+eax], xmm7 movdqa [esi+eax+10h], xmm7 ; four lines lea esi, [esi+ecx*4] movdqa [esi], xmm7 movdqa [esi+10h], xmm7 movdqa [esi+ecx], xmm7 movdqa [esi+ecx+10h], xmm7 movdqa [esi+ecx*2], xmm7 movdqa [esi+ecx*2+10h], xmm7 movdqa [esi+eax], xmm7 movdqa [esi+eax+10h], xmm7 pop esi ret WELS_EXTERN WelsResBlockZero8x8_sse2 ALIGN 16 ;******************************************************************************* ; void_t WelsResBlockZero8x8_sse2(int16_t * pBlock, int32_t iStride) ;******************************************************************************* WelsResBlockZero8x8_sse2: push esi mov esi, [esp+08h] mov ecx, [esp+0ch] lea ecx, [ecx*2] lea eax, [ecx*3] pxor xmm7, xmm7 movdqa [esi], xmm7 movdqa [esi+ecx], xmm7 movdqa [esi+ecx*2], xmm7 movdqa [esi+eax], xmm7 lea esi, [esi+ecx*4] movdqa [esi], xmm7 movdqa [esi+ecx], xmm7 movdqa [esi+ecx*2], xmm7 movdqa [esi+eax], xmm7 pop esi ret