shithub: openh264

ref: 1ca805cd855cb57d3dfc1604544eacfac9702728
dir: /processing/src/asm/intra_pred.asm/

View raw version
;*!
;* \copy
;*     Copyright (c)  2009-2013, Cisco Systems
;*     All rights reserved.
;*
;*     Redistribution and use in source and binary forms, with or without
;*     modification, are permitted provided that the following conditions
;*     are met:
;*
;*        * Redistributions of source code must retain the above copyright
;*          notice, this list of conditions and the following disclaimer.
;*
;*        * Redistributions in binary form must reproduce the above copyright
;*          notice, this list of conditions and the following disclaimer in
;*          the documentation and/or other materials provided with the
;*          distribution.
;*
;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;*     POSSIBILITY OF SUCH DAMAGE.
;*
;*
;*  intra_pred.asm
;*
;*  Abstract
;*      sse2 function for intra predict operations
;*
;*  History
;*      18/09/2009 Created
;*
;*
;*************************************************************************/
%include "../../src/asm/asm_inc.asm"

BITS 32
;***********************************************************************
; Local Data (Read Only)
;***********************************************************************

%ifdef FORMAT_COFF
SECTION .rodata data
%else
SECTION .rodata align=16
%endif


align 16
mmx_01bytes:		times 16	db 1

;***********************************************************************
; macros
;***********************************************************************
%macro  COPY_16_TIMES 2
		movdqa		%2,	[%1-16]
		psrldq		%2,	15
		pmuludq		%2,	[mmx_01bytes]
		pshufd		%2,	%2, 0
%endmacro

%macro  COPY_16_TIMESS 3
		movdqa		%2,	[%1+%3-16]
		psrldq		%2,	15
		pmuludq		%2,	[mmx_01bytes]
		pshufd		%2,	%2, 0
%endmacro

;***********************************************************************
; Code
;***********************************************************************

SECTION .text

;***********************************************************************
; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
;***********************************************************************

%macro SSE2_PRED_H_16X16_TWO_LINE 1
    lea     eax,	[eax+ecx*2]

    COPY_16_TIMES eax,	xmm0
    movdqa  [edx+%1],	xmm0
    COPY_16_TIMESS eax,	xmm0,	ecx
    movdqa  [edx+%1+0x10],	xmm0
%endmacro

WELS_EXTERN WelsI16x16LumaPredH_sse2
WelsI16x16LumaPredH_sse2:
    mov     edx, [esp+4]    ; pred
    mov     eax, [esp+8]	; pRef
    mov     ecx, [esp+12]   ; stride

    COPY_16_TIMES eax,	xmm0
    movdqa  [edx],		xmm0
    COPY_16_TIMESS eax,	xmm0,	ecx
    movdqa  [edx+0x10],	xmm0

	SSE2_PRED_H_16X16_TWO_LINE   0x20
	SSE2_PRED_H_16X16_TWO_LINE   0x40
	SSE2_PRED_H_16X16_TWO_LINE   0x60
	SSE2_PRED_H_16X16_TWO_LINE   0x80
	SSE2_PRED_H_16X16_TWO_LINE   0xa0
	SSE2_PRED_H_16X16_TWO_LINE   0xc0
	SSE2_PRED_H_16X16_TWO_LINE   0xe0

    ret

;***********************************************************************
; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
;***********************************************************************
WELS_EXTERN WelsI16x16LumaPredV_sse2
WelsI16x16LumaPredV_sse2:
    mov     edx, [esp+4]    ; pred
    mov     eax, [esp+8]	; pRef
    mov     ecx, [esp+12]   ; stride

    sub     eax, ecx
    movdqa  xmm0, [eax]

    movdqa  [edx], xmm0
    movdqa  [edx+10h], xmm0
    movdqa  [edx+20h], xmm0
    movdqa  [edx+30h], xmm0
    movdqa  [edx+40h], xmm0
    movdqa  [edx+50h], xmm0
    movdqa  [edx+60h], xmm0
    movdqa  [edx+70h], xmm0
    movdqa  [edx+80h], xmm0
    movdqa  [edx+90h], xmm0
    movdqa  [edx+160], xmm0
	movdqa  [edx+176], xmm0
    movdqa  [edx+192], xmm0
    movdqa  [edx+208], xmm0
    movdqa  [edx+224], xmm0
    movdqa  [edx+240], xmm0

    ret