shithub: openh264

ref: cf92e8d6208af51fb55ed61554b135585ed33c1b
dir: /codec/encoder/core/src/decode_mb_aux.cpp/

View raw version
/*!
 * \copy
 *     Copyright (c)  2013, Cisco Systems
 *     All rights reserved.
 *
 *     Redistribution and use in source and binary forms, with or without
 *     modification, are permitted provided that the following conditions
 *     are met:
 *
 *        * Redistributions of source code must retain the above copyright
 *          notice, this list of conditions and the following disclaimer.
 *
 *        * Redistributions in binary form must reproduce the above copyright
 *          notice, this list of conditions and the following disclaimer in
 *          the documentation and/or other materials provided with the
 *          distribution.
 *
 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 *     POSSIBILITY OF SUCH DAMAGE.
 *
 */

#include <string.h>
#include "decode_mb_aux.h"
#include "wels_common_basis.h"
#include "cpu_core.h"

namespace WelsSVCEnc {
/****************************************************************************
 * Dequant and Ihdm functions
 ****************************************************************************/
void WelsIHadamard4x4Dc(int16_t *pRes) //pBuffer size : 4x4
{
	int16_t iTemp[4];
	int32_t i	= 4;

	while( --i >= 0 )
	{
		const int32_t kiIdx	= i<<2;
		const int32_t kiIdx1	= 1 + kiIdx;
		const int32_t kiIdx2	= 1 + kiIdx1;
		const int32_t kiIdx3	= 1 + kiIdx2;

		iTemp[0] = pRes[kiIdx ] + pRes[kiIdx2];
		iTemp[1] = pRes[kiIdx ] - pRes[kiIdx2];
		iTemp[2] = pRes[kiIdx1] - pRes[kiIdx3];
		iTemp[3] = pRes[kiIdx1] + pRes[kiIdx3];

		pRes[kiIdx ] = iTemp[0] + iTemp[3];
		pRes[kiIdx1] = iTemp[1] + iTemp[2];
		pRes[kiIdx2] = iTemp[1] - iTemp[2];
		pRes[kiIdx3] = iTemp[0] - iTemp[3];		
	}

	i = 4;
	while( --i >= 0 )
	{
		const int32_t kiI4	= 4 + i;
		const int32_t kiI8	= 4 + kiI4;
		const int32_t kiI12	= 4 + kiI8;

		iTemp[0] = pRes[i  ] + pRes[kiI8 ];
		iTemp[1] = pRes[i  ] - pRes[kiI8 ];
		iTemp[2] = pRes[kiI4 ] - pRes[kiI12];
		iTemp[3] = pRes[kiI4 ] + pRes[kiI12];

		pRes[i  ] = iTemp[0] + iTemp[3];
		pRes[kiI4 ] = iTemp[1] + iTemp[2];
		pRes[kiI8 ] = iTemp[1] - iTemp[2];
		pRes[kiI12] = iTemp[0] - iTemp[3];
	}
}

/* for qp < 12 */
void WelsDequantLumaDc4x4(int16_t *pRes, const int32_t kiQp)
{
	int32_t i	= 15;
	const uint16_t kuiDequantValue	= g_kuiDequantCoeff[kiQp%6][0];
	const int16_t kiQF0		= kiQp / 6; 
	const int16_t kiQF1		= 2 - kiQF0;
	const int16_t kiQF0S	= 1 << (1 - kiQF0);
	
	while ( i >= 0 )
	{
		pRes[i  ] = ( pRes[i  ] * kuiDequantValue + kiQF0S ) >> kiQF1; 
		pRes[i-1] = ( pRes[i-1] * kuiDequantValue + kiQF0S ) >> kiQF1; 
		pRes[i-2] = ( pRes[i-2] * kuiDequantValue + kiQF0S ) >> kiQF1; 
		pRes[i-3] = ( pRes[i-3] * kuiDequantValue + kiQF0S ) >> kiQF1; 

		i -= 4;
	}
}

/* for qp >= 12 */
void WelsDequantIHadamard4x4_c(int16_t *pRes, const uint16_t kuiMF)
{
	int16_t iTemp[4];
	int32_t i;

	for(i = 0; i < 16; i += 4)
	{
		iTemp[0] = pRes[i  ] + pRes[i+2];
		iTemp[1] = pRes[i  ] - pRes[i+2];
		iTemp[2] = pRes[i+1] - pRes[i+3];
		iTemp[3] = pRes[i+1] + pRes[i+3];

		pRes[i  ] = iTemp[0] + iTemp[3];
		pRes[i+1] = iTemp[1] + iTemp[2];
		pRes[i+2] = iTemp[1] - iTemp[2];
		pRes[i+3] = iTemp[0] - iTemp[3];		
	}

	for(i = 0; i < 4; i++)
	{
		iTemp[0] = pRes[i   ] + pRes[i+8 ];
		iTemp[1] = pRes[i   ] - pRes[i+8 ];
		iTemp[2] = pRes[i+4 ] - pRes[i+12];
		iTemp[3] = pRes[i+4 ] + pRes[i+12];

		pRes[i  ]  = (iTemp[0] + iTemp[3]) * kuiMF;
		pRes[i+4 ] = (iTemp[1] + iTemp[2]) * kuiMF;
		pRes[i+8 ] = (iTemp[1] - iTemp[2]) * kuiMF;
		pRes[i+12] = (iTemp[0] - iTemp[3]) * kuiMF;
	}	
}

void WelsDequantIHadamard2x2Dc( int16_t* pDct, const uint16_t kuiMF)
{
	const int16_t kiSumU = pDct[0] + pDct[2];
	const int16_t kiDelU =   pDct[0] -  pDct[2];
	const int16_t kiSumD = pDct[1] + pDct[3];
	const int16_t kiDelD =   pDct[1] -  pDct[3];
	
	pDct[0] = (kiSumU + kiSumD) * kuiMF;
    pDct[1] = (kiSumU  -  kiSumD) * kuiMF;
    pDct[2] = (kiDelU   + kiDelD)   * kuiMF;
    pDct[3] = (kiDelU   - kiDelD)   * kuiMF;
}

void WelsDequant4x4_c(int16_t *pRes, const uint16_t* kpMF)
{
	int32_t i;
	for(i = 0; i < 8; i++)
	{
		pRes[i]	*=	kpMF[i];
		pRes[i+8]	*=kpMF[i];
	}
}

void WelsDequantFour4x4_c(int16_t *pRes, const uint16_t* kpMF)
{
	int32_t i;
	for(i = 0; i < 8; i++)
	{
		pRes[i]	*=	kpMF[i];
		pRes[i+8]	*=	kpMF[i];
		pRes[i+16]*=	kpMF[i];
		pRes[i+24]*=	kpMF[i];
		pRes[i+32]*=	kpMF[i];
		pRes[i+40]*=	kpMF[i];
		pRes[i+48]*=	kpMF[i];
		pRes[i+56]*=	kpMF[i];
	}	
}

/****************************************************************************
 * IDCT functions, final output = prediction(CS) + IDCT(scaled_coeff)
 ****************************************************************************/
void WelsIDctT4Rec_c( uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct )
{
	int32_t i;
	int16_t iTemp[16];	

	int32_t iDstStridex2 = iStride << 1;
	int32_t iDstStridex3 = iStride + iDstStridex2;
	int32_t iPredStridex2 = iPredStride << 1;
	int32_t iPredStridex3 = iPredStride + iPredStridex2;

	for (i = 0; i < 4; i ++) //horizon
	{		
		int32_t iIdx = i << 2;
		const int32_t kiHorSumU = pDct[iIdx] + pDct[iIdx+2];	// add 0-2
		const int32_t kiHorDelU = pDct[iIdx] - pDct[iIdx+2];	// sub 0-2
		const int32_t kiHorSumD = pDct[iIdx+1] + (pDct[iIdx+3] >> 1);
		const int32_t kiHorDelD = (pDct[iIdx+1] >> 1) - pDct[iIdx+3];

		iTemp[iIdx  ]   = kiHorSumU  + kiHorSumD;		
		iTemp[iIdx+1] = kiHorDelU   + kiHorDelD;
		iTemp[iIdx+2] = kiHorDelU   -  kiHorDelD;
		iTemp[iIdx+3] = kiHorSumU  -  kiHorSumD;
	}

	for (i = 0; i < 4; i ++) //vertical
	{
		const int32_t kiVerSumL = iTemp[i]                 + iTemp[8+i];
		const int32_t kiVerDelL   = iTemp[i]                 - iTemp[8+i];
		const int32_t kiVerDelR   = (iTemp[4+i] >> 1) - iTemp[12+i];
		const int32_t kiVerSumR = iTemp[4+i]             + (iTemp[12+i] >> 1);

		pRec[i				]         = WELS_CLIP1( pPred[i              ]         + ((kiVerSumL + kiVerSumR + 32) >> 6) );
		pRec[iStride+i		]     = WELS_CLIP1( pPred[iPredStride+i  ]  + ((kiVerDelL + kiVerDelR + 32) >> 6) );
		pRec[iDstStridex2 + i] = WELS_CLIP1( pPred[iPredStridex2+i] + ((kiVerDelL - kiVerDelR + 32) >> 6) );
		pRec[iDstStridex3 + i] = WELS_CLIP1( pPred[iPredStridex3+i] + ((kiVerSumL - kiVerSumR + 32) >> 6) );
	}	
}

void WelsIDctFourT4Rec_c( uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct )
{
	int32_t iDstStridex4  = iStride << 2;
	int32_t iPredStridex4 = iPredStride << 2;
	WelsIDctT4Rec_c( pRec,                  iStride, pPred,						iPredStride, pDct	);
	WelsIDctT4Rec_c( &pRec[4],              iStride, &pPred[4],					iPredStride, pDct+16 );
	WelsIDctT4Rec_c( &pRec[iDstStridex4  ], iStride, &pPred[iPredStridex4  ],	iPredStride, pDct+32 );	
	WelsIDctT4Rec_c( &pRec[iDstStridex4+4], iStride, &pPred[iPredStridex4+4],	iPredStride, pDct+48 );

}

void WelsIDctT4RecOnMb(uint8_t* pDst, int32_t iDstStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct, PIDctFunc pfIDctFourT4)
{
	int32_t iDstStridex8  = iDstStride << 3;
	int32_t iPredStridex8 = iPredStride << 3;
	
	pfIDctFourT4(&pDst[0], iDstStride, &pPred[0], iPredStride, pDct);
	pfIDctFourT4(&pDst[8], iDstStride, &pPred[8], iPredStride, pDct+64);
	pfIDctFourT4(&pDst[iDstStridex8], iDstStride, &pPred[iPredStridex8], iPredStride, pDct+128);
	pfIDctFourT4(&pDst[iDstStridex8+8], iDstStride, &pPred[iPredStridex8+8], iPredStride, pDct+192);
}

/* 
 * pfIDctI16x16Dc: do luma idct of an MB for I16x16 mode, when only dc value are non-zero
 */
void WelsIDctRecI16x16Dc_c(uint8_t *pRec, int32_t iStride, uint8_t *pPred, int32_t iPredStride, int16_t *pDctDc)
{
	int32_t i, j;

	for (i = 0; i < 16; i ++) 
	{
		for(j = 0; j < 16; j++)
		{
			pRec[j] = WELS_CLIP1( pPred[j] + ((pDctDc[(i&0x0C) + (j>>2)] + 32) >> 6) );
		}
		pRec += iStride;
		pPred += iPredStride;
	}
}

void WelsGetEncBlockStrideOffset(int32_t *pBlock, const int32_t kiStrideY, const int32_t kiStrideUV)
{
	int32_t i, j, k, r;	
	for(j = 0; j < 4; j++)
	{
		i = j << 2;
		k = (j&0x01) << 1;
		r = j&0x02;
		pBlock[i]		= (0 + k + (0 + r) * kiStrideY) << 2;
		pBlock[i+1]	= (1 + k + (0 + r) * kiStrideY) << 2;
		pBlock[i+2]	= (0 + k + (1 + r) * kiStrideY) << 2;
		pBlock[i+3]	= (1 + k + (1 + r) * kiStrideY) << 2;

		pBlock[16+j]	=
		pBlock[20+j]	= ((j&0x01) + r * kiStrideUV) << 2;
	}
}

void WelsInitReconstructionFuncs( SWelsFuncPtrList *pFuncList, uint32_t  uiCpuFlag )
{
	pFuncList->pfDequantization4x4			= WelsDequant4x4_c;
	pFuncList->pfDequantizationFour4x4		= WelsDequantFour4x4_c;
	pFuncList->pfDequantizationIHadamard4x4	= WelsDequantIHadamard4x4_c;

	pFuncList->pfIDctT4		= WelsIDctT4Rec_c;
	pFuncList->pfIDctFourT4		= WelsIDctFourT4Rec_c;
	pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_c;

#if defined(X86_ASM)
	if ( uiCpuFlag & WELS_CPU_MMXEXT )
	{
    	pFuncList->pfIDctT4		= WelsIDctT4Rec_mmx;
	}
	if ( uiCpuFlag & WELS_CPU_SSE2 )
	{
		pFuncList->pfDequantization4x4			= WelsDequant4x4_sse2;
		pFuncList->pfDequantizationFour4x4		= WelsDequantFour4x4_sse2;
		pFuncList->pfDequantizationIHadamard4x4	= WelsDequantIHadamard4x4_sse2;

		pFuncList->pfIDctFourT4		= WelsIDctFourT4Rec_sse2;
		pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_sse2;
	}
#endif//X86_ASM
}
}