shithub: openh264

ref: 9b21ece789c066db96be8810f0892560adcd63b8
dir: /codec/encoder/core/src/svc_encode_mb.cpp/

View raw version
/*!
 * \copy
 *     Copyright (c)  2009-2013, Cisco Systems
 *     All rights reserved.
 *
 *     Redistribution and use in source and binary forms, with or without
 *     modification, are permitted provided that the following conditions
 *     are met:
 *
 *        * Redistributions of source code must retain the above copyright
 *          notice, this list of conditions and the following disclaimer.
 *
 *        * Redistributions in binary form must reproduce the above copyright
 *          notice, this list of conditions and the following disclaimer in
 *          the documentation and/or other materials provided with the
 *          distribution.
 *
 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 *     POSSIBILITY OF SUCH DAMAGE.
 *
 *
 * \file		encode_mb.c
 *
 * \brief		Implementaion for pCurMb encoding
 *
 * \date		05/19/2009 Created
 *************************************************************************************
 */

#include <stdio.h>	//test use for file operation
#include <string.h>

#include "svc_encode_mb.h"
#include "encode_mb_aux.h"
#include "decode_mb_aux.h"
#include "ls_defines.h"
#include "cpu_core.h"
#include "as264_common.h"
#include "mb_cache.h"
#include "array_stack_align.h"

namespace WelsSVCEnc {
void WelsDctMb(int16_t* pRes, uint8_t* pEncMb, int32_t iEncStride, uint8_t* pBestPred, PDctFunc pfDctFourT4)
{
    pfDctFourT4(pRes,			    pEncMb,							    iEncStride, pBestPred,			16);
	pfDctFourT4(pRes + 64,		pEncMb + 8,						    iEncStride, pBestPred + 8,		16);
	pfDctFourT4(pRes + 128,	pEncMb + 8 * iEncStride,		iEncStride, pBestPred + 128,	16);
	pfDctFourT4(pRes + 192,	pEncMb + 8 * iEncStride + 8,	iEncStride, pBestPred + 136,	16);
}

void WelsEncRecI16x16Y(sWelsEncCtx *pEncCtx, SMB *pCurMb, SMbCache *pMbCache)
{
	ENFORCE_STACK_ALIGN_1D(int16_t, aDctT4Dc, 16, 16)
	SWelsFuncPtrList *pFuncList	= pEncCtx->pFuncList;
	SDqLayer* pCurDqLayer	    = pEncCtx->pCurDqLayer;
	const int32_t kiEncStride	        = pCurDqLayer->iEncStride[0];	
	int16_t *pRes				     	= pMbCache->pCoeffLevel;
	uint8_t *pPred				        = pMbCache->SPicData.pCsMb[0];
	const int32_t kiRecStride     	= pCurDqLayer->iCsStride[0];
	int16_t *pBlock				        = pMbCache->pDct->iLumaBlock[0]; 
	uint8_t *pBestPred		    	= pMbCache->pMemPredLuma;
	const uint8_t* kpNoneZeroCountIdx	= &g_kuiMbCountScan4Idx[0];	
	uint8_t i, uiQp						    = pCurMb->uiLumaQp;
	uint32_t uiNoneZeroCount, uiNoneZeroCountMbAc				= 0, uiCountI16x16Dc;

	int16_t* pMF = g_kiQuantMF[uiQp], *pFF	= g_iQuantIntraFF[uiQp];

	WelsDctMb(pRes,  pMbCache->SPicData.pEncMb[0], kiEncStride, pBestPred, pEncCtx->pFuncList->pfDctFourT4);

	pFuncList->pfTransformHadamard4x4Dc(aDctT4Dc, pRes);
	pFuncList->pfQuantizationDc4x4( aDctT4Dc, pFF[0]<<1, pMF[0]>>1);
	pFuncList->pfScan4x4( pMbCache->pDct->iLumaI16x16Dc, aDctT4Dc);
	uiCountI16x16Dc = pFuncList->pfGetNoneZeroCount(pMbCache->pDct->iLumaI16x16Dc);

	for(i = 0; i < 4; i++)
	{	
		pFuncList->pfQuantizationFour4x4(pRes, pFF,  pMF);
		pFuncList->pfScan4x4Ac(pBlock,		pRes		);
		pFuncList->pfScan4x4Ac(pBlock + 16, pRes + 16	);
		pFuncList->pfScan4x4Ac(pBlock + 32, pRes + 32	);
		pFuncList->pfScan4x4Ac(pBlock + 48, pRes + 48	);
		pRes += 64; 
		pBlock += 64;	
	}
	pRes -= 256;
	pBlock -= 256;

	for(i=0; i<16; i++)	{
		uiNoneZeroCount = pFuncList->pfGetNoneZeroCount(pBlock);
		pCurMb->pNonZeroCount[*kpNoneZeroCountIdx++] = uiNoneZeroCount;
		uiNoneZeroCountMbAc += uiNoneZeroCount;
		pBlock += 16;
	}	

	if( uiCountI16x16Dc > 0 ){
		if(uiQp < 12) 		
		{
			WelsIHadamard4x4Dc(aDctT4Dc);
			WelsDequantLumaDc4x4(aDctT4Dc, uiQp);
		}
		else
			pFuncList->pfDequantizationIHadamard4x4(aDctT4Dc, g_kuiDequantCoeff[uiQp][0]>>2);
	}

	if( uiNoneZeroCountMbAc > 0 )
	{		
		pCurMb->uiCbp = 15;	
        pFuncList->pfDequantizationFour4x4(pRes, g_kuiDequantCoeff[uiQp]);
		pFuncList->pfDequantizationFour4x4(pRes+64, g_kuiDequantCoeff[uiQp]);
		pFuncList->pfDequantizationFour4x4(pRes+128, g_kuiDequantCoeff[uiQp]);
		pFuncList->pfDequantizationFour4x4(pRes+192, g_kuiDequantCoeff[uiQp]);

		pRes[0]  = aDctT4Dc[0];		pRes[16] = aDctT4Dc[1];  
		pRes[32] = aDctT4Dc[4];		pRes[48] = aDctT4Dc[5];  
		pRes[64] = aDctT4Dc[2];		pRes[80] = aDctT4Dc[3];  
		pRes[96] = aDctT4Dc[6];		pRes[112]= aDctT4Dc[7];  
		pRes[128]= aDctT4Dc[8];		pRes[144]= aDctT4Dc[9];  
		pRes[160]= aDctT4Dc[12];		pRes[176]= aDctT4Dc[13]; 
		pRes[192]= aDctT4Dc[10];		pRes[208]= aDctT4Dc[11]; 
		pRes[224]= aDctT4Dc[14];		pRes[240]= aDctT4Dc[15]; 

		pFuncList->pfIDctFourT4(pPred,					              kiRecStride, pBestPred,		       16, pRes		);
		pFuncList->pfIDctFourT4(pPred + 8,				          kiRecStride, pBestPred + 8,	   16, pRes + 64 );
		pFuncList->pfIDctFourT4(pPred + kiRecStride*8,	      kiRecStride, pBestPred + 128,  16, pRes + 128);
		pFuncList->pfIDctFourT4(pPred + kiRecStride*8 + 8, kiRecStride, pBestPred + 136,  16, pRes + 192);
	} 
	else if( uiCountI16x16Dc > 0 ){
		pFuncList->pfIDctI16x16Dc(pPred,	kiRecStride, pBestPred,	16, aDctT4Dc);
	} 
	else{	
		pFuncList->pfCopy16x16Aligned(pPred, kiRecStride, pBestPred, 16);
	}
}
void WelsEncRecI4x4Y( sWelsEncCtx *pEncCtx, SMB *pCurMb, SMbCache *pMbCache, uint8_t uiI4x4Idx)
{
	SWelsFuncPtrList *pFuncList	= pEncCtx->pFuncList;
	SDqLayer* pCurDqLayer		= pEncCtx->pCurDqLayer;
	int32_t iEncStride			= pCurDqLayer->iEncStride[0];
	uint8_t uiQp					= pCurMb->uiLumaQp;

	int16_t *pResI4x4 = pMbCache->pCoeffLevel;
	uint8_t *pPredI4x4;

	uint8_t *pPred     = pMbCache->SPicData.pCsMb[0];
	int32_t iRecStride = pCurDqLayer->iCsStride[0];

	uint32_t uiOffset = g_kuiMbCountScan4Idx[uiI4x4Idx];
	uint8_t* pEncMb = pMbCache->SPicData.pEncMb[0];
	uint8_t *pBestPred = pMbCache->pBestPredI4x4Blk4;
	int16_t* pBlock = pMbCache->pDct->iLumaBlock[uiI4x4Idx];

	int16_t *pMF = g_kiQuantMF[uiQp], *pFF = g_iQuantIntraFF[uiQp];

	int32_t *pStrideEncBlockOffset = pEncCtx->pStrideTab->pStrideEncBlockOffset[pEncCtx->uiDependencyId];
	int32_t *pStrideDecBlockOffset = pEncCtx->pStrideTab->pStrideDecBlockOffset[pEncCtx->uiDependencyId][0==pEncCtx->uiTemporalId];
	int32_t iNoneZeroCount = 0;

	pFuncList->pfDctT4( pResI4x4, &(pEncMb[pStrideEncBlockOffset[uiI4x4Idx]]), iEncStride, pBestPred, 4 );
	pFuncList->pfQuantization4x4(pResI4x4, pFF, pMF);
	pFuncList->pfScan4x4(pBlock, pResI4x4);
	
	iNoneZeroCount = pFuncList->pfGetNoneZeroCount(pBlock);
	pCurMb->pNonZeroCount[uiOffset] = iNoneZeroCount;

	pPredI4x4 = pPred + pStrideDecBlockOffset[uiI4x4Idx]; 
	if ( iNoneZeroCount > 0 )
	{
		pCurMb->uiCbp |= 1 << (uiI4x4Idx>>2);
		pFuncList->pfDequantization4x4( pResI4x4, g_kuiDequantCoeff[uiQp]);
		pFuncList->pfIDctT4(pPredI4x4, iRecStride, pBestPred, 4, pResI4x4);
	}
	else
		WelsCopy4x4(pPredI4x4, iRecStride, pBestPred, 4);
}

void WelsEncInterY(SWelsFuncPtrList *pFuncList, SMB * pCurMb, SMbCache *pMbCache)
{    
	PQuantizationMaxFunc pfQuantizationFour4x4Max	= pFuncList->pfQuantizationFour4x4Max;
	PSetMemoryZero pfSetMemZeroSize8				        = pFuncList->pfSetMemZeroSize8;
	PSetMemoryZero pfSetMemZeroSize64			        = pFuncList->pfSetMemZeroSize64;
	PScanFunc pfScan4x4			                                    = pFuncList->pfScan4x4;
	PCalculateSingleCtrFunc pfCalculateSingleCtr4x4		= pFuncList->pfCalculateSingleCtr4x4;
	PGetNoneZeroCountFunc pfGetNoneZeroCount	    = pFuncList->pfGetNoneZeroCount;
	PDeQuantizationFunc pfDequantizationFour4x4		= pFuncList->pfDequantizationFour4x4;
	int16_t *pRes					                                    = pMbCache->pCoeffLevel;
	int32_t iSingleCtrMb		= 0, iSingleCtr8x8[4];
	int16_t* pBlock				= pMbCache->pDct->iLumaBlock[0]; 
	uint8_t uiQp					= pCurMb->uiLumaQp;
	int16_t *pMF					= g_kiQuantMF[uiQp], *pFF = g_kiQuantInterFF[uiQp], aMax[16];
	int32_t i, j, iNoneZeroCountMbDcAc	= 0, iNoneZeroCount=0;	

	for(i = 0; i < 4; i++)
	{	
		pfQuantizationFour4x4Max(pRes, pFF,  pMF, aMax+(i<<2));
		iSingleCtr8x8[i] = 0;
		for(j = 0; j < 4; j++)
		{
			if(aMax[(i<<2)+j] == 0)
				pfSetMemZeroSize8(pBlock, 32);
			else	
			{
				pfScan4x4(pBlock, pRes);		
				if(aMax[(i<<2)+j] > 1)
					iSingleCtr8x8[i] += 9;	
				else if(iSingleCtr8x8[i] < 6)
					iSingleCtr8x8[i] += pfCalculateSingleCtr4x4(pBlock);
			}
			pRes += 16; 
			pBlock += 16;	
		}
		iSingleCtrMb += iSingleCtr8x8[i];
	}
	pBlock -= 256;
	pRes -= 256;

	memset(pCurMb->pNonZeroCount, 0, 16);  
    
   
	if( iSingleCtrMb < 6 )	 //from JVT-O079
    {		
		iNoneZeroCountMbDcAc = 0;
		pfSetMemZeroSize64( pRes,  768 );	// confirmed_safe_unsafe_usage
	}
	else
	{
		const uint8_t* kpNoneZeroCountIdx = g_kuiMbCountScan4Idx;
		for(i = 0; i < 4; i++)
		{
			if( iSingleCtr8x8[i] >= 4 ){				
				for( j = 0; j < 4; j++ ){
					iNoneZeroCount = pfGetNoneZeroCount(pBlock);
					pCurMb->pNonZeroCount[*kpNoneZeroCountIdx++] = iNoneZeroCount;
					iNoneZeroCountMbDcAc += iNoneZeroCount;
       				pBlock += 16; 
				}
				pfDequantizationFour4x4(pRes, g_kuiDequantCoeff[uiQp]);
				pCurMb->uiCbp |= 1 << i;
			}
			else {	// set zero for an 8x8 pBlock
				pfSetMemZeroSize64(pRes, 128);	// confirmed_safe_unsafe_usage
				kpNoneZeroCountIdx += 4;
				pBlock += 64; 
			}	
			pRes += 64;
		}
	}
}

void    WelsEncRecUV(SWelsFuncPtrList *pFuncList, SMB * pCurMb, SMbCache *pMbCache, int16_t * pRes, int32_t iUV)
{
	PQuantizationHadamardFunc pfQuantizationHadamard2x2		= pFuncList->pfQuantizationHadamard2x2;
	PQuantizationMaxFunc pfQuantizationFour4x4Max	= pFuncList->pfQuantizationFour4x4Max;	
	PSetMemoryZero pfSetMemZeroSize8				        = pFuncList->pfSetMemZeroSize8;
	PSetMemoryZero pfSetMemZeroSize64				    = pFuncList->pfSetMemZeroSize64;
	PScanFunc pfScan4x4Ac		                                	= pFuncList->pfScan4x4Ac;
	PCalculateSingleCtrFunc pfCalculateSingleCtr4x4		= pFuncList->pfCalculateSingleCtr4x4;
	PGetNoneZeroCountFunc pfGetNoneZeroCount	    = pFuncList->pfGetNoneZeroCount;
	PDeQuantizationFunc pfDequantizationFour4x4		= pFuncList->pfDequantizationFour4x4;
	const int32_t kiInterFlag				                            = !IS_INTRA( pCurMb->uiMbType);
	const uint8_t	kiQp                                                   = pCurMb->uiChromaQp;
	uint8_t i, uiNoneZeroCount, uiNoneZeroCountMbAc	= 0, uiNoneZeroCountMbDc = 0;
	uint8_t uiNoneZeroCountOffset	                            = (iUV - 1)<<1;	//UV==1 or 2 
	uint8_t uiSubMbIdx				                                = 16 + ((iUV - 1)<<2);			//uiSubMbIdx == 16 or 20
	int16_t* iChromaDc			= pMbCache->pDct->iChromaDc[iUV-1], *pBlock = pMbCache->pDct->iChromaBlock[(iUV - 1)<<2];		
	int16_t aDct2x2[4], j, aMax[4];
	int32_t iSingleCtr8x8		= 0;
	int16_t* pMF = g_kiQuantMF[kiQp], *pFF = g_kiQuantInterFF[(!kiInterFlag)*6+kiQp];

	uiNoneZeroCountMbDc = pfQuantizationHadamard2x2(pRes, pFF[0]<<1, pMF[0]>>1, aDct2x2, iChromaDc);

	pfQuantizationFour4x4Max(pRes, pFF,  pMF, aMax);

	for(j = 0; j < 4; j++)
	{	
		if(aMax[j] == 0)
			pfSetMemZeroSize8(pBlock, 32);
		else	
		{
			pfScan4x4Ac(pBlock, pRes);	
			if(kiInterFlag)				
			{
				if(aMax[j] > 1)
					iSingleCtr8x8 += 9;	
				else if(iSingleCtr8x8 < 7)
					iSingleCtr8x8 += pfCalculateSingleCtr4x4(pBlock);
			}
			else
				iSingleCtr8x8 = INT_MAX;
		}
		pRes += 16; 
		pBlock += 16;	
	}	
	pRes -= 64;

	if(  iSingleCtr8x8 < 7 )	//from JVT-O079
	{		
		pfSetMemZeroSize64(pRes, 128);	// confirmed_safe_unsafe_usage
		ST16( &pCurMb->pNonZeroCount[16+uiNoneZeroCountOffset], 0 );
		ST16( &pCurMb->pNonZeroCount[20+uiNoneZeroCountOffset], 0 );
	}
	else
	{
		const uint8_t* kpNoneZeroCountIdx = &g_kuiMbCountScan4Idx[uiSubMbIdx];
		pBlock -= 64;
		for(i=0; i<4; i++){
			uiNoneZeroCount = pfGetNoneZeroCount(pBlock);
			pCurMb->pNonZeroCount[*kpNoneZeroCountIdx++] = uiNoneZeroCount;
			uiNoneZeroCountMbAc += uiNoneZeroCount;
			pBlock += 16;
		}
		pfDequantizationFour4x4(pRes, g_kuiDequantCoeff[pCurMb->uiChromaQp]);
		pCurMb->uiCbp &= 0x0F;
		pCurMb->uiCbp |= 0x20;	
	}

	if (uiNoneZeroCountMbDc > 0)
	{	
		WelsDequantIHadamard2x2Dc(aDct2x2, g_kuiDequantCoeff[kiQp][0] >> 1);
		if ( 2 != (pCurMb->uiCbp >> 4) )
			pCurMb->uiCbp |= (0x01 << 4) ;
 		pRes[0]	= aDct2x2[0];
 		pRes[16]	= aDct2x2[1];
 		pRes[32]	= aDct2x2[2];
 		pRes[48]	= aDct2x2[3];                   
	}
}


void    WelsRecPskip(SDqLayer *pCurLayer, SWelsFuncPtrList *pFuncList, SMB * pCurMb, SMbCache *pMbCache)
{	
	int32_t* iRecStride	= pCurLayer->iCsStride;
	uint8_t** pCsMb		= &pMbCache->SPicData.pCsMb[0];

	pFuncList->pfCopy16x16Aligned(pCsMb[0],	*iRecStride++,	pMbCache->pSkipMb,		16);
	pFuncList->pfCopy8x8Aligned(	pCsMb[1],	*iRecStride++,	pMbCache->pSkipMb + 256,	8);
	pFuncList->pfCopy8x8Aligned(	pCsMb[2],	*iRecStride,	pMbCache->pSkipMb + 320,	8);
	pFuncList->pfSetMemZeroSize8(		pCurMb->pNonZeroCount,	24 );
}

BOOL_T WelsTryPYskip(sWelsEncCtx * pEncCtx, SMB * pCurMb, SMbCache *pMbCache)
{
	int32_t iSingleCtrMb	= 0;
	int16_t *pRes = pMbCache->pCoeffLevel;
	const uint8_t kuiQp = pCurMb->uiLumaQp;	

	int16_t* pBlock = pMbCache->pDct->iLumaBlock[0];		
	uint16_t aMax[4], i, j;
	int16_t* pMF = g_kiQuantMF[kuiQp], *pFF = g_kiQuantInterFF[kuiQp];

    for(i = 0; i < 4; i++)
    {		
		pEncCtx->pFuncList->pfQuantizationFour4x4Max(pRes, pFF,  pMF, (int16_t*)aMax);	

		for(j = 0; j < 4; j++)
		{	
			if(aMax[j] > 1) return FALSE;	// iSingleCtrMb += 9, can't be P_SKIP				
			else if( aMax[j] == 1) 
			{	
				pEncCtx->pFuncList->pfScan4x4(pBlock, pRes); //
				iSingleCtrMb += pEncCtx->pFuncList->pfCalculateSingleCtr4x4(pBlock);	
			}		
			if(iSingleCtrMb >= 6) 	return FALSE; //from JVT-O079
			pRes += 16; 
			pBlock += 16;
		}
	}	
    return TRUE;
}

BOOL_T    WelsTryPUVskip(sWelsEncCtx * pEncCtx, SMB * pCurMb, SMbCache *pMbCache, int32_t iUV)
{
	int16_t* pRes = ((iUV == 1) ? &(pMbCache->pCoeffLevel[256]):&(pMbCache->pCoeffLevel[256+64]));	

	const uint8_t kuiQp = g_kuiChromaQpTable[CLIP3_QP_0_51(pCurMb->uiLumaQp + pEncCtx->pCurDqLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset)];

	int16_t* pMF = g_kiQuantMF[kuiQp], *pFF = g_kiQuantInterFF[kuiQp];

	if(pEncCtx->pFuncList->pfQuantizationHadamard2x2Skip(pRes, pFF[0]<<1, pMF[0]>>1))
		return FALSE;
	else
	{
		uint16_t aMax[4], j;
		int32_t iSingleCtrMb = 0;
		int16_t* pBlock = pMbCache->pDct->iChromaBlock[(iUV-1)<<2];
		pEncCtx->pFuncList->pfQuantizationFour4x4Max(pRes, pFF,  pMF, (int16_t*)aMax);

		for(j = 0; j < 4; j++)
		{
			if( aMax[j] > 1)		return FALSE;	// iSingleCtrMb += 9, can't be P_SKIP			
			else if( aMax[j] == 1)
			{	
				pEncCtx->pFuncList->pfScan4x4Ac(pBlock, pRes);				
				iSingleCtrMb += pEncCtx->pFuncList->pfCalculateSingleCtr4x4(pBlock);	
			}		
			if(iSingleCtrMb >= 7) return FALSE; //from JVT-O079
			pRes += 16; 
			pBlock += 16;	
		}
		return TRUE;
	}
}

} // namespace WelsSVCEnc