shithub: openh264

ref: 3331ff656676a43e5fd97327c569734399d5fa40
dir: /codec/decoder/core/src/decode_slice.cpp/

View raw version
/*!
 * \copy
 *     Copyright (c)  2008-2013, Cisco Systems
 *     All rights reserved.
 *
 *     Redistribution and use in source and binary forms, with or without
 *     modification, are permitted provided that the following conditions
 *     are met:
 *
 *        * Redistributions of source code must retain the above copyright
 *          notice, this list of conditions and the following disclaimer.
 *
 *        * Redistributions in binary form must reproduce the above copyright
 *          notice, this list of conditions and the following disclaimer in
 *          the documentation and/or other materials provided with the
 *          distribution.
 *
 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 *     POSSIBILITY OF SUCH DAMAGE.
 *
 *
 *  Abstract
 *      current slice decoding
 *
 *  History
 *      07/10/2008 Created
 *      08/09/2013 Modified
 *
 *****************************************************************************/
#include <memory.h>

#include "typedefs.h"
#include "dec_golomb.h"

#include "fmo.h"
#include "deblocking.h"
#include "utils.h"

#include "decode_slice.h"

#include "error_code.h"
#include "decode_mb_aux.h"
#include "parse_mb_syn_cavlc.h"
#include "rec_mb.h"
#include "mv_pred.h"

#include "as264_common.h"
#include "cpu_core.h"
#include "expand_pic.h"

namespace WelsDec {

int32_t WelsTargetSliceConstruction( PWelsDecoderContext pCtx )
{
	int32_t iPreQP = 0;

	PDqLayer pCurLayer = pCtx->pCurDqLayer;
	PSlice pCurSlice = &pCurLayer->sLayerInfo.sSliceInLayer;
	PSliceHeader pSliceHeader = &pCurSlice->sSliceHeaderExt.sSliceHeader;

	int32_t iTotalMbTargetLayer = pSliceHeader->pSps->uiTotalMbCount;

	int32_t iCurLayerWidth  = pCurLayer->iMbWidth << 4; 
	int32_t iCurLayerHeight = pCurLayer->iMbHeight << 4;

	int32_t iNextMbXyIndex = 0;
	PFmo pFmo = pCtx->pFmo;

	int32_t iTotalNumMb = pCurSlice->iTotalMbInCurSlice;
	int32_t iCountNumMb = 0;
	PDeblockingFilterMbFunc pDeblockMb;

	if ( !pCtx->bAvcBasedFlag && iCurLayerWidth != pCtx->iCurSeqIntervalMaxPicWidth ) 
	{
		return -1;
	}

	iNextMbXyIndex   = pSliceHeader->iFirstMbInSlice;
	pCurLayer->iMbX  = iNextMbXyIndex % pCurLayer->iMbWidth;
	pCurLayer->iMbY  = iNextMbXyIndex / pCurLayer->iMbWidth; 
	pCurLayer->iMbXyIndex = iNextMbXyIndex;	

	if ( 0 == iNextMbXyIndex )
	{
		pCurLayer->pDec->iSpsId = pSliceHeader->iSpsId;
		pCurLayer->pDec->iPpsId = pSliceHeader->iPpsId;

		pCurLayer->pDec->uiQualityId = pCurLayer->sLayerInfo.sNalHeaderExt.uiQualityId;
	}

	do 
	{
		iPreQP = pCurLayer->pLumaQp[pCurLayer->iMbXyIndex];
		
		if ( WelsTargetMbConstruction( pCtx ) )
		{
			WelsLog( pCtx, WELS_LOG_WARNING, "WelsTargetSliceConstruction():::MB(%d, %d) construction error. pCurSlice_type:%d\n",
				pCurLayer->iMbX, pCurLayer->iMbY, pCurSlice->eSliceType );

			return -1;
		}

		++iCountNumMb;
		++pCurLayer->pDec->iTotalNumMbRec;
		if ( iCountNumMb >= iTotalNumMb )
		{
			break;
		}		
		if ( pCurLayer->pDec->iTotalNumMbRec > iTotalMbTargetLayer )
		{
			WelsLog( pCtx, WELS_LOG_WARNING, "WelsTargetSliceConstruction():::fdec->iTotalNumMbRec:%d, iTotalMbTargetLayer:%d\n",
				pCurLayer->pDec->iTotalNumMbRec, iTotalMbTargetLayer );

			return -1;
		}		
		
		if ( pSliceHeader->pPps->uiNumSliceGroups > 1 )
		{
			iNextMbXyIndex = FmoNextMb( pFmo, iNextMbXyIndex );
		}
		else
		{
			++iNextMbXyIndex;
		}
		if ( -1 == iNextMbXyIndex || iNextMbXyIndex >= iTotalMbTargetLayer )	// slice group boundary or end of a frame
		{
			break;
		}
		pCurLayer->iMbX  = iNextMbXyIndex % pCurLayer->iMbWidth;
		pCurLayer->iMbY  = iNextMbXyIndex / pCurLayer->iMbWidth;
		pCurLayer->iMbXyIndex = iNextMbXyIndex;
	} while (1);
	
	pCtx->pDec->iWidthInPixel  = iCurLayerWidth;
	pCtx->pDec->iHeightInPixel = iCurLayerHeight;

	if((pCurSlice->eSliceType != I_SLICE)&&(pCurSlice->eSliceType != P_SLICE))
		return 0;

    pDeblockMb = WelsDeblockingMb; 

	if ( 1 == pSliceHeader->uiDisableDeblockingFilterIdc )
	{
		return 0;//NO_SUPPORTED_FILTER_IDX
	}
	else
	{
		WelsDeblockingFilterSlice( pCtx, pDeblockMb );

	}
	// any other filter_idc not supported here, 7/22/2010

	return 0;
}

int32_t WelsMbInterSampleConstruction( PWelsDecoderContext pCtx, PDqLayer pCurLayer, 
											  uint8_t* pDstY, uint8_t* pDstU, uint8_t* pDstV, int32_t iStrideL, int32_t iStrideC )
{
	int32_t iMbXy = pCurLayer->iMbXyIndex;
	int32_t i, iIndex, iOffset;

	WelsChromaDcIdct( pCurLayer->pScaledTCoeff[iMbXy] + 256 );	// 256 = 16*16
	WelsChromaDcIdct( pCurLayer->pScaledTCoeff[iMbXy] + 320 );	// 320 = 16*16 + 16*4

	for(i=0; i<16; i++) //luma
	{
		iIndex = g_kuiMbNonZeroCountIdx[i];
		if( pCurLayer->pNzc[iMbXy][iIndex] )
		{
			iOffset = ((iIndex>>2)<<2) * iStrideL + ((iIndex%4)<<2);			
			pCtx->pIdctResAddPredFunc( pDstY+iOffset, iStrideL, pCurLayer->pScaledTCoeff[iMbXy]+(i<<4) );
		}
	}

	for ( i = 0; i < 4; i++ ) //chroma
	{
		iIndex = g_kuiMbNonZeroCountIdx[i+16]; //Cb
		if ( pCurLayer->pNzc[iMbXy][iIndex] || *(pCurLayer->pScaledTCoeff[iMbXy]+((i+16)<<4)) )
		{
			iOffset = (((iIndex-16)>>2)<<2) * iStrideC + (((iIndex-16)%4)<<2);			
			pCtx->pIdctResAddPredFunc( pDstU+iOffset, iStrideC, pCurLayer->pScaledTCoeff[iMbXy]+((i+16)<<4) );
		}

		iIndex = g_kuiMbNonZeroCountIdx[i+20]; //Cr
		if ( pCurLayer->pNzc[iMbXy][iIndex] || *(pCurLayer->pScaledTCoeff[iMbXy]+((i+20)<<4)) )
		{
			iOffset = (((iIndex-18)>>2)<<2) * iStrideC + (((iIndex-18)%4)<<2);			
			pCtx->pIdctResAddPredFunc( pDstV+iOffset, iStrideC , pCurLayer->pScaledTCoeff[iMbXy]+((i+20)<<4));
		}
	}

	return 0;
}
int32_t WelsMbInterConstruction(PWelsDecoderContext pCtx, PDqLayer pCurLayer)
{
	int32_t iMbX = pCurLayer->iMbX;
	int32_t iMbY = pCurLayer->iMbY;
	uint8_t  *pDstY, *pDstCb, *pDstCr;

	int32_t iLumaStride   = pCtx->pDec->iLinesize[0];
	int32_t iChromaStride = pCtx->pDec->iLinesize[1];

	pDstY  = pCurLayer->pDec->pData[0] + ((iMbY * iLumaStride + iMbX)<<4);
	pDstCb = pCurLayer->pDec->pData[1] + ((iMbY * iChromaStride + iMbX)<<3);
	pDstCr = pCurLayer->pDec->pData[2] + ((iMbY * iChromaStride + iMbX)<<3);

	GetInterPred(pDstY, pDstCb, pDstCr, pCtx);
	WelsMbInterSampleConstruction( pCtx, pCurLayer, pDstY, pDstCb, pDstCr, iLumaStride, iChromaStride );

	pCtx->sBlockFunc.pWelsSetNonZeroCountFunc(NULL, pCurLayer->pNzc[pCurLayer->iMbXyIndex]);// set all none-zero nzc to 1; dbk can be opti!
	return 0;
}

void_t WelsLumaDcDequantIdct(int16_t *pBlock, int32_t iQp){
    const int32_t kiQMul= g_kuiDequantCoeff[iQp][0];
#define STRIDE 16
    int32_t i;
    int32_t iTemp[16]; //FIXME check if this is a good idea
	int16_t* pBlk = pBlock;
    static const int32_t kiXOffset[4]={0, STRIDE, STRIDE<<2,  5*STRIDE};
    static const int32_t kiYOffset[4]={0, STRIDE<<1, STRIDE<<3, 10*STRIDE};

    for(i=0; i<4; i++){
        const int32_t kiOffset= kiYOffset[i];
		const int32_t kiX1 = kiOffset + kiXOffset[2];
		const int32_t kiX2 = STRIDE + kiOffset;
		const int32_t kiX3 = kiOffset + kiXOffset[3];
		const int32_t kiI4 = i << 2;	// 4*i
        const int32_t kiZ0= pBlk[kiOffset] + pBlk[kiX1];
        const int32_t kiZ1= pBlk[kiOffset] - pBlk[kiX1];
        const int32_t kiZ2= pBlk[kiX2] - pBlk[kiX3];
        const int32_t kiZ3= pBlk[kiX2] + pBlk[kiX3];

        iTemp[kiI4]  = kiZ0+kiZ3;
        iTemp[1+kiI4]= kiZ1+kiZ2;
        iTemp[2+kiI4]= kiZ1-kiZ2;
        iTemp[3+kiI4]= kiZ0-kiZ3;
    }

    for(i=0; i<4; i++){
        const int32_t kiOffset= kiXOffset[i];
		const int32_t kiI4 = 4 + i;
        const int32_t kiZ0= iTemp[i] + iTemp[4+kiI4];
        const int32_t kiZ1= iTemp[i] - iTemp[4+kiI4];
        const int32_t kiZ2= iTemp[kiI4] - iTemp[8+kiI4];
        const int32_t kiZ3= iTemp[kiI4] + iTemp[8+kiI4];

        pBlk[kiOffset]= ((kiZ0 + kiZ3)*kiQMul + 2)>>2; //FIXME think about merging this into decode_resdual
        pBlk[kiYOffset[1] +kiOffset]= ((kiZ1 + kiZ2)*kiQMul + 2)>>2;
        pBlk[kiYOffset[2] +kiOffset]= ((kiZ1 - kiZ2)*kiQMul + 2)>>2;
        pBlk[kiYOffset[3] +kiOffset]= ((kiZ0 - kiZ3)*kiQMul + 2)>>2;
    }
	#undef STRIDE
}

int32_t WelsMbIntraPredictionConstruction(PWelsDecoderContext pCtx, PDqLayer pCurLayer, bool_t bOutput)
{
//seems IPCM should not enter this path
	int32_t iMbXy = pCurLayer->iMbXyIndex;

	FORCE_STACK_ALIGN_1D( int16_t, pTempScaledTCoeff, MB_COEFF_LIST_SIZE, 16 );

	memcpy(pTempScaledTCoeff, pCurLayer->pScaledTCoeff[iMbXy], 384*sizeof(pCurLayer->pScaledTCoeff[iMbXy][0]));

	WelsFillRecNeededMbInfo(pCtx, bOutput, pCurLayer);
	
	if(IS_INTRA16x16(pCurLayer->pMbType[iMbXy]))
	{
		int32_t i,j;
		// really need?
		for(i=0; i<16; i++)
		{
			j = g_kuiLumaDcZigzagScan[i];
			pTempScaledTCoeff[j] = pCurLayer->pScaledTCoeff[iMbXy][j];
		}
		WelsLumaDcDequantIdct(pTempScaledTCoeff, pCurLayer->pLumaQp[iMbXy]);
		RecI16x16Mb(iMbXy, pCtx,pTempScaledTCoeff,pCurLayer);

		return 0;
	}
		
	if(IS_INTRA4x4(pCurLayer->pMbType[iMbXy]))
		RecI4x4Mb(iMbXy, pCtx,pTempScaledTCoeff,pCurLayer);
		
	return 0;
}

int32_t WelsMbInterPrediction(PWelsDecoderContext pCtx, PDqLayer pCurLayer)
{
	int32_t iMbX = pCurLayer->iMbX;
	int32_t iMbY = pCurLayer->iMbY;
	uint8_t  *pDstY, *pDstCb, *pDstCr;

	int32_t iLumaStride   = pCtx->pDec->iLinesize[0];
	int32_t iChromaStride = pCtx->pDec->iLinesize[1];

	pDstY  = pCurLayer->pDec->pData[0] + ((iMbY * iLumaStride + iMbX)<<4);
	pDstCb = pCurLayer->pDec->pData[1] + ((iMbY * iChromaStride + iMbX)<<3);
	pDstCr = pCurLayer->pDec->pData[2] + ((iMbY * iChromaStride + iMbX)<<3);

	GetInterPred(pDstY, pDstCb, pDstCr, pCtx);

	return 0;
}

void_t WelsMbCopy( uint8_t *pDst, int32_t iStrideDst, uint8_t *pSrc, int32_t iStrideSrc, 
				 int32_t iHeight, int32_t iWidth )
{
	int32_t i;
	int32_t iOffsetDst = 0, iOffsetSrc = 0;
	for ( i = 0; i < iHeight; i++ )
	{
		memcpy( pDst+iOffsetDst, pSrc+iOffsetSrc, iWidth );
		iOffsetDst += iStrideDst;
		iOffsetSrc += iStrideSrc;
	}
}


int32_t WelsTargetMbConstruction(PWelsDecoderContext pCtx)
{
	PDqLayer pCurLayer = pCtx->pCurDqLayer;	
	if ( MB_TYPE_INTRA_PCM == pCurLayer->pMbType[pCurLayer->iMbXyIndex] )
	{		
		//copy cs into fdec
		int32_t iCsStrideL = pCurLayer->iCsStride[0];
		int32_t iCsStrideC = pCurLayer->iCsStride[1];

		int32_t iDecStrideL = pCurLayer->pDec->iLinesize[0]; 
		int32_t iDecStrideC = pCurLayer->pDec->iLinesize[1]; 

		int32_t iCsOffsetL = ( pCurLayer->iMbX + pCurLayer->iMbY * iCsStrideL ) << 4;
		int32_t iCsOffsetC = ( pCurLayer->iMbX + pCurLayer->iMbY * iCsStrideC ) << 3;
		
		int32_t iDecOffsetL = ( pCurLayer->iMbX + pCurLayer->iMbY * iDecStrideL ) << 4;
		int32_t iDecOffsetC = ( pCurLayer->iMbX + pCurLayer->iMbY * iDecStrideC ) << 3;
		
		uint8_t* pSrcY = pCurLayer->pCsData[0] + iCsOffsetL;
		uint8_t* pSrcU = pCurLayer->pCsData[1] + iCsOffsetC;
		uint8_t* pSrcV = pCurLayer->pCsData[2] + iCsOffsetC;
		
		uint8_t* pDecY = pCurLayer->pDec->pData[0] + iDecOffsetL;
		uint8_t* pDecU = pCurLayer->pDec->pData[1] + iDecOffsetC;
		uint8_t* pDecV = pCurLayer->pDec->pData[2] + iDecOffsetC;

		WelsMbCopy( pDecY, iDecStrideL, pSrcY, iCsStrideL, 16, 16 );
		WelsMbCopy( pDecU, iDecStrideC, pSrcU, iCsStrideC, 8, 8 );
		WelsMbCopy( pDecV, iDecStrideC, pSrcV, iCsStrideC, 8, 8 );
		
		return 0;
	}
	else if(IS_INTRA(pCurLayer->pMbType[pCurLayer->iMbXyIndex]))
	{
		WelsMbIntraPredictionConstruction(pCtx, pCurLayer, 1);
	}
	else if ( IS_INTER( pCurLayer->pMbType[pCurLayer->iMbXyIndex] ) ) //InterMB
	{
		if ( 0 == pCurLayer->pCbp[pCurLayer->iMbXyIndex] ) //uiCbp==0 include SKIP
		{
			WelsMbInterPrediction( pCtx, pCurLayer );
		}
		else
		{
			WelsMbInterConstruction( pCtx, pCurLayer );
		}			
	}
	else
	{
		WelsLog( pCtx, WELS_LOG_WARNING, "WelsTargetMbConstruction():::::Unknown MB type: %d\n", pCurLayer->pMbType[pCurLayer->iMbXyIndex] );
		return -1;
	}
	
	return 0;
}

void_t WelsChromaDcIdct( int16_t *pBlock )
{
    int32_t iStride= 32;
    int32_t iXStride= 16;
	int32_t iStride1 = iXStride + iStride;
	int16_t* pBlk = pBlock;
    int32_t iA,iB,iC,iD,iE;
	
    iA= pBlk[0];
    iB= pBlk[iXStride];
    iC= pBlk[iStride];
    iD= pBlk[iStride1];
	
    iE = iA-iB;
    iA += iB;
    iB = iC-iD;
    iC += iD;
	
	pBlk[0]= (iA+iC) >> 1;
    pBlk[iXStride]= (iE+iB) >> 1;
    pBlk[iStride]= (iA-iC) >> 1;
    pBlk[iStride1]= (iE-iB) >> 1;
}

int32_t WelsDecodeSlice(PWelsDecoderContext pCtx, bool_t bFirstSliceInLayer, PNalUnit pNalCur)
{
	PDqLayer pCurLayer = pCtx->pCurDqLayer;
	PFmo pFmo = pCtx->pFmo;
	int32_t i, iRet;
	int32_t iNextMbXyIndex, iSliceIdc; 

	PSlice pSlice = &pCurLayer->sLayerInfo.sSliceInLayer;
	PSliceHeaderExt pSliceHeaderExt = &pSlice->sSliceHeaderExt;
	PSliceHeader pSliceHeader = &pSliceHeaderExt->sSliceHeader;
	int32_t iMbX, iMbY;
	const int32_t kiCountNumMb = pSliceHeader->pSps->uiTotalMbCount; //need to be correct when fmo or multi slice
	PBitStringAux pBs = pCurLayer->pBitStringAux; 
	int32_t iUsedBits  = 0;
	
	PWelsDecMbCavlcFunc pDecMbCavlcFunc; 

	pSlice->iTotalMbInCurSlice = 0; //initialize at the starting of slice decoding.

	if ( P_SLICE == pSliceHeader->eSliceType ) 
	{
		pDecMbCavlcFunc = WelsDecodeMbCavlcPSlice;
	}
	else //I_SLICE
	{
		pDecMbCavlcFunc = WelsDecodeMbCavlcISlice;
	}

	if ( pSliceHeader->pPps->bConstainedIntraPredFlag )
	{
		pCtx->pFillInfoCacheIntra4x4Func = WelsFillCacheConstrain1Intra4x4;
		pCtx->pParseIntra4x4ModeFunc      = ParseIntra4x4ModeConstrain1;
		pCtx->pParseIntra16x16ModeFunc    = ParseIntra16x16ModeConstrain1;
	}
	else
	{
		pCtx->pFillInfoCacheIntra4x4Func = WelsFillCacheConstrain0Intra4x4;
		pCtx->pParseIntra4x4ModeFunc      = ParseIntra4x4ModeConstrain0;
		pCtx->pParseIntra16x16ModeFunc    = ParseIntra16x16ModeConstrain0;
	}
	
	pCtx->eSliceType = pSliceHeader->eSliceType;

	if (pCurLayer->sLayerInfo.pPps->bEntropyCodingModeFlag == 1)
	{
		//CABAC encoding is unsupported yet!
		return -1;
	}
	
	iNextMbXyIndex = pSliceHeader->iFirstMbInSlice;

	if ( iNextMbXyIndex >= kiCountNumMb )
	{
		WelsLog(pCtx, WELS_LOG_ERROR, "WelsDecodeSlice()::iFirstMbInSlice(%d) > pSps->kiTotalMb(%d). ERROR!!! resolution change....\n", 
			iNextMbXyIndex, kiCountNumMb);
		pCtx->iErrorCode |= dsNoParamSets;
		return dsNoParamSets;
	}	

	iMbX = iNextMbXyIndex % pCurLayer->iMbWidth;
	iMbY = iNextMbXyIndex / pCurLayer->iMbWidth; // error is introduced by multiple slices case, 11/23/2009
	pSlice->iMbSkipRun = -1;
	iSliceIdc = (pSliceHeader->iFirstMbInSlice<<7)+pCurLayer->uiLayerDqId;
	
	pCurLayer->iMbX =  iMbX;
	pCurLayer->iMbY = iMbY;
	pCurLayer->iMbXyIndex = iNextMbXyIndex;

	if(pSliceHeaderExt->bSliceSkipFlag == 1)
	{
		for(i=0; i<(int32_t)pSliceHeaderExt->uiNumMbsInSlice; i++)
		{
			pCurLayer->pSliceIdc[iNextMbXyIndex] = iSliceIdc;
		
			
			pCurLayer->pResidualPredFlag[iNextMbXyIndex] = 1;		
			
			if ( pSliceHeaderExt->sSliceHeader.pPps->uiNumSliceGroups > 1 )
			{
				iNextMbXyIndex = FmoNextMb( pFmo, iNextMbXyIndex );
			}
			else
			{
				++iNextMbXyIndex;
			}

			iMbX = iNextMbXyIndex%pCurLayer->iMbWidth;
			iMbY = iNextMbXyIndex%pCurLayer->iMbHeight;
	
			pCurLayer->iMbX =  iMbX;
			pCurLayer->iMbY = iMbY;
			pCurLayer->iMbXyIndex = iNextMbXyIndex;
		}
		return 0;
	}

	do{
		pCurLayer->pSliceIdc[iNextMbXyIndex] = iSliceIdc;
		iRet = pDecMbCavlcFunc( pCtx,  pNalCur );

		if (iRet != ERR_NONE){
			return iRet;
		}

		++pSlice->iTotalMbInCurSlice;

		if ( pSliceHeader->pPps->uiNumSliceGroups > 1 )
		{
			iNextMbXyIndex = FmoNextMb( pFmo, iNextMbXyIndex );
		}
		else
		{
			++iNextMbXyIndex;
		}
		if ( (-1 == iNextMbXyIndex) || (iNextMbXyIndex >= kiCountNumMb) )	// slice group boundary or end of a frame
		{
			break;
		}

		// check whether there is left bits to read next time in case multiple slices
		iUsedBits = ((pBs->pCurBuf-pBs->pStartBuf)<<3) - (16-pBs->iLeftBits);
		if ( iUsedBits == pBs->iBits && 0 >= pCurLayer->sLayerInfo.sSliceInLayer.iMbSkipRun )	// slice boundary
		{
			break;
		}		
		if ( iUsedBits > pBs->iBits )//When BS incomplete, as long as find it, SHOULD stop decoding to avoid mosaic or crash.
		{
			WelsLog( pCtx, WELS_LOG_WARNING, "WelsDecodeSlice()::::pBs incomplete, iUsedBits:%d > pBs->iBits:%d, MUST stop decoding.\n", 
				iUsedBits, pBs->iBits );
			return -1;
		}
		iMbX = iNextMbXyIndex % pCurLayer->iMbWidth;
		iMbY = iNextMbXyIndex / pCurLayer->iMbWidth;
		pCurLayer->iMbX =  iMbX;
		pCurLayer->iMbY = iMbY;
		pCurLayer->iMbXyIndex = iNextMbXyIndex;
	}while(1);
	
	return ERR_NONE;
}

int32_t WelsActualDecodeMbCavlcISlice(PWelsDecoderContext pCtx)
{	
	SVlcTable* pVlcTable     = &pCtx->sVlcTable;
	PDqLayer pCurLayer		 = pCtx->pCurDqLayer;
	PBitStringAux pBs		 = pCurLayer->pBitStringAux;
	PSlice pSlice			 = &pCurLayer->sLayerInfo.sSliceInLayer;
	PSliceHeader pSliceHeader		     = &pSlice->sSliceHeaderExt.sSliceHeader;

	SNeighAvail sNeighAvail;

	int32_t iScanIdxStart = pSlice->sSliceHeaderExt.uiScanIdxStart;
	int32_t iScanIdxEnd   = pSlice->sSliceHeaderExt.uiScanIdxEnd;	

	int32_t iMbX = pCurLayer->iMbX;
	int32_t iMbY = pCurLayer->iMbY;
	int32_t iMbXy = pCurLayer->iMbXyIndex;
	int32_t iNMbMode, i;
	uint32_t uiMbType = 0, uiCbp = 0, uiCbpL = 0, uiCbpC = 0;

	FORCE_STACK_ALIGN_1D( uint8_t, pNonZeroCount, 48, 16 );
	
	pCurLayer->pInterPredictionDoneFlag[iMbXy] = 0;
	pCurLayer->pResidualPredFlag[iMbXy] = pSlice->sSliceHeaderExt.bDefaultResidualPredFlag;
	
	uiMbType = BsGetUe(pBs);
	if ( uiMbType > 25 )
	{
		return ERR_INFO_INVALID_MB_TYPE;
	}	

	if ( 25 == uiMbType )
	{
		int32_t iDecStrideL = pCurLayer->pDec->iLinesize[0]; 
		int32_t iDecStrideC = pCurLayer->pDec->iLinesize[1]; 
		
		int32_t iOffsetL = ( iMbX + iMbY * iDecStrideL ) << 4;
		int32_t iOffsetC = ( iMbX + iMbY * iDecStrideC ) << 3;
		
		uint8_t* pDecY = pCurLayer->pCsData[0] + iOffsetL;
		uint8_t* pDecU = pCurLayer->pCsData[1] + iOffsetC;
		uint8_t* pDecV = pCurLayer->pCsData[2] + iOffsetC;
		
		uint8_t *pTmpBsBuf;
		
		int32_t i;
		int32_t iCopySizeY  = ( sizeof( uint8_t ) << 4 );
		int32_t iCopySizeUV = ( sizeof( uint8_t ) << 3 );

		int32_t iIndex = ((-pBs->iLeftBits)>>3) + 2;
		
		pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA_PCM;
		
		//step 1: locating bit-stream pointer [must align into integer byte]	
		pBs->pCurBuf -= iIndex;
		
		//step 2: copy pixel from bit-stream into fdec [reconstruction]		
		pTmpBsBuf = pBs->pCurBuf;
		for ( i = 0; i < 16; i++ ) //luma
		{
			memcpy( pDecY , pTmpBsBuf, iCopySizeY );
			pDecY += iDecStrideL;				
			pTmpBsBuf += 16;
		}
		for ( i = 0; i < 8; i++ ) //cb
		{				
			memcpy( pDecU, pTmpBsBuf, iCopySizeUV );
			pDecU += iDecStrideC;				
			pTmpBsBuf += 8;
		}
		for ( i = 0; i < 8; i++ ) //cr
		{				
			memcpy( pDecV, pTmpBsBuf, iCopySizeUV );
			pDecV += iDecStrideC;
			pTmpBsBuf += 8;
		}	

		pBs->pCurBuf += 384;
		InitReadBits( pBs );
		
		//step 3: update QP and pNonZeroCount
		pCurLayer->pLumaQp[iMbXy] = 0;
		pCurLayer->pChromaQp[iMbXy] = 0;
		memset( pCurLayer->pNzc[iMbXy], 16, sizeof( pCurLayer->pNzc[iMbXy] ) ); //JVT-x201wcm1.doc, page229, 2009.10.23		
		return 0;				
	}
	else if (0 == uiMbType) //reference to JM
	{
		FORCE_STACK_ALIGN_1D( int8_t, pIntraPredMode, 48, 16 );
		pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA4x4;
		pCtx->pFillInfoCacheIntra4x4Func( &sNeighAvail, pNonZeroCount, pIntraPredMode, pCurLayer );
		if ( pCtx->pParseIntra4x4ModeFunc( &sNeighAvail, pIntraPredMode, pBs, pCurLayer ) )
		{
			return -1;
		}

		//uiCbp
		uiCbp = BsGetUe(pBs);
		//G.9.1 Alternative parsing process for coded pBlock pattern
		if ( uiCbp > 47 ) 
			return ERR_INFO_INVALID_CBP;

		uiCbp = g_kuiIntra4x4CbpTable[uiCbp];

		pCurLayer->pCbp[iMbXy] = uiCbp;
		uiCbpC = uiCbp >> 4;
		uiCbpL = uiCbp & 15;
	}
	else //I_PCM exclude, we can ignore it
	{
		pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA16x16;
		pCurLayer->pIntraPredMode[iMbXy][7] = (uiMbType-1) & 3;
		pCurLayer->pCbp[iMbXy] = g_kuiI16CbpTable[(uiMbType-1)>>2];
		uiCbpC = pCurLayer->pCbp[iMbXy] >> 4;
		uiCbpL = pCurLayer->pCbp[iMbXy] & 15;
		WelsFillCacheNonZeroCount( &sNeighAvail, pNonZeroCount, pCurLayer );
		if ( pCtx->pParseIntra16x16ModeFunc( &sNeighAvail, pBs, pCurLayer ) )
		{
			return -1;
		}
	}
	
	iNMbMode = BASE_MB;					

	memset(pCurLayer->pScaledTCoeff[iMbXy], 0, 384*sizeof(pCurLayer->pScaledTCoeff[iMbXy][0]));
	ST32(&pCurLayer->pNzc[iMbXy][0], 0);
	ST32(&pCurLayer->pNzc[iMbXy][4], 0);
	ST32(&pCurLayer->pNzc[iMbXy][8], 0);
	ST32(&pCurLayer->pNzc[iMbXy][12], 0);
	ST32(&pCurLayer->pNzc[iMbXy][16], 0);
	ST32(&pCurLayer->pNzc[iMbXy][20], 0);

	if( pCurLayer->pCbp[iMbXy] == 0 && IS_INTRA4x4(pCurLayer->pMbType[iMbXy]))
	{
		pCurLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp;
		pCurLayer->pChromaQp[iMbXy] = g_kuiChromaQp[WELS_CLIP3(pCurLayer->pLumaQp[iMbXy] + 
											pSliceHeader->pPps->iChromaQpIndexOffset, 0, 51)];

	}

	if ( pCurLayer->pCbp[iMbXy] || MB_TYPE_INTRA16x16 == pCurLayer->pMbType[iMbXy] ) 
	{
		int32_t iQpDelta, iId8x8, iId4x4;		

		iQpDelta = BsGetSe(pBs);

        if (iQpDelta > 25 || iQpDelta < -26) //out of iQpDelta range
		{
			return ERR_INFO_INVALID_QP;
		}

		pCurLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp + iQpDelta; //update iLastMbQp
		//refer to JVT-X201wcm1.doc equation(7-35)
		if ( (unsigned)(pCurLayer->pLumaQp[iMbXy]) > 51 )
		{
			if ( pCurLayer->pLumaQp[iMbXy] < 0 )
			{
				pCurLayer->pLumaQp[iMbXy] += 52;
			} 
			else
			{
				pCurLayer->pLumaQp[iMbXy] -= 52;
			}
		}
		//QP should be in the range of [0, 51]
		if ( pCurLayer->pLumaQp[iMbXy] < 0 || pCurLayer->pLumaQp[iMbXy] > 51 )
		{
			return ERR_INFO_INVALID_QP;
		}
		pSlice->iLastMbQp = pCurLayer->pLumaQp[iMbXy];
		pCurLayer->pChromaQp[iMbXy] = g_kuiChromaQp[WELS_CLIP3(pSlice->iLastMbQp + pSliceHeader->pPps->iChromaQpIndexOffset, 0, 51)];


		BsStartCavlc( pBs );

		if ( MB_TYPE_INTRA16x16 == pCurLayer->pMbType[iMbXy] ) 
		{
			//step1: Luma DC
			if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs, 0, 16,
				g_kuiLumaDcZigzagScan, I16_LUMA_DC, pCurLayer->pScaledTCoeff[iMbXy], iNMbMode, pCurLayer->pLumaQp[iMbXy], pCtx) )
			{
				return -1;//abnormal
			}
			//step2: Luma AC
			if (uiCbpL)
			{
				for (i = 0; i < 16; i++) 
				{	
                    if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs, i,
								iScanIdxEnd - WELS_MAX(iScanIdxStart, 1) + 1, g_kuiZigzagScan+ WELS_MAX(iScanIdxStart,1),
								I16_LUMA_AC, pCurLayer->pScaledTCoeff[iMbXy] + (i<<4), iNMbMode, pCurLayer->pLumaQp[iMbXy], pCtx) ) 
					{
							return -1;//abnormal
					}					
				}
				ST32(&pCurLayer->pNzc[iMbXy][0], LD32(&pNonZeroCount[1+8*1]));
				ST32(&pCurLayer->pNzc[iMbXy][4], LD32(&pNonZeroCount[1+8*2]));
				ST32(&pCurLayer->pNzc[iMbXy][8], LD32(&pNonZeroCount[1+8*3]));
				ST32(&pCurLayer->pNzc[iMbXy][12], LD32(&pNonZeroCount[1+8*4]));
			}
			else //pNonZeroCount = 0
			{
				ST32(&pCurLayer->pNzc[iMbXy][0], 0);
				ST32(&pCurLayer->pNzc[iMbXy][4], 0);	
				ST32(&pCurLayer->pNzc[iMbXy][8], 0);
				ST32(&pCurLayer->pNzc[iMbXy][12], 0);
			}
		}
		else //non-MB_TYPE_INTRA16x16
		{	
			for (iId8x8 = 0; iId8x8 < 4; iId8x8++) 
			{
				if (uiCbpL & (1 << iId8x8)) 
				{
					int32_t iIndex = (iId8x8 << 2);
					for (iId4x4 = 0; iId4x4 < 4; iId4x4++) 
					{
						//Luma (DC and AC decoding together)
						if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs, iIndex,
							iScanIdxEnd - iScanIdxStart + 1, g_kuiZigzagScan+iScanIdxStart, 
							LUMA_DC_AC, pCurLayer->pScaledTCoeff[iMbXy] + (iIndex<<4), iNMbMode, pCurLayer->pLumaQp[iMbXy], pCtx) )
						{
							return -1;//abnormal
						}
						iIndex++;
					}
				}
				else
				{
					ST16(&pNonZeroCount[g_kuiCacheNzcScanIdx[(iId8x8<<2)]], 0);
					ST16(&pNonZeroCount[g_kuiCacheNzcScanIdx[(iId8x8<<2)+2]], 0);
				}
			}	
			ST32(&pCurLayer->pNzc[iMbXy][0], LD32(&pNonZeroCount[1+8*1]));
			ST32(&pCurLayer->pNzc[iMbXy][4], LD32(&pNonZeroCount[1+8*2]));
			ST32(&pCurLayer->pNzc[iMbXy][8], LD32(&pNonZeroCount[1+8*3]));
			ST32(&pCurLayer->pNzc[iMbXy][12], LD32(&pNonZeroCount[1+8*4]));
		}

		//chroma 
		//step1: DC
		if ( 1 == uiCbpC || 2 == uiCbpC )
		{	
			for (i = 0; i < 2; i++) //Cb Cr
			{			
				if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs,
					16 + (i << 2), 4, g_kuiChromaDcScan, CHROMA_DC, pCurLayer->pScaledTCoeff[iMbXy] + 256 + (i<<6),
					iNMbMode, pCurLayer->pChromaQp[iMbXy], pCtx) ) 
				{
					return -1;//abnormal
				}
			}
		}

		//step2: AC
		if (2 == uiCbpC)
		{
			for (i = 0; i < 2; i++) //Cb Cr
			{
				int32_t iIndex = 16 + (i<<2);
				for (iId4x4 = 0; iId4x4 < 4; iId4x4++) 
				{
					if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs, iIndex, 
						iScanIdxEnd - WELS_MAX(iScanIdxStart, 1) + 1, g_kuiZigzagScan + WELS_MAX(iScanIdxStart,1), 
						CHROMA_AC, pCurLayer->pScaledTCoeff[iMbXy]+(iIndex<<4), iNMbMode, pCurLayer->pChromaQp[iMbXy], pCtx) )
					{
						return -1;//abnormal
					}
					iIndex++;
				}
			}
			ST16(&pCurLayer->pNzc[iMbXy][16], LD16(&pNonZeroCount[6+8*1]));
			ST16(&pCurLayer->pNzc[iMbXy][20], LD16(&pNonZeroCount[6+8*2]));
			ST16(&pCurLayer->pNzc[iMbXy][18], LD16(&pNonZeroCount[6+8*4]));
			ST16(&pCurLayer->pNzc[iMbXy][22], LD16(&pNonZeroCount[6+8*5]));
		}
		else 
		{
			ST16(&pCurLayer->pNzc[iMbXy][16], 0);
			ST16(&pCurLayer->pNzc[iMbXy][20], 0);
			ST16(&pCurLayer->pNzc[iMbXy][18], 0);
			ST16(&pCurLayer->pNzc[iMbXy][22], 0);
		}
		BsEndCavlc( pBs ); 
	}
	else
	{
		ST32(&pCurLayer->pNzc[iMbXy][0], 0);
		ST32(&pCurLayer->pNzc[iMbXy][4], 0);
		ST32(&pCurLayer->pNzc[iMbXy][8], 0);
		ST32(&pCurLayer->pNzc[iMbXy][12], 0);
		ST32(&pCurLayer->pNzc[iMbXy][16], 0);
		ST32(&pCurLayer->pNzc[iMbXy][20], 0);
	}	

	return 0;
}

int32_t WelsDecodeMbCavlcISlice(PWelsDecoderContext pCtx, PNalUnit pNalCur)
{
	PDqLayer pCurLayer = pCtx->pCurDqLayer;
	PBitStringAux pBs = pCurLayer->pBitStringAux;	
	PSliceHeaderExt pSliceHeaderExt = &pCurLayer->sLayerInfo.sSliceInLayer.sSliceHeaderExt;
	int32_t iBaseModeFlag;	
	int32_t iRet = 0; //should have the return value to indicate decoding error or not, It's NECESSARY--2010.4.15

	if( pSliceHeaderExt->bAdaptiveBaseModeFlag == 1)
	{
		iBaseModeFlag = BsGetOneBit(pBs);
	}
	else
	{
		iBaseModeFlag = pSliceHeaderExt->bDefaultBaseModeFlag;
	}
    if( !iBaseModeFlag )
    {
        iRet = WelsActualDecodeMbCavlcISlice( pCtx);
    }
    else
    {
        WelsLog( pCtx, WELS_LOG_WARNING, "iBaseModeFlag (%d) != 0, inter-layer prediction not supported.\n", iBaseModeFlag);
        return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_ILP);
    }
	if ( iRet ) //occur error when parsing, MUST STOP decoding
	{
		return iRet;
	}

	return 0;
}

int32_t WelsActualDecodeMbCavlcPSlice(PWelsDecoderContext pCtx)
{
	SVlcTable* pVlcTable     = &pCtx->sVlcTable;
	PDqLayer pCurLayer		 = pCtx->pCurDqLayer;
	PBitStringAux pBs		 = pCurLayer->pBitStringAux;
	PSlice pSlice			 = &pCurLayer->sLayerInfo.sSliceInLayer;
	PSliceHeader pSliceHeader		     = &pSlice->sSliceHeaderExt.sSliceHeader;

	SNeighAvail sNeighAvail;

	int32_t iScanIdxStart = pSlice->sSliceHeaderExt.uiScanIdxStart;
	int32_t iScanIdxEnd   = pSlice->sSliceHeaderExt.uiScanIdxEnd;	

	int32_t iMbX = pCurLayer->iMbX;
	int32_t iMbY = pCurLayer->iMbY;
	int32_t iMbXy = pCurLayer->iMbXyIndex;

	int32_t iNMbMode, i;
	uint32_t uiMbType = 0, uiCbp = 0, uiCbpL = 0, uiCbpC = 0;

	FORCE_STACK_ALIGN_1D( uint8_t, pNonZeroCount, 48, 16 );
	pCurLayer->pInterPredictionDoneFlag[iMbXy] = 0;//2009.10.23
	
	uiMbType = BsGetUe(pBs);	
	if (uiMbType < 5) //inter MB type
	{
		int16_t iMotionVector[LIST_A][30][MV_A];	

		int8_t	iRefIndex[LIST_A][30];
		pCurLayer->pMbType[iMbXy] = g_ksInterMbTypeInfo[uiMbType].iType;
		WelsFillCacheInter( &sNeighAvail, pNonZeroCount, iMotionVector, iRefIndex, pCurLayer );
		if ( ParseInterInfo(pCtx, iMotionVector, iRefIndex, pBs) )
		{
			return -1;//abnormal
		}

		if( pSlice->sSliceHeaderExt.bAdaptiveResidualPredFlag ==1 )
		{
			pCurLayer->pResidualPredFlag[iMbXy] =  BsGetOneBit(pBs);
		}
		else
		{
			pCurLayer->pResidualPredFlag[iMbXy] = pSlice->sSliceHeaderExt.bDefaultResidualPredFlag;
		}

		if(pCurLayer->pResidualPredFlag[iMbXy] == 0)
		{
			iNMbMode = BASE_MB;
			pCurLayer->pInterPredictionDoneFlag[iMbXy] = 0;
		}
		else 
		{
            WelsLog(pCtx, WELS_LOG_WARNING, "residual_pred_flag = 1 not supported.\n");
            return -1;
		}
	}
	else //intra MB type
	{
		uiMbType -= 5;
		if ( uiMbType > 25 )
		{
			return ERR_INFO_INVALID_MB_TYPE;
		}
		
		if ( 25 == uiMbType )
		{
			int32_t iDecStrideL = pCurLayer->pDec->iLinesize[0]; 
			int32_t iDecStrideC = pCurLayer->pDec->iLinesize[1]; 
		
			int32_t iOffsetL = ( iMbX + iMbY * iDecStrideL ) << 4;
			int32_t iOffsetC = ( iMbX + iMbY * iDecStrideC ) << 3;
		
			uint8_t* pDecY = pCurLayer->pCsData[0] + iOffsetL;
			uint8_t* pDecU = pCurLayer->pCsData[1] + iOffsetC;
			uint8_t* pDecV = pCurLayer->pCsData[2] + iOffsetC;
		
			uint8_t *pTmpBsBuf;
		
			int32_t i;
			int32_t iCopySizeY  = ( sizeof( uint8_t ) << 4 );
			int32_t iCopySizeUV = ( sizeof( uint8_t ) << 3 );

			int32_t iIndex = ((-pBs->iLeftBits)>>3) + 2;
			
			pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA_PCM;

			//step 1: locating bit-stream pointer [must align into integer byte]
			pBs->pCurBuf -= iIndex;
		
		    //step 2: copy pixel from bit-stream into fdec [reconstruction]			
			pTmpBsBuf = pBs->pCurBuf;
			for ( i = 0; i < 16; i++ ) //luma
			{
				memcpy( pDecY , pTmpBsBuf, iCopySizeY );
				pDecY += iDecStrideL;				
				pTmpBsBuf += 16;
			}
			
			for ( i = 0; i < 8; i++ ) //cb
			{				
				memcpy( pDecU, pTmpBsBuf, iCopySizeUV );
				pDecU += iDecStrideC;				
				pTmpBsBuf += 8;
			}
			for ( i = 0; i < 8; i++ ) //cr
			{				
				memcpy( pDecV, pTmpBsBuf, iCopySizeUV );
				pDecV += iDecStrideC;
				pTmpBsBuf += 8;
			}		

			pBs->pCurBuf += 384;
			InitReadBits( pBs );
		
		    //step 3: update QP and pNonZeroCount
			pCurLayer->pLumaQp[iMbXy] = 0;
			pCurLayer->pChromaQp[iMbXy] = 0;		
			ST32(&pCurLayer->pNzc[iMbXy][0], 0);
			ST32(&pCurLayer->pNzc[iMbXy][4], 0);
			ST32(&pCurLayer->pNzc[iMbXy][8], 0);
			ST32(&pCurLayer->pNzc[iMbXy][12], 0);
			return 0;
		}
		else
		{
			if (0 == uiMbType) 
			{
				FORCE_STACK_ALIGN_1D( int8_t, pIntraPredMode, 48, 16 );
				pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA4x4;
				pCtx->pFillInfoCacheIntra4x4Func( &sNeighAvail, pNonZeroCount, pIntraPredMode, pCurLayer );
				if ( pCtx->pParseIntra4x4ModeFunc( &sNeighAvail, pIntraPredMode, pBs, pCurLayer ) )
				{
					return -1;
				}
				iNMbMode = BASE_MB;
			}
			else //I_PCM exclude, we can ignore it
			{
				pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA16x16;
				pCurLayer->pIntraPredMode[iMbXy][7] = (uiMbType-1) & 3;
				pCurLayer->pCbp[iMbXy] = g_kuiI16CbpTable[(uiMbType-1)>>2];
				uiCbpC = pCurLayer->pCbp[iMbXy] >> 4;
				uiCbpL = pCurLayer->pCbp[iMbXy] & 15;
				WelsFillCacheNonZeroCount( &sNeighAvail, pNonZeroCount, pCurLayer );
				if ( pCtx->pParseIntra16x16ModeFunc( &sNeighAvail, pBs, pCurLayer ) )
				{
					return -1;
				}
				iNMbMode = BASE_MB;
			}
		}
	}	
	
	if ( MB_TYPE_INTRA16x16 != pCurLayer->pMbType[iMbXy] ) 
	{
		uiCbp = BsGetUe(pBs);	
		{
			if ( uiCbp > 47 ) 
				return ERR_INFO_INVALID_CBP;

			if (MB_TYPE_INTRA4x4 == pCurLayer->pMbType[iMbXy]) 
			{
				uiCbp = g_kuiIntra4x4CbpTable[uiCbp];
			}
			else //inter
				uiCbp = g_kuiInterCbpTable[uiCbp];
		}

		pCurLayer->pCbp[iMbXy] = uiCbp;
		uiCbpC = pCurLayer->pCbp[iMbXy] >> 4;
		uiCbpL = pCurLayer->pCbp[iMbXy] & 15;
	}		

	if(iNMbMode == BASE_MB)
	{
		pCtx->sBlockFunc.pWelsBlockZero16x16Func(pCurLayer->pScaledTCoeff[iMbXy], 16);
		pCtx->sBlockFunc.pWelsBlockZero8x8Func(pCurLayer->pScaledTCoeff[iMbXy]+256, 8);
		pCtx->sBlockFunc.pWelsBlockZero8x8Func(pCurLayer->pScaledTCoeff[iMbXy]+256+64, 8);

		ST32(&pCurLayer->pNzc[iMbXy][0], 0);
		ST32(&pCurLayer->pNzc[iMbXy][4], 0);
		ST32(&pCurLayer->pNzc[iMbXy][8], 0);
		ST32(&pCurLayer->pNzc[iMbXy][12], 0);
		ST32(&pCurLayer->pNzc[iMbXy][20], 0);
		if( pCurLayer->pCbp[iMbXy] == 0 && !IS_INTRA16x16(pCurLayer->pMbType[iMbXy]) && !IS_I_BL(pCurLayer->pMbType[iMbXy]))
		{
			pCurLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp;
			pCurLayer->pChromaQp[iMbXy] = g_kuiChromaQp[WELS_CLIP3(pCurLayer->pLumaQp[iMbXy] + pSliceHeader->pPps->iChromaQpIndexOffset, 0, 51)];
		}
	}	

	if ( pCurLayer->pCbp[iMbXy] || MB_TYPE_INTRA16x16 == pCurLayer->pMbType[iMbXy] )
	{
		int32_t iQpDelta, iId8x8, iId4x4;	
		
		iQpDelta = BsGetSe(pBs);

        if (iQpDelta > 25 || iQpDelta < -26) //out of iQpDelta range
		{
			return ERR_INFO_INVALID_QP;
		}

		pCurLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp + iQpDelta; //update iLastMbQp
		//refer to JVT-X201wcm1.doc equation(7-35)	
		if ( (unsigned)(pCurLayer->pLumaQp[iMbXy]) > 51 )
		{
			if ( pCurLayer->pLumaQp[iMbXy] < 0 )
			{
				pCurLayer->pLumaQp[iMbXy] += 52;
			} 
			else
			{
				pCurLayer->pLumaQp[iMbXy] -= 52;
			}
		}
		//QP should be in the range of [0, 51]
		if ( pCurLayer->pLumaQp[iMbXy] < 0 || pCurLayer->pLumaQp[iMbXy] > 51 )
		{
			return ERR_INFO_INVALID_QP;
		}
		pSlice->iLastMbQp = pCurLayer->pLumaQp[iMbXy];
		pCurLayer->pChromaQp[iMbXy] = g_kuiChromaQp[WELS_CLIP3(pSlice->iLastMbQp + pSliceHeader->pPps->iChromaQpIndexOffset, 0, 51)];

		BsStartCavlc( pBs );

		if ( MB_TYPE_INTRA16x16 == pCurLayer->pMbType[iMbXy] ) 
		{
			//step1: Luma DC
			if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs, 0, 16, g_kuiLumaDcZigzagScan, 
				I16_LUMA_DC, pCurLayer->pScaledTCoeff[iMbXy], iNMbMode, pCurLayer->pLumaQp[iMbXy], pCtx) )
			{
				return -1;//abnormal
			}
			//step2: Luma AC
			if (uiCbpL)
			{
				for (i = 0; i < 16; i++) 
				{
                    if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount,pBs, i,
							iScanIdxEnd - WELS_MAX(iScanIdxStart, 1) + 1, g_kuiZigzagScan + WELS_MAX(iScanIdxStart,1),
							I16_LUMA_AC, pCurLayer->pScaledTCoeff[iMbXy] + (i<<4), iNMbMode, pCurLayer->pLumaQp[iMbXy], pCtx) )
					{
						return -1;//abnormal
					}
				}
				ST32(&pCurLayer->pNzc[iMbXy][0], LD32(&pNonZeroCount[1+8*1]));
				ST32(&pCurLayer->pNzc[iMbXy][4], LD32(&pNonZeroCount[1+8*2]));
				ST32(&pCurLayer->pNzc[iMbXy][8], LD32(&pNonZeroCount[1+8*3]));
				ST32(&pCurLayer->pNzc[iMbXy][12], LD32(&pNonZeroCount[1+8*4]));
			}
			else //pNonZeroCount = 0
			{
				ST32(&pCurLayer->pNzc[iMbXy][0], 0);
				ST32(&pCurLayer->pNzc[iMbXy][4], 0);
				ST32(&pCurLayer->pNzc[iMbXy][8], 0);
				ST32(&pCurLayer->pNzc[iMbXy][12], 0);
			}
		}
		else //non-MB_TYPE_INTRA16x16
		{	
			for (iId8x8 = 0; iId8x8 < 4; iId8x8++) 
			{
				if (uiCbpL & (1 << iId8x8)) 
				{
					int32_t iIndex = (iId8x8 << 2);
					for (iId4x4 = 0; iId4x4 < 4; iId4x4++) 
					{
						//Luma (DC and AC decoding together)
						if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs, iIndex,
							iScanIdxEnd - iScanIdxStart + 1, g_kuiZigzagScan+iScanIdxStart, LUMA_DC_AC,
							pCurLayer->pScaledTCoeff[iMbXy] + (iIndex<<4), iNMbMode, pCurLayer->pLumaQp[iMbXy], pCtx) )
						{
							return -1;//abnormal
						}
						iIndex++;
					}
				}
				else
				{					
					ST16(&pNonZeroCount[g_kuiCacheNzcScanIdx[iId8x8<<2]],0);
					ST16(&pNonZeroCount[g_kuiCacheNzcScanIdx[(iId8x8<<2)+2]],0);
				}
			}	
			ST32(&pCurLayer->pNzc[iMbXy][0], LD32(&pNonZeroCount[1+8*1]));
			ST32(&pCurLayer->pNzc[iMbXy][4], LD32(&pNonZeroCount[1+8*2]));
			ST32(&pCurLayer->pNzc[iMbXy][8], LD32(&pNonZeroCount[1+8*3]));
			ST32(&pCurLayer->pNzc[iMbXy][12], LD32(&pNonZeroCount[1+8*4]));
        }

		
		//chroma 
		//step1: DC
		if ( 1 == uiCbpC || 2 == uiCbpC )
		{	
			for (i = 0; i < 2; i++) //Cb Cr
			{	
				if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs,
					16 + (i << 2), 4, g_kuiChromaDcScan, CHROMA_DC, pCurLayer->pScaledTCoeff[iMbXy] + 256 + (i<<6),
					iNMbMode, pCurLayer->pChromaQp[iMbXy], pCtx) ) 
				{
					return -1;//abnormal
				}
			}
		}
		else
		{
		}
		//step2: AC
		if (2 == uiCbpC)
		{
			for (i = 0; i < 2; i++) //Cb Cr
			{
				int32_t iIndex= 16 + (i<<2);
				for (iId4x4 = 0; iId4x4 < 4; iId4x4++) 
				{
					if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs, iIndex,
						iScanIdxEnd - WELS_MAX(iScanIdxStart, 1) + 1, g_kuiZigzagScan + WELS_MAX(iScanIdxStart,1),
						CHROMA_AC, pCurLayer->pScaledTCoeff[iMbXy]+(iIndex<<4), iNMbMode, pCurLayer->pChromaQp[iMbXy], pCtx) )
					{
						return -1;//abnormal
					}
					iIndex++;
				}
			}
			ST16(&pCurLayer->pNzc[iMbXy][16], LD16(&pNonZeroCount[6+8*1]));
			ST16(&pCurLayer->pNzc[iMbXy][20], LD16(&pNonZeroCount[6+8*2]));
			ST16(&pCurLayer->pNzc[iMbXy][18], LD16(&pNonZeroCount[6+8*4]));
			ST16(&pCurLayer->pNzc[iMbXy][22], LD16(&pNonZeroCount[6+8*5]));
		}
		else 
		{
			ST32(&pCurLayer->pNzc[iMbXy][16], 0);
			ST32(&pCurLayer->pNzc[iMbXy][20], 0); 
		}
		BsEndCavlc( pBs );
	}
	else
	{
		ST32(&pCurLayer->pNzc[iMbXy][0], 0);
		ST32(&pCurLayer->pNzc[iMbXy][4], 0);
		ST32(&pCurLayer->pNzc[iMbXy][8], 0);
		ST32(&pCurLayer->pNzc[iMbXy][12], 0);
		ST32(&pCurLayer->pNzc[iMbXy][16], 0);
		ST32(&pCurLayer->pNzc[iMbXy][20], 0);
	}	

	return 0;
}

int32_t WelsDecodeMbCavlcPSlice(PWelsDecoderContext pCtx, PNalUnit pNalCur)
{
	PDqLayer pCurLayer		 = pCtx->pCurDqLayer;
	PBitStringAux pBs		 = pCurLayer->pBitStringAux;
	PSlice pSlice			 = &pCurLayer->sLayerInfo.sSliceInLayer;
	PSliceHeader pSliceHeader		    = &pSlice->sSliceHeaderExt.sSliceHeader;

	int32_t iMbXy = pCurLayer->iMbXyIndex;
	int32_t iBaseModeFlag, i;
	int32_t iRet = 0; //should have the return value to indicate decoding error or not, It's NECESSARY--2010.4.15

	if (-1 == pSlice->iMbSkipRun) 
	{
		pSlice->iMbSkipRun = BsGetUe(pBs);
		if ( -1 == pSlice->iMbSkipRun )
		{
			return -1;
		}
		
	}
	if (pSlice->iMbSkipRun--)
	{
		int16_t iMv[2] = {0};
		
		pCurLayer->pMbType[iMbXy] = MB_TYPE_SKIP;
		ST32(&pCurLayer->pNzc[iMbXy][0], 0);
		ST32(&pCurLayer->pNzc[iMbXy][4], 0);
		ST32(&pCurLayer->pNzc[iMbXy][8], 0);
		ST32(&pCurLayer->pNzc[iMbXy][12], 0);
		ST32(&pCurLayer->pNzc[iMbXy][16], 0);
		ST32(&pCurLayer->pNzc[iMbXy][20], 0);
		
		pCurLayer->pInterPredictionDoneFlag[iMbXy] = 0;
		memset(pCurLayer->pRefIndex[0][iMbXy], 0, sizeof(int8_t) * 16);
		
		//predict iMv			
		PredPSkipMvFromNeighbor( pCurLayer, iMv );
		for (i = 0; i < 16; i++)
		{
			ST32( pCurLayer->pMv[0][iMbXy][i], *(uint32_t*)iMv );
		}
		
		if(!pSlice->sSliceHeaderExt.bDefaultResidualPredFlag)
		{
			memset(pCurLayer->pScaledTCoeff[iMbXy], 0, 384*sizeof(int16_t));
		}

		//reset rS
		if(!pSlice->sSliceHeaderExt.bDefaultResidualPredFlag ||
			(pNalCur->sNalHeaderExt.uiQualityId==0 && pNalCur->sNalHeaderExt.uiDependencyId==0))
		{
			pCurLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp;
			pCurLayer->pChromaQp[iMbXy] = g_kuiChromaQp[WELS_CLIP3(pCurLayer->pLumaQp[iMbXy] + pSliceHeader->pPps->iChromaQpIndexOffset, 0, 51)];
		}

		pCurLayer->pCbp[iMbXy] = 0;

		return 0;
	}

	if(	pSlice->sSliceHeaderExt.bAdaptiveBaseModeFlag == 1)
	{
		iBaseModeFlag = BsGetOneBit(pBs);
	}
	else
	{
		iBaseModeFlag = pSlice->sSliceHeaderExt.bDefaultBaseModeFlag;
	}
    if( !iBaseModeFlag )
    {
        iRet = WelsActualDecodeMbCavlcPSlice( pCtx );
    }
    else
    {
        WelsLog( pCtx, WELS_LOG_WARNING, "iBaseModeFlag (%d) != 0, inter-layer prediction not supported.\n", iBaseModeFlag);
        return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_ILP);
    }
	if ( iRet ) //occur error when parsing, MUST STOP decoding
	{
		return iRet;
	}

	return 0;
}

void_t WelsBlockInit(int16_t* pBlock, int32_t iWidth, int32_t iHeight, int32_t iStride, uint8_t uiVal)
{
	int32_t i;
	int16_t* pDst = pBlock;
	
	for(i=0; i<iHeight; i++)
	{
		memset(pDst, uiVal, iWidth*sizeof(int16_t));
		pDst += iStride;
	}
}

void_t WelsBlockFuncInit(SBlockFunc  * pFunc,  int32_t iCpu)
{
	pFunc->pWelsBlockZero16x16Func		= WelsBlockZero16x16_c;
	pFunc->pWelsBlockZero8x8Func	    = WelsBlockZero8x8_c;
	pFunc->pWelsSetNonZeroCountFunc	    = SetNonZeroCount_c;

#ifdef  X86_ASM
	if( iCpu & WELS_CPU_SSE2 ){
		pFunc->pWelsBlockZero16x16Func		= WelsResBlockZero16x16_sse2;
		pFunc->pWelsBlockZero8x8Func	    = WelsResBlockZero8x8_sse2;
	}
#endif
}
void_t WelsBlockZero16x16_c(int16_t * pBlock, int32_t iStride)
{
    WelsBlockInit(pBlock,16,16,iStride,0);
}

void_t WelsBlockZero8x8_c(int16_t * pBlock, int32_t iStride)
{
	WelsBlockInit(pBlock,8,8,iStride,0);
}

void_t SetNonZeroCount_c(int16_t* pBlock, int8_t* pNonZeroCount)
{
    int32_t i;
    int32_t iIndex;

	for( i=0;i<24;i++ ){
        iIndex = g_kuiMbNonZeroCountIdx[i];
	    pNonZeroCount[iIndex] = !!pNonZeroCount[iIndex];
	}
}

} // namespace WelsDec