shithub: openh264

ref: 82a492262e7d426e844af64b8f8752b4d8617e9e
dir: /codec/encoder/core/src/encoder_ext.cpp/

View raw version
/*!
 * \copy
 *     Copyright (c)  2009-2013, Cisco Systems
 *     All rights reserved.
 *
 *     Redistribution and use in source and binary forms, with or without
 *     modification, are permitted provided that the following conditions
 *     are met:
 *
 *        * Redistributions of source code must retain the above copyright
 *          notice, this list of conditions and the following disclaimer.
 *
 *        * Redistributions in binary form must reproduce the above copyright
 *          notice, this list of conditions and the following disclaimer in
 *          the documentation and/or other materials provided with the
 *          distribution.
 *
 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 *     POSSIBILITY OF SUCH DAMAGE.
 *
 *
 * \file	encoder_ext.c
 *
 * \brief	core encoder for SVC
 *
 * \date	7/24/2009 Created
 *
 *************************************************************************************
 */
#include <string.h>
#include <stdlib.h>
#include <assert.h>

#include "encoder.h"
#include "extern.h"
#include "encoder_context.h"
#include "typedefs.h"
#include "wels_const.h"
#include "wels_common_basis.h"
#include "codec_def.h"
#include "param_svc.h"
#include "cpu_core.h"
#include "cpu.h"
#include "utils.h"
#include "svc_enc_frame.h"
#include "svc_enc_golomb.h"
#include "svc_enc_slice_segment.h"
#include "au_set.h"
#include "picture_handle.h"
#include "codec_app_def.h"
#include "svc_base_layer_md.h"
#include "svc_encode_slice.h"
#include "decode_mb_aux.h"
#include "deblocking.h"
#include "rc.h"
#include "ref_list_mgr_svc.h"
#include "md.h"
#include "ls_defines.h"
#include "set_mb_syn_cavlc.h"
#include "crt_util_safe_x.h"	// Safe CRT routines like utils for cross platforms
#include "array_stack_align.h"
// for MT, 4/22/2010
#include "slice_multi_threading.h"
#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
#include "measure_time.h"
#endif//DYNAMIC_SLICE_ASSIGN

namespace WelsSVCEnc {


int32_t WelsCodeOnePicPartition(	sWelsEncCtx *pCtx,
									SLayerBSInfo *pLbi,
									int32_t *pNalIdxInLayer,									
									int32_t* pLayerSize,
									int32_t iFirstMbInPartition,	// first mb inclusive in partition
									int32_t iEndMbInPartition,	// end mb exclusive in partition
									int32_t iStartSliceIdx
								  );


/*!
 * \brief	validate checking in parameter configuration
 * \pParam	pParam		SWelsSvcCodingParam*
 * \return	successful - 0; otherwise none 0 for failed
 */
int32_t ParamValidation( SWelsSvcCodingParam *pCfg )
{
	float fMaxFrameRate = 0.0f;
	const float fEpsn = 0.000001f;
	int32_t i = 0;
	int32_t iLastSpatialWidth	= 0;
	int32_t	iLastSpatialHeight	= 0;
	float fLastFrameRateIn	= 0.0f;
	float fLastFrameRateOut	= 0.0f;
	SDLayerParam *pLastSpatialParam = NULL;

	assert( pCfg != NULL );

	for (i = 0; i < pCfg->iNumDependencyLayer; ++ i)
	{
		SDLayerParam *fDlp = &pCfg->sDependencyLayers[i];
		if ( fDlp->fOutputFrameRate > fDlp->fInputFrameRate || (fDlp->fInputFrameRate >= -fEpsn && fDlp->fInputFrameRate <= fEpsn)
			|| (fDlp->fOutputFrameRate >= -fEpsn && fDlp->fOutputFrameRate <= fEpsn) )
		{
#if defined (_DEBUG)
			fprintf(stderr, "Invalid settings in input frame rate(%.6f) or output frame rate(%.6f) of layer #%d config file..\n",
				fDlp->fInputFrameRate, fDlp->fOutputFrameRate, i);
#endif
			return 1;
		}
		if ( UINT_MAX == GetLogFactor(fDlp->fOutputFrameRate, fDlp->fInputFrameRate) )
		{
#if defined (_DEBUG)
			fprintf(stderr, "Invalid settings in input frame rate(%.6f) and output frame rate(%.6f) of layer #%d config file: iResult of output frame rate divided by input frame rate should be power of 2(i.e,in/pOut=2^n)..\n",
				fDlp->fInputFrameRate, fDlp->fOutputFrameRate, i);
#endif
			return 1;
		}
	}

	for (i = 0; i < pCfg->iNumDependencyLayer; ++ i)
	{
		SDLayerParam *fDlp = &pCfg->sDependencyLayers[i];
		if ( fDlp->fInputFrameRate > fMaxFrameRate )
			fMaxFrameRate	= fDlp->fInputFrameRate;
	}

	if ( fMaxFrameRate > fEpsn && (fMaxFrameRate - pCfg->fMaxFrameRate > fEpsn || fMaxFrameRate - pCfg->fMaxFrameRate < -fEpsn) )
	{
		pCfg->fMaxFrameRate	= fMaxFrameRate;		
	}

	for (i = 0; i < pCfg->iNumDependencyLayer; ++ i)
	{
		SDLayerParam *fDlp = &pCfg->sDependencyLayers[i];

		pLastSpatialParam	= fDlp;
		iLastSpatialWidth	= fDlp->iFrameWidth;
		iLastSpatialHeight	= fDlp->iFrameHeight;
		fLastFrameRateIn	= fDlp->fInputFrameRate;
		fLastFrameRateOut	= fDlp->fOutputFrameRate;
	}

	return 0;
}

int32_t ParamValidationExt( void *pParam )
{
	SWelsSvcCodingParam *pCodingParam = (SWelsSvcCodingParam *)pParam;
	int8_t i = 0;
	int32_t iIdx = 0;

	assert ( pCodingParam != NULL );
	if ( NULL == pCodingParam )
		return 1;

	if ( pCodingParam->iNumDependencyLayer < 1 || pCodingParam->iNumDependencyLayer > MAX_DEPENDENCY_LAYER ){
#if defined (_DEBUG)
		fprintf(stderr, "ParamValidationExt(), monitor invalid pCodingParam->iNumDependencyLayer: %d!\n", pCodingParam->iNumDependencyLayer);
#endif//#if _DEBUG

		return 1;
	}
	
	if ( pCodingParam->iNumTemporalLayer < 1 || pCodingParam->iNumTemporalLayer > MAX_TEMPORAL_LEVEL ){
#if defined (_DEBUG)
		fprintf(stderr, "ParamValidationExt(), monitor invalid pCodingParam->iNumTemporalLayer: %d!\n", pCodingParam->iNumTemporalLayer);
#endif//#if _DEBUG
		return 1;
	}
	
	if ( pCodingParam->uiGopSize < 1 || pCodingParam->uiGopSize > MAX_GOP_SIZE ){
#if defined (_DEBUG)
		fprintf(stderr, "ParamValidationExt(), monitor invalid pCodingParam->uiGopSize: %d!\n", pCodingParam->uiGopSize);
#endif//#if _DEBUG
		return 1;
	}
	

	if ( pCodingParam->uiIntraPeriod && pCodingParam->uiIntraPeriod < pCodingParam->uiGopSize )
	{
#if defined (_DEBUG)
		fprintf(stderr, "ParamValidationExt(), uiIntraPeriod(%d) should be not less than that of uiGopSize(%d) or -1 specified!\n",
			pCodingParam->uiIntraPeriod, pCodingParam->uiGopSize);
#endif//#if _DEBUG
		return 1;
	}
	
	if ( pCodingParam->uiIntraPeriod && (pCodingParam->uiIntraPeriod & (pCodingParam->uiGopSize-1)) != 0 )
	{
#if defined (_DEBUG)
		fprintf(stderr, "ParamValidationExt(), uiIntraPeriod(%d) should be multiple of uiGopSize(%d) or -1 specified!\n",
			pCodingParam->uiIntraPeriod, pCodingParam->uiGopSize);
#endif//#if _DEBUG
		return 1;
	}
	

#ifdef MT_ENABLED
	//about iMultipleThreadIdc, bDeblockingParallelFlag, iLoopFilterDisableIdc, & uiSliceMode
	// (1) Single Thread
	//	if (THREAD==1)//single thread
	//		no parallel_deblocking: bDeblockingParallelFlag = 0;
	// (2) Multi Thread: see uiSliceMode decision
	if ( pCodingParam->iMultipleThreadIdc == 1 )
	{
		//now is single thread. no parallel deblocking, set flag=0
		pCodingParam->bDeblockingParallelFlag = false;
	}
	else
	{
		pCodingParam->bDeblockingParallelFlag = true;
	}
#else
	pCodingParam->bDeblockingParallelFlag	= false;
#endif//MT_ENABLED
	
	for ( i = 0; i < pCodingParam->iNumDependencyLayer; ++ i ){
		SDLayerParam *fDlp = &pCodingParam->sDependencyLayers[i];
		const int32_t kiPicWidth = fDlp->iFrameWidth;
		const int32_t kiPicHeight= fDlp->iFrameHeight;
		int32_t iMbWidth		= 0;
		int32_t iMbHeight		= 0;
		int32_t iMbNumInFrame		= 0;
		int32_t iMaxSliceNum		= MAX_SLICES_NUM;
		if ( kiPicWidth <= 0 || kiPicHeight <= 0 ){
#if defined (_DEBUG)
			fprintf(stderr, "ParamValidationExt(), invalid %d x %d in dependency layer settings!\n", kiPicWidth, kiPicHeight);
#endif//#if _DEBUG
			return 1;
		}
		if ( (kiPicWidth & 0x0F) != 0 || (kiPicHeight & 0x0F) != 0 ){
#if defined (_DEBUG)
			fprintf(stderr, "ParamValidationExt(), in layer #%d iWidth x iHeight(%d x %d) both should be multiple of 16, can not support with arbitrary size currently!\n", i, kiPicWidth, kiPicHeight);
#endif//#if _DEBUG
			return 1;
		}	

		if ( fDlp->sMso.uiSliceMode >= SM_RESERVED ){
#if defined (_DEBUG)
			fprintf(stderr, "ParamValidationExt(), invalid uiSliceMode (%d) settings!\n", fDlp->sMso.uiSliceMode );
#endif//#if _DEBUG
			return 1;
		}

		//check pSlice settings under multi-pSlice
		if ( kiPicWidth<=16 && kiPicHeight<=16 ){
			//only have one MB, set to single_slice
			fDlp->sMso.uiSliceMode = SM_SINGLE_SLICE;
		}
		switch ( fDlp->sMso.uiSliceMode )
		{
			case SM_SINGLE_SLICE:
				fDlp->sMso.sSliceArgument.iSliceNum = 1;
				fDlp->sMso.sSliceArgument.uiSliceSizeConstraint = 0;
				fDlp->sMso.sSliceArgument.iSliceNum = 0;
				for (iIdx=0; iIdx<MAX_SLICES_NUM;iIdx++)
				{
					fDlp->sMso.sSliceArgument.uiSliceMbNum[iIdx] = 0;
				}
				break;
			case SM_FIXEDSLCNUM_SLICE:
				{
					fDlp->sMso.sSliceArgument.uiSliceSizeConstraint = 0;

					iMbWidth	= (kiPicWidth+15)>>4;
					iMbHeight	= (kiPicHeight+15)>>4;
					iMbNumInFrame = iMbWidth * iMbHeight;
					iMaxSliceNum = MAX_SLICES_NUM;
					if ( fDlp->sMso.sSliceArgument.iSliceNum <= 0 
						|| fDlp->sMso.sSliceArgument.iSliceNum > iMaxSliceNum )
					{
#if defined (_DEBUG)
						fprintf(stderr, "ParamValidationExt(), invalid uiSliceNum (%d) settings!\n", fDlp->sMso.sSliceArgument.iSliceNum );
#endif//#if _DEBUG
						return 1;
					}
					if (fDlp->sMso.sSliceArgument.iSliceNum == 1)
					{
#if defined (_DEBUG)
						fprintf(stderr, "ParamValidationExt(), uiSliceNum(%d) you set for SM_FIXEDSLCNUM_SLICE, now turn to SM_SINGLE_SLICE type!\n", fDlp->sMso.sSliceArgument.iSliceNum );
#endif//#if _DEBUG
						fDlp->sMso.uiSliceMode	= SM_SINGLE_SLICE;
						break;
					}
					if (pCodingParam->bEnableRc)	// multiple slices verify with gom
					{		
						//check uiSliceNum
						GomValidCheckSliceNum( iMbWidth, iMbHeight, (int32_t*)&fDlp->sMso.sSliceArgument.iSliceNum );
						assert(fDlp->sMso.sSliceArgument.iSliceNum > 1);
						//set uiSliceMbNum with current uiSliceNum
						GomValidCheckSliceMbNum( iMbWidth, iMbHeight, &fDlp->sMso.sSliceArgument );
					}
					else if ( !CheckFixedSliceNumMultiSliceSetting( iMbNumInFrame, &fDlp->sMso.sSliceArgument ) )	// verify interleave mode settings
					{//check uiSliceMbNum with current uiSliceNum
#if defined (_DEBUG)
						fprintf(stderr, "ParamValidationExt(), invalid uiSliceMbNum (%d) settings!\n",  fDlp->sMso.sSliceArgument.uiSliceMbNum[0] );
#endif//#if _DEBUG
						return 1;
					}
					// considering the coding efficient and performance, iCountMbNum constraint by MIN_NUM_MB_PER_SLICE condition of multi-pSlice mode settting
					if ( iMbNumInFrame <= MIN_NUM_MB_PER_SLICE )
					{
						fDlp->sMso.uiSliceMode	= SM_SINGLE_SLICE;
						fDlp->sMso.sSliceArgument.iSliceNum	= 1;
						break;
					}
				}
				break;
			case SM_RASTER_SLICE:
				{
					fDlp->sMso.sSliceArgument.uiSliceSizeConstraint = 0;

					iMbWidth	= (kiPicWidth+15)>>4;
					iMbHeight	= (kiPicHeight+15)>>4;
					iMbNumInFrame = iMbWidth * iMbHeight;
					iMaxSliceNum = MAX_SLICES_NUM;
					if ( fDlp->sMso.sSliceArgument.uiSliceMbNum[0] <= 0 )
					{
#if defined (_DEBUG)
						fprintf(stderr, "ParamValidationExt(), invalid uiSliceMbNum (%d) settings!\n",  fDlp->sMso.sSliceArgument.uiSliceMbNum[0] );
#endif//#if _DEBUG
						return 1;
					}

					if ( !CheckRasterMultiSliceSetting( iMbNumInFrame, &fDlp->sMso.sSliceArgument ) )	// verify interleave mode settings
					{
#if defined (_DEBUG)
						fprintf(stderr, "ParamValidationExt(), invalid uiSliceMbNum (%d) settings!\n",  fDlp->sMso.sSliceArgument.uiSliceMbNum[0] );
#endif//#if _DEBUG
						return 1;
					}
					if ( fDlp->sMso.sSliceArgument.iSliceNum <= 0 || fDlp->sMso.sSliceArgument.iSliceNum > iMaxSliceNum )	// verify interleave mode settings
					{
#if defined (_DEBUG)
						fprintf(stderr, "ParamValidationExt(), invalid uiSliceNum (%d) in SM_RASTER_SLICE settings!\n",  fDlp->sMso.sSliceArgument.iSliceNum );
#endif//#if _DEBUG
						return 1;
					}
					if (fDlp->sMso.sSliceArgument.iSliceNum == 1)
					{
#if defined (_DEBUG)
						fprintf(stderr, "ParamValidationExt(), pSlice setting for SM_RASTER_SLICE now turn to SM_SINGLE_SLICE!\n" );
#endif//#if _DEBUG
						fDlp->sMso.uiSliceMode	= SM_SINGLE_SLICE;
						break;
					}
#ifdef MT_ENABLED
					if (pCodingParam->bEnableRc && fDlp->sMso.sSliceArgument.iSliceNum > 1)
					{
#if defined (_DEBUG)
						fprintf(stderr, "ParamValidationExt(), WARNING: GOM based RC do not support SM_RASTER_SLICE!\n" );
#endif//#if _DEBUG
					}
#endif
					// considering the coding efficient and performance, iCountMbNum constraint by MIN_NUM_MB_PER_SLICE condition of multi-pSlice mode settting
					if ( iMbNumInFrame <= MIN_NUM_MB_PER_SLICE )
					{
						fDlp->sMso.uiSliceMode	= SM_SINGLE_SLICE;
						fDlp->sMso.sSliceArgument.iSliceNum	= 1;
						break;
					}
				}
				break;		
			case SM_ROWMB_SLICE:
				{
					fDlp->sMso.sSliceArgument.uiSliceSizeConstraint = 0;

					iMbWidth	= (kiPicWidth+15)>>4;
					iMbHeight	= (kiPicHeight+15)>>4;
					iMaxSliceNum = MAX_SLICES_NUM;
					if ( iMbHeight > iMaxSliceNum )
					{
#if defined (_DEBUG)
						fprintf(stderr, "ParamValidationExt(), invalid uiSliceNum (%d) settings more than MAX!\n", iMbHeight );
#endif//#if _DEBUG
						return 1;
					}
					fDlp->sMso.sSliceArgument.iSliceNum	= iMbHeight;

					if ( fDlp->sMso.sSliceArgument.iSliceNum <= 0 )
					{
#if defined (_DEBUG)
						fprintf(stderr, "ParamValidationExt(), invalid uiSliceNum (%d) settings!\n", fDlp->sMso.sSliceArgument.iSliceNum );
#endif//#if _DEBUG
						return 1;
					}	
					if ( !CheckRowMbMultiSliceSetting( iMbWidth, &fDlp->sMso.sSliceArgument ) )	// verify interleave mode settings
					{
#if defined (_DEBUG)
						fprintf(stderr, "ParamValidationExt(), invalid uiSliceMbNum (%d) settings!\n",  fDlp->sMso.sSliceArgument.uiSliceMbNum[0] );
#endif//#if _DEBUG
						return 1;
					}
				}
				break;
			case SM_DYN_SLICE:
				{
					iMbWidth	= (kiPicWidth+15)>>4;
					iMbHeight	= (kiPicHeight+15)>>4;
					if ( fDlp->sMso.sSliceArgument.uiSliceSizeConstraint <= 0 )
					{
#if defined (_DEBUG)
						fprintf(stderr, "ParamValidationExt(), invalid iSliceSize (%d) settings!\n",   fDlp->sMso.sSliceArgument.uiSliceSizeConstraint );
#endif//#if _DEBUG
						return 1;
					}
					// considering the coding efficient and performance, iCountMbNum constraint by MIN_NUM_MB_PER_SLICE condition of multi-pSlice mode settting
					if ( iMbWidth * iMbHeight <= MIN_NUM_MB_PER_SLICE )
					{
						fDlp->sMso.uiSliceMode	= SM_SINGLE_SLICE;
						fDlp->sMso.sSliceArgument.iSliceNum	= 1;
						break;
					}
				}
				break;
			default:
				{

#if defined (_DEBUG)
					fprintf(stderr, "ParamValidationExt(), invalid uiSliceMode (%d) settings!\n", pCodingParam->sDependencyLayers[0].sMso.uiSliceMode );
#endif//#if _DEBUG
					return 1;

				}
				break;
		}
	}
	
	return ParamValidation(pCodingParam);
}

/*!
 * \brief	acquire count number of layers and NALs based on configurable paramters dependency
 * \pParam	pCtx				sWelsEncCtx*
 * \pParam	pParam			SWelsSvcCodingParam*
 * \pParam	pCountLayers	pointer of count number of layers indeed
 * \pParam	iCountNals		pointer of count number of nals indeed
 * \return	0 - successful; otherwise failed
 */
static inline int32_t AcquireLayersNals( sWelsEncCtx **ppCtx, SWelsSvcCodingParam *pParam, int32_t *pCountLayers, int32_t *pCountNals )
{	
	int32_t iCountNumLayers		= 0;
	int32_t iCountNumNals			= 0;
	int32_t iNumDependencyLayers	= 0;
	int32_t iDIndex 				= 0;
#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
	int32_t iNumLayersPack = 0;
#endif//MT_ENABLED && PACKING_ONE_SLICE_PER_LAYER

	if ( NULL == pParam || NULL == ppCtx || NULL == *ppCtx )
		return 1;
	
	iNumDependencyLayers	= pParam->iNumDependencyLayer;

	do {
		SDLayerParam *pDLayer = &pParam->sDependencyLayers[iDIndex];		
//		pDLayer->ptr_cfg = pParam;
		int32_t iOrgNumNals = iCountNumNals;

		//Note: Sep. 2010
		//Review this part and suggest no change, since the memory over-use 
		//(1) counts little to the overall performance
		//(2) should not be critial even under mobile case
		if ( SM_DYN_SLICE == pDLayer->sMso.uiSliceMode )
		{
			iCountNumNals += MAX_SLICES_NUM;
			// plus prefix NALs
			if ( iDIndex == 0 )
				iCountNumNals += MAX_SLICES_NUM;
			// MAX_SLICES_NUM < MAX_LAYER_NUM_OF_FRAME ensured at svc_enc_slice_segment.h
#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
			assert(MAX_SLICES_NUM < MAX_LAYER_NUM_OF_FRAME);
			// iNumLayersPack += MAX_SLICES_NUM; // do not count it for dynamic slicing mode
#else//!MT_ENABLED || !PACKING_ONE_SLICE_PER_LAYER
			assert(iCountNumNals - iOrgNumNals <= MAX_NAL_UNITS_IN_LAYER );
#endif//MT_ENABLED && PACKING_ONE_SLICE_PER_LAYER
		}
		else /*if ( SM_SINGLE_SLICE != pDLayer->sMso.uiSliceMode )*/
		{
			const int32_t kiNumOfSlice = GetInitialSliceNum(	(pDLayer->iFrameWidth+0x0f)>>4,
												(pDLayer->iFrameHeight+0x0f)>>4,
												&pDLayer->sMso );

			// NEED check iCountNals value in case multiple slices is used
			iCountNumNals += kiNumOfSlice; // for pSlice VCL NALs
			// plus prefix NALs
			if ( iDIndex == 0 )
				iCountNumNals += kiNumOfSlice;
#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
			assert(num_of_slice <= MAX_SLICES_NUM && MAX_SLICES_NUM < MAX_LAYER_NUM_OF_FRAME);
			iNumLayersPack += num_of_slice;
#else//!MT_ENABLED || !PACKING_ONE_SLICE_PER_LAYER
			assert(iCountNumNals - iOrgNumNals <= MAX_NAL_UNITS_IN_LAYER );				
#endif//MT_ENALBED && PACKING_ONE_SLICE_PER_LAYER
			if ( kiNumOfSlice > MAX_SLICES_NUM )
			{
				WelsLog( *ppCtx, WELS_LOG_ERROR, "AcquireLayersNals(), num_of_slice(%d) > MAX_SLICES_NUM(%d) per (iDid= %d, qid= %d) settings!\n",
					kiNumOfSlice, MAX_SLICES_NUM, iDIndex, 0 );
				return 1;
			}
		}
#if !defined(MT_ENABLED) || !defined(PACKING_ONE_SLICE_PER_LAYER)
		if ( iCountNumNals - iOrgNumNals > MAX_NAL_UNITS_IN_LAYER )
		{
			WelsLog( *ppCtx, WELS_LOG_ERROR, "AcquireLayersNals(), num_of_nals(%d) > MAX_NAL_UNITS_IN_LAYER(%d) per (iDid= %d, qid= %d) settings!\n",
				(iCountNumNals - iOrgNumNals), MAX_NAL_UNITS_IN_LAYER, iDIndex, 0 );
			return 1;
		}
#endif//!MT_ENABLED) || !PACKING_ONE_SLICE_PER_LAYER

		iCountNumLayers ++;
		
		++ iDIndex;
	} while(iDIndex < iNumDependencyLayers);

	iCountNumNals += 1 + iNumDependencyLayers + (iCountNumLayers<<1) + iCountNumLayers;	// plus iCountNumLayers for reserved application
#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
	iNumLayersPack += 1 + iNumDependencyLayers + (iCountNumLayers<<1);
#endif//MT_ENABLED && PACKING_ONE_SLICE_PER_LAYER

	// to check number of layers / nals / slices dependencies, 12/8/2010
#if !defined(MT_ENABLED)
	if ( iCountNumLayers > MAX_LAYER_NUM_OF_FRAME )
	{
		WelsLog( *ppCtx, WELS_LOG_ERROR, "AcquireLayersNals(), iCountNumLayers(%d) > MAX_LAYER_NUM_OF_FRAME(%d)!", iCountNumLayers, MAX_LAYER_NUM_OF_FRAME );
		return 1;
	}
#else//MT_ENABLED
#if defined(PACKING_ONE_SLICE_PER_LAYER)
	if ( iNumLayersPack > MAX_LAYER_NUM_OF_FRAME )
	{
		WelsLog( *ppCtx, WELS_LOG_ERROR, "AcquireLayersNals(), num_layers_pack_overall(%d) > MAX_LAYER_NUM_OF_FRAME(%d)!", iNumLayersPack, MAX_LAYER_NUM_OF_FRAME );
		return 1;
	}
#else//!PACKING_ONE_SLICE_PER_LAYER
	if ( iCountNumLayers > MAX_LAYER_NUM_OF_FRAME )
	{
		WelsLog( *ppCtx, WELS_LOG_ERROR, "AcquireLayersNals(), iCountNumLayers(%d) > MAX_LAYER_NUM_OF_FRAME(%d)!", iCountNumLayers, MAX_LAYER_NUM_OF_FRAME );
		return 1;
	}
#endif//PACKING_ONE_SLICE_PER_LAYER
#endif//!MT_ENABLED

	if ( NULL != pCountLayers )
		*pCountLayers	= iCountNumLayers;
	if ( NULL != pCountNals )
		*pCountNals 	= iCountNumNals;
	return 0;
}

/*!
 * \brief	alloc spatial layers pictures (I420 based source pictures)	
 */
int32_t AllocSpatialPictures( sWelsEncCtx **ppCtx, SWelsSvcCodingParam *pParam )
{
	CMemoryAlign *pMa						= (*ppCtx)->pMemAlign;	
	const int32_t kiDlayerCount					= pParam->iNumDependencyLayer;
	int32_t iDlayerIndex							= 0;

	// spatial pictures
	iDlayerIndex = 0;
	do {
		const int32_t kiPicWidth = pParam->sDependencyLayers[iDlayerIndex].iFrameWidth;
		const int32_t kiPicHeight   = pParam->sDependencyLayers[iDlayerIndex].iFrameHeight;
		const uint8_t kuiLayerInTemporal = 2 + WELS_MAX(pParam->sDependencyLayers[iDlayerIndex].iHighestTemporalId, 1);
		const uint8_t kuiRefNumInTemporal = kuiLayerInTemporal + pParam->iLTRRefNum;
		uint8_t i = 0;

		do {
			SPicture *pPic = AllocPicture( pMa, kiPicWidth, kiPicHeight, false );
			WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pPic), FreeMemorySvc(ppCtx); *ppCtx = NULL )
			(*ppCtx)->pSpatialPic[iDlayerIndex][i] = pPic;
			++ i;
		} while( i < kuiRefNumInTemporal);

		(*ppCtx)->uiSpatialLayersInTemporal[iDlayerIndex] = kuiLayerInTemporal;
		(*ppCtx)->uiSpatialPicNum[iDlayerIndex] = kuiRefNumInTemporal;
		++ iDlayerIndex;
	} while( iDlayerIndex < kiDlayerCount );  

	return 0;
}

void FreeSpatialPictures( sWelsEncCtx *pCtx )
{
	CMemoryAlign *pMa	= pCtx->pMemAlign;
	int32_t j = 0;
	while( j < pCtx->pSvcParam->iNumDependencyLayer )
	{
		uint8_t i = 0;
		uint8_t uiRefNumInTemporal = pCtx->uiSpatialPicNum[j];

		while( i < uiRefNumInTemporal ) 
		{
			if ( NULL != pCtx->pSpatialPic[j][i] )
			{
				FreePicture( pMa, &pCtx->pSpatialPic[j][i] );			
			}
			++ i;
		}
		pCtx->uiSpatialLayersInTemporal[j]	= 0;
		++ j;
	}

}

static  void  InitMbInfo(sWelsEncCtx * pEnc, SMB  * pList, SDqLayer * pLayer, const int32_t kiDlayerId, const int32_t kiMaxMbNum )
{
    int32_t  iMbWidth		= pLayer->iMbWidth;
	int32_t  iMbHeight		= pLayer->iMbHeight;
	int32_t  iIdx;
	int32_t  iMbNum			= iMbWidth * iMbHeight;
	SSliceCtx *pSliceCtx= pLayer->pSliceEncCtx;
	uint32_t uiNeighborAvail;    
	const int32_t kiOffset	= (kiDlayerId & 0x01) * kiMaxMbNum;
	SMVUnitXY (*pLayerMvUnitBlock4x4)[MB_BLOCK4x4_NUM]	= (SMVUnitXY(*)[MB_BLOCK4x4_NUM])(&pEnc->pMvUnitBlock4x4[MB_BLOCK4x4_NUM*kiOffset]);	
	int8_t (*pLayerRefIndexBlock8x8)[MB_BLOCK8x8_NUM]		= (int8_t(*)[MB_BLOCK8x8_NUM])(&pEnc->pRefIndexBlock4x4[MB_BLOCK8x8_NUM*kiOffset]);	

	for( iIdx = 0; iIdx<iMbNum; iIdx++ ){
		BOOL_T     bLeft;
		BOOL_T     bTop;
		BOOL_T     bLeftTop;
		BOOL_T     bRightTop;
		int32_t  iLeftXY, iTopXY, iLeftTopXY, iRightTopXY;
		uint8_t  uiSliceIdc;
		
		pList[iIdx].iMbX = pEnc->pStrideTab->pMbIndexX[kiDlayerId][iIdx];
		pList[iIdx].iMbY = pEnc->pStrideTab->pMbIndexY[kiDlayerId][iIdx];
		pList[iIdx].iMbXY = iIdx;		

        uiSliceIdc = WelsMbToSliceIdc(pSliceCtx, iIdx);
		iLeftXY = iIdx - 1;
		iTopXY = iIdx - iMbWidth;
		iLeftTopXY = iTopXY - 1;
		iRightTopXY = iTopXY + 1;

		bLeft = (pList[iIdx].iMbX > 0) && (uiSliceIdc == WelsMbToSliceIdc(pSliceCtx, iLeftXY));
		bTop = (pList[iIdx].iMbY > 0) && (uiSliceIdc == WelsMbToSliceIdc(pSliceCtx, iTopXY));
		bLeftTop = (pList[iIdx].iMbX > 0) && (pList[iIdx].iMbY > 0) && (uiSliceIdc == 
			WelsMbToSliceIdc(pSliceCtx, iLeftTopXY));
		bRightTop = (pList[iIdx].iMbX < (iMbWidth-1)) && (pList[iIdx].iMbY > 0) && (uiSliceIdc == 
			WelsMbToSliceIdc(pSliceCtx, iRightTopXY));

		uiNeighborAvail = 0;
		if( bLeft ){
			uiNeighborAvail |= LEFT_MB_POS;
		}
		if( bTop ){
			uiNeighborAvail |= TOP_MB_POS;
		}
		if( bLeftTop ){
			uiNeighborAvail |= TOPLEFT_MB_POS;
		}
		if( bRightTop ){
			uiNeighborAvail |= TOPRIGHT_MB_POS;
		}
		pList[iIdx].uiSliceIdc		= uiSliceIdc;	// merge from svc_hd_opt_b for multiple slices coding
		pList[iIdx].uiNeighborAvail	= uiNeighborAvail;
		uiNeighborAvail = 0;
		if(pList[iIdx].iMbX >= BASE_MV_MB_NMB)
			uiNeighborAvail |= LEFT_MB_POS;
        if(pList[iIdx].iMbX <= (iMbWidth-1-BASE_MV_MB_NMB))
            uiNeighborAvail |= RIGHT_MB_POS;
		if (pList[iIdx].iMbY >= BASE_MV_MB_NMB)
			uiNeighborAvail |= TOP_MB_POS;
        if(pList[iIdx].iMbY <= (iMbHeight-1-BASE_MV_MB_NMB))
            uiNeighborAvail |= BOTTOM_MB_POS;

		pList[iIdx].sMv					= pLayerMvUnitBlock4x4[iIdx];
		pList[iIdx].pRefIndex			= pLayerRefIndexBlock8x8[iIdx];
		pList[iIdx].pSadCost				= &pEnc->pSadCostMb[iIdx];
		pList[iIdx].pIntra4x4PredMode	= &pEnc->pIntra4x4PredModeBlocks[iIdx*INTRA_4x4_MODE_NUM];
		pList[iIdx].pNonZeroCount		= &pEnc->pNonZeroCountBlocks[iIdx*MB_LUMA_CHROMA_BLOCK4x4_NUM];		
	}
}


int32_t   InitMbListD( sWelsEncCtx ** ppCtx)
{
    int32_t		iNumDlayer = (*ppCtx)->pSvcParam->iNumDependencyLayer;	
	int32_t		iMbSize[MAX_DEPENDENCY_LAYER] = { 0 };
	int32_t		iOverallMbNum = 0;
	int32_t		iMbWidth = 0;
	int32_t		iMbHeight= 0;
	int32_t		i;

	if ( iNumDlayer > MAX_DEPENDENCY_LAYER )
		return 1;

	for( i=0;i<iNumDlayer;i++ ){
	    iMbWidth = ((*ppCtx)->pSvcParam->sDependencyLayers[i].iFrameWidth + 15)>>4;
		iMbHeight = ((*ppCtx)->pSvcParam->sDependencyLayers[i].iFrameHeight + 15)>>4;
		iMbSize[i] = iMbWidth  * iMbHeight;
		iOverallMbNum += iMbSize[i];
	}

	(*ppCtx)->ppMbListD = static_cast<SMB **>((*ppCtx)->pMemAlign->WelsMalloc(iNumDlayer * sizeof(SMB *), "ppMbListD"));
	(*ppCtx)->ppMbListD[0] = NULL;
	WELS_VERIFY_RETURN_PROC_IF(1, (*ppCtx)->ppMbListD==NULL, FreeMemorySvc(ppCtx));
	(*ppCtx)->ppMbListD[0] = static_cast<SMB*>((*ppCtx)->pMemAlign->WelsMallocz(iOverallMbNum * sizeof(SMB), "ppMbListD[0]"));
	WELS_VERIFY_RETURN_PROC_IF(1, (*ppCtx)->ppMbListD[0]==NULL, FreeMemorySvc(ppCtx));
	(*ppCtx)->ppDqLayerList[0]->sMbDataP = (*ppCtx)->ppMbListD[0];
	InitMbInfo(*ppCtx, (*ppCtx)->ppMbListD[0], (*ppCtx)->ppDqLayerList[0], 0, iMbSize[iNumDlayer-1]);
	for( i=1;i<iNumDlayer;i++ ){		
		(*ppCtx)->ppMbListD[i] = (*ppCtx)->ppMbListD[i-1] + iMbSize[i-1];
		(*ppCtx)->ppDqLayerList[i]->sMbDataP = (*ppCtx)->ppMbListD[i];
		InitMbInfo(*ppCtx, (*ppCtx)->ppMbListD[i], (*ppCtx)->ppDqLayerList[i], i, iMbSize[iNumDlayer-1]);
	}

	return 0;
}

int32_t AllocMbCacheAligned( SMbCache *pMbCache, CMemoryAlign *pMa )
{
	pMbCache->pCoeffLevel = (int16_t *)pMa->WelsMalloc(MB_COEFF_LIST_SIZE*sizeof(int16_t), "pMbCache->pCoeffLevel");
	WELS_VERIFY_RETURN_IF(1, (NULL==pMbCache->pCoeffLevel));
	pMbCache->pMemPredMb = (uint8_t *)pMa->WelsMalloc(2*256*sizeof(uint8_t), "pMbCache->pMemPredMb");
	WELS_VERIFY_RETURN_IF(1, (NULL==pMbCache->pMemPredMb));
	pMbCache->pSkipMb = (uint8_t *)pMa->WelsMalloc(384*sizeof(uint8_t), "pMbCache->pSkipMb");
	WELS_VERIFY_RETURN_IF(1, (NULL==pMbCache->pSkipMb));
	pMbCache->pMemPredBlk4 = (uint8_t *)pMa->WelsMalloc(2*16*sizeof(uint8_t), "pMbCache->pMemPredBlk4");
	WELS_VERIFY_RETURN_IF(1, (NULL==pMbCache->pMemPredBlk4));
	pMbCache->pBufferInterPredMe = (uint8_t *)pMa->WelsMalloc(4*640*sizeof(uint8_t), "pMbCache->pBufferInterPredMe");
	WELS_VERIFY_RETURN_IF(1, (NULL==pMbCache->pBufferInterPredMe));
	pMbCache->pPrevIntra4x4PredModeFlag = (bool_t *)pMa->WelsMalloc(16*sizeof(bool_t), "pMbCache->pPrevIntra4x4PredModeFlag");
	WELS_VERIFY_RETURN_IF(1, (NULL==pMbCache->pPrevIntra4x4PredModeFlag));
	pMbCache->pRemIntra4x4PredModeFlag	= (int8_t *)pMa->WelsMalloc(16*sizeof(int8_t), "pMbCache->pRemIntra4x4PredModeFlag");
	WELS_VERIFY_RETURN_IF(1, (NULL==pMbCache->pRemIntra4x4PredModeFlag));
	pMbCache->pDct = (SDCTCoeff *)pMa->WelsMalloc(sizeof(SDCTCoeff), "pMbCache->pDct");
	WELS_VERIFY_RETURN_IF(1, (NULL==pMbCache->pDct));	
	return 0;
}

void FreeMbCache( SMbCache *pMbCache, CMemoryAlign *pMa )
{
	if ( NULL != pMbCache->pCoeffLevel )
	{
		pMa->WelsFree( pMbCache->pCoeffLevel, "pMbCache->pCoeffLevel" );
		pMbCache->pCoeffLevel = NULL;
	}		
	if ( NULL != pMbCache->pMemPredMb )
	{
		pMa->WelsFree( pMbCache->pMemPredMb, "pMbCache->pMemPredMb" );
		pMbCache->pMemPredMb = NULL;
	}	
	if ( NULL != pMbCache->pSkipMb )
	{
		pMa->WelsFree( pMbCache->pSkipMb, "pMbCache->pSkipMb" );
		pMbCache->pSkipMb = NULL;
	}	
	if ( NULL != pMbCache->pMemPredBlk4 )
	{
		pMa->WelsFree( pMbCache->pMemPredBlk4, "pMbCache->pMemPredBlk4" );
		pMbCache->pMemPredBlk4 = NULL;
	}	
	if ( NULL != pMbCache->pBufferInterPredMe )
	{
		pMa->WelsFree( pMbCache->pBufferInterPredMe, "pMbCache->pBufferInterPredMe" );
		pMbCache->pBufferInterPredMe = NULL;
	}	
	if ( NULL != pMbCache->pPrevIntra4x4PredModeFlag )
	{
		pMa->WelsFree( pMbCache->pPrevIntra4x4PredModeFlag, "pMbCache->pPrevIntra4x4PredModeFlag" );
		pMbCache->pPrevIntra4x4PredModeFlag = NULL;
	}	
	if ( NULL != pMbCache->pRemIntra4x4PredModeFlag )
	{
		pMa->WelsFree( pMbCache->pRemIntra4x4PredModeFlag, "pMbCache->pRemIntra4x4PredModeFlag" );
		pMbCache->pRemIntra4x4PredModeFlag = NULL;
	}	
	if ( NULL != pMbCache->pDct )
	{
		pMa->WelsFree( pMbCache->pDct, "pMbCache->pDct" );
		pMbCache->pDct = NULL;
	}
}


/*!
 * \brief	initialize ppDqLayerList and slicepEncCtx_list due to count number of layers available
 * \pParam	pCtx			sWelsEncCtx*
 * \return	0 - successful; otherwise failed
 */
static inline int32_t InitDqLayers( sWelsEncCtx **ppCtx )
{
	SWelsSvcCodingParam *pParam	= NULL;
	SWelsSPS *pSps						= NULL;
	SSubsetSps *pSubsetSps			= NULL;
	SWelsPPS *pPps						= NULL;
	CMemoryAlign *pMa				= NULL;
	SStrideTables *pStrideTab		= NULL;	
	int32_t iDlayerCount					= 0;	
	int32_t iDlayerIndex					= 0;
	uint32_t iSpsId					= 0;
	uint32_t iPpsId					= 0;
	uint32_t iNumRef				= 0;
	int32_t iResult					= 0;	
	
	if ( NULL == ppCtx || NULL == *ppCtx )
		return 1;

	pMa		= (*ppCtx)->pMemAlign;
	pParam	= (*ppCtx)->pSvcParam;	
	iDlayerCount	= pParam->iNumDependencyLayer;
	iNumRef	= pParam->iNumRefFrame;
//	highest_layers_in_temporal = 1 + WELS_MAX(pParam->iDecompStages, 1);
	pStrideTab	= (*ppCtx)->pStrideTab;	

	iDlayerIndex			= 0;	
	while (iDlayerIndex < iDlayerCount)
	{
		SRefList *pRefList			= NULL;
		uint32_t i					= 0;
		const int32_t kiWidth			= pParam->sDependencyLayers[iDlayerIndex].iFrameWidth;
		const int32_t kiHeight		= pParam->sDependencyLayers[iDlayerIndex].iFrameHeight;
        int32_t iPicWidth			= WELS_ALIGN(kiWidth, MB_WIDTH_LUMA) + (PADDING_LENGTH<<1);	// with iWidth of horizon
        int32_t iPicChromaWidth	= iPicWidth >> 1;

        iPicWidth	= WELS_ALIGN( iPicWidth, 32 );	// 32(or 16 for chroma below) to match original imp. here instead of iCacheLineSize
		iPicChromaWidth	= WELS_ALIGN( iPicChromaWidth, 16 );

		WelsGetEncBlockStrideOffset( (*ppCtx)->pStrideTab->pStrideEncBlockOffset[iDlayerIndex], iPicWidth, iPicChromaWidth);		
		
		// pRef list
		pRefList		= (SRefList *)pMa->WelsMallocz( sizeof(SRefList), "pRefList" );
		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pRefList), FreeMemorySvc(ppCtx) )			
		
		do {
			pRefList->pRef[i]	= AllocPicture( pMa, kiWidth, kiHeight, true );	// to use actual size of current layer
			WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pRefList->pRef[i]), FreeMemorySvc(ppCtx) )				
			++ i;
		} while(i < 1 + iNumRef);
		
		pRefList->pNextBuffer = pRefList->pRef[0];			
		(*ppCtx)->ppRefPicListExt[iDlayerIndex]	= pRefList;		
		++ iDlayerIndex;
	}

	// for I420 based source spatial pictures
	if ( AllocSpatialPictures( ppCtx, pParam ) )
	{
		FreeMemorySvc( ppCtx );		
		return 1;
	}

	iDlayerIndex	= 0;
	while (iDlayerIndex < iDlayerCount) {
		SDqLayer *pDqLayer		= NULL;
		SDLayerParam *pDlayer	= &pParam->sDependencyLayers[iDlayerIndex];		
		const int32_t kiMbW		= (pDlayer->iFrameWidth + 0x0f) >> 4;
		const int32_t kiMbH		= (pDlayer->iFrameHeight + 0x0f) >> 4;
		int32_t iMaxSliceNum	= 1;
		const int32_t kiSliceNum = GetInitialSliceNum( kiMbW, kiMbH, &pDlayer->sMso );
		if ( iMaxSliceNum < kiSliceNum )
			iMaxSliceNum = kiSliceNum;

		// pDq layers list
		pDqLayer = (SDqLayer *)pMa->WelsMallocz( sizeof(SDqLayer), "pDqLayer" );
		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pDqLayer), FreeMemorySvc(ppCtx) )

		// for dynamic slicing mode
		if ( SM_DYN_SLICE == pDlayer->sMso.uiSliceMode )
		{			
			const int32_t iSize			= pParam->iCountThreadsNum * sizeof(int32_t);

			pDqLayer->pNumSliceCodedOfPartition		= (int32_t *)pMa->WelsMallocz( iSize, "pNumSliceCodedOfPartition" );			
			pDqLayer->pLastCodedMbIdxOfPartition	= (int32_t *)pMa->WelsMallocz( iSize, "pLastCodedMbIdxOfPartition" );
			pDqLayer->pLastMbIdxOfPartition			= (int32_t *)pMa->WelsMallocz( iSize, "pLastMbIdxOfPartition" );

			WELS_VERIFY_RETURN_PROC_IF( 1,
										(NULL == pDqLayer->pNumSliceCodedOfPartition ||
										NULL == pDqLayer->pLastCodedMbIdxOfPartition ||
										NULL == pDqLayer->pLastMbIdxOfPartition),
										FreeMemorySvc(ppCtx) )
		}

		pDqLayer->iMbWidth					= kiMbW;
		pDqLayer->iMbHeight					= kiMbH;
#ifndef MT_ENABLED
		if ( SM_DYN_SLICE == pDlayer->sMso.uiSliceMode )//wmalloc pSliceInLayer
		{	
			SSlice *pSlice			= NULL;
			int32_t iSliceIdx		= 0;
			//wmalloc AVERSLICENUM_CONSTANT of pDqLayer->sLayerInfo.pSliceInLayer, 
			//wmalloc AVERSLICENUM_CONSTANT num of pSlice as initialization			
			//only set value for the first pSlice
			pDqLayer->sLayerInfo.pSliceInLayer	= (SSlice *)pMa->WelsMallocz( sizeof(SSlice) * iMaxSliceNum, "pSliceInLayer" );

			WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pDqLayer->sLayerInfo.pSliceInLayer), FreeMemorySvc(ppCtx) )
			{
				pSlice = &pDqLayer->sLayerInfo.pSliceInLayer[0];
				pSlice->uiSliceIdx = 0;
				pSlice->pSliceBsa = &(*ppCtx)->pOut->sBsWrite;
			}		

			while(iSliceIdx < iMaxSliceNum)
			{
				pSlice = &pDqLayer->sLayerInfo.pSliceInLayer[iSliceIdx];
				if ( AllocMbCacheAligned(&pSlice->sMbCacheInfo, pMa) )
				{
					FreeMemorySvc(ppCtx);
					return 1;
				}
				++ iSliceIdx;
			}
		}
		else
#endif//!MT_ENABLED
		{			
			int32_t iSliceIdx		= 0;
			pDqLayer->sLayerInfo.pSliceInLayer	= (SSlice *)pMa->WelsMallocz( sizeof(SSlice) * iMaxSliceNum, "pSliceInLayer" );

			WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pDqLayer->sLayerInfo.pSliceInLayer), FreeMemorySvc(ppCtx) )
			if ( iMaxSliceNum > 1 )
			{
				while (iSliceIdx < iMaxSliceNum) {
					SSlice *pSlice = &pDqLayer->sLayerInfo.pSliceInLayer[iSliceIdx];
					pSlice->uiSliceIdx = iSliceIdx;
#ifdef MT_ENABLED
					if ( pParam->iMultipleThreadIdc > 1 )
						pSlice->pSliceBsa = &(*ppCtx)->pSliceBs[iSliceIdx].sBsWrite;
					else
						pSlice->pSliceBsa = &(*ppCtx)->pOut->sBsWrite;
#else
					pSlice->pSliceBsa = &(*ppCtx)->pOut->sBsWrite;
#endif//MT_ENABLED
					if ( AllocMbCacheAligned(&pSlice->sMbCacheInfo, pMa) )
					{
						FreeMemorySvc(ppCtx);
						return 1;
					}
					++ iSliceIdx;
				}
			}
			// fix issue in case single pSlice coding might be inclusive exist in variant spatial layer setting, also introducing multi-pSlice modes
			else	// only one pSlice
			{
				SSlice *pSlice = &pDqLayer->sLayerInfo.pSliceInLayer[0];
				pSlice->uiSliceIdx	= 0;
				pSlice->pSliceBsa	= &(*ppCtx)->pOut->sBsWrite;
				if ( AllocMbCacheAligned(&pSlice->sMbCacheInfo, pMa) )
				{
					FreeMemorySvc(ppCtx);
					return 1;
				}
			}
		}

		//deblocking parameters initialization
		//target-layer deblocking
		pDqLayer->iLoopFilterDisableIdc	                = pParam->iLoopFilterDisableIdc;
		pDqLayer->iLoopFilterAlphaC0Offset				= (pParam->iLoopFilterAlphaC0Offset)<<1;
		pDqLayer->iLoopFilterBetaOffset					= (pParam->iLoopFilterBetaOffset)<<1;
		//inter-layer deblocking
		pDqLayer->uiDisableInterLayerDeblockingFilterIdc	= pParam->iInterLayerLoopFilterDisableIdc;
		pDqLayer->iInterLayerSliceAlphaC0Offset				= (pParam->iInterLayerLoopFilterAlphaC0Offset)<<1;
		pDqLayer->iInterLayerSliceBetaOffset				= (pParam->iInterLayerLoopFilterBetaOffset)<<1;
		//parallel deblocking
		pDqLayer->bDeblockingParallelFlag                  = pParam->bDeblockingParallelFlag;

		//deblocking parameter adjustment
		if ( SM_SINGLE_SLICE == pDlayer->sMso.uiSliceMode )
		{
			//iLoopFilterDisableIdc: will be 0 or 1 under single_slice
			if ( 2 == pParam->iLoopFilterDisableIdc )
			{
				pDqLayer->iLoopFilterDisableIdc	= 0;
			}
			//bDeblockingParallelFlag
			pDqLayer->bDeblockingParallelFlag = false;
		}
		else
		{//multi-pSlice
#ifdef MT_ENABLED
			if ( 0 == pDqLayer->iLoopFilterDisableIdc )
			{
				pDqLayer->bDeblockingParallelFlag	= false;
			}
#endif
		}

		(*ppCtx)->ppDqLayerList[iDlayerIndex]	= pDqLayer;
		
		++ iDlayerIndex;
	}

	// for dynamically malloc for parameter sets memory instead of maximal items for standard to reduce size, 3/18/2010
	if ( &(*ppCtx)->pSvcParam->bMgsT0OnlyStrategy )
	{
	    (*ppCtx)->pPPSArray	= (SWelsPPS *)pMa->WelsMalloc( (1+iDlayerCount) * sizeof(SWelsPPS), "pPPSArray" );
	}
	else
	{
	    (*ppCtx)->pPPSArray	= (SWelsPPS *)pMa->WelsMalloc( iDlayerCount * sizeof(SWelsPPS), "pPPSArray" );
	}
	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pPPSArray), FreeMemorySvc(ppCtx) )

	(*ppCtx)->pSpsArray	= (SWelsSPS *)pMa->WelsMalloc( sizeof(SWelsSPS), "pSpsArray" );
	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pSpsArray), FreeMemorySvc(ppCtx) )
	if ( iDlayerCount > 1 )
	{
		(*ppCtx)->pSubsetArray	= (SSubsetSps *)pMa->WelsMalloc( (iDlayerCount-1) * sizeof(SSubsetSps), "pSubsetArray" );
		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pSubsetArray), FreeMemorySvc(ppCtx) )
	}
	
	(*ppCtx)->pDqIdcMap	= (SDqIdc *)pMa->WelsMallocz( iDlayerCount * sizeof(SDqIdc), "pDqIdcMap" );
	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pDqIdcMap), FreeMemorySvc(ppCtx) )	

	iDlayerIndex	= 0;
	while (iDlayerIndex < iDlayerCount) {		
		SDqIdc *pDqIdc		= &(*ppCtx)->pDqIdcMap[iDlayerIndex];		
		const bool_t bUseSubsetSps			= (iDlayerIndex > BASE_DEPENDENCY_ID);
		SDLayerParam *pDlayerParam	= &pParam->sDependencyLayers[iDlayerIndex];

		pDqIdc->uiSpatialId	= iDlayerIndex;
		pPps	= &(*ppCtx)->pPPSArray[iPpsId];
		if ( !bUseSubsetSps )
		{
			pSps	= &(*ppCtx)->pSpsArray[iSpsId];
		}
		else
		{
			pSubsetSps	= &(*ppCtx)->pSubsetArray[iSpsId];
			pSps			= &pSubsetSps->pSps;
		}		

		// Need port pSps/pPps initialization due to spatial scalability changed
		if ( !bUseSubsetSps )
		{	
			WelsInitSps( pSps, pDlayerParam, pParam->uiIntraPeriod, pParam->iNumRefFrame, iSpsId, 
						  pParam->bEnableFrameCroppingFlag, pParam->bEnableRc );

			if( iDlayerCount > 1 )
			{
				pSps->bConstraintSet0Flag = true;
				pSps->bConstraintSet1Flag = true;
				pSps->bConstraintSet2Flag = true;
			}
		}
		else
		{
			WelsInitSubsetSps( pSubsetSps, pDlayerParam, pParam->uiIntraPeriod, pParam->iNumRefFrame, iSpsId,
								 pParam->bEnableFrameCroppingFlag, pParam->bEnableRc );
		}

		// initialize pPps
		WelsInitPps( pPps, pSps, pSubsetSps, iPpsId, true, bUseSubsetSps );

		// Not using FMO in SVC coding so far, come back if need FMO
		{
			iResult = InitSlicePEncCtx(	&(*ppCtx)->pSliceCtxList[iDlayerIndex],
											(*ppCtx)->pMemAlign,
											false,
											pSps->iMbWidth,
											pSps->iMbHeight,
											&(pDlayerParam->sMso),
											pPps	);
			if ( iResult )
			{
				WelsLog( *ppCtx, WELS_LOG_WARNING, "InitDqLayers(), InitSlicePEncCtx failed(%d)!", iResult );
				FreeMemorySvc( ppCtx );			
				return 1;
			}
			(*ppCtx)->ppDqLayerList[iDlayerIndex]->pSliceEncCtx	= &(*ppCtx)->pSliceCtxList[iDlayerIndex];			
		}
		pDqIdc->iSpsId	= iSpsId;
		pDqIdc->iPpsId	= iPpsId;

		(*ppCtx)->sPSOVector.bPpsIdMappingIntoSubsetsps[iPpsId] = bUseSubsetSps;

		if ( bUseSubsetSps )
			++ iSpsId;
		++ iPpsId;
		++ (*ppCtx)->iSpsNum;
		++ (*ppCtx)->iPpsNum;

		++ iDlayerIndex;
	}	
	return 0;
}

int32_t AllocStrideTables( sWelsEncCtx **ppCtx, const int32_t kiNumSpatialLayers )
{
	CMemoryAlign *pMa				= (*ppCtx)->pMemAlign;
	SWelsSvcCodingParam *pParam	= (*ppCtx)->pSvcParam;
	SStrideTables *pPtr				= NULL;
	int16_t *pTmpRow	= NULL, *pRowX = NULL, *pRowY = NULL, *p = NULL;
	uint8_t *pBase		= NULL;
	uint8_t *pBaseDec = NULL, *pBaseEnc = NULL, *pBaseMbX = NULL, *pBaseMbY = NULL;
	struct {
		int32_t iMbWidth;
		int32_t iCountMbNum;				// count number of SMB in each spatial
		int32_t iSizeAllMbAlignCache;	// cache line size aligned in each spatial
	} sMbSizeMap[MAX_DEPENDENCY_LAYER] = {0};
	int32_t iLineSizeY[MAX_DEPENDENCY_LAYER][2] = {0};
	int32_t iLineSizeUV[MAX_DEPENDENCY_LAYER][2]= {0};
	int32_t iMapSpatialIdx[MAX_DEPENDENCY_LAYER][2] = {0};	
	int32_t iSizeDec		= 0;
	int32_t iSizeEnc		= 0;
	int32_t iCountLayersNeedCs[2]	= {0};
	const int32_t kiUnit1Size = 24 * sizeof(int32_t);
	int32_t iUnit2Size		= 0;
	int32_t iNeedAllocSize	= 0;
	int32_t iRowSize		= 0;
	int16_t iMaxMbWidth	= 0;
	int16_t iMaxMbHeight	= 0;
	int32_t i				= 0;
	int32_t iSpatialIdx		= 0;
	int32_t iTemporalIdx	= 0;
	int32_t iCntTid			= 0;	

	if ( kiNumSpatialLayers <= 0 || kiNumSpatialLayers > MAX_DEPENDENCY_LAYER)
		return 1;

	pPtr = (SStrideTables *)pMa->WelsMalloc(sizeof(SStrideTables), "SStrideTables");
	if (NULL == pPtr)
		return 1;
	(*ppCtx)->pStrideTab = pPtr;
	
	iCntTid	= pParam->iNumTemporalLayer > 1 ? 2 : 1;	

	iSpatialIdx = 0;
	while (iSpatialIdx < kiNumSpatialLayers) {
		const int32_t kiTmpWidth = (pParam->sDependencyLayers[iSpatialIdx].iFrameWidth + 15) >> 4;
		const int32_t kiTmpHeight= (pParam->sDependencyLayers[iSpatialIdx].iFrameHeight + 15) >> 4;
		int32_t iNumMb = kiTmpWidth * kiTmpHeight;
		
		sMbSizeMap[iSpatialIdx].iMbWidth		= kiTmpWidth;
		sMbSizeMap[iSpatialIdx].iCountMbNum	= iNumMb;
		
		iNumMb *= sizeof(int16_t);
		sMbSizeMap[iSpatialIdx].iSizeAllMbAlignCache = iNumMb;		
		iUnit2Size += iNumMb;

		++ iSpatialIdx;
	}
	
	// Adaptive size_cs, size_fdec by implementation dependency
	iTemporalIdx= 0;
	while ( iTemporalIdx < iCntTid )
	{
		const bool_t kbBaseTemporalFlag	= (iTemporalIdx == 0);		
		
		iSpatialIdx = 0;
		while ( iSpatialIdx < kiNumSpatialLayers )
		{
			SDLayerParam *fDlp					= &pParam->sDependencyLayers[iSpatialIdx];			

			const int32_t kiWidthPad = WELS_ALIGN( fDlp->iFrameWidth, 16 ) + (PADDING_LENGTH<<1);
			iLineSizeY[iSpatialIdx][kbBaseTemporalFlag]	= WELS_ALIGN( kiWidthPad, 32 );
			iLineSizeUV[iSpatialIdx][kbBaseTemporalFlag]= WELS_ALIGN( (kiWidthPad>>1), 16 );			

			iMapSpatialIdx[iCountLayersNeedCs[kbBaseTemporalFlag]][kbBaseTemporalFlag] = iSpatialIdx;
			++ iCountLayersNeedCs[kbBaseTemporalFlag];			
			++ iSpatialIdx;
		}
		++ iTemporalIdx;
	}
	iSizeDec= kiUnit1Size * (iCountLayersNeedCs[0] + iCountLayersNeedCs[1]);
	iSizeEnc= kiUnit1Size * kiNumSpatialLayers;

	iNeedAllocSize = iSizeDec + iSizeEnc + (iUnit2Size << 1);

	pBase = (uint8_t *)pMa->WelsMalloc( iNeedAllocSize, "pBase" );
	if ( NULL == pBase )
	{		
		return 1;
	}

	pBaseDec= pBase;		// iCountLayersNeedCs
	pBaseEnc= pBaseDec + iSizeDec;		// iNumSpatialLayers
	pBaseMbX = pBaseEnc + iSizeEnc;	// iNumSpatialLayers
	pBaseMbY = pBaseMbX + iUnit2Size;	// iNumSpatialLayers
	
	iTemporalIdx= 0;
	while ( iTemporalIdx < iCntTid )
	{
		const bool_t kbBaseTemporalFlag	= (iTemporalIdx == 0);
		
		iSpatialIdx = 0;
		while ( iSpatialIdx < iCountLayersNeedCs[kbBaseTemporalFlag] )
		{
			const int32_t kiActualSpatialIdx = iMapSpatialIdx[iSpatialIdx][kbBaseTemporalFlag];
			const int32_t kiLumaWidth	= iLineSizeY[kiActualSpatialIdx][kbBaseTemporalFlag];
			const int32_t kiChromaWidth	= iLineSizeUV[kiActualSpatialIdx][kbBaseTemporalFlag];

			WelsGetEncBlockStrideOffset( (int32_t *)pBaseDec, kiLumaWidth, kiChromaWidth );

			pPtr->pStrideDecBlockOffset[kiActualSpatialIdx][kbBaseTemporalFlag]	= (int32_t *)pBaseDec;
			pBaseDec+= kiUnit1Size;

			++ iSpatialIdx;
		}
		++ iTemporalIdx;
	}
	iTemporalIdx= 0;
	while ( iTemporalIdx < iCntTid )
	{
		const bool_t kbBaseTemporalFlag	= (iTemporalIdx == 0);

		iSpatialIdx = 0;
		while (iSpatialIdx < kiNumSpatialLayers)
		{
			int32_t iMatchIndex = 0;			
			bool_t bInMap = false;
			bool_t bMatchFlag = false;

			i = 0;
			while ( i < iCountLayersNeedCs[kbBaseTemporalFlag] )
			{			
				const int32_t kiActualIdx = iMapSpatialIdx[i][kbBaseTemporalFlag];
				if ( kiActualIdx == iSpatialIdx )
				{
					bInMap	= true;
					break;
				}
				if ( !bMatchFlag )
				{
					iMatchIndex	= kiActualIdx;
					bMatchFlag	= true;
				}
				++ i;
			}

			if ( bInMap )
			{
				++ iSpatialIdx;
				continue;
			}

			// not in spatial map and assign match one to it
			pPtr->pStrideDecBlockOffset[iSpatialIdx][kbBaseTemporalFlag]	= pPtr->pStrideDecBlockOffset[iMatchIndex][kbBaseTemporalFlag];

			++ iSpatialIdx;
		}
		++ iTemporalIdx;
	}
	
	iSpatialIdx = 0;
	while ( iSpatialIdx < kiNumSpatialLayers )
	{		
		const int32_t kiAllocMbSize = sMbSizeMap[iSpatialIdx].iSizeAllMbAlignCache;

		pPtr->pStrideEncBlockOffset[iSpatialIdx]	= (int32_t *)pBaseEnc;
		
		pPtr->pMbIndexX[iSpatialIdx]				= (int16_t *)pBaseMbX;
		pPtr->pMbIndexY[iSpatialIdx]				= (int16_t *)pBaseMbY;

		pBaseEnc += kiUnit1Size;
		pBaseMbX += kiAllocMbSize;
		pBaseMbY += kiAllocMbSize;
		
		++ iSpatialIdx;		
	}
	
	while ( iSpatialIdx < MAX_DEPENDENCY_LAYER )
	{
		pPtr->pStrideDecBlockOffset[iSpatialIdx][0]	= NULL;
		pPtr->pStrideDecBlockOffset[iSpatialIdx][1]	= NULL;		
		pPtr->pStrideEncBlockOffset[iSpatialIdx]		= NULL;
		pPtr->pMbIndexX[iSpatialIdx]					= NULL;
		pPtr->pMbIndexY[iSpatialIdx]					= NULL;

		++ iSpatialIdx;
	}

	// initialize pMbIndexX and pMbIndexY tables as below

	iMaxMbWidth	= sMbSizeMap[kiNumSpatialLayers-1].iMbWidth;
	iMaxMbWidth	= WELS_ALIGN(iMaxMbWidth, 4);	// 4 loops for int16_t required introduced as below
	iRowSize		= iMaxMbWidth * sizeof(int16_t);

	pTmpRow = (int16_t*)pMa->WelsMalloc( iRowSize, "pTmpRow" );
	if ( NULL == pTmpRow )
	{		
		return 1;
	}
	pRowX = pTmpRow;
	pRowY = pRowX;
	// initialize pRowX & pRowY
	i = 0;
	p = pRowX;
	while ( i < iMaxMbWidth )
	{
		*p		= i;
		*(p+1)	= 1+i;
		*(p+2)	= 2+i;
		*(p+3)	= 3+i;
		
		p += 4;
		i += 4;
	}

	iSpatialIdx = kiNumSpatialLayers;
	while ( --iSpatialIdx >= 0 )
	{
		int16_t *pMbIndexX = pPtr->pMbIndexX[iSpatialIdx];
		const int32_t kiMbWidth	= sMbSizeMap[iSpatialIdx].iMbWidth;
		const int32_t kiMbHeight	= sMbSizeMap[iSpatialIdx].iCountMbNum / kiMbWidth;
		const int32_t kiLineSize	= kiMbWidth * sizeof(int16_t);

		i = 0;
		while ( i < kiMbHeight )
		{
			memcpy( pMbIndexX, pRowX, kiLineSize );	// confirmed_safe_unsafe_usage

			pMbIndexX += kiMbWidth;			
			++ i;
		}		
	}

	memset(pRowY, 0, iRowSize);
	iMaxMbHeight	= sMbSizeMap[kiNumSpatialLayers-1].iCountMbNum / sMbSizeMap[kiNumSpatialLayers-1].iMbWidth;
	i = 0;
	for (;;)
	{
		ENFORCE_STACK_ALIGN_1D(int16_t, t, 4, 16)

		int32_t t32 = 0;
		int16_t j = 0;

		for ( iSpatialIdx = kiNumSpatialLayers-1; iSpatialIdx >= 0; -- iSpatialIdx )
		{
			const int32_t kiMbWidth	= sMbSizeMap[iSpatialIdx].iMbWidth;
			const int32_t kiMbHeight = sMbSizeMap[iSpatialIdx].iCountMbNum / kiMbWidth;
			const int32_t kiLineSize	= kiMbWidth * sizeof(int16_t);
			int16_t *pMbIndexY = pPtr->pMbIndexY[iSpatialIdx] + i * kiMbWidth;

			if ( i < kiMbHeight )
			{
				memcpy( pMbIndexY, pRowY, kiLineSize );	// confirmed_safe_unsafe_usage
			}
		}		
		++ i;
		if (i >= iMaxMbHeight)
			break;

		t32 = i | (i << 16);
		ST32( t  , t32 );
		ST32( t+2, t32 );

		p = pRowY;
		while ( j < iMaxMbWidth )
		{			
			ST64( p, LD64(t) );
			
			p += 4;
			j += 4;
		}
	}

	pMa->WelsFree( pTmpRow, "pTmpRow" );
	pTmpRow = NULL;

	return 0;
}

/*!
 * \brief	request specific memory for SVC
 * \pParam	pEncCtx		sWelsEncCtx*
 * \return	successful - 0; otherwise none 0 for failed
 */
int32_t RequestMemorySvc( sWelsEncCtx **ppCtx )
{
	SWelsSvcCodingParam *pParam	= (*ppCtx)->pSvcParam;
	CMemoryAlign *pMa				= (*ppCtx)->pMemAlign;
	SDLayerParam *pFinalSpatial	= NULL;
	int32_t iCountBsLen			= 0;
	int32_t iCountNals				= 0;
	int32_t iMaxPicWidth			= 0;
	int32_t iMaxPicHeight			= 0;
	int32_t iCountMaxMbNum		= 0;
	int32_t iIndex					= 0;
	int32_t iCountLayers			= 0;
	int32_t iResult					= 0;
	float	fCompressRatioThr		= .5f;
	const int32_t kiNumDependencyLayers	= pParam->iNumDependencyLayer;
	const uint32_t kuiMvdInterTableSize	=  ( kiNumDependencyLayers == 1 ? (1 + (648 << 1)) : (1 + (972 << 1)) );	
	const uint32_t kuiMvdCacheAlginedSize	= kuiMvdInterTableSize * sizeof(uint16_t);
	int32_t iVclLayersBsSizeCount		= 0;
	int32_t iNonVclLayersBsSizeCount	= 0;	
#if defined(MT_ENABLED)
	int32_t iTargetSpatialBsSize			= 0;
#endif//MT_ENABLED

	if ( kiNumDependencyLayers < 1 || kiNumDependencyLayers > MAX_DEPENDENCY_LAYER )
	{
		WelsLog( *ppCtx, WELS_LOG_WARNING, "RequestMemorySvc() failed due to invalid iNumDependencyLayers(%d)!\n", kiNumDependencyLayers);
		FreeMemorySvc( ppCtx );		
		return 1;
	}

	if ( pParam->uiGopSize == 0 || ( pParam->uiIntraPeriod && ((pParam->uiIntraPeriod % pParam->uiGopSize) != 0)) )
	{
		WelsLog( *ppCtx, WELS_LOG_WARNING, "RequestMemorySvc() failed due to invalid uiIntraPeriod(%d) (=multipler of uiGopSize(%d)!",
			pParam->uiIntraPeriod, pParam->uiGopSize);
		FreeMemorySvc( ppCtx );		
		return 1;
	}

	pFinalSpatial	= &pParam->sDependencyLayers[kiNumDependencyLayers - 1];
	iMaxPicWidth	= pFinalSpatial->iFrameWidth;
	iMaxPicHeight	= pFinalSpatial->iFrameHeight;
	iCountMaxMbNum= ((15+iMaxPicWidth)>>4) * ((15+iMaxPicHeight)>>4);

	iResult = AcquireLayersNals( ppCtx, pParam, &iCountLayers, &iCountNals );
	if ( iResult )
	{
		WelsLog( *ppCtx, WELS_LOG_WARNING, "RequestMemorySvc(), AcquireLayersNals failed(%d)!", iResult);
		FreeMemorySvc( ppCtx );		
		return 1;
	}	
	
	iNonVclLayersBsSizeCount = SSEI_BUFFER_SIZE + pParam->iNumDependencyLayer * SPS_BUFFER_SIZE + (1+pParam->iNumDependencyLayer) * PPS_BUFFER_SIZE;

	int32_t iLayerBsSize = 0;
	iIndex = 0;
	while(iIndex < pParam->iNumDependencyLayer)
	{
		SDLayerParam *fDlp = &pParam->sDependencyLayers[iIndex];		

		fCompressRatioThr	= COMPRESS_RATIO_DECIDED_BY_RESOLUTION(fDlp->iFrameWidth, fDlp->iFrameHeight);

		iLayerBsSize = WELS_ROUND( ( (3 * fDlp->iFrameWidth * fDlp->iFrameHeight)>>1) * fCompressRatioThr);
		iLayerBsSize	= WELS_ALIGN(iLayerBsSize, 4);			// 4 bytes alinged		
		iVclLayersBsSizeCount += iLayerBsSize;
		++ iIndex;
	}
#if defined(MT_ENABLED)
	iTargetSpatialBsSize = iLayerBsSize;
#endif//MT_ENABLED
	iCountBsLen = iNonVclLayersBsSizeCount + iVclLayersBsSizeCount;

	pParam->iNumRefFrame	= WELS_CLIP3(pParam->iNumRefFrame, MIN_REF_PIC_COUNT, MAX_REFERENCE_PICTURE_COUNT_NUM);
		
	// Output
	(*ppCtx)->pOut = (SWelsEncoderOutput *)pMa->WelsMalloc( sizeof(SWelsEncoderOutput), "SWelsEncoderOutput" );
	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pOut), FreeMemorySvc(ppCtx) )
	(*ppCtx)->pOut->pBsBuffer		= (uint8_t *)pMa->WelsMalloc( iCountBsLen, "pOut->pBsBuffer" );
	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pOut->pBsBuffer), FreeMemorySvc(ppCtx) )
	(*ppCtx)->pOut->uiSize			= iCountBsLen;	
	(*ppCtx)->pOut->sNalList		= (SWelsNalRaw *)pMa->WelsMalloc( iCountNals * sizeof(SWelsNalRaw), "pOut->sNalList" );
	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pOut->sNalList), FreeMemorySvc(ppCtx) )
	(*ppCtx)->pOut->iCountNals		= iCountNals;
	(*ppCtx)->pOut->iNalIndex		= 0;

#ifdef MT_ENABLED
	if ( pParam->iMultipleThreadIdc > 1 )
	{
		(*ppCtx)->pFrameBs			= (uint8_t *)pMa->WelsMalloc( iCountBsLen + (iTargetSpatialBsSize * ((*ppCtx)->iMaxSliceCount-1)), "pFrameBs" );
		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pFrameBs), FreeMemorySvc(ppCtx) )
		(*ppCtx)->iFrameBsSize		= iCountBsLen * (*ppCtx)->iMaxSliceCount;
	}
	else
#endif//MT_ENABLED
	{	
		(*ppCtx)->pFrameBs			= (uint8_t *)pMa->WelsMalloc( iCountBsLen, "pFrameBs" );
		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pFrameBs), FreeMemorySvc(ppCtx) )
		(*ppCtx)->iFrameBsSize		= iCountBsLen;
	}
	(*ppCtx)->iPosBsBuffer		= 0;

#ifdef MT_ENABLED
	// for pSlice bs buffers
	if ( pParam->iMultipleThreadIdc > 1 && RequestMtResource( ppCtx, pParam, iCountBsLen, iTargetSpatialBsSize ) )
	{
		WelsLog( *ppCtx, WELS_LOG_WARNING, "RequestMemorySvc(), RequestMtResource failed!");
		FreeMemorySvc( ppCtx );
		return 1;
	}
#endif
		
	(*ppCtx)->pIntra4x4PredModeBlocks = static_cast<int8_t*>
		(pMa->WelsMallocz( iCountMaxMbNum * INTRA_4x4_MODE_NUM, "pIntra4x4PredModeBlocks" ));
	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pIntra4x4PredModeBlocks), FreeMemorySvc(ppCtx) )

	(*ppCtx)->pNonZeroCountBlocks = static_cast<int8_t*>
		(pMa->WelsMallocz( iCountMaxMbNum * MB_LUMA_CHROMA_BLOCK4x4_NUM, "pNonZeroCountBlocks" ));
	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pNonZeroCountBlocks), FreeMemorySvc(ppCtx) )

	(*ppCtx)->pMvUnitBlock4x4 = static_cast<SMVUnitXY*>
		(pMa->WelsMallocz( iCountMaxMbNum * 2 * MB_BLOCK4x4_NUM * sizeof(SMVUnitXY), "pMvUnitBlock4x4" ));
	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pMvUnitBlock4x4), FreeMemorySvc(ppCtx) )

	(*ppCtx)->pRefIndexBlock4x4 = static_cast<int8_t*>
		(pMa->WelsMallocz( iCountMaxMbNum * 2 * MB_BLOCK8x8_NUM * sizeof(int8_t), "pRefIndexBlock4x4" ));
	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pRefIndexBlock4x4), FreeMemorySvc(ppCtx) )

	(*ppCtx)->pSadCostMb	= static_cast<int32_t*>
		  (pMa->WelsMallocz( iCountMaxMbNum * sizeof(int32_t), "pSadCostMb" ));
	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pSadCostMb), FreeMemorySvc(ppCtx))

	(*ppCtx)->bEncCurFrmAsIdrFlag = true;  // make sure first frame is IDR
	(*ppCtx)->iGlobalQp				= 26;	// global qp in default

	(*ppCtx)->pLtr = (SLTRState *)pMa->WelsMalloc( kiNumDependencyLayers*sizeof(SLTRState), "SLTRState" );
	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pLtr), FreeMemorySvc(ppCtx) )
	int32_t i = 0;
	for( i = 0; i < kiNumDependencyLayers; i++ )
	{
		ResetLtrState( &(*ppCtx)->pLtr[i] );
	}

	(*ppCtx)->ppRefPicListExt	= (SRefList**)pMa->WelsMalloc( kiNumDependencyLayers * sizeof(SRefList *), "ppRefPicListExt" );
	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->ppRefPicListExt), FreeMemorySvc(ppCtx) )

	// pSlice context list
	(*ppCtx)->pSliceCtxList	= (SSliceCtx *)pMa->WelsMallocz( kiNumDependencyLayers * sizeof(SSliceCtx), "pSliceCtxList" );
	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pSliceCtxList), FreeMemorySvc(ppCtx) )

	(*ppCtx)->ppDqLayerList	= (SDqLayer **)pMa->WelsMalloc( kiNumDependencyLayers * sizeof(SDqLayer *), "ppDqLayerList" );
	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->ppDqLayerList), FreeMemorySvc(ppCtx) )

	// stride tables
	if ( AllocStrideTables( ppCtx, kiNumDependencyLayers ) )
	{
		WelsLog( *ppCtx, WELS_LOG_WARNING, "RequestMemorySvc(), AllocStrideTables failed!");
		FreeMemorySvc( ppCtx );
		return 1;
	}
	
	//Rate control module memory allocation
	// only malloc once for RC pData, 12/14/2009
	(*ppCtx)->pWelsSvcRc = (SWelsSvcRc *)pMa->WelsMallocz( kiNumDependencyLayers * sizeof(SWelsSvcRc), "pWelsSvcRc" );
	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pWelsSvcRc), FreeMemorySvc(ppCtx) )
	//End of Rate control module memory allocation
	
	//pVaa memory allocation	
	(*ppCtx)->pVaa	= (SVAAFrameInfo *)pMa->WelsMallocz( sizeof(SVAAFrameInfo), "pVaa" );
	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa), FreeMemorySvc(ppCtx) )

	if((*ppCtx)->pSvcParam->bEnableAdaptiveQuant)//malloc mem
	{
		(*ppCtx)->pVaa->sAdaptiveQuantParam.pMotionTextureUnit   = static_cast<SMotionTextureUnit*>
			(pMa->WelsMallocz( iCountMaxMbNum * sizeof(SMotionTextureUnit), "pVaa->sAdaptiveQuantParam.pMotionTextureUnit" ));
		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->sAdaptiveQuantParam.pMotionTextureUnit), FreeMemorySvc(ppCtx) )
		(*ppCtx)->pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp   = static_cast<int8_t*>
			(pMa->WelsMallocz( iCountMaxMbNum * sizeof(int8_t), "pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp" ));
		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp), FreeMemorySvc(ppCtx) )
	}

	(*ppCtx)->pVaa->pVaaBackgroundMbFlag = (int8_t *)pMa->WelsMallocz( iCountMaxMbNum * sizeof(int8_t), "pVaa->vaa_skip_mb_flag" );
	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->pVaaBackgroundMbFlag), FreeMemorySvc(ppCtx) )

	(*ppCtx)->pVaa->sVaaCalcInfo.pSad8x8 = static_cast<int32_t(*)[4]>
	    (pMa->WelsMallocz( iCountMaxMbNum * 4 * sizeof(int32_t), "pVaa->sVaaCalcInfo.sad8x8" ));
	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->sVaaCalcInfo.pSad8x8), FreeMemorySvc(ppCtx) )
	(*ppCtx)->pVaa->sVaaCalcInfo.pSsd16x16 = static_cast<int32_t*>
		(pMa->WelsMallocz( iCountMaxMbNum * sizeof(int32_t), "pVaa->sVaaCalcInfo.pSsd16x16" ));
	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->sVaaCalcInfo.pSsd16x16), FreeMemorySvc(ppCtx) )
	(*ppCtx)->pVaa->sVaaCalcInfo.pSum16x16 = static_cast<int32_t*>
		(pMa->WelsMallocz( iCountMaxMbNum * sizeof(int32_t), "pVaa->sVaaCalcInfo.pSum16x16" ));
	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->sVaaCalcInfo.pSum16x16), FreeMemorySvc(ppCtx) )
	(*ppCtx)->pVaa->sVaaCalcInfo.pSumOfSquare16x16 = static_cast<int32_t*>
		(pMa->WelsMallocz( iCountMaxMbNum * sizeof(int32_t), "pVaa->sVaaCalcInfo.pSumOfSquare16x16" ));
	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->sVaaCalcInfo.pSumOfSquare16x16), FreeMemorySvc(ppCtx) )

	if ((*ppCtx)->pSvcParam->bEnableBackgroundDetection)  //BGD control
	{
		(*ppCtx)->pVaa->sVaaCalcInfo.pSumOfDiff8x8 = static_cast<int32_t(*)[4]>
			(pMa->WelsMallocz( iCountMaxMbNum * 4 * sizeof(int32_t), "pVaa->sVaaCalcInfo.sd_16x16" ));
		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->sVaaCalcInfo.pSumOfDiff8x8), FreeMemorySvc(ppCtx) )
		(*ppCtx)->pVaa->sVaaCalcInfo.pMad8x8 = static_cast<uint8_t(*)[4]>
			(pMa->WelsMallocz( iCountMaxMbNum * 4 * sizeof(uint8_t), "pVaa->sVaaCalcInfo.mad_16x16" ));
		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->sVaaCalcInfo.pMad8x8), FreeMemorySvc(ppCtx) )
	}

	//End of pVaa memory allocation
	
	iResult = InitDqLayers( ppCtx );
	if ( iResult )
	{
		WelsLog( *ppCtx, WELS_LOG_WARNING, "RequestMemorySvc(), InitDqLayers failed(%d)!", iResult );
		FreeMemorySvc( ppCtx );
		return iResult;
	}	

	if( InitMbListD( ppCtx ) )
	{
		WelsLog( *ppCtx, WELS_LOG_WARNING, "RequestMemorySvc(), InitMbListD failed!" );
		FreeMemorySvc( ppCtx );
		return 1;
	}

	(*ppCtx)->pMvdCostTableInter = (uint16_t *)pMa->WelsMallocz( 52 * kuiMvdCacheAlginedSize, "pMvdCostTableInter" );
	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pMvdCostTableInter), FreeMemorySvc(ppCtx) )
	MvdCostInit( (*ppCtx)->pMvdCostTableInter, kuiMvdInterTableSize ); //should put to a better place?		
	
	if ( (*ppCtx)->ppRefPicListExt[0] != NULL && (*ppCtx)->ppRefPicListExt[0]->pRef[0] != NULL )
		(*ppCtx)->pDecPic				= (*ppCtx)->ppRefPicListExt[0]->pRef[0];
	else
		(*ppCtx)->pDecPic				= NULL;	// error here

	(*ppCtx)->pSps				= &(*ppCtx)->pSpsArray[0];
	(*ppCtx)->pPps				= &(*ppCtx)->pPPSArray[0];

	return 0;
}


/*!
 * \brief	free memory	in SVC core encoder
 * \pParam	pEncCtx		sWelsEncCtx*
 * \return	none
 */
void FreeMemorySvc( sWelsEncCtx **ppCtx )
{
	if ( NULL != *ppCtx )
	{
		sWelsEncCtx *pCtx	= *ppCtx;
		CMemoryAlign *pMa			= pCtx->pMemAlign;
		SWelsSvcCodingParam *pParam= pCtx->pSvcParam;
		int32_t ilayer				= 0;        
		
		// SStrideTables
		if ( NULL != pCtx->pStrideTab )
		{
			if ( NULL != pCtx->pStrideTab->pStrideDecBlockOffset[0][1] )
			{
				pMa->WelsFree( pCtx->pStrideTab->pStrideDecBlockOffset[0][1], "pBase" );
				pCtx->pStrideTab->pStrideDecBlockOffset[0][1] = NULL;
			}
			pMa->WelsFree(pCtx->pStrideTab, "SStrideTables");
			pCtx->pStrideTab = NULL;
		}
		// pDq idc map
		if ( NULL != pCtx->pDqIdcMap )
		{
			pMa->WelsFree( pCtx->pDqIdcMap, "pDqIdcMap" );
			pCtx->pDqIdcMap = NULL;
		}

		if ( NULL != pCtx->pOut )
		{		
			// bs pBuffer
			if ( NULL != pCtx->pOut->pBsBuffer )
			{
				pMa->WelsFree( pCtx->pOut->pBsBuffer, "pOut->pBsBuffer" );
				pCtx->pOut->pBsBuffer = NULL;
			}
			// NALs list
			if ( NULL != pCtx->pOut->sNalList )
			{
				pMa->WelsFree( pCtx->pOut->sNalList, "pOut->sNalList" );
				pCtx->pOut->sNalList = NULL;
			}
			pMa->WelsFree( pCtx->pOut, "SWelsEncoderOutput" );
			pCtx->pOut = NULL;
		}

#ifdef MT_ENABLED
		if ( pParam != NULL && pParam->iMultipleThreadIdc > 1 )
			ReleaseMtResource( ppCtx );
#endif//MT_ENABLED

		// frame bitstream pBuffer
		if ( NULL != pCtx->pFrameBs )
		{
			pMa->WelsFree( pCtx->pFrameBs, "pFrameBs" );
			pCtx->pFrameBs = NULL;
		}

		// pSpsArray
		if ( NULL != pCtx->pSpsArray )
		{
			pMa->WelsFree( pCtx->pSpsArray, "pSpsArray" );
			pCtx->pSpsArray = NULL;
		}
		// pPPSArray
		if ( NULL != pCtx->pPPSArray )
		{
			pMa->WelsFree( pCtx->pPPSArray, "pPPSArray" );
			pCtx->pPPSArray = NULL;
		}
		// subset_sps_array
		if ( NULL != pCtx->pSubsetArray )
		{
			pMa->WelsFree( pCtx->pSubsetArray, "pSubsetArray" );
			pCtx->pSubsetArray = NULL;
		}

		if( NULL != pCtx->pIntra4x4PredModeBlocks ){
			pMa->WelsFree( pCtx->pIntra4x4PredModeBlocks, "pIntra4x4PredModeBlocks" );
			pCtx->pIntra4x4PredModeBlocks = NULL;
		}

		if( NULL != pCtx->pNonZeroCountBlocks ){
			pMa->WelsFree( pCtx->pNonZeroCountBlocks, "pNonZeroCountBlocks" );
			pCtx->pNonZeroCountBlocks = NULL;
		}

		if ( NULL != pCtx->pMvUnitBlock4x4)
		{
			pMa->WelsFree( pCtx->pMvUnitBlock4x4, "pMvUnitBlock4x4" );
			pCtx->pMvUnitBlock4x4	= NULL;
		}

		if ( NULL != pCtx->pRefIndexBlock4x4)
		{
			pMa->WelsFree( pCtx->pRefIndexBlock4x4, "pRefIndexBlock4x4" );
			pCtx->pRefIndexBlock4x4	= NULL;
		}
			
		if ( NULL != pCtx->ppMbListD )
		{		
			if( NULL != pCtx->ppMbListD[0] ){
				pMa->WelsFree( pCtx->ppMbListD[0], "ppMbListD[0]" );
				(*ppCtx)->ppMbListD[0] = NULL;
			}
			pMa->WelsFree( pCtx->ppMbListD, "ppMbListD" );
			pCtx->ppMbListD = NULL;
		}

		if ( NULL != pCtx->pSadCostMb)
		{
			pMa->WelsFree( pCtx->pSadCostMb, "pSadCostMb" );
			pCtx->pSadCostMb = NULL;
		}

		// SLTRState
		if ( NULL != pCtx->pLtr )
		{
			pMa->WelsFree( pCtx->pLtr, "SLTRState" );
			pCtx->pLtr = NULL;
		}

		// pDq layers list
		ilayer = 0;
		if ( NULL != pCtx->ppDqLayerList && pParam != NULL )
		{			
			while (ilayer < pParam->iNumDependencyLayer) {
				SDqLayer *pDq	= pCtx->ppDqLayerList[ilayer];
				SDLayerParam *pDlp = &pCtx->pSvcParam->sDependencyLayers[ilayer];
				const BOOL_T kbIsDynamicSlicing = (SM_DYN_SLICE == pDlp->sMso.uiSliceMode);
				
				// pDq layers
				if ( NULL != pDq )
				{
					if ( NULL != pDq->sLayerInfo.pSliceInLayer )
					{
						int32_t iSliceIdx = 0;
						int32_t iSliceNum = GetInitialSliceNum( pDq->iMbWidth, pDq->iMbHeight, &pDlp->sMso );
						if (iSliceNum < 1)
							iSliceNum = 1;
						while(iSliceIdx < iSliceNum)
						{
							SSlice *pSlice = &pDq->sLayerInfo.pSliceInLayer[iSliceIdx];
							FreeMbCache(&pSlice->sMbCacheInfo, pMa);
							++ iSliceIdx;
						}
						pMa->WelsFree( pDq->sLayerInfo.pSliceInLayer, "pSliceInLayer" );
						pDq->sLayerInfo.pSliceInLayer = NULL;
					}
					if ( kbIsDynamicSlicing )
					{
						pMa->WelsFree( pDq->pNumSliceCodedOfPartition, "pNumSliceCodedOfPartition" );
						pDq->pNumSliceCodedOfPartition	= NULL;
						pMa->WelsFree( pDq->pLastCodedMbIdxOfPartition, "pLastCodedMbIdxOfPartition" );
						pDq->pLastCodedMbIdxOfPartition	= NULL;
						pMa->WelsFree( pDq->pLastMbIdxOfPartition, "pLastMbIdxOfPartition" );						
						pDq->pLastMbIdxOfPartition = NULL;
					}

					pMa->WelsFree( pDq, "pDq" );
					pDq = NULL;
					pCtx->ppDqLayerList[ilayer] = NULL;
				}				
				++ ilayer;
			}
			pMa->WelsFree( pCtx->ppDqLayerList, "ppDqLayerList" );
			pCtx->ppDqLayerList = NULL;
		}
		FreeSpatialPictures( pCtx );		

		// reference picture list extension
		if ( NULL != pCtx->ppRefPicListExt && pParam != NULL )
		{
			ilayer = 0;
			while (ilayer < pParam->iNumDependencyLayer) {
				SRefList *pRefList		= pCtx->ppRefPicListExt[ilayer];
				if ( NULL != pRefList )
				{
					int32_t iRef = 0;
					do {
						if ( pRefList->pRef[iRef] != NULL )
						{
							FreePicture( pMa, &pRefList->pRef[iRef] );
						}
						++ iRef;
					} while(iRef < 1 + pParam->iNumRefFrame);

					pMa->WelsFree( pCtx->ppRefPicListExt[ilayer], "ppRefPicListExt[]" );
					pCtx->ppRefPicListExt[ilayer] = NULL;
				}				
				++ ilayer;
			}	

			pMa->WelsFree( pCtx->ppRefPicListExt, "ppRefPicListExt" );
			pCtx->ppRefPicListExt = NULL;
		}
		
		// pSlice context list
		if ( NULL != pCtx->pSliceCtxList && pParam != NULL )
		{
			ilayer = 0;
			while (ilayer < pParam->iNumDependencyLayer) {
				SSliceCtx *pSliceCtx	= &pCtx->pSliceCtxList[ilayer];
				if ( NULL != pSliceCtx )
					UninitSlicePEncCtx( pSliceCtx, pMa );				
				++ ilayer;
			}
			pMa->WelsFree( pCtx->pSliceCtxList, "pSliceCtxList" );
			pCtx->pSliceCtxList = NULL;
		}		

		// VAA
		if ( NULL != pCtx->pVaa )
		{
			if(pCtx->pSvcParam->bEnableAdaptiveQuant)//free mem
			{
				pMa->WelsFree( pCtx->pVaa->sAdaptiveQuantParam.pMotionTextureUnit, "pVaa->sAdaptiveQuantParam.pMotionTextureUnit" );
				pCtx->pVaa->sAdaptiveQuantParam.pMotionTextureUnit = NULL;
				pMa->WelsFree( pCtx->pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp, "pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp" );
				pCtx->pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp = NULL;
			}

			pMa->WelsFree( pCtx->pVaa->pVaaBackgroundMbFlag, "pVaa->pVaaBackgroundMbFlag");
			pCtx->pVaa->pVaaBackgroundMbFlag	= NULL;
			pMa->WelsFree( pCtx->pVaa->sVaaCalcInfo.pSad8x8, "pVaa->sVaaCalcInfo.sad8x8" );
			pCtx->pVaa->sVaaCalcInfo.pSad8x8		= NULL;
			pMa->WelsFree( pCtx->pVaa->sVaaCalcInfo.pSsd16x16, "pVaa->sVaaCalcInfo.pSsd16x16" );
			pCtx->pVaa->sVaaCalcInfo.pSsd16x16	= NULL;
			pMa->WelsFree( pCtx->pVaa->sVaaCalcInfo.pSum16x16, "pVaa->sVaaCalcInfo.pSum16x16" );
			pCtx->pVaa->sVaaCalcInfo.pSum16x16	= NULL;
			pMa->WelsFree( pCtx->pVaa->sVaaCalcInfo.pSumOfSquare16x16, "pVaa->sVaaCalcInfo.pSumOfSquare16x16" );
			pCtx->pVaa->sVaaCalcInfo.pSumOfSquare16x16		= NULL;

			if (pCtx->pSvcParam->bEnableBackgroundDetection) //BGD control
			{
				pMa->WelsFree( pCtx->pVaa->sVaaCalcInfo.pSumOfDiff8x8, "pVaa->sVaaCalcInfo.pSumOfDiff8x8" );
				pCtx->pVaa->sVaaCalcInfo.pSumOfDiff8x8	= NULL;
				pMa->WelsFree( pCtx->pVaa->sVaaCalcInfo.pMad8x8, "pVaa->sVaaCalcInfo.pMad8x8" );
				pCtx->pVaa->sVaaCalcInfo.pMad8x8	= NULL;
			}

			pMa->WelsFree( pCtx->pVaa, "pVaa" );
			pCtx->pVaa = NULL;
		}

		WelsRcFreeMemory(pCtx);
		// rate control module memory free
		if ( NULL != pCtx->pWelsSvcRc )
		{
			pMa->WelsFree( pCtx->pWelsSvcRc, "pWelsSvcRc" );
			pCtx->pWelsSvcRc = NULL;
		}

		/* MVD cost tables for Inter */
		if ( NULL != pCtx->pMvdCostTableInter )
		{
			pMa->WelsFree( pCtx->pMvdCostTableInter, "pMvdCostTableInter" );
			pCtx->pMvdCostTableInter = NULL;
		}

#ifdef ENABLE_TRACE_FILE
		if ( NULL != pCtx->pFileLog )
		{
			fclose( pCtx->pFileLog );
			pCtx->pFileLog	= NULL;
		}
		pCtx->uiSizeLog	= 0;
#endif//ENABLE_TRACE_FILE

		FreeCodingParam( &pCtx->pSvcParam, pMa );
		if ( NULL != pCtx->pFuncList )
		{
			pMa->WelsFree(pCtx->pFuncList, "SWelsFuncPtrList");
			pCtx->pFuncList = NULL;
		}

#if defined(MEMORY_MONITOR)
		assert(pMa->WelsGetMemoryUsage() == 0);	// ensure all memory free well
#endif//MEMORY_MONITOR		

		if ( (*ppCtx)->pMemAlign != NULL )
		{
			WelsLog( NULL, WELS_LOG_INFO, "FreeMemorySvc(), verify memory usage (%d bytes) after free..\n", (*ppCtx)->pMemAlign->WelsGetMemoryUsage() );
			delete (*ppCtx)->pMemAlign;
			(*ppCtx)->pMemAlign = NULL;
		}

		free(*ppCtx);
		*ppCtx = NULL;
	}
}

int32_t InitSliceSettings( SWelsSvcCodingParam *pCodingParam, const int32_t kiCpuCores, int16_t *pMaxSliceCount )
{
	int32_t iSpatialIdx = 0, iSpatialNum = pCodingParam->iNumDependencyLayer;
	int16_t iMaxSliceCount = 0;
		
	do {
		SDLayerParam *pDlp				= &pCodingParam->sDependencyLayers[iSpatialIdx];
		SMulSliceOption *pMso			= &pDlp->sMso;
		SSliceArgument *pSlcArg			= &pMso->sSliceArgument;
		const int32_t kiMbWidth			= (pDlp->iFrameWidth+15)>>4;
		const int32_t kiMbHeight			= (pDlp->iFrameHeight+15)>>4;
		const int32_t kiMbNumInFrame	= kiMbWidth * kiMbHeight;
#if defined(MT_ENABLED)
#if defined(DYNAMIC_SLICE_ASSIGN)
		int32_t iSliceNum				= (SM_FIXEDSLCNUM_SLICE == pMso->uiSliceMode || SM_DYN_SLICE == pMso->uiSliceMode) ? kiCpuCores : pSlcArg->iSliceNum; // uiSliceNum per input has been validated at ParamValidationExt()
#else//!DYNAMIC_SLICE_ASSIGN
		int32_t iSliceNum				= (SM_DYN_SLICE == pMso->uiSliceMode) ? kiCpuCores : pSlcArg->uiSliceNum; // uiSliceNum per input has been validated at ParamValidationExt()
#endif//DYNAMIC_SLICE_ASSIGN
#else//!MT_ENABLED
		int16_t iSliceNum				= pSlcArg->iSliceNum; // uiSliceNum per input has been validated at ParamValidationExt()
#endif//MT_ENABLED

		// NOTE: Per design, in case MT/DYNAMIC_SLICE_ASSIGN enabled, for SM_FIXEDSLCNUM_SLICE mode, 
		// uiSliceNum of current spatial layer settings equals to uiCpuCores number; SM_DYN_SLICE mode,
		// uiSliceNum intials as uiCpuCores also, stay tuned dynamically slicing in future
		pSlcArg->iSliceNum	= iSliceNum;	// used fixed one

		switch(pMso->uiSliceMode)
		{
		case SM_DYN_SLICE:
			iMaxSliceCount	= AVERSLICENUM_CONSTRAINT;
//#ifndef MT_ENABLED
			break;	// go through for MT_ENABLED & SM_DYN_SLICE?
//#endif//MT_ENABLED
		case SM_FIXEDSLCNUM_SLICE:
			if ( iSliceNum > iMaxSliceCount )
				iMaxSliceCount = iSliceNum;
			// need perform check due uiSliceNum might change, although has been initialized somewhere outside
			if (pCodingParam->bEnableRc)
			{
				GomValidCheckSliceMbNum( kiMbWidth, kiMbHeight, pSlcArg );						
			}			
			else
			{			
				CheckFixedSliceNumMultiSliceSetting( kiMbNumInFrame, pSlcArg );
			}			
			break;
		case SM_SINGLE_SLICE:
			if ( iSliceNum > iMaxSliceCount )
				iMaxSliceCount = iSliceNum;
			break;
		case SM_RASTER_SLICE:
			if ( iSliceNum > iMaxSliceCount )
				iMaxSliceCount = iSliceNum;
			break;
		case SM_ROWMB_SLICE:
			if ( iSliceNum > iMaxSliceCount )
				iMaxSliceCount = iSliceNum;
			break;
		default:
			break;
		}			

		++ iSpatialIdx;
	} while(iSpatialIdx < iSpatialNum);	

#ifdef MT_ENABLED	
	pCodingParam->iCountThreadsNum				= WELS_MIN(kiCpuCores, iMaxSliceCount);
	pCodingParam->iMultipleThreadIdc	= pCodingParam->iCountThreadsNum;
#else
	pCodingParam->iMultipleThreadIdc	= 1;
	pCodingParam->iCountThreadsNum				= 1;
#endif//MT_ENABLED

#ifndef WELS_TESTBED	// for product release and non-SGE testing
	
	if ( kiCpuCores < 2 )	// single CPU core, make no sense for MT parallelization
	{
		pCodingParam->iMultipleThreadIdc	= 1;
		pCodingParam->iCountThreadsNum				= 1;
	}
#endif
	
	*pMaxSliceCount					= iMaxSliceCount;

	return 0;
}

/*!
 * \brief	log output for cpu features/capabilities
 */
void OutputCpuFeaturesLog( uint32_t uiCpuFeatureFlags, uint32_t uiCpuCores, int32_t iCacheLineSize )
{
	// welstracer output
	WelsLog(NULL, WELS_LOG_INFO, "WELS CPU features/capacities (0x%x) detected: \t"	\
		"HTT:      %c, "	\
		"MMX:      %c, "	\
		"MMXEX:    %c, "	\
		"SSE:      %c, "	\
		"SSE2:     %c, "	\
		"SSE3:     %c, "	\
		"SSSE3:    %c, "	\
		"SSE4.1:   %c, "	\
		"SSE4.2:   %c, "	\
		"AVX:      %c, "	\
		"FMA:      %c, "	\
		"X87-FPU:  %c, "	\
		"3DNOW:    %c, "	\
		"3DNOWEX:  %c, "	\
		"ALTIVEC:  %c, "	\
		"CMOV:     %c, "	\
		"MOVBE:    %c, "	\
		"AES:      %c, "	\
		"NUMBER OF LOGIC PROCESSORS ON CHIP: %d, "	\
		"CPU CACHE LINE SIZE (BYTES):        %d\n",
		uiCpuFeatureFlags,
		(uiCpuFeatureFlags & WELS_CPU_HTT) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_MMX) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_MMXEXT) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_SSE) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_SSE2) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_SSE3) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_SSSE3) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_SSE41) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_SSE42) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_AVX) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_FMA) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_FPU) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_3DNOW) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_3DNOWEXT) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_ALTIVEC) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_CMOV) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_MOVBE) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_AES) ? 'Y' : 'N',
		uiCpuCores,
		iCacheLineSize );

#ifdef _DEBUG	// output at console & _debug
	fprintf( stderr, "WELS CPU features/capacities (0x%x) detected: \n"	\
		"HTT:      %c, "	\
		"MMX:      %c, "	\
		"MMXEX:    %c, "	\
		"SSE:      %c, "	\
		"SSE2:     %c, "	\
		"SSE3:     %c, "	\
		"SSSE3:    %c, "	\
		"SSE4.1:   %c, "	\
		"SSE4.2:   %c, "	\
		"AVX:      %c, "	\
		"FMA:      %c, "	\
		"X87-FPU:  %c, "	\
		"3DNOW:    %c, "	\
		"3DNOWEX:  %c, "	\
		"ALTIVEC:  %c, "	\
		"CMOV:     %c, "	\
		"MOVBE:    %c, "	\
		"AES:      %c, "	\
		"NUMBER OF LOGIC PROCESSORS ON CHIP: %d, "	\
		"CPU CACHE LINE SIZE (BYTES):        %d\n",
		uiCpuFeatureFlags,
		(uiCpuFeatureFlags & WELS_CPU_HTT) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_MMX) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_MMXEXT) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_SSE) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_SSE2) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_SSE3) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_SSSE3) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_SSE41) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_SSE42) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_AVX) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_FMA) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_FPU) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_3DNOW) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_3DNOWEXT) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_ALTIVEC) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_CMOV) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_MOVBE) ? 'Y' : 'N',
		(uiCpuFeatureFlags & WELS_CPU_AES) ? 'Y' : 'N',
		uiCpuCores,
		iCacheLineSize );
#endif//_DEBUG
}

/*!
 * \brief	initialize Wels avc encoder core library
 * \pParam	ppCtx		sWelsEncCtx**
 * \pParam	pParam		SWelsSvcCodingParam*
 * \return	successful - 0; otherwise none 0 for failed
 */
int32_t WelsInitEncoderExt( sWelsEncCtx **ppCtx, SWelsSvcCodingParam *pCodingParam )
{
	sWelsEncCtx *pCtx		= NULL;
	int32_t	iRet					= 0;
	uint32_t uiCpuFeatureFlags		= 0;	// CPU features
	int32_t uiCpuCores				= 1;	// number of logic processors on physical processor package, one logic processor means HTT not supported	
	int32_t iCacheLineSize			= 16;	// on chip cache line size in byte
	int16_t iSliceNum				= 1;	// number of slices used
		
 	if ( NULL == ppCtx || NULL == pCodingParam )
	{
		WelsLog(NULL, WELS_LOG_ERROR, "WelsInitEncoderExt(), NULL == ppCtx(0x%p) or NULL == pCodingParam(0x%p).\n", (void *)ppCtx, (void *)pCodingParam);
		return 1;
	}

	iRet	=	ParamValidationExt( pCodingParam );
	if ( iRet != 0 )
	{
		WelsLog(NULL, WELS_LOG_ERROR, "WelsInitEncoderExt(), ParamValidationExt failed return %d.\n", iRet);
		return iRet;
	}

	// for cpu features detection, Only detect once??
#ifdef X86_ASM
	uiCpuFeatureFlags	= WelsCPUFeatureDetect( &uiCpuCores );	// detect cpu capacity features	
	if ( uiCpuFeatureFlags & WELS_CPU_CACHELINE_128 )
		iCacheLineSize = 128;
	else if ( uiCpuFeatureFlags & WELS_CPU_CACHELINE_64 )
		iCacheLineSize = 64;
	else if ( uiCpuFeatureFlags & WELS_CPU_CACHELINE_32 )
		iCacheLineSize	= 32;
	else if ( uiCpuFeatureFlags & WELS_CPU_CACHELINE_16 )
		iCacheLineSize	= 16;
	OutputCpuFeaturesLog( uiCpuFeatureFlags, uiCpuCores, iCacheLineSize );
#else
	iCacheLineSize	= 16;	// 16 bytes aligned in default
#endif//X86_ASM

#ifndef WELS_TESTBED

#if defined(MT_ENABLED) && defined(DYNAMIC_DETECT_CPU_CORES)
	if ( pCodingParam->iMultipleThreadIdc > 0 )
		uiCpuCores = pCodingParam->iMultipleThreadIdc;
	else
	{
		if ( uiCpuFeatureFlags == 0 )	// cpuid not supported, use high level system API as followed to detect number of pysical/logic processor
			uiCpuCores = DynamicDetectCpuCores();
		// So far so many cpu cores up to MAX_THREADS_NUM mean for server platforms,
		// for client application here it is constrained by maximal to MAX_THREADS_NUM
		if ( uiCpuCores > MAX_THREADS_NUM )	// MAX_THREADS_NUM
			uiCpuCores	= MAX_THREADS_NUM;	// MAX_THREADS_NUM
		else if ( uiCpuCores < 1 )	// just for safe
			uiCpuCores	= 1;
	}
#endif//MT_ENABLED && DYNAMIC_DETECT_CPU_CORES

#else//WELS_TESTBED
	
	uiCpuCores	= pCodingParam->iMultipleThreadIdc;	// assigned uiCpuCores from iMultipleThreadIdc from SGE testing

#endif//WELS_TESTBED	

	uiCpuCores	= WELS_CLIP3(uiCpuCores, 1, MAX_THREADS_NUM);

	if ( InitSliceSettings(pCodingParam, uiCpuCores, &iSliceNum ) )
	{
		WelsLog(NULL, WELS_LOG_ERROR, "WelsInitEncoderExt(), InitSliceSettings failed.\n");
		return 1;
	}
	
	*ppCtx	= NULL;
	
	pCtx	= static_cast<sWelsEncCtx*>(malloc( sizeof(sWelsEncCtx) ));

	WELS_VERIFY_RETURN_IF(1, (NULL == pCtx))
	memset( pCtx, 0, sizeof(sWelsEncCtx) );	

	pCtx->pMemAlign = new CMemoryAlign( iCacheLineSize );
	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pCtx->pMemAlign), FreeMemorySvc(&pCtx) )

	// for logs
#ifdef ENABLE_TRACE_FILE
	if (wlog == WelsLogDefault)
	{
		str_t fname[MAX_FNAME_LEN] = {0};

#if defined (_MSC_VER)
#if _MSC_VER>=1500
			SNPRINTF(fname, MAX_FNAME_LEN, MAX_FNAME_LEN, "%swels_svc_encoder_trace.txt",  pCodingParam->sTracePath );		// confirmed_safe_unsafe_usage
#else
			SNPRINTF(fname, MAX_FNAME_LEN, "%swels_svc_encoder_trace.txt",  pCodingParam->sTracePath );		// confirmed_safe_unsafe_usage
#endif//_MSC_VER>=1500
#else
        //GNUC/
        SNPRINTF(fname,      MAX_FNAME_LEN,       "%swels_svc_encoder_trace.txt",  pCodingParam->sTracePath );		// confirmed_safe_unsafe_usage
#endif//_MSC_VER


#if defined(__GNUC__)
		pCtx->pFileLog	= FOPEN(fname, "wt+");
#else//WIN32
#if defined(WIN32) && defined(_MSC_VER)
#if _MSC_VER >= 1500
		FOPEN(&pCtx->pFileLog,fname, "wt+");
#else
		pCtx->pFileLog	= FOPEN(fname, "wt+");
#endif//_MSC_VER>=1500
#endif//WIN32 && _MSC_VER
#endif//__GNUC__
		pCtx->uiSizeLog	= 0;
	}
#endif//ENABLE_TRACE_FILE

	pCodingParam->DetermineTemporalSettings();
	iRet = AllocCodingParam( &pCtx->pSvcParam, pCtx->pMemAlign, pCodingParam->iNumDependencyLayer );
	if ( iRet != 0 )
	{
		FreeMemorySvc( &pCtx );		
		return iRet;
	}
	memcpy( pCtx->pSvcParam, pCodingParam, sizeof(SWelsSvcCodingParam) );	// confirmed_safe_unsafe_usage

	pCtx->pFuncList = (SWelsFuncPtrList *)pCtx->pMemAlign->WelsMalloc(sizeof(SWelsFuncPtrList), "SWelsFuncPtrList");
	if ( NULL == pCtx->pFuncList )
	{
		FreeMemorySvc( &pCtx );
		return 1;
	}
	InitFunctionPointers( pCtx->pFuncList, pCtx->pSvcParam, uiCpuFeatureFlags );	

	pCtx->iActiveThreadsNum	= pCodingParam->iCountThreadsNum;
	pCtx->iMaxSliceCount	= iSliceNum;
	iRet = RequestMemorySvc( &pCtx );
	if ( iRet != 0 )
	{		
		WelsLog(pCtx, WELS_LOG_ERROR, "WelsInitEncoderExt(), RequestMemorySvc failed return %d.\n", iRet);
		FreeMemorySvc( &pCtx );		
		return iRet;
	}

#ifdef MT_ENABLED
	if ( pCodingParam->iMultipleThreadIdc > 1 )
		iRet = CreateSliceThreads( pCtx);		
#endif

	WelsRcInitModule( pCtx,  pCtx->pSvcParam->bEnableRc ? WELS_RC_GOM : WELS_RC_DISABLE);

	pCtx->pVpp = new CWelsPreProcess((void *)pCtx);
	if ( pCtx->pVpp == NULL )
	{		
		WelsLog(pCtx, WELS_LOG_ERROR, "WelsInitEncoderExt(), pOut of memory in case new CWelsPreProcess().\n");
		FreeMemorySvc( &pCtx );
		return iRet;
	}

#if defined(MEMORY_MONITOR)
	WelsLog(pCtx, WELS_LOG_INFO, "WelsInitEncoderExt() exit, overall memory usage: %lu bytes\n", sizeof(sWelsEncCtx) /* requested size from malloc() or new operator */
                                                                                                 + pCtx->pMemAlign->WelsGetMemoryUsage()	/* requested size from CMemoryAlign::WelsMalloc() */
             );
#endif//MEMORY_MONITOR
	
	*ppCtx	= pCtx;

	WelsLog(pCtx, WELS_LOG_DEBUG, "WelsInitEncoderExt(), pCtx= 0x%p.\n", (void *)pCtx);
	
	return 0;
}
/*
 *
 * status information output
 */
#if defined(STAT_OUTPUT)
void StatOverallEncodingExt(sWelsEncCtx *pCtx)
{
    int8_t i = 0;
	int8_t j = 0;
	for (i = 0;i<pCtx->pSvcParam->iNumDependencyLayer;i++)
	{
			fprintf( stdout,"\nDependency layer : %d\n",i);
			fprintf( stdout,"Quality layer : %d\n",j);
			{
				const int32_t iCount = pCtx->sStatData[i][j].sSliceData.iSliceCount[I_SLICE] +
					                pCtx->sStatData[i][j].sSliceData.iSliceCount[P_SLICE] +
									pCtx->sStatData[i][j].sSliceData.iSliceCount[B_SLICE];
#if defined(MB_TYPES_CHECK) 
				if (iCount > 0){
					int32_t iCountNumIMb = pCtx->sStatData[i][j].sSliceData.iMbCount[I_SLICE][Intra4x4] + pCtx->sStatData[i][j].sSliceData.iMbCount[I_SLICE][Intra16x16]+ pCtx->sStatData[i][j].sSliceData.iMbCount[I_SLICE][7];
					int32_t iCountNumPMb	=	pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Intra4x4] +
						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Intra16x16] +
						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][7] +
						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter16x16] +
						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter16x8] +
						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter8x16] +
						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter8x8] +
						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][10] +
						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][PSkip];	
					int32_t count_p_mbL0 = 	pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter16x16] +
						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter16x8] +
						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter8x16] +
						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter8x8] +
						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][10];
					
					int32_t iMbCount = iCountNumIMb + iCountNumPMb;
					if ( iMbCount > 0 ){
						fprintf(	stderr,
							"SVC: overall Slices	MBs: %d Avg\nI4x4: %.3f%% I16x16: %.3f%% IBL: %.3f%%\nP16x16: %.3f%% P16x8: %.3f%% P8x16: %.3f%% P8x8: %.3f%% SUBP8x8: %.3f%% PSKIP: %.3f%%\nILP(All): %.3f%% ILP(PL0): %.3f%% BLSKIP(PL0): %.3f%% RP(PL0): %.3f%%\n",
							iMbCount,
							(100.0f * (pCtx->sStatData[i][j].sSliceData.iMbCount[I_SLICE][Intra4x4] + pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Intra4x4]) / iMbCount),
							(100.0f * (pCtx->sStatData[i][j].sSliceData.iMbCount[I_SLICE][Intra16x16] + pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Intra16x16]) / iMbCount),
							(100.0f * (pCtx->sStatData[i][j].sSliceData.iMbCount[I_SLICE][7] + pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][7]) / iMbCount),
							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter16x16] / iMbCount ),
							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter16x8] / iMbCount ),
							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter8x16] / iMbCount ),
							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter8x8] / iMbCount),
							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][10] / iMbCount),
							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][PSkip] / iMbCount),
							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][11] / iMbCount),
							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][11] / count_p_mbL0),
							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][8] / count_p_mbL0),
							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][9] / count_p_mbL0) 
							);
					}					
				}
#endif //#if defined(MB_TYPES_CHECK)

				if (iCount > 0){
					fprintf( stdout, "SVC: overall PSNR Y: %2.3f U: %2.3f V: %2.3f kb/s: %.1f fps: %.3f\n\n",
						(pCtx->sStatData[i][j].sQualityStat.rYPsnr[I_SLICE]+pCtx->sStatData[i][j].sQualityStat.rYPsnr[P_SLICE]+pCtx->sStatData[i][j].sQualityStat.rYPsnr[B_SLICE]) / (float)(iCount),
						(pCtx->sStatData[i][j].sQualityStat.rUPsnr[I_SLICE]+pCtx->sStatData[i][j].sQualityStat.rUPsnr[P_SLICE]+pCtx->sStatData[i][j].sQualityStat.rUPsnr[B_SLICE]) / (float)(iCount),
						(pCtx->sStatData[i][j].sQualityStat.rVPsnr[I_SLICE]+pCtx->sStatData[i][j].sQualityStat.rVPsnr[P_SLICE]+pCtx->sStatData[i][j].sQualityStat.rVPsnr[B_SLICE]) / (float)(iCount),
						1.0f * pCtx->pSvcParam->sDependencyLayers[i].fOutputFrameRate *(pCtx->sStatData[i][j].sSliceData.iSliceSize[I_SLICE] +pCtx->sStatData[i][j].sSliceData.iSliceSize[P_SLICE] +pCtx->sStatData[i][j].sSliceData.iSliceSize[B_SLICE] ) / (float)(iCount+pCtx->pWelsSvcRc[i].iSkipFrameNum)/1000,
						1.0f * pCtx->pSvcParam->sDependencyLayers[i].fOutputFrameRate );

				}

			}
		
	}
}
#endif
/*!
 * \brief	uninitialize Wels encoder core library
 * \pParam	pEncCtx		sWelsEncCtx*
 * \return	none
 */
void WelsUninitEncoderExt( sWelsEncCtx **ppCtx )
{
	if ( NULL == ppCtx || NULL == *ppCtx )
		return;

	WelsLog( *ppCtx, WELS_LOG_INFO, "WelsUninitEncoderExt(), pCtx= %p, iThreadCount= %d, iMultipleThreadIdc= %d.\n", (void *)(*ppCtx), (*ppCtx)->pSvcParam->iCountThreadsNum, (*ppCtx)->pSvcParam->iMultipleThreadIdc );

#if defined(STAT_OUTPUT)
	StatOverallEncodingExt( *ppCtx );
#endif	

#if defined(MT_ENABLED)	
	if ( (*ppCtx)->pSvcParam->iMultipleThreadIdc > 1 && (*ppCtx)->pSliceThreading != NULL )
	{		
		const int32_t iThreadCount = (*ppCtx)->pSvcParam->iCountThreadsNum;
		int32_t iThreadIdx = 0;
		
#if defined(WIN32)
		if ( (*ppCtx)->pSliceThreading->pExitEncodeEvent != NULL )
		{
			do {
				if ( (*ppCtx)->pSliceThreading->pThreadHandles[iThreadIdx] != NULL )	// iThreadIdx is already created successfully
					WelsEventSignal( &(*ppCtx)->pSliceThreading->pExitEncodeEvent[iThreadIdx] );
				++ iThreadIdx;
			} while(iThreadIdx < iThreadCount);

			WelsMultipleEventsWaitAllBlocking( iThreadCount, &(*ppCtx)->pSliceThreading->pFinSliceCodingEvent[0] );

		}		
#elif defined(__GNUC__)
		while ( iThreadIdx < iThreadCount )
		{
			int res = 0;
			if ( (*ppCtx)->pSliceThreading->pThreadHandles[iThreadIdx] )
			{
				res = WelsThreadCancel( (*ppCtx)->pSliceThreading->pThreadHandles[iThreadIdx] );
				WelsLog( *ppCtx, WELS_LOG_INFO, "WelsUninitEncoderExt(), WelsThreadCancel(pThreadHandles%d) return %d..\n", iThreadIdx, res);
				res = WelsThreadJoin( (*ppCtx)->pSliceThreading->pThreadHandles[iThreadIdx] );	// waiting thread exit
				WelsLog( *ppCtx, WELS_LOG_INFO, "WelsUninitEncoderExt(), pthread_join(pThreadHandles%d) return %d..\n", iThreadIdx, res);
				(*ppCtx)->pSliceThreading->pThreadHandles[iThreadIdx] = 0;
			}
#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
			if ( (*ppCtx)->pSliceThreading->pUpdateMbListThrdHandles[iThreadIdx] )
			{
				res = WelsThreadCancel( (*ppCtx)->pSliceThreading->pUpdateMbListThrdHandles[iThreadIdx] );
				WelsLog( *ppCtx, WELS_LOG_INFO, "WelsUninitEncoderExt(), WelsThreadCancel(pUpdateMbListThrdHandles%d) return %d..\n", iThreadIdx, res);				
				res = WelsThreadJoin( (*ppCtx)->pSliceThreading->pUpdateMbListThrdHandles[iThreadIdx] );	// waiting thread exit
				WelsLog( *ppCtx, WELS_LOG_INFO, "WelsUninitEncoderExt(), pthread_join(pUpdateMbListThrdHandles%d) return %d..\n", iThreadIdx, res);
				(*ppCtx)->pSliceThreading->pUpdateMbListThrdHandles[iThreadIdx] = 0;
			}
#endif//DYNAMIC_SLICE_ASSIGN && TRY_SLICING_BALANCE
			++ iThreadIdx;
		}
#endif//WIN32
	}
#endif//MT_ENABLED

	if ((*ppCtx)->pVpp)
	{
		delete (*ppCtx)->pVpp;
		(*ppCtx)->pVpp = NULL;
	}
	FreeMemorySvc( ppCtx );
	*ppCtx = NULL;
}

/*!
 * \brief	get temporal level due to configuration and coding context	
 */
static inline int32_t GetTemporalLevel( SDLayerParam *fDlp, const int32_t kiFrameNum, const int32_t kiGopSize )
{
	const int32_t kiCodingIdx	= kiFrameNum & (kiGopSize-1);
	
	return fDlp->uiCodingIdx2TemporalId[kiCodingIdx];
}

void DynslcUpdateMbNeighbourInfoListForAllSlices( SSliceCtx *pSliceCtx, SMB *pMbList )
{	
	const int32_t kiMbWidth			= pSliceCtx->iMbWidth;
	const int32_t kiEndMbInSlice	= pSliceCtx->iMbNumInFrame - 1;
	int32_t  iIdx					= 0;	

	do {
		SMB *pMb = &pMbList[iIdx];
		uint32_t uiNeighborAvailFlag	= 0;
		const int32_t kiMbXY				= pMb->iMbXY;
		const int32_t kiMbX				= pMb->iMbX;
		const int32_t kiMbY				= pMb->iMbY;
		BOOL_T     bLeft;
		BOOL_T     bTop;
		BOOL_T     bLeftTop;
		BOOL_T     bRightTop;
		int32_t  uiSliceIdc;
		int32_t   iLeftXY, iTopXY, iLeftTopXY, iRightTopXY;

		uiSliceIdc = WelsMbToSliceIdc(pSliceCtx, kiMbXY);
		pMb->uiSliceIdc	= uiSliceIdc;
		iLeftXY = kiMbXY - 1;
		iTopXY = kiMbXY - kiMbWidth;
		iLeftTopXY = iTopXY - 1;
		iRightTopXY = iTopXY + 1;

		bLeft = (kiMbX > 0) && (uiSliceIdc == WelsMbToSliceIdc(pSliceCtx, iLeftXY));
		bTop = (kiMbY > 0) && (uiSliceIdc == WelsMbToSliceIdc(pSliceCtx, iTopXY));
		bLeftTop = (kiMbX > 0) && (kiMbY > 0) && (uiSliceIdc == WelsMbToSliceIdc(pSliceCtx, iLeftTopXY));
		bRightTop = (kiMbX < (kiMbWidth-1)) && (kiMbY > 0) && (uiSliceIdc == WelsMbToSliceIdc(pSliceCtx, iRightTopXY));		

		if( bLeft ){
			uiNeighborAvailFlag |= LEFT_MB_POS;
		}
		if( bTop ){
			uiNeighborAvailFlag |= TOP_MB_POS;
		}
		if( bLeftTop ){
			uiNeighborAvailFlag |= TOPLEFT_MB_POS;
		}
		if( bRightTop ){
			uiNeighborAvailFlag |= TOPRIGHT_MB_POS;
		}		
		pMb->uiNeighborAvail	= (uint8_t)uiNeighborAvailFlag;

		++ iIdx;
	} while(iIdx <= kiEndMbInSlice);
}

/*
 * TUNE back if number of picture partition decision algorithm based on past if available
 */
int32_t PicPartitionNumDecision( sWelsEncCtx *pCtx )
{
	int32_t iPartitionNum	= 1;
#ifdef MT_ENABLED
	if ( pCtx->pSvcParam->iMultipleThreadIdc > 1 )
	{
		iPartitionNum	= pCtx->pSvcParam->iCountThreadsNum;
#if !defined(FIXED_PARTITION_ASSIGN)
		if ( P_SLICE == pCtx->eSliceType )
			iPartitionNum	= 1;
#endif//!FIXED_PARTITION_ASSIGN
	}
	return iPartitionNum;
#else
	return iPartitionNum;
#endif//MT_ENABLED
}

#if defined(MT_ENABLED)
void WelsInitCurrentQBLayerMltslc( sWelsEncCtx *pCtx )
{	
	//pData init
	SDqLayer*		pCurDq				= pCtx->pCurDqLayer;
	SSliceCtx*	pSliceCtx			= (pCurDq->pSliceEncCtx);	
		
	//mb_neighbor
	DynslcUpdateMbNeighbourInfoListForAllSlices( pSliceCtx, pCurDq->sMbDataP );	
}

void UpdateSlicepEncCtxWithPartition( SSliceCtx *pSliceCtx, int32_t iPartitionNum )
{
	const int32_t kiMbNumInFrame	= pSliceCtx->iMbNumInFrame;
	int32_t iCountMbNumPerPartition	= kiMbNumInFrame;
	int32_t iAssignableMbLeft		= kiMbNumInFrame;
	int32_t iFirstMbIdx			= 0;
	int32_t i/*, j*/;

	if ( iPartitionNum <= 0 )
		iPartitionNum	= 1;
	else if ( iPartitionNum > AVERSLICENUM_CONSTRAINT )
		iPartitionNum	= AVERSLICENUM_CONSTRAINT;	// AVERSLICENUM_CONSTRAINT might be variable, however not fixed by MACRO
	iCountMbNumPerPartition	/= iPartitionNum;
	pSliceCtx->iSliceNumInFrame	= iPartitionNum;
	i = 0;
	while( i < iPartitionNum )
	{		
		if ( i + 1 == iPartitionNum )
		{
			pSliceCtx->pCountMbNumInSlice[i]	= iAssignableMbLeft;
		}
		else
		{
			pSliceCtx->pCountMbNumInSlice[i]	= iCountMbNumPerPartition;
		}
		pSliceCtx->pFirstMbInSlice[i]	=	iFirstMbIdx;
		
		memset( pSliceCtx->pOverallMbMap+iFirstMbIdx, (uint8_t)i, pSliceCtx->pCountMbNumInSlice[i]*sizeof(uint8_t) );

		// for next partition(or pSlice)
		iFirstMbIdx	+= pSliceCtx->pCountMbNumInSlice[i];
		iAssignableMbLeft -= pSliceCtx->pCountMbNumInSlice[i];
		++ i;
	}
}

void WelsInitCurrentDlayerMltslc( sWelsEncCtx *pCtx, int32_t iPartitionNum )
{	
	SDqLayer* pCurDq				= pCtx->pCurDqLayer;
	SSliceCtx* pSliceCtx		= pCurDq->pSliceEncCtx;	

	UpdateSlicepEncCtxWithPartition( pSliceCtx, iPartitionNum );

	if ( I_SLICE == pCtx->eSliceType )//check if uiSliceSizeConstraint too small
	{
#define byte_complexIMBat26 (60)
		uint8_t		iCurDid = pCtx->uiDependencyId;
		uint32_t	uiFrmByte = 0;

		if ( pCtx->pSvcParam->bEnableRc ) 
		{//RC case
			uiFrmByte = (
				( (uint32_t)(pCtx->pSvcParam->sDependencyLayers[iCurDid].iSpatialBitrate)
				/(uint32_t)(pCtx->pSvcParam->sDependencyLayers[iCurDid].fInputFrameRate) ) >> 3 );
		}
		else
		{//fixed QP case
			const int32_t iTtlMbNumInFrame = pSliceCtx->iMbNumInFrame;
			int32_t iQDeltaTo26 = ( 26 - pCtx->pSvcParam->sDependencyLayers[iCurDid].iDLayerQp );

			uiFrmByte = (iTtlMbNumInFrame * byte_complexIMBat26);
			if ( iQDeltaTo26 > 0 )
			{
				//smaller QP than 26
				uiFrmByte = (uint32_t)( uiFrmByte * ( (float)iQDeltaTo26 / 4 ) );
			}
			else if ( iQDeltaTo26 < 0 )
			{
				//larger QP than 26
				iQDeltaTo26 = ( (-iQDeltaTo26) >> 2 ); //delta mod 4
				uiFrmByte = ( uiFrmByte >> (iQDeltaTo26) ); //if delta 4, byte /2
			}
		}

		//MINPACKETSIZE_CONSTRAINT
		if ( pSliceCtx->uiSliceSizeConstraint 
			<
			 (uint32_t)( uiFrmByte//suppose 16 byte per mb at average
			 / ( pSliceCtx->iMaxSliceNumConstraint ) )
			)
		{

			WelsLog( pCtx, 
				WELS_LOG_WARNING, 
				"Set-SliceConstraint(%d) too small for current resolution (MB# %d) under QP/BR!\n", 
				pSliceCtx->uiSliceSizeConstraint,
				pSliceCtx->iMbNumInFrame
				);
		}											
	}

	WelsInitCurrentQBLayerMltslc( pCtx );
}
#else
void WelsInitCurrentQBLayerMltslc( sWelsEncCtx *pCtx )
{	
	//pData init
	SDqLayer*		pCurDq				= pCtx->pCurDqLayer;
	SSliceCtx*	pSliceCtx			= (pCurDq->pSliceEncCtx);	
	SSlice *			pSlice				= &pCurDq->sLayerInfo.pSliceInLayer[0];
	int32_t			iTtlMbNumInFrame = pSliceCtx->iMbNumInFrame;

	//pSliceCtx
	memset( pSliceCtx->pOverallMbMap,		0, iTtlMbNumInFrame * sizeof(uint8_t) );
	memset( pSliceCtx->pCountMbNumInSlice,	0, pSliceCtx->iSliceNumInFrame * sizeof(int32_t) );
	memset( pSliceCtx->pFirstMbInSlice,		0, pSliceCtx->iSliceNumInFrame * sizeof(int16_t) );
	pSliceCtx->iSliceNumInFrame				= 1;//
	pSliceCtx->pCountMbNumInSlice[0]			= iTtlMbNumInFrame;
		
	//mb_neighbor
	DynslcUpdateMbNeighbourInfoListForAllSlices( pSliceCtx, pCurDq->sMbDataP );	

	//pSlice init
	pSlice->uiSliceIdx				= 0;
	pSlice->pSliceBsa				= &pCtx->pOut->sBsWrite;
	pSlice->bDynamicSlicingSliceSizeCtrlFlag			= false;
	pSlice->uiAssumeLog2BytePerMb	= ( pCtx->eSliceType == P_SLICE ) ? 0 : 1;
}

void WelsInitCurrentDlayerMltslc( sWelsEncCtx *pCtx, int32_t iPartitionNum )
{	
	SDqLayer* pCurDq = pCtx->pCurDqLayer;
	SSliceCtx* pSliceCtx = ( pCurDq->pSliceEncCtx );	
	int32_t iTtlMbNumInFrame = pCurDq->iMbHeight*pCurDq->iMbWidth;

	pSliceCtx->iMbNumInFrame 
		= pSliceCtx->pCountMbNumInSlice[0] = iTtlMbNumInFrame;

	if ( I_SLICE == pCtx->eSliceType )//check if uiSliceSizeConstraint too small
	{
#define byte_complexIMBat26 (60)
		uint8_t		iCurDid = pCtx->uiDependencyId;
		uint32_t	uiFrmByte = 0;

		if ( pCtx->pSvcParam->bEnableRc ) 
		{//RC case
			uiFrmByte = (
				( (uint32_t)(pCtx->pSvcParam->sDependencyLayers[iCurDid].iSpatialBitrate)
				/(uint32_t)(pCtx->pSvcParam->sDependencyLayers[iCurDid].fInputFrameRate) ) >> 3 );
		}
		else
		{//fixed QP case
			int32_t iQDeltaTo26 = ( 26 - pCtx->pSvcParam->sDependencyLayers[iCurDid].iDLayerQp );

			uiFrmByte = (iTtlMbNumInFrame * byte_complexIMBat26);
			if ( iQDeltaTo26 > 0 )
			{
				//smaller QP than 26
				uiFrmByte = (uint32_t)( uiFrmByte * ( (float)iQDeltaTo26 / 4 ) );
			}
			else if ( iQDeltaTo26 < 0 )
			{
				//larger QP than 26
				iQDeltaTo26 = ( (-iQDeltaTo26) >> 2 ); //delta mod 4
				uiFrmByte = ( uiFrmByte >> (iQDeltaTo26) ); //if delta 4, byte /2
			}
		}

		//MINPACKETSIZE_CONSTRAINT
		if ( pSliceCtx->uiSliceSizeConstraint 
			<
			 (uint32_t)( uiFrmByte//suppose 16 byte per mb at average
			 / ( pSliceCtx->iMaxSliceNumConstraint ) )
			)
		{

			WelsLog( pCtx, 
				WELS_LOG_WARNING, 
				"Set-SliceConstraint(%d) too small for current resolution (MB# %d) under QP/BR!\n", 
				pSliceCtx->uiSliceSizeConstraint,
				pSliceCtx->iMbNumInFrame
				);
		}											
	}

	WelsInitCurrentQBLayerMltslc( pCtx );
}
#endif

/*!
 * \brief	initialize current layer	
 */
void WelsInitCurrentLayer(	sWelsEncCtx *pCtx,
								const int32_t kiWidth,
								const int32_t kiHeight )
{
 	SWelsSvcCodingParam *pParam	= pCtx->pSvcParam;
	SPicture *pEncPic					= pCtx->pEncPic;
	SPicture *pDecPic					= pCtx->pDecPic;
	SDqLayer *pCurDq				= pCtx->pCurDqLayer;
	SSlice *pBaseSlice				= &pCurDq->sLayerInfo.pSliceInLayer[0];
	SSlice *pSlice					= NULL;
	const uint8_t kiCurDid			= pCtx->uiDependencyId;
	const bool_t kbUseSubsetSpsFlag= (kiCurDid > BASE_DEPENDENCY_ID);
	SDLayerParam *fDlp				= &pParam->sDependencyLayers[kiCurDid];
	SNalUnitHeaderExt *pNalHdExt	= &pCurDq->sLayerInfo.sNalHeaderExt;
	SNalUnitHeader *pNalHd			= &pNalHdExt->sNalHeader;	
	SDqIdc *pDqIdc						= &pCtx->pDqIdcMap[kiCurDid];
	int32_t iIdx						= 0;
	int32_t iSliceCount				= 0;

	if ( NULL == pCurDq )
		return;
	
	pCurDq->pDecPic	= pDecPic;
	
	if ( fDlp->sMso.uiSliceMode == SM_DYN_SLICE )	// need get extra slices for update
		iSliceCount = GetInitialSliceNum( pCurDq->iMbWidth, pCurDq->iMbHeight, &fDlp->sMso );
	else
		iSliceCount = GetCurrentSliceNum( pCurDq->pSliceEncCtx );
	assert( iSliceCount > 0 );
	
	pBaseSlice->sSliceHeaderExt.sSliceHeader.iPpsId	= pDqIdc->iPpsId;
	pCurDq->sLayerInfo.pPpsP							=
	pBaseSlice->sSliceHeaderExt.sSliceHeader.pPps		= &pCtx->pPPSArray[pBaseSlice->sSliceHeaderExt.sSliceHeader.iPpsId];	
	pBaseSlice->sSliceHeaderExt.sSliceHeader.iSpsId	= pDqIdc->iSpsId;
	if ( kbUseSubsetSpsFlag )
	{
		pCurDq->sLayerInfo.pSubsetSpsP					= &pCtx->pSubsetArray[pDqIdc->iSpsId];
		pCurDq->sLayerInfo.pSpsP						=
		pBaseSlice->sSliceHeaderExt.sSliceHeader.pSps	= &pCurDq->sLayerInfo.pSubsetSpsP->pSps;
	}
	else
	{
		pCurDq->sLayerInfo.pSubsetSpsP					= NULL;
		pCurDq->sLayerInfo.pSpsP						=
		pBaseSlice->sSliceHeaderExt.sSliceHeader.pSps	= &pCtx->pSpsArray[pBaseSlice->sSliceHeaderExt.sSliceHeader.iSpsId];
	}

	pSlice = pBaseSlice;
	iIdx = 1;
	while ( iIdx < iSliceCount ) {
		++ pSlice;
		pSlice->sSliceHeaderExt.sSliceHeader.iPpsId	= pBaseSlice->sSliceHeaderExt.sSliceHeader.iPpsId;		
		pSlice->sSliceHeaderExt.sSliceHeader.pPps	= pBaseSlice->sSliceHeaderExt.sSliceHeader.pPps;		
		pSlice->sSliceHeaderExt.sSliceHeader.iSpsId	= pBaseSlice->sSliceHeaderExt.sSliceHeader.iSpsId;
		pSlice->sSliceHeaderExt.sSliceHeader.pSps	= pBaseSlice->sSliceHeaderExt.sSliceHeader.pSps;		
		++ iIdx;		
	}

	memset( pNalHdExt, 0, sizeof(SNalUnitHeaderExt) );
	pNalHd->uiNalRefIdc					= pCtx->eNalPriority;
	pNalHd->eNalUnitType				= pCtx->eNalType;

	pNalHdExt->uiDependencyId			= kiCurDid;
	pNalHdExt->bDiscardableFlag		= (pCtx->bNeedPrefixNalFlag) ? (pNalHd->uiNalRefIdc == NRI_PRI_LOWEST) : false;
	pNalHdExt->bIdrFlag				= (pCtx->iFrameNum == 0) && ((pCtx->eNalType == NAL_UNIT_CODED_SLICE_IDR) || (pCtx->eSliceType == I_SLICE));
	pNalHdExt->uiTemporalId				= pCtx->uiTemporalId;
	
	pBaseSlice->bSliceHeaderExtFlag	= (NAL_UNIT_CODED_SLICE_EXT == pNalHd->eNalUnitType);
	
	pSlice = pBaseSlice;
	iIdx = 1;
	while (iIdx < iSliceCount) {
		++ pSlice;		
		pSlice->bSliceHeaderExtFlag			= pBaseSlice->bSliceHeaderExtFlag;
		++ iIdx;		
	}	

	// pEncPic pData
	pCurDq->pEncData[0]		= pEncPic->pData[0];
	pCurDq->pEncData[1]		= pEncPic->pData[1];
	pCurDq->pEncData[2]		= pEncPic->pData[2];
	pCurDq->iEncStride[0]	= pEncPic->iLineSize[0];
	pCurDq->iEncStride[1]	= pEncPic->iLineSize[1];
	pCurDq->iEncStride[2]	= pEncPic->iLineSize[2];
	// cs pData
	pCurDq->pCsData[0]		= pDecPic->pData[0];
	pCurDq->pCsData[1]		= pDecPic->pData[1];
	pCurDq->pCsData[2]		= pDecPic->pData[2];
	pCurDq->iCsStride[0]	= pDecPic->iLineSize[0];
	pCurDq->iCsStride[1]	= pDecPic->iLineSize[1];
	pCurDq->iCsStride[2]	= pDecPic->iLineSize[2];		
	
	if ( pCurDq->pRefLayer != NULL )
	{
		pCurDq->bBaseLayerAvailableFlag	= true;
	}
	else
	{
		pCurDq->bBaseLayerAvailableFlag	= false;
	}
}

void PreprocessSliceCoding( sWelsEncCtx *pCtx )
{
	SDqLayer *pCurLayer		= pCtx->pCurDqLayer;
	const bool_t kbBaseAvail	= pCurLayer->bBaseLayerAvailableFlag;

	/* function pointers conditional assignment under sWelsEncCtx, layer_mb_enc_rec (in stack) is exclusive */

	if ( P_SLICE == pCtx->eSliceType )
	{ 
		if ( kbBaseAvail ) 
		{			
			if ( pCtx->pSvcParam->iNumDependencyLayer == (pCurLayer->sLayerInfo.sNalHeaderExt.uiDependencyId + 1) ) //
			{
				pCtx->pFuncList->pfMotionSearch = WelsMotionEstimateSearchSad;				
				pCtx->pFuncList->pfFirstIntraMode = WelsMdFirstIntraMode;
				pCtx->pFuncList->pfIntraFineMd = WelsMdIntraFinePartitionVaa;
				pCtx->pFuncList->pfInterFineMd = WelsMdInterFinePartitionVaa;
				pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad;
				pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad;
				pCtx->pFuncList->sSampleDealingFuncs.pfMdCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSad;				
			}
			else 
			{
				pCtx->pFuncList->pfMotionSearch  = WelsMotionEstimateSearchSatd;
				pCtx->pFuncList->pfFirstIntraMode = WelsMdFirstIntraMode;
				pCtx->pFuncList->pfIntraFineMd = WelsMdIntraFinePartition;
				pCtx->pFuncList->pfInterFineMd = WelsMdInterFinePartition;
				pCtx->pFuncList->sSampleDealingFuncs.pfMdCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSatd;
				pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd;
				pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd;
				pCtx->pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd;				
			}
			pCtx->pFuncList->sSampleDealingFuncs.pfMeCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSatd;	
		}
		else
		{
			//case 3: pBase layer MD + encoding
			if ( pCurLayer->sLayerInfo.sNalHeaderExt.uiDependencyId+1 == pCtx->pSvcParam->iNumDependencyLayer )
			{
				pCtx->pFuncList->pfMotionSearch  = WelsMotionEstimateSearchSad;
				pCtx->pFuncList->pfFirstIntraMode = WelsMdFirstIntraMode;
				pCtx->pFuncList->pfIntraFineMd = WelsMdIntraFinePartitionVaa;
				pCtx->pFuncList->pfInterFineMd = WelsMdInterFinePartitionVaa;
				pCtx->pFuncList->sSampleDealingFuncs.pfMdCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSad;
				pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad;
				pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad;				
			}
			else
			{
   				pCtx->pFuncList->pfMotionSearch  = WelsMotionEstimateSearchSatd;
				pCtx->pFuncList->pfFirstIntraMode = WelsMdFirstIntraMode;
				pCtx->pFuncList->pfIntraFineMd = WelsMdIntraFinePartition;
				pCtx->pFuncList->pfInterFineMd = WelsMdInterFinePartition;
				pCtx->pFuncList->sSampleDealingFuncs.pfMdCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSatd;
				pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd;
				pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd;
				pCtx->pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd;				
			}
			pCtx->pFuncList->sSampleDealingFuncs.pfMeCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSatd;			
		}
	}
	else if ( I_SLICE == pCtx->eSliceType )
	{
			if ( pCurLayer->sLayerInfo.sNalHeaderExt.uiDependencyId+1 == pCtx->pSvcParam->iNumDependencyLayer )
			{
				pCtx->pFuncList->sSampleDealingFuncs.pfMdCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSad;
				pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad;
				pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad;
				pCtx->pFuncList->pfIntraFineMd = WelsMdIntraFinePartitionVaa;
			}
			else
			{
				pCtx->pFuncList->sSampleDealingFuncs.pfMdCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSatd;
				pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd;
				pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd;
				pCtx->pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd;
				pCtx->pFuncList->pfIntraFineMd = WelsMdIntraFinePartition;
			}			
	}
}

/*!
 * \brief	swap pDq layers between current pDq layer and reference pDq layer	
 */

static inline void WelsSwapDqLayers( sWelsEncCtx *pCtx )
{
	// swap and assign reference	
	const int32_t kiDid			= pCtx->uiDependencyId;
	const int32_t kiNextDqIdx   = 1 + kiDid;

	SDqLayer *pTmpLayer			= pCtx->ppDqLayerList[kiNextDqIdx];
	SDqLayer *pRefLayer			= pCtx->pCurDqLayer;
	pCtx->pCurDqLayer				= pTmpLayer;
	pCtx->pCurDqLayer->pRefLayer	= pRefLayer;
}

/*!
 * \brief	prefetch reference picture after WelsBuildRefList	
 */
static inline void PrefetchReferencePicture( sWelsEncCtx *pCtx, const EFrameType keFrameType )
{	
	SSlice *pSliceBase = &pCtx->pCurDqLayer->sLayerInfo.pSliceInLayer[0];
	const int32_t kiSliceCount = GetCurrentSliceNum( pCtx->pCurDqLayer->pSliceEncCtx );
	int32_t iIdx = 0;
	uint8_t uiRefIdx = -1;

	assert( kiSliceCount > 0 );
	if ( keFrameType != WELS_FRAME_TYPE_IDR )
	{
		assert( pCtx->iNumRef0 > 0 );
		pCtx->pRefPic	= pCtx->pRefList0[0];	// always get item 0 due to reordering done
		pCtx->pCurDqLayer->pRefPic	= pCtx->pRefPic;
		uiRefIdx	= 0;	// reordered reference iIndex		
	}
	else	// safe for IDR coding
	{
		pCtx->pRefPic					= NULL;
		pCtx->pCurDqLayer->pRefPic	= NULL;		
	}

	iIdx = 0;
	while (iIdx < kiSliceCount) {
		pSliceBase->sSliceHeaderExt.sSliceHeader.uiRefIndex	= uiRefIdx;
		++ pSliceBase;
		++ iIdx;
	}
}


void ParasetIdAdditionIdAdjust( SParaSetOffsetVariable *sParaSetOffsetVariable, const int32_t kiCurEncoderParaSetId, const uint32_t kuiMaxIdInBs )//paraset_type = 0: SPS; =1: PPS
{
	//SPS_ID in avc_sps and pSubsetSps will be different using this
	//SPS_ID case example:
	//1st enter:		next_spsid_in_bs == 0; spsid == 0; delta==0;				//actual spsid_in_bs == 0 
	//1st finish:		next_spsid_in_bs == 1;
	//2nd enter:	next_spsid_in_bs == 1; spsid == 0; delta==1;				//actual spsid_in_bs == 1
	//2nd finish:		next_spsid_in_bs == 2;
	//31st enter:	next_spsid_in_bs == 31; spsid == 0~2; delta==31~29;	//actual spsid_in_bs == 31
	//31st finish:	next_spsid_in_bs == 0;
	//31st enter:	next_spsid_in_bs == 0; spsid == 0~2; delta==-2~0;		//actual spsid_in_bs == 0
	//31st finish:	next_spsid_in_bs == 1;
	
	const int32_t kiEncId			= kiCurEncoderParaSetId;
	const uint32_t kuiPrevIdInBs	= sParaSetOffsetVariable->iParaSetIdDelta[kiEncId] + kiEncId;//mark current_id
	const bool_t *kpUsedIdPointer   = &sParaSetOffsetVariable->bUsedParaSetIdInBs[0];
	uint32_t uiNextIdInBs			= sParaSetOffsetVariable->uiNextParaSetIdToUseInBs;

#if _DEBUG
	if ( 0 != sParaSetOffsetVariable->iParaSetIdDelta[kiEncId] )
		assert ( sParaSetOffsetVariable->bUsedParaSetIdInBs[kuiPrevIdInBs] ); //sure the prev-used one was marked activated correctly
#endif
	//update current layer's pCodingParam
	sParaSetOffsetVariable->iParaSetIdDelta[kiEncId]	= uiNextIdInBs - kiEncId;  //for current parameter set, change its id_delta
	//write pso pData for next update: 
	sParaSetOffsetVariable->bUsedParaSetIdInBs[kuiPrevIdInBs] = false;	//   
	sParaSetOffsetVariable->bUsedParaSetIdInBs[uiNextIdInBs] = true;		//   update current used_id

	//prepare for next update:
	//   find the next avaibable iId
	do
	{
		++uiNextIdInBs;
		if (uiNextIdInBs >= kuiMaxIdInBs ) 
		{
			uiNextIdInBs = 0;//ensure the SPS_ID wound not exceed MAX_SPS_COUNT
		}
	}while ( kpUsedIdPointer[uiNextIdInBs] );

	//   update next_id
	sParaSetOffsetVariable->uiNextParaSetIdToUseInBs = uiNextIdInBs;

#if _DEBUG
	assert ( !sParaSetOffsetVariable->bUsedParaSetIdInBs[uiNextIdInBs] ); //sure the next-to-use one is marked activated correctly
#endif

}

/*!
 * \brief	write all parameter sets introduced in SVC extension
 * \return	size in bytes of bitstream wrote
 */
int32_t WelsWriteParameterSets( sWelsEncCtx *pCtx, int32_t *pNalLen, int32_t *pNumNal )
{
	int32_t iSize	= 0;
	int32_t iNal	= 0;
	int32_t	iIdx	= 0;
	int32_t iId	= 0;
	int32_t iCountNal	= 0;

	if ( NULL == pCtx || NULL == pNalLen || NULL == pNumNal )
		return 0;	
		
	/* write all SPS */
	iIdx = 0;
	while (iIdx < pCtx->iSpsNum) {
		SDqIdc *pDqIdc		= &pCtx->pDqIdcMap[iIdx];
		const int32_t kiDid	= pDqIdc->uiSpatialId;
		const bool_t kbUsingSubsetSps = (kiDid > BASE_DEPENDENCY_ID);

		iNal	= pCtx->pOut->iNalIndex;

		if ( pCtx->pSvcParam->bEnableSpsPpsIdAddition )
		{
#if _DEBUG
			pCtx->sPSOVector.bEnableSpsPpsIdAddition = 1;
			assert(kiDid < MAX_DEPENDENCY_LAYER);
			assert(iIdx < MAX_DQ_LAYER_NUM);
#endif

			ParasetIdAdditionIdAdjust( &(pCtx->sPSOVector.sParaSetOffsetVariable[kbUsingSubsetSps ? PARA_SET_TYPE_SUBSETSPS : PARA_SET_TYPE_AVCSPS]), 
				(kbUsingSubsetSps)?(pCtx->pSubsetArray[iIdx - 1].pSps.uiSpsId):(pCtx->pSpsArray[0].uiSpsId ), 
				MAX_SPS_COUNT );
		}
		else
		{
			memset(&(pCtx->sPSOVector), 0, sizeof(pCtx->sPSOVector)  );
		}

		if ( kbUsingSubsetSps ){
			iId	= iIdx - 1;
			
			/* generate Subset SPS */
			WelsLoadNal( pCtx->pOut, NAL_UNIT_SUBSET_SPS, NRI_PRI_HIGHEST );

			WelsWriteSubsetSpsSyntax( &pCtx->pSubsetArray[iId], &pCtx->pOut->sBsWrite, &(pCtx->sPSOVector.sParaSetOffsetVariable[PARA_SET_TYPE_SUBSETSPS].iParaSetIdDelta[0]) );
			WelsUnloadNal( pCtx->pOut );
		}
		else{
			iId	= 0;
			
			/* generate sequence parameters set */
			WelsLoadNal( pCtx->pOut, NAL_UNIT_SPS, NRI_PRI_HIGHEST );
			WelsWriteSpsNal( &pCtx->pSpsArray[0], &pCtx->pOut->sBsWrite,  &(pCtx->sPSOVector.sParaSetOffsetVariable[PARA_SET_TYPE_AVCSPS].iParaSetIdDelta[0]) );
			WelsUnloadNal( pCtx->pOut );
		}
		
		pNalLen[iCountNal] = WelsEncodeNal( &pCtx->pOut->sNalList[iNal], pCtx->pFrameBs + pCtx->iPosBsBuffer, &pNalLen[iCountNal] );

		pCtx->iPosBsBuffer	+= pNalLen[iCountNal];
		iSize				+= pNalLen[iCountNal];
		
		++ iIdx;
		++ iCountNal;
	}	
	
	/* write all PPS */
	iIdx = 0;
	while (iIdx < pCtx->iPpsNum) {
		if ( pCtx->pSvcParam->bEnableSpsPpsIdAddition )
		{
			//para_set_type = 2: PPS, use MAX_PPS_COUNT
			ParasetIdAdditionIdAdjust( &pCtx->sPSOVector.sParaSetOffsetVariable[PARA_SET_TYPE_PPS], pCtx->pPPSArray[iIdx].iPpsId, MAX_PPS_COUNT );
		}

		iNal	= pCtx->pOut->iNalIndex;
		/* generate picture parameter set */
		WelsLoadNal( pCtx->pOut, NAL_UNIT_PPS, NRI_PRI_HIGHEST );
		WelsWritePpsSyntax( &pCtx->pPPSArray[iIdx], &pCtx->pOut->sBsWrite, &(pCtx->sPSOVector) );
		WelsUnloadNal( pCtx->pOut );
		
		pNalLen[iCountNal] = WelsEncodeNal( &pCtx->pOut->sNalList[iNal], pCtx->pFrameBs + pCtx->iPosBsBuffer, &pNalLen[iCountNal] );
		
		pCtx->iPosBsBuffer	+= pNalLen[iCountNal];
		iSize				+= pNalLen[iCountNal];
		
		++ iIdx;
		++ iCountNal;
	}
	
	*pNumNal = iCountNal;
	
	return iSize;
}

static inline int32_t AddPrefixNal(	sWelsEncCtx *pCtx,
									 SLayerBSInfo *pLayerBsInfo,
									 int32_t *pNalLen,
									 int32_t *pNalIdxInLayer,
									 const EWelsNalUnitType keNalType,
									 const EWelsNalRefIdc keNalRefIdc	)
{
	int32_t iPayloadSize = 0;
	
	if ( keNalRefIdc != NRI_PRI_LOWEST )
	{
		WelsLoadNal( pCtx->pOut, NAL_UNIT_PREFIX, keNalRefIdc );		

		WelsWriteSVCPrefixNal( &pCtx->pOut->sBsWrite, keNalRefIdc, (NAL_UNIT_CODED_SLICE_IDR == keNalType) );

		WelsUnloadNal( pCtx->pOut );						
		
		iPayloadSize	= WelsEncodeNalExt(	&pCtx->pOut->sNalList[pCtx->pOut->iNalIndex-1],
			&pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt,
			pCtx->pFrameBs + pCtx->iPosBsBuffer,
			&pNalLen[*pNalIdxInLayer]	);	
		
		pCtx->iPosBsBuffer							+= iPayloadSize;
		pLayerBsInfo->iNalLengthInByte[*pNalIdxInLayer]	= iPayloadSize;
		
		(*pNalIdxInLayer) ++;
	}
	else // No Prefix NAL Unit RBSP syntax here, but need add NAL Unit Header extension
	{
		WelsLoadNal( pCtx->pOut, NAL_UNIT_PREFIX, keNalRefIdc );
		// No need write any syntax of prefix NAL Unit RBSP here
		WelsUnloadNal( pCtx->pOut );
		
		iPayloadSize = WelsEncodeNalExt(	&pCtx->pOut->sNalList[pCtx->pOut->iNalIndex-1],
			&pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt,
			pCtx->pFrameBs + pCtx->iPosBsBuffer,
			&pNalLen[*pNalIdxInLayer]	);
		
		pCtx->iPosBsBuffer							+= iPayloadSize;
		pLayerBsInfo->iNalLengthInByte[*pNalIdxInLayer]	= iPayloadSize;
		
		(*pNalIdxInLayer) ++;
	}
	
	return iPayloadSize;
}

int32_t WritePadding(sWelsEncCtx *pCtx, int32_t iLen)
{
	int32_t i=0;
	int32_t iNal	= 0;
	SBitStringAux	*pBs = NULL;	
	int32_t iNalLen;
	int32_t iSize=0;
	
	iNal	= pCtx->pOut->iNalIndex;
	pBs	=	&pCtx->pOut->sBsWrite;	// SBitStringAux instance for non VCL NALs decoding
	
	if((pBs->pBufEnd - pBs->pBufPtr) < iLen || iNal >= pCtx->pOut->iCountNals)
	{
#if GOM_TRACE_FLAG
		WelsLog( pCtx, WELS_LOG_ERROR,"[RC] paddingcal pBuffer overflow, bufferlen=%d, paddinglen=%d, iNalIdx= %d, iCountNals= %d\n",
			(pBs->pBufEnd-pBs->pBufPtr), iLen, iNal, pCtx->pOut->iCountNals);
#endif
		return 0;
	}

	WelsLoadNal( pCtx->pOut, NAL_UNIT_FILLER_DATA, NRI_PRI_LOWEST );
	
	for(i=0;i<iLen;i++)
	{
		BsWriteBits( pBs, 8, 0xff);
	}
	
	BsRbspTrailingBits( pBs );

	BsFlush( pBs );
	
	WelsUnloadNal( pCtx->pOut );
	iNalLen = WelsEncodeNal( &pCtx->pOut->sNalList[iNal], pCtx->pFrameBs + pCtx->iPosBsBuffer, &iNalLen );
	
	pCtx->iPosBsBuffer	+= iNalLen;
	iSize				+= iNalLen;
	
	return iSize;
}

/*
 * post process of dynamic slicing bs writing in case PACKING_ONE_SLICE_PER_LAYER
 * include: count bs size of over all the slices in layer, 
 * return: count number of slices in layer
 */
#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
int32_t PostProcDynamicSlicingBsWriting( sWelsEncCtx *pCtx, SLayerBSInfo *pLayerBsInfo, int32_t *pLayerSize, const int32_t kiPartitionCnt )
{
	SDqLayer *pCurDq		= pCtx->pCurDqLayer;
	int32_t iPartitionIdx	= 0;
	int32_t iCheckingIdx	= 0;
	int32_t iSwappingIdx	= -1;
	int32_t iSliceCount		= 0;
	int32_t iLayerSize		= 0;

	// count number of slices in layer and layer size
	while(iPartitionIdx < kiPartitionCnt)
	{
		const int32_t coded_slice_cnt = pCurDq->pNumSliceCodedOfPartition[iPartitionIdx];		
		iLayerSize += pCtx->pSliceThreading->pCountBsSizeInPartition[iPartitionIdx];
		iSliceCount += coded_slice_cnt;
		++ iPartitionIdx;
	}
	*pLayerSize	= iLayerSize;

	// reordering pLayerBs pointers, but do not ensure raster scan order of picture
	// just maintain discontinuous items,i.e,
	// input:
	// partition 1: uiSliceIdx: 0 2 4 6
	// partition 2: uiSliceIdx: 1 3 5 7 9 11 13
	// output:
	// uiSliceIdx: 0 1 2 3 4 5 6 7 8 9 10
	iCheckingIdx = 0;						
	while(true)
	{
		bool_t bMatchFlag = false;
		iPartitionIdx = 0;							
		while(iPartitionIdx < kiPartitionCnt)
		{
			const int32_t coded_slice_cnt = pCurDq->pNumSliceCodedOfPartition[iPartitionIdx];
			// iCheckingIdx need convert to iIndex of iPartitionIdx based to avoid linear searching
			// belong this partition and not exceed the number of slices coded in partition
			if ( iPartitionIdx == (iCheckingIdx % kiPartitionCnt)
				&& iCheckingIdx / kiPartitionCnt < coded_slice_cnt )
			{
				if ( iSwappingIdx >= 0 )
				{
					// memory swapping
					memmove(pLayerBsInfo+iSwappingIdx, LayerBsInfo+iCheckingIdx, sizeof(SLayerBSInfo));	// confirmed_safe_unsafe_usage
					++ iSwappingIdx;	// record iSwappingIdx
				}
				++ iCheckingIdx;
				bMatchFlag = true;
				break;
			}
			++ iPartitionIdx;
		}
		if ( !bMatchFlag )
		{
			if ( iSwappingIdx < 0 )
				iSwappingIdx = iCheckingIdx;
			++ iCheckingIdx;
		}
		if ( iSwappingIdx >= iSliceCount )
			break;
	}

	return iSliceCount;
}
#endif//MT_ENABLED && PACKING_ONE_SLICE_PER_LAYER

/*
 * Force coding IDR as follows
 */
int32_t ForceCodingIDR( sWelsEncCtx *pCtx )
{
	if ( NULL == pCtx )
		return 1;

	pCtx->bEncCurFrmAsIdrFlag = true;
	pCtx->iCodingIndex	= 0;

	return 0;
}

/*!
 * \brief	core svc encoding process
 *
 * \pParam	pCtx			sWelsEncCtx*, encoder context
 * \pParam	pDst			FrameBSInfo*
 * \pParam	pSrc			SSourcePicture* for need_ds = true or SSourcePicture** for need_ds = false
 * \pParam	iConfiguredLayerNum	=1 in case need_ds = true or >1 in case need_ds = false
 * \pParam	need_ds		Indicate whether need down sampling desired
 *						[NO in picture list case, YES in console aplication based]
 * \return	EFrameType (WELS_FRAME_TYPE_IDR/WELS_FRAME_TYPE_I/WELS_FRAME_TYPE_P)
 */
int32_t WelsEncoderEncodeExt( sWelsEncCtx *pCtx, void *pDst, const SSourcePicture **ppSrcList, const int32_t iConfiguredLayerNum )
{
	SFrameBSInfo *pFbi					= (SFrameBSInfo *)pDst;
	SLayerBSInfo *pLayerBsInfo					= &pFbi->sLayerInfo[0];
	SWelsSvcCodingParam *pSvcParam	= pCtx->pSvcParam;
	SSpatialPicIndex *pSpatialIndexMap= &pCtx->sSpatialIndexMap[0];
#if defined(ENABLE_FRAME_DUMP) || defined(ENABLE_PSNR_CALC)
	SPicture *fsnr						= NULL;
#endif//ENABLE_FRAME_DUMP || ENABLE_PSNR_CALC
	SPicture *pEncPic						= NULL;	// to be decided later
#if defined(MT_ENABLED) && (defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG))
	int32_t did_list[MAX_DEPENDENCY_LAYER]	= {0};	
#endif//MT_ENABLED && DYNAMIC_SLICE_ASSIGN
	int32_t iLayerNum					= 0;
	int32_t iLayerSize					= 0;
	int32_t iSpatialNum					= 0; // available count number of spatial layers due to frame size changed in this given frame
	int32_t iSpatialIdx					= 0; // iIndex of spatial layers due to frame size changed in this given frame
	int32_t iFrameSize					= 0;
	int32_t iNalLen[128]				= {0};
	int32_t iNalIdxInLayer			= 0;
	int32_t iCountNal					= 0;
	EFrameType eFrameType				= WELS_FRAME_TYPE_AUTO;	
	int32_t iCurWidth					= 0;
	int32_t iCurHeight					= 0;
	EWelsNalUnitType eNalType			= NAL_UNIT_UNSPEC_0;
	EWelsNalRefIdc eNalRefIdc			= NRI_PRI_LOWEST;
	int8_t iCurDid						= 0;
	int8_t iCurTid						= 0;
	bool_t bAvcBased					= false;
#if defined(ENABLE_PSNR_CALC)
	real32_t snr_y = .0f, snr_u = .0f, snr_v = .0f;
#endif//ENABLE_PSNR_CALC

#if defined(_DEBUG)
	int32_t i = 0, j = 0, k = 0;
#endif//_DEBUG

	pFbi->iLayerNum	= 0;	// for initialization

	// perform csc/denoise/downsample/padding, generate spatial layers
	iSpatialNum = pCtx->pVpp->WelsPreprocessStep1(pCtx, ppSrcList, iConfiguredLayerNum);	
	if ( iSpatialNum < 1 )	// skip due to temporal layer settings (different frame rate)
	{
		++ pCtx->iCodingIndex;
		return WELS_FRAME_TYPE_SKIP;
	}

	eFrameType = DecideFrameType( pCtx, iSpatialNum );
	if (eFrameType == WELS_FRAME_TYPE_SKIP)
		return eFrameType;

	InitFrameCoding( pCtx, eFrameType );

	iCurTid	= GetTemporalLevel( &pSvcParam->sDependencyLayers[pSpatialIndexMap->iDid], pCtx->iCodingIndex, pSvcParam->uiGopSize );
	pCtx->uiTemporalId	= iCurTid;
	
	pLayerBsInfo->pBsBuf	= pCtx->pFrameBs ;

	if ( eFrameType == WELS_FRAME_TYPE_IDR  )
	{
		++ pCtx->sPSOVector.uiIdrPicId;
		//if ( pSvcParam->bEnableSSEI )
		
		// write parameter sets bitstream here
		WelsWriteParameterSets( pCtx, &iNalLen[0], &iCountNal );

		pLayerBsInfo->uiPriorityId	= 0;
		pLayerBsInfo->uiSpatialId		= 0;
		pLayerBsInfo->uiTemporalId	= 0;
		pLayerBsInfo->uiQualityId		= 0;
		pLayerBsInfo->uiLayerType		= NON_VIDEO_CODING_LAYER;
		pLayerBsInfo->iNalCount		= iCountNal;
		for (int32_t iNalIndex	= 0; iNalIndex < iCountNal; ++ iNalIndex)
		{
			pLayerBsInfo->iNalLengthInByte[iNalIndex]	= iNalLen[iNalIndex];
		}

		++ pLayerBsInfo;
		pLayerBsInfo->pBsBuf			= pCtx->pFrameBs + pCtx->iPosBsBuffer;
		++ iLayerNum;
	}

	pCtx->pCurDqLayer				= pCtx->ppDqLayerList[pSpatialIndexMap->iDid];
	pCtx->pCurDqLayer->pRefLayer	= NULL;

	while ( iSpatialIdx < iSpatialNum )
	{		
		const int32_t d_idx			= (pSpatialIndexMap+iSpatialIdx)->iDid;	// get iDid
		SDLayerParam *param_d		= &pSvcParam->sDependencyLayers[d_idx];			

		pCtx->uiDependencyId	= iCurDid = (int8_t)d_idx;
		pCtx->pVpp->WelsPreprocessStep3(pCtx, d_idx);

		pCtx->pEncPic	 = pEncPic = (pSpatialIndexMap+iSpatialIdx)->pSrc;
		pCtx->pEncPic->iPictureType	= pCtx->eSliceType;
		pCtx->pEncPic->iFramePoc		= pCtx->iPOC;

		iCurWidth	= param_d->iFrameWidth;
		iCurHeight	= param_d->iFrameHeight;

#if defined(MT_ENABLED) && (defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG))
		did_list[iSpatialIdx]	= iCurDid;
#endif//MT_ENABLED && DYNAMIC_SLICE_ASSIGN
		
		// Encoding this picture might mulitiple sQualityStat layers potentially be encoded as followed

		switch ( param_d->sMso.uiSliceMode )
		{
		case SM_FIXEDSLCNUM_SLICE:
			{
#if defined(MT_ENABLED) && defined(DYNAMIC_SLICE_ASSIGN)	
				if ( (iCurDid > 0) && (pSvcParam->iMultipleThreadIdc > 1) &&
					(pSvcParam->sDependencyLayers[iCurDid].sMso.uiSliceMode == SM_FIXEDSLCNUM_SLICE && pSvcParam->iMultipleThreadIdc >= pSvcParam->sDependencyLayers[iCurDid].sMso.sSliceArgument.iSliceNum )
					) 
					AdjustEnhanceLayer( pCtx, iCurDid );
#endif//MT_ENABLED && DYNAMIC_SLICE_ASSIGN
				break;
			}
		case SM_DYN_SLICE:
			{
				int32_t iPicIPartitionNum = PicPartitionNumDecision( pCtx );
				// MT compatibility
				pCtx->iActiveThreadsNum	= iPicIPartitionNum;	// we try to active number of threads, equal to number of picture partitions
				WelsInitCurrentDlayerMltslc( pCtx, iPicIPartitionNum );
				break;
			}
		default:
			{
				break;
			}
		}

		/* coding each spatial layer, only one sQualityStat layer within spatial support */
		int32_t iSliceCount	= 1;			
		if ( iLayerNum >= MAX_LAYER_NUM_OF_FRAME )	// check available layer_bs_info writing as follows
		{
			WelsLog( pCtx, WELS_LOG_ERROR, "WelsEncoderEncodeExt(), iLayerNum(%d) overflow(max:%d)!", iLayerNum, MAX_LAYER_NUM_OF_FRAME);
			return -1;
		}

		iNalIdxInLayer	= 0;
		bAvcBased	= (iCurDid == BASE_DEPENDENCY_ID);
		pCtx->bNeedPrefixNalFlag	= (bAvcBased && 
			(pSvcParam->bPrefixNalAddingCtrl || 
			(pSvcParam->iNumDependencyLayer > 1) ));

		if ( eFrameType == WELS_FRAME_TYPE_P )
		{
			eNalType	= bAvcBased ? NAL_UNIT_CODED_SLICE : NAL_UNIT_CODED_SLICE_EXT;					
		}
		else if ( eFrameType == WELS_FRAME_TYPE_IDR )
		{
			eNalType	= bAvcBased ? NAL_UNIT_CODED_SLICE_IDR : NAL_UNIT_CODED_SLICE_EXT;
		}
		if ( iCurTid == 0 || pCtx->eSliceType == I_SLICE )
			eNalRefIdc	= NRI_PRI_HIGHEST;
		else if ( iCurTid == pSvcParam->iDecompStages )
			eNalRefIdc	= NRI_PRI_LOWEST;
		else if ( 1 + iCurTid == pSvcParam->iDecompStages )
			eNalRefIdc	= NRI_PRI_LOW;
		else	// more details for other temporal layers?
			eNalRefIdc	= NRI_PRI_HIGHEST;
		pCtx->eNalType		= eNalType;
		pCtx->eNalPriority	= eNalRefIdc;				

		pCtx->pDecPic					= pCtx->ppRefPicListExt[iCurDid]->pNextBuffer;
#if defined(ENABLE_FRAME_DUMP) || defined(ENABLE_PSNR_CALC)
		fsnr					= pCtx->pDecPic;
#endif//#if defined(ENABLE_FRAME_DUMP) || defined(ENABLE_PSNR_CALC)
		pCtx->pDecPic->iPictureType	= pCtx->eSliceType;				
		pCtx->pDecPic->iFramePoc		= pCtx->iPOC;				

		WelsInitCurrentLayer( pCtx, iCurWidth, iCurHeight );

		WelsMarkPic(pCtx);
		if ( !WelsBuildRefList( pCtx, pCtx->iPOC ) )
		{
			// Force coding IDR as followed
			ForceCodingIDR( pCtx );
			WelsLog(pCtx, WELS_LOG_WARNING, "WelsEncoderEncodeExt(), WelsBuildRefList failed for P frames, pCtx->iNumRef0= %d.\n", pCtx->iNumRef0);
			return -1;
		}
#ifdef LONG_TERM_REF_DUMP
		dump_ref(pCtx);
#endif
		WelsUpdateRefSyntax(pCtx,  pCtx->iPOC, eFrameType);	//get reordering syntax used for writing slice header and transmit to encoder.
		PrefetchReferencePicture( pCtx, eFrameType );	// update reference picture for current pDq layer

		pCtx->pFuncList->pfRc.pfWelsRcPictureInit(pCtx);
		PreprocessSliceCoding( pCtx );	// MUST be called after pfWelsRcPictureInit() and WelsInitCurrentLayer()

		iLayerSize	= 0;
		if ( SM_SINGLE_SLICE == param_d->sMso.uiSliceMode )	// only one slice within a sQualityStat layer
		{
			int32_t iSliceSize = 0;					
			
			if ( pCtx->bNeedPrefixNalFlag )
			{
				iLayerSize += AddPrefixNal( pCtx, pLayerBsInfo, &iNalLen[0], &iNalIdxInLayer, eNalType, eNalRefIdc );
			}
			
			WelsLoadNal( pCtx->pOut, eNalType, eNalRefIdc );
			
			WelsCodeOneSlice( pCtx, 0, eNalType );
			
			WelsUnloadNal( pCtx->pOut );
			
			iSliceSize = WelsEncodeNalExt(	&pCtx->pOut->sNalList[pCtx->pOut->iNalIndex-1],
											&pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt,
											pCtx->pFrameBs + pCtx->iPosBsBuffer,
											&iNalLen[iNalIdxInLayer] );
			iLayerSize += iSliceSize;
			pCtx->iPosBsBuffer	+= iSliceSize;
			pLayerBsInfo->uiLayerType		= VIDEO_CODING_LAYER;
			pLayerBsInfo->uiSpatialId		= iCurDid;
			pLayerBsInfo->uiTemporalId	= iCurTid;
			pLayerBsInfo->uiQualityId		= 0;
			pLayerBsInfo->uiPriorityId	= 0;
			pLayerBsInfo->iNalLengthInByte[iNalIdxInLayer]	= iSliceSize;
			pLayerBsInfo->iNalCount		= ++ iNalIdxInLayer;					
		}
		// for dynamic slicing single threading..
#ifndef MT_ENABLED
		else if ( SM_DYN_SLICE == param_d->sMso.uiSliceMode )
#else	// MT_ENABLED
		else if ( (SM_DYN_SLICE == param_d->sMso.uiSliceMode) && (pSvcParam->iMultipleThreadIdc <= 1) )
#endif//MT_ENABLED
		{
			const int32_t kiLastMbInFrame = pCtx->pCurDqLayer->pSliceEncCtx->iMbNumInFrame;
			WelsCodeOnePicPartition( pCtx, pLayerBsInfo, &iNalIdxInLayer, &iLayerSize, 0, kiLastMbInFrame, 0 );					
		}
		else
		{//other multi-slice uiSliceMode			
#if defined(MT_ENABLED)
            int err = 0;
			// THREAD_FULLY_FIRE_MODE/THREAD_PICK_UP_MODE for any mode of non-SM_DYN_SLICE
			if ( (SM_DYN_SLICE != param_d->sMso.uiSliceMode) && (pSvcParam->iMultipleThreadIdc > 1) )
			{
				iSliceCount	= GetCurrentSliceNum( pCtx->pCurDqLayer->pSliceEncCtx );
				if ( iLayerNum +
#if defined(PACKING_ONE_SLICE_PER_LAYER)
					iSliceCount
#else
					1
#endif//PACKING_ONE_SLICE_PER_LAYER
					>= MAX_LAYER_NUM_OF_FRAME )	// check available layer_bs_info for further writing as followed
				{
					WelsLog( pCtx, WELS_LOG_ERROR, "WelsEncoderEncodeExt(), iLayerNum(%d) overflow(max:%d) at iDid= %d uiSliceMode= %d, iSliceCount= %d!",
						iLayerNum, MAX_LAYER_NUM_OF_FRAME, iCurDid, param_d->sMso.uiSliceMode, iSliceCount );
					return -1;
				}
				if ( iSliceCount <= 1 )
				{
					WelsLog( pCtx, WELS_LOG_ERROR, "WelsEncoderEncodeExt(), iSliceCount(%d) from GetCurrentSliceNum() is untrusted due stack/heap crupted!\n", iSliceCount );
					return -1;
				}

				if ( pSvcParam->iCountThreadsNum >= iSliceCount )	//THREAD_FULLY_FIRE_MODE
				{
#if defined(PACKING_ONE_SLICE_PER_LAYER)
					int32_t iSliceIdx = 1;							
					int32_t iOrgSlicePos[MAX_SLICES_NUM] = {0};
					iOrgSlicePos[0] = pCtx->iPosBsBuffer;
					while (uiSliceIdx < iSliceCount)
					{
						iOrgSlicePos[uiSliceIdx] = pCtx->pSliceBs[uiSliceIdx].uiBsPos;
						++ uiSliceIdx;
					}
#elif defined(MT_DEBUG)
					int64_t t_bs_append = 0;
#endif//PACKING_ONE_SLICE_PER_LAYER
					
					pCtx->iActiveThreadsNum	= iSliceCount;
					// to fire slice coding threads
					err = FiredSliceThreads( &pCtx->pSliceThreading->pThreadPEncCtx[0], &pCtx->pSliceThreading->pReadySliceCodingEvent[0], pLayerBsInfo, iSliceCount, pCtx->pCurDqLayer->pSliceEncCtx, FALSE );
					if ( err )
					{
						WelsLog( pCtx, WELS_LOG_ERROR, "[MT] WelsEncoderEncodeExt(), FiredSliceThreads return(%d) failed and exit encoding frame, iCountThreadsNum= %d, iSliceCount= %d, uiSliceMode= %d, iMultipleThreadIdc= %d!!\n",
							err, pSvcParam->iCountThreadsNum, iSliceCount, param_d->sMso.uiSliceMode, pSvcParam->iMultipleThreadIdc );
						return -1;
					}
				
					WelsMultipleEventsWaitAllBlocking( iSliceCount, &pCtx->pSliceThreading->pSliceCodedEvent[0] );
				

					// all slices are finished coding here
					// append exclusive slice 0 bs to pFrameBs
#if defined(PACKING_ONE_SLICE_PER_LAYER)
					iLayerSize = pCtx->iPosBsBuffer - iOrgSlicePos[0];
					uiSliceIdx = 1;
					while (uiSliceIdx < iSliceCount)
					{
						iLayerSize += pCtx->pSliceBs[uiSliceIdx].uiBsPos - iOrgSlicePos[uiSliceIdx];
						++ uiSliceIdx;
					}
					iLayerNum += iSliceCount;	// each slice stickly output as layer info for performance improvement directly
					pLayerBsInfo += iSliceCount;
#else
#if defined(MT_DEBUG)
					t_bs_append = WelsTime();
#endif//MT_DEBUG
					iLayerSize = AppendSliceToFrameBs( pCtx, pLayerBsInfo, iSliceCount );
#if defined(MT_DEBUG)
					t_bs_append = WelsTime() - t_bs_append;
					if ( pCtx->pSliceThreading->pFSliceDiff )
					{
						fprintf(pCtx->pSliceThreading->pFSliceDiff, 
#if defined(WIN32)
							"%6I64d us consumed at AppendSliceToFrameBs() for coding_idx: %d iDid: %d qid: %d\n",
#else
							"%6lld us consumed at AppendSliceToFrameBs() for coding_idx: %d iDid: %d qid: %d\n",
#endif//WIN32
							t_bs_append, pCtx->iCodingIndex, iCurDid, 0 );
					}
#endif//MT_DEBUG
#endif//PACKING_ONE_SLICE_PER_LAYER
				}
				else	//THREAD_PICK_UP_MODE
				{
					int32_t iNumThreadsRunning = 0;
					int32_t iNumThreadsScheduled = 0;
					int32_t iIndexOfSliceToBeCoded = 0;
#if defined(PACKING_ONE_SLICE_PER_LAYER)
					int32_t iSliceIdx = 1;							
					int32_t iOrgSlicePos[MAX_SLICES_NUM] = {0};
					iOrgSlicePos[0] = pCtx->iPosBsBuffer;
					while (uiSliceIdx < iSliceCount)
					{
						iOrgSlicePos[uiSliceIdx] = pCtx->pSliceBs[uiSliceIdx].uiBsPos;
						++ uiSliceIdx;
					}
#endif//PACKING_ONE_SLICE_PER_LAYER

					pCtx->iActiveThreadsNum	= pSvcParam->iCountThreadsNum;
					iNumThreadsScheduled	= pCtx->iActiveThreadsNum;
					iNumThreadsRunning		= iNumThreadsScheduled;
					// to fire slice coding threads
					err = FiredSliceThreads( &pCtx->pSliceThreading->pThreadPEncCtx[0], &pCtx->pSliceThreading->pReadySliceCodingEvent[0], pLayerBsInfo, iNumThreadsRunning, pCtx->pCurDqLayer->pSliceEncCtx, FALSE );
					if ( err )
					{
						WelsLog( pCtx, WELS_LOG_ERROR, "[MT] WelsEncoderEncodeExt(), FiredSliceThreads return(%d) failed and exit encoding frame, iCountThreadsNum= %d, iSliceCount= %d, uiSliceMode= %d, iMultipleThreadIdc= %d!!\n",
							err, pSvcParam->iCountThreadsNum, iSliceCount, param_d->sMso.uiSliceMode, pSvcParam->iMultipleThreadIdc );
						return -1;
					}

					iIndexOfSliceToBeCoded = iNumThreadsRunning;
					while (1)
					{
						if ( iIndexOfSliceToBeCoded >= iSliceCount && iNumThreadsRunning <= 0 )
							break;								
#ifdef WIN32
						WELS_THREAD_ERROR_CODE lwait	= 0;
						int32_t iEventId				= -1;
						
						lwait = WelsMultipleEventsWaitSingleBlocking(	iNumThreadsScheduled,
																		&pCtx->pSliceThreading->pSliceCodedEvent[0],
																		2 );	// 2 ms for one tick
						iEventId = (int32_t)(lwait - WELS_THREAD_ERROR_WAIT_OBJECT_0);
						if ( iEventId >= 0 && iEventId < iNumThreadsScheduled )
						{									
							if ( iIndexOfSliceToBeCoded < iSliceCount )
							{		
								// pick up succeeding slice for threading
								// thread_id equal to iEventId per implementation here
								pCtx->pSliceThreading->pThreadPEncCtx[iEventId].iSliceIndex	= iIndexOfSliceToBeCoded;
#ifdef PACKING_ONE_SLICE_PER_LAYER
								pCtx->pSliceThreading->pThreadPEncCtx[iEventId].pLayerBs	= pLayerBsInfo+iIndexOfSliceToBeCoded;
#endif//PACKING_ONE_SLICE_PER_LAYER
								WelsEventSignal( &pCtx->pSliceThreading->pReadySliceCodingEvent[iEventId] );

								++ iIndexOfSliceToBeCoded;
							}
							else	// no other slices left for coding
							{										
								-- iNumThreadsRunning;
							}
						}
						else
						{
							WelsSleep(1);
						}								
#else//__GNUC__
						// TODO for pthread platforms
						// alternate implementation using blocking due non-blocking with timeout mode not support at wels thread lib, tune back if available
						WelsMultipleEventsWaitAllBlocking( iNumThreadsRunning, &pCtx->pSliceThreading->pSliceCodedEvent[0] );
						if ( iIndexOfSliceToBeCoded < iSliceCount )
						{
							int32_t iThreadIdx = 0;
							// pick up succeeding slices for threading if left
							while ( iThreadIdx < iNumThreadsScheduled )
							{
								if ( iIndexOfSliceToBeCoded >= iSliceCount )
									break;
								pCtx->pSliceThreading->pThreadPEncCtx[iThreadIdx].iSliceIndex = iIndexOfSliceToBeCoded;
#ifdef PACKING_ONE_SLICE_PER_LAYER
								pCtx->pSliceThreading->pThreadPEncCtx[iThreadIdx].pLayerBs = pLayerBsInfo+iIndexOfSliceToBeCoded;
#endif//PACKING_ONE_SLICE_PER_LAYER
								WelsEventSignal( pCtx->pSliceThreading->pReadySliceCodingEvent[iThreadIdx] );

								++ iIndexOfSliceToBeCoded;
								++ iThreadIdx;
							}
							// update iNumThreadsRunning
							iNumThreadsRunning		= iThreadIdx;									
						}
						else
						{
							iNumThreadsRunning = 0;
						}
#endif//WIN32
					}//while(1)

// all slices are finished coding here
					// append exclusive slice 0 bs to pFrameBs
#if defined(PACKING_ONE_SLICE_PER_LAYER)
					iLayerSize = pCtx->iPosBsBuffer - iOrgSlicePos[0];
					uiSliceIdx = 1;
					while (uiSliceIdx < iSliceCount)
					{
						iLayerSize += pCtx->pSliceBs[uiSliceIdx].uiBsPos - iOrgSlicePos[uiSliceIdx];
						++ uiSliceIdx;
					}
					iLayerNum += iSliceCount;	// each slice stickly output as layer info for performance improvement directly
					pLayerBsInfo += iSliceCount;
#else
					iLayerSize = AppendSliceToFrameBs( pCtx, pLayerBsInfo, iSliceCount );
#endif//PACKING_ONE_SLICE_PER_LAYER
				}
			}					
			// THREAD_FULLY_FIRE_MODE && SM_DYN_SLICE
			else if ( (SM_DYN_SLICE == param_d->sMso.uiSliceMode) && (pSvcParam->iMultipleThreadIdc > 1) )
			{
				const int32_t kiPartitionCnt	= pCtx->iActiveThreadsNum; //pSvcParam->iCountThreadsNum;
#if defined(PACKING_ONE_SLICE_PER_LAYER)
				ResetCountBsSizeInPartitions( pCtx->pSliceThreading->pCountBsSizeInPartition, kiPartitionCnt );
				pCtx->pCurDqLayer->pSliceEncCtx->iMaxSliceNumConstraint = WELS_MIN ( MAX_SLICES_NUM, DynamicMaxSliceNumConstraint( MAX_LAYER_NUM_OF_FRAME, iLayerNum, 1 + /*( num_qlayer - 1) +*/ ( ( (iCurDid==0) && ( pSvcParam->uiGopSize>1 ) ) ? 1: 0 ) ) );  				
#endif//PACKING_ONE_SLICE_PER_LAYER

				// to fire slice coding threads
				err = FiredSliceThreads( &pCtx->pSliceThreading->pThreadPEncCtx[0], &pCtx->pSliceThreading->pReadySliceCodingEvent[0], pLayerBsInfo, kiPartitionCnt, pCtx->pCurDqLayer->pSliceEncCtx, TRUE );
				if ( err )
				{
					WelsLog( pCtx, WELS_LOG_ERROR, "[MT] WelsEncoderEncodeExt(), FiredSliceThreads return(%d) failed and exit encoding frame, iCountThreadsNum= %d, iSliceCount= %d, uiSliceMode= %d, iMultipleThreadIdc= %d!!\n",
						err, pSvcParam->iCountThreadsNum, iSliceCount, param_d->sMso.uiSliceMode, pSvcParam->iMultipleThreadIdc );
					return -1;
				}

				WelsMultipleEventsWaitAllBlocking( kiPartitionCnt, &pCtx->pSliceThreading->pSliceCodedEvent[0] );

#if defined(PACKING_ONE_SLICE_PER_LAYER)						
				iSliceCount = PostProcDynamicSlicingBsWriting( pCtx, pLayerBsInfo, &iLayerSize, kiPartitionCnt );
				assert(iLayerNum + iSliceCount < MAX_LAYER_NUM_OF_FRAME);
				pLayerBsInfo += iSliceCount;
				iLayerNum += iSliceCount;
#else
				iLayerSize = AppendSliceToFrameBs( pCtx, pLayerBsInfo, kiPartitionCnt );
#endif//PACKING_ONE_SLICE_PER_LAYER
			}
			else	// for non-dynamic-slicing mode single threading branch..
#endif//MT_ENABLED
			{
				const bool_t bNeedPrefix	= pCtx->bNeedPrefixNalFlag;
				int32_t iSliceIdx			= 0;

				iSliceCount	= GetCurrentSliceNum( pCtx->pCurDqLayer->pSliceEncCtx );						
				while (iSliceIdx < iSliceCount)
				{
					int32_t iSliceSize	= 0;

					if ( bNeedPrefix )
					{
						iLayerSize += AddPrefixNal( pCtx, pLayerBsInfo, &iNalLen[0], &iNalIdxInLayer, eNalType, eNalRefIdc );
					}
					
					WelsLoadNal( pCtx->pOut, eNalType, eNalRefIdc );
					WelsCodeOneSlice( pCtx, iSliceIdx, eNalType );
					WelsUnloadNal( pCtx->pOut );
					
					iSliceSize = WelsEncodeNalExt(	&pCtx->pOut->sNalList[pCtx->pOut->iNalIndex-1],
													&pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt,
													pCtx->pFrameBs + pCtx->iPosBsBuffer,
													&iNalLen[iNalIdxInLayer] );
					pCtx->iPosBsBuffer	+= iSliceSize;
					iLayerSize	+= iSliceSize;
					pLayerBsInfo->iNalLengthInByte[iNalIdxInLayer]	= iSliceSize;

#if defined(SLICE_INFO_OUTPUT)
					fprintf(	stderr,
								"@slice=%-6d sliceType:%c idc:%d size:%-6d\n",
								iSliceIdx,
								(pCtx->eSliceType == P_SLICE ? 'P' : 'I'),
								eNalRefIdc,
								iSliceSize	);
#endif//SLICE_INFO_OUTPUT						
					++ iNalIdxInLayer;
					++ iSliceIdx;						
				}						

				pLayerBsInfo->uiLayerType		= VIDEO_CODING_LAYER;
				pLayerBsInfo->uiSpatialId		= iCurDid;
				pLayerBsInfo->uiTemporalId	= iCurTid;
				pLayerBsInfo->uiQualityId		= 0;
				pLayerBsInfo->uiPriorityId	= 0;
				pLayerBsInfo->iNalCount		= iNalIdxInLayer;
			}
		}			

		// deblocking filter
		if (
#if defined(MT_ENABLED)
			(!pCtx->pCurDqLayer->bDeblockingParallelFlag) &&
#endif//MT_ENABLED
#if !defined(ENABLE_FRAME_DUMP)
			( (eNalRefIdc != NRI_PRI_LOWEST) && (param_d->iHighestTemporalId == 0 || iCurTid < param_d->iHighestTemporalId) ) &&
#endif//!ENABLE_FRAME_DUMP
			true
		)
		{
			PerformDeblockingFilter( pCtx );
		}

		// reference picture list update				
		if ( eNalRefIdc != NRI_PRI_LOWEST )
		{
			if ( !WelsUpdateRefList( pCtx ) )
			{
				// Force coding IDR as followed
				ForceCodingIDR( pCtx );
				WelsLog(pCtx, WELS_LOG_WARNING, "WelsEncoderEncodeExt(), WelsUpdateRefList failed.\n");
				return -1;
			}
		}

		iFrameSize += iLayerSize;				

		pCtx->pFuncList->pfRc.pfWelsRcPictureInfoUpdate(pCtx, iLayerSize);

#ifdef ENABLE_FRAME_DUMP
		// Dump reconstruction picture for each sQualityStat layer
		if ( iCurDid+1 < pSvcParam->iNumDependencyLayer )
			DumpDependencyRec( fsnr, &param_d->sRecFileName[0], iCurDid );
#endif//ENABLE_FRAME_DUMP

#if defined(ENABLE_PSNR_CALC)
		snr_y	= WelsCalcPsnr(	fsnr->pData[0],
							fsnr->iLineSize[0],
							pEncPic->pData[0],
							pEncPic->iLineSize[0],
							iCurWidth,
							iCurHeight	);
		snr_u	= WelsCalcPsnr(	fsnr->pData[1],
							fsnr->iLineSize[1],
							pEncPic->pData[1],
							pEncPic->iLineSize[1],
							(iCurWidth>>1),
							(iCurHeight>>1)	);
		snr_v	= WelsCalcPsnr(	fsnr->pData[2],
							fsnr->iLineSize[2],
							pEncPic->pData[2],
							pEncPic->iLineSize[2],
							(iCurWidth>>1),
							(iCurHeight>>1)	);
#endif//ENABLE_PSNR_CALC

#if defined(LAYER_INFO_OUTPUT)
		fprintf( stderr, "%2s %5d: %-5d %2s   T%1d D%1d Q%-2d  QP%3d   Y%2.2f  U%2.2f  V%2.2f  %8d bits\n",
				 (iSpatialIdx == 0) ? "#AU" : "   ",
				 pCtx->iPOC,
				 pCtx->iFrameNum,
				 (uiFrameType == WELS_FRAME_TYPE_I || uiFrameType == WELS_FRAME_TYPE_IDR) ? "I": "P",
				 iCurTid,
				 iCurDid,
				 0,
				 pCtx->pWelsSvcRc[pCtx->uiDependencyId].iAverageFrameQp,
				 snr_y,
				 snr_u,
				 snr_v,
				 (iLayerSize<<3)	);
#endif//LAYER_INFO_OUTPUT

#if defined(STAT_OUTPUT)

#if defined(ENABLE_PSNR_CALC)
		{
			pCtx->sStatData[iCurDid][0].sQualityStat.rYPsnr[pCtx->eSliceType]	+= snr_y;
			pCtx->sStatData[iCurDid][0].sQualityStat.rUPsnr[pCtx->eSliceType]	+= snr_u;
			pCtx->sStatData[iCurDid][0].sQualityStat.rVPsnr[pCtx->eSliceType]	+= snr_v;
		}
#endif//ENABLE_PSNR_CALC
		
#if defined(MB_TYPES_CHECK) //091025, frame output
		if (pCtx->eSliceType == P_SLICE)
		{
			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][Intra4x4] += pCtx->sPerInfo.iMbCount[P_SLICE][Intra4x4];
			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][Intra16x16] += pCtx->sPerInfo.iMbCount[P_SLICE][Intra16x16];
			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][Inter16x16] += pCtx->sPerInfo.iMbCount[P_SLICE][Inter16x16];
			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][Inter16x8] += pCtx->sPerInfo.iMbCount[P_SLICE][Inter16x8];
			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][Inter8x16] += pCtx->sPerInfo.iMbCount[P_SLICE][Inter8x16];
			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][Inter8x8] += pCtx->sPerInfo.iMbCount[P_SLICE][Inter8x8];
			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][PSkip] += pCtx->sPerInfo.iMbCount[P_SLICE][PSkip];
			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][8] += pCtx->sPerInfo.iMbCount[P_SLICE][8];
			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][9] += pCtx->sPerInfo.iMbCount[P_SLICE][9];
			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][10] += pCtx->sPerInfo.iMbCount[P_SLICE][10];
			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][11] += pCtx->sPerInfo.iMbCount[P_SLICE][11];
		}
		else if (pCtx->eSliceType == I_SLICE)
		{
			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[I_SLICE][Intra4x4] += pCtx->sPerInfo.iMbCount[I_SLICE][Intra4x4];
			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[I_SLICE][Intra16x16] += pCtx->sPerInfo.iMbCount[I_SLICE][Intra16x16];
			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[I_SLICE][7] += pCtx->sPerInfo.iMbCount[I_SLICE][7];
		}
		
		memset(pCtx->sPerInfo.iMbCount[P_SLICE], 0, 18*sizeof( int32_t ));
		memset(pCtx->sPerInfo.iMbCount[I_SLICE], 0, 18*sizeof( int32_t ));

#endif//MB_TYPES_CHECK
		{ 
    		//no pCtx->pSvcParam->bMgsT0OnlyStrategy
			++ pCtx->sStatData[iCurDid][0].sSliceData.iSliceCount[pCtx->eSliceType];	// for multiple slices coding
			pCtx->sStatData[iCurDid][0].sSliceData.iSliceSize[pCtx->eSliceType]	+= (iLayerSize<<3);	// bits
		}
#endif//STAT_OUTPUT

#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
		if ( pSvcParam->iMultipleThreadIdc <= 1 || SM_SINGLE_SLICE == param_d->sMso.uiSliceMode )	// sigle thread actually used
#else
		if ( 1 )
#endif//MT_ENABLED && PACKING_ONE_SLICE_PER_LAYER
		{
			++ iLayerNum;
			++ pLayerBsInfo;
		}

						
		pLayerBsInfo->pBsBuf	= pCtx->pFrameBs + pCtx->iPosBsBuffer;

		if( pSvcParam->iPaddingFlag && pCtx->pWelsSvcRc[pCtx->uiDependencyId].iPaddingSize > 0 )
		{
			const int32_t kiPaddingNalSize = WritePadding(pCtx, pCtx->pWelsSvcRc[pCtx->uiDependencyId].iPaddingSize);
			
#if GOM_TRACE_FLAG
			WelsLog( pCtx, WELS_LOG_INFO,"[RC] encoding_qp%d Padding: %d\n",pCtx->uiDependencyId, pCtx->pWelsSvcRc[pCtx->uiDependencyId].iPaddingSize);
#endif
			if ( kiPaddingNalSize <= 0 )
				return -1;

			pCtx->pWelsSvcRc[pCtx->uiDependencyId].iPaddingBitrateStat += pCtx->pWelsSvcRc[pCtx->uiDependencyId].iPaddingSize;
			
			pCtx->pWelsSvcRc[pCtx->uiDependencyId].iPaddingSize=0;

			pLayerBsInfo->uiPriorityId	= 0;
			pLayerBsInfo->uiSpatialId		= 0;
			pLayerBsInfo->uiTemporalId	= 0;
			pLayerBsInfo->uiQualityId		= 0;
			pLayerBsInfo->uiLayerType		= NON_VIDEO_CODING_LAYER;
			pLayerBsInfo->iNalCount		= 1;
			pLayerBsInfo->iNalLengthInByte[0] = kiPaddingNalSize;
			++ pLayerBsInfo;
			pLayerBsInfo->pBsBuf	= pCtx->pFrameBs + pCtx->iPosBsBuffer;
			++ iLayerNum;
		}

#if defined(MT_ENABLED) && defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
		if ( param_d->sMso.uiSliceMode == SM_FIXEDSLCNUM_SLICE && pSvcParam->iMultipleThreadIdc > 1 &&
			 pSvcParam->iMultipleThreadIdc >= param_d->sMso.sSliceArgument.iSliceNum )
		{
			CalcSliceComplexRatio( pCtx->pSliceThreading->pSliceComplexRatio[iCurDid], pCtx->pCurDqLayer->pSliceEncCtx, pCtx->pSliceThreading->pSliceConsumeTime[iCurDid] );
#if defined(MT_DEBUG)
			TrackSliceComplexities( pCtx, iCurDid );
#endif//#if defined(MT_DEBUG)
		}
#endif//MT_ENABLED && DYNAMIC_SLICE_ASSIGN && TRY_SLICING_BALANCE

		++ iSpatialIdx;		

		if ( iCurDid+1 < pSvcParam->iNumDependencyLayer )
		{
			WelsSwapDqLayers( pCtx );
		}

		if ( pSvcParam->bEnableLongTermReference && (pCtx->pLtr[pCtx->uiDependencyId].bLTRMarkingFlag && (pCtx->pLtr[pCtx->uiDependencyId].iLTRMarkMode == LTR_DELAY_MARK))) 
		{
			pCtx->bLongTermRefFlag[d_idx][0] = true;
		}

		if ( iCurTid < pCtx->uiSpatialLayersInTemporal[d_idx] - 1 || pSvcParam->iDecompStages == 0 )
		{
			if ( (iCurTid >= MAX_TEMPORAL_LEVEL)||(pCtx->uiSpatialLayersInTemporal[d_idx]-1>= MAX_TEMPORAL_LEVEL))
			{
				ForceCodingIDR( pCtx );	// some logic error
				return -1;
			}

			if ( pSvcParam->bEnableLongTermReference && pCtx->bLongTermRefFlag[d_idx][iCurTid] )
			{	
				SPicture *tmp	= pCtx->pSpatialPic[d_idx][pCtx->uiSpatialLayersInTemporal[d_idx]+pCtx->pVaa->uiMarkLongTermPicIdx];
				pCtx->pSpatialPic[d_idx][pCtx->uiSpatialLayersInTemporal[d_idx]+pCtx->pVaa->uiMarkLongTermPicIdx] = pCtx->pSpatialPic[d_idx][iCurTid];
				pCtx->pSpatialPic[d_idx][iCurTid] = pCtx->pSpatialPic[d_idx][pCtx->uiSpatialLayersInTemporal[d_idx]-1];
				pCtx->pSpatialPic[d_idx][pCtx->uiSpatialLayersInTemporal[d_idx]-1] = tmp;
				pCtx->bLongTermRefFlag[d_idx][iCurTid] = false;
			}
			else
			{
				WelsExchangeSpatialPictures( &pCtx->pSpatialPic[d_idx][pCtx->uiSpatialLayersInTemporal[d_idx]-1], &pCtx->pSpatialPic[d_idx][iCurTid] );
			}
		}

		if ( pSvcParam->bEnableLongTermReference && ((pCtx->pLtr[pCtx->uiDependencyId].bLTRMarkingFlag && (pCtx->pLtr[pCtx->uiDependencyId].iLTRMarkMode == LTR_DIRECT_MARK)) || eFrameType == WELS_FRAME_TYPE_IDR)) 
		{
			pCtx->bLongTermRefFlag[d_idx][iCurTid] = true;
		}
	}

#if defined(MT_ENABLED) && defined(MT_DEBUG)
	TrackSliceConsumeTime( pCtx, did_list, iSpatialNum );
#endif//MT_ENABLED && MT_DEBUG
	
#if defined(MT_ENABLED) && defined(DYNAMIC_SLICE_ASSIGN)
	if ( pSvcParam->iMultipleThreadIdc > 1 && did_list[0] == BASE_DEPENDENCY_ID 
		&& pSvcParam->sDependencyLayers[0].sMso.uiSliceMode == SM_FIXEDSLCNUM_SLICE && pSvcParam->iMultipleThreadIdc >= pSvcParam->sDependencyLayers[0].sMso.sSliceArgument.iSliceNum
		&& pSvcParam->sDependencyLayers[did_list[iSpatialNum-1]].sMso.uiSliceMode == SM_FIXEDSLCNUM_SLICE && pSvcParam->iMultipleThreadIdc >= pSvcParam->sDependencyLayers[did_list[iSpatialNum-1]].sMso.sSliceArgument.iSliceNum )
	{	
		AdjustBaseLayer( pCtx );
	}
#endif//DYNAMIC_SLICE_ASSIGN

#ifdef ENABLE_FRAME_DUMP
	DumpRecFrame( fsnr, &pSvcParam->sDependencyLayers[pSvcParam->iNumDependencyLayer-1].sRecFileName[0] );	// pDecPic: final reconstruction output
#endif//ENABLE_FRAME_DUMP
	
	++ pCtx->iCodingIndex;
	pCtx->eLastNalPriority	= eNalRefIdc;
	pFbi->iLayerNum			= iLayerNum;

#if defined(X86_ASM)
	WelsEmms();
#endif //X86_ASM	
 
	return eFrameType;
}

/*!
 * \brief	Wels SVC encoder parameters adjustment
 *			SVC adjustment results in new requirement in memory blocks adjustment
 */
int32_t WelsEncoderParamAdjust( sWelsEncCtx **ppCtx, SWelsSvcCodingParam *pNewParam )
{
	SWelsSvcCodingParam *pOldParam		= NULL;
	int32_t iReturn = 0;
	int8_t iIndexD= 0;
	bool_t bNeedReset = false;

	if ( NULL == ppCtx || NULL == *ppCtx || NULL == pNewParam )	return 1;
	
	/* Check validation in new parameters */
	iReturn	= ParamValidationExt( pNewParam );
	if ( iReturn != 0 )	return iReturn;

	pOldParam	= (*ppCtx)->pSvcParam;

	/* Decide whether need reset for IDR frame based on adjusting prarameters changed */
	/* Temporal levels, spatial settings and/ or quality settings changed need update parameter sets related. */
	bNeedReset	=	(pOldParam == NULL ) ||
					(pOldParam->iNumTemporalLayer != pNewParam->iNumTemporalLayer) ||
					(pOldParam->uiGopSize != pNewParam->uiGopSize) ||
					(pOldParam->iNumDependencyLayer != pNewParam->iNumDependencyLayer) ||
					(pOldParam->iDecompStages != pNewParam->iDecompStages) ||
					(pOldParam->iActualPicWidth != pNewParam->iActualPicWidth || pOldParam->iActualPicHeight != pNewParam->iActualPicHeight) ||
					(pOldParam->SUsedPicRect.iWidth != pNewParam->SUsedPicRect.iWidth || pOldParam->SUsedPicRect.iHeight != pNewParam->SUsedPicRect.iHeight) ||
					(pOldParam->bEnableLongTermReference != pNewParam->bEnableLongTermReference);
	if ( !bNeedReset ){	// Check its picture resolutions/quality settings respectively in each dependency layer
		iIndexD = 0;
		assert( pOldParam->iNumDependencyLayer == pNewParam->iNumDependencyLayer );
		do 
		{
			const SDLayerParam *kpOldDlp	= &pOldParam->sDependencyLayers[iIndexD];
			const SDLayerParam *kpNewDlp	= &pNewParam->sDependencyLayers[iIndexD];
			float fT1 = .0f;
			float fT2 = .0f;

			// check frame size settings
			if ( kpOldDlp->iFrameWidth != kpNewDlp->iFrameWidth ||
				 kpOldDlp->iFrameHeight != kpNewDlp->iFrameHeight ||
				 kpOldDlp->iActualWidth != kpNewDlp->iActualWidth ||
				 kpOldDlp->iActualHeight != kpNewDlp->iActualHeight ){
				bNeedReset	= true;
				break;
			}

			if ( kpOldDlp->sMso.uiSliceMode != kpNewDlp->sMso.uiSliceMode ||				 
				 kpOldDlp->sMso.sSliceArgument.iSliceNum != kpNewDlp->sMso.sSliceArgument.iSliceNum )
			{
				bNeedReset	= true;
				break;
			}

			// check frame rate
			// we can not check whether corresponding fFrameRate is equal or not, 
			// only need to check d_max/d_min and max_fr/d_max whether it is equal or not
			if ( kpNewDlp->fInputFrameRate > EPSN && kpOldDlp->fInputFrameRate > EPSN )
				fT1 = kpNewDlp->fOutputFrameRate/kpNewDlp->fInputFrameRate - kpOldDlp->fOutputFrameRate/kpOldDlp->fInputFrameRate;
			if ( kpNewDlp->fOutputFrameRate > EPSN && kpOldDlp->fOutputFrameRate > EPSN )
				fT2 = pNewParam->fMaxFrameRate/kpNewDlp->fOutputFrameRate - pOldParam->fMaxFrameRate/kpOldDlp->fOutputFrameRate;
			if ( fT1 > EPSN || fT1 < -EPSN || fT2 > EPSN || fT2 < -EPSN )
			{
				bNeedReset = true;
				break;
			}

			if ( kpOldDlp->iHighestTemporalId != kpNewDlp->iHighestTemporalId )
			{
				bNeedReset = true;
				break;
			}			

			++ iIndexD;
		} while (iIndexD < pOldParam->iNumDependencyLayer);		
	}

	if ( bNeedReset ){
		SParaSetOffsetVariable sTmpPsoVariable[PARA_SET_TYPE];
		uint16_t	          uiTmpIdrPicId;//this is for LTR!
		memcpy( sTmpPsoVariable, (*ppCtx)->sPSOVector.sParaSetOffsetVariable, (PARA_SET_TYPE)*sizeof(SParaSetOffsetVariable)  );// confirmed_safe_unsafe_usage
		uiTmpIdrPicId = (*ppCtx)->sPSOVector.uiIdrPicId;

		WelsUninitEncoderExt( ppCtx );

		/* Update new parameters */
		if ( WelsInitEncoderExt( ppCtx, pNewParam ) )
			return 1;		

		// reset the scaled spatial picture size 
		(*ppCtx)->pVpp->WelsPreprocessReset(*ppCtx);
		//if WelsInitEncoderExt succeed

		//for FLEXIBLE_PARASET_ID
		memcpy( (*ppCtx)->sPSOVector.sParaSetOffsetVariable, sTmpPsoVariable, (PARA_SET_TYPE)*sizeof(SParaSetOffsetVariable)  );// confirmed_safe_unsafe_usage
		(*ppCtx)->sPSOVector.uiIdrPicId = uiTmpIdrPicId;
	}
	else{
		/* maybe adjustment introduced in bitrate or little settings adjustment and so on.. */		
		pNewParam->iNumRefFrame								= WELS_CLIP3(pNewParam->iNumRefFrame, MIN_REF_PIC_COUNT, MAX_REFERENCE_PICTURE_COUNT_NUM);
		pNewParam->iLoopFilterDisableIdc					= WELS_CLIP3(pNewParam->iLoopFilterDisableIdc, 0, 6);
		pNewParam->iLoopFilterAlphaC0Offset				= WELS_CLIP3(pNewParam->iLoopFilterAlphaC0Offset, -6, 6);
		pNewParam->iLoopFilterBetaOffset					= WELS_CLIP3(pNewParam->iLoopFilterBetaOffset, -6, 6);
		pNewParam->iInterLayerLoopFilterDisableIdc		= WELS_CLIP3(pNewParam->iInterLayerLoopFilterDisableIdc, 0, 6);
		pNewParam->iInterLayerLoopFilterAlphaC0Offset	= WELS_CLIP3(pNewParam->iInterLayerLoopFilterAlphaC0Offset, -6, 6);
		pNewParam->iInterLayerLoopFilterBetaOffset		= WELS_CLIP3(pNewParam->iInterLayerLoopFilterBetaOffset, -6, 6);
		pNewParam->fMaxFrameRate							= WELS_CLIP3(pNewParam->fMaxFrameRate, MIN_FRAME_RATE, MAX_FRAME_RATE);

		// we can not use direct struct based memcpy due some fields need keep unchanged as before
		pOldParam->fMaxFrameRate	= pNewParam->fMaxFrameRate;		// maximal frame rate [Hz / fps]
		pOldParam->iInputCsp			= pNewParam->iInputCsp;			// color space of input sequence	
		pOldParam->uiIntraPeriod		= pNewParam->uiIntraPeriod;		// intra period (multiple of GOP size as desired)
		pOldParam->bEnableSpsPpsIdAddition = pNewParam->bEnableSpsPpsIdAddition;
		pOldParam->bPrefixNalAddingCtrl = pNewParam->bPrefixNalAddingCtrl;
		pOldParam->iNumRefFrame		= pNewParam->iNumRefFrame;		// number of reference frame used

		/* denoise control */
		pOldParam->bEnableDenoise	= pNewParam->bEnableDenoise;

		/* background detection control */
		pOldParam->bEnableBackgroundDetection		= pNewParam->bEnableBackgroundDetection;

		/* adaptive quantization control */
		pOldParam->bEnableAdaptiveQuant	= pNewParam->bEnableAdaptiveQuant;

		/* int32_t term reference control */
		pOldParam->bEnableLongTermReference	= pNewParam->bEnableLongTermReference;	   
		pOldParam->uiLtrMarkPeriod	= pNewParam->uiLtrMarkPeriod;	

		// keep below values unchanged as before
		pOldParam->bEnableSSEI		= pNewParam->bEnableSSEI;
		pOldParam->bEnableFrameCroppingFlag	= pNewParam->bEnableFrameCroppingFlag;	// enable frame cropping flag

		/* Motion search */
		
		/* Deblocking loop filter */
		pOldParam->iLoopFilterDisableIdc	= pNewParam->iLoopFilterDisableIdc;	// 0: on, 1: off, 2: on except for slice boundaries
		pOldParam->iLoopFilterAlphaC0Offset	= pNewParam->iLoopFilterAlphaC0Offset;// AlphaOffset: valid range [-6, 6], default 0
		pOldParam->iLoopFilterBetaOffset		= pNewParam->iLoopFilterBetaOffset;	// BetaOffset:	valid range [-6, 6], default 0
		pOldParam->iInterLayerLoopFilterDisableIdc	= pNewParam->iInterLayerLoopFilterDisableIdc; // Employed based upon inter-layer, same comment as above
		pOldParam->iInterLayerLoopFilterAlphaC0Offset	= pNewParam->iInterLayerLoopFilterAlphaC0Offset;	// InterLayerLoopFilterAlphaC0Offset
		pOldParam->iInterLayerLoopFilterBetaOffset		= pNewParam->iInterLayerLoopFilterBetaOffset;	// InterLayerLoopFilterBetaOffset
		
		/* Rate Control */
		pOldParam->bEnableRc			= pNewParam->bEnableRc;	
		pOldParam->iRCMode	    	= pNewParam->iRCMode;	
		pOldParam->iTargetBitrate	= pNewParam->iTargetBitrate;			// overall target bitrate introduced in RC module
		pOldParam->iPaddingFlag	    = pNewParam->iPaddingFlag;
		
		/* Layer definition */
		pOldParam->bPrefixNalAddingCtrl	= pNewParam->bPrefixNalAddingCtrl;

		// d
		iIndexD = 0;
		do 
		{
			SDLayerParam *pOldDlp	= &pOldParam->sDependencyLayers[iIndexD];
			SDLayerParam *pNewDlp	= &pNewParam->sDependencyLayers[iIndexD];

			pOldDlp->fInputFrameRate	= pNewDlp->fInputFrameRate;	// input frame rate
			pOldDlp->fOutputFrameRate	= pNewDlp->fOutputFrameRate;	// output frame rate
			pOldDlp->iSpatialBitrate	= pNewDlp->iSpatialBitrate;
			
			pOldDlp->uiProfileIdc		= pNewDlp->uiProfileIdc;			// value of profile IDC (0 for auto-detection)

			/* Derived variants below */
			pOldDlp->iTemporalResolution	= pNewDlp->iTemporalResolution;
			pOldDlp->iDecompositionStages	= pNewDlp->iDecompositionStages;			
			
			memcpy(pOldDlp->uiCodingIdx2TemporalId, pNewDlp->uiCodingIdx2TemporalId, sizeof(pOldDlp->uiCodingIdx2TemporalId));	// confirmed_safe_unsafe_usage

			++ iIndexD;
		} while (iIndexD < pOldParam->iNumDependencyLayer);		
	}

	/* Any else initialization/reset for rate control here? */
	
	return 0;
}


int32_t WelsCodeOnePicPartition(	sWelsEncCtx *pCtx,
									SLayerBSInfo *pLayerBsInfo,
									int32_t *pNalIdxInLayer,									
									int32_t* pLayerSize,
									int32_t iFirstMbInPartition,	// first mb inclusive in partition
									int32_t iEndMbInPartition,	// end mb exclusive in partition
									int32_t iStartSliceIdx
								  )
{

	SDqLayer * pCurLayer			= pCtx->pCurDqLayer;
	SSliceCtx * pSliceCtx		= pCurLayer->pSliceEncCtx;	
	int32_t iNalLen[MAX_NAL_UNITS_IN_LAYER]			= {0};
	int32_t iNalIdxInLayer		= *pNalIdxInLayer;
	int32_t iSliceIdx				= iStartSliceIdx;
	const int32_t kiSliceStep		= pCtx->iActiveThreadsNum;
	const int32_t kiPartitionId		= iStartSliceIdx % kiSliceStep;
	int32_t iPartitionBsSize		= 0;
	int32_t iAnyMbLeftInPartition= iEndMbInPartition - iFirstMbInPartition;
	const EWelsNalUnitType keNalType	= pCtx->eNalType;
	const EWelsNalRefIdc keNalRefIdc	= pCtx->eNalPriority;
	const bool_t kbNeedPrefix		= pCtx->bNeedPrefixNalFlag;

	//init
	{
		pSliceCtx->pFirstMbInSlice[iSliceIdx]		= iFirstMbInPartition;	
		pCurLayer->pNumSliceCodedOfPartition[kiPartitionId]	= 1;	// one slice per partition intialized, dynamic slicing inside
		pCurLayer->pLastMbIdxOfPartition[kiPartitionId]		= iEndMbInPartition-1;
	}
	pCurLayer->pLastCodedMbIdxOfPartition[kiPartitionId] = 0;

	while ( iAnyMbLeftInPartition > 0 )
	{
		int32_t iSliceSize	= 0;

		if ( iSliceIdx >= pSliceCtx->iMaxSliceNumConstraint )	// insufficient memory in pSliceInLayer[]
		{
			// TODO: need exception handler for not large enough of MAX_SLICES_NUM related memory usage
			// No idea about its solution due MAX_SLICES_NUM is fixed lenght in relevent pData structure
			return 1;
		}
		
		if ( kbNeedPrefix )
		{
			iPartitionBsSize += AddPrefixNal( pCtx, pLayerBsInfo, &iNalLen[0], &iNalIdxInLayer, keNalType, keNalRefIdc );
		}

		WelsLoadNal( pCtx->pOut, keNalType, keNalRefIdc );
		WelsCodeOneSlice( pCtx, iSliceIdx, keNalType );
		WelsUnloadNal( pCtx->pOut );

		iSliceSize = WelsEncodeNalExt(	&pCtx->pOut->sNalList[pCtx->pOut->iNalIndex-1],
			&pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt,
			pCtx->pFrameBs + pCtx->iPosBsBuffer,
			&iNalLen[iNalIdxInLayer]	);
		pCtx->iPosBsBuffer	+= iSliceSize;
		iPartitionBsSize	+= iSliceSize;
		pLayerBsInfo->iNalLengthInByte[iNalIdxInLayer]	= iSliceSize;		

#if defined(SLICE_INFO_OUTPUT)
		fprintf(	stderr,
			"@slice=%-6d sliceType:%c idc:%d size:%-6d\n",
			iSliceIdx,
			(pCtx->eSliceType == P_SLICE ? 'P' : 'I'),
			eNalRefIdc,
			iSliceSize	);
#endif//SLICE_INFO_OUTPUT

		++ iNalIdxInLayer;
		iSliceIdx += kiSliceStep;	//if uiSliceIdx is not continuous
		iAnyMbLeftInPartition = iEndMbInPartition - (1 + pCurLayer->pLastCodedMbIdxOfPartition[kiPartitionId]);		
	}

	*pLayerSize			= iPartitionBsSize;
	*pNalIdxInLayer	= iNalIdxInLayer;

	// slice based packing???
	pLayerBsInfo->uiLayerType		= VIDEO_CODING_LAYER;
	pLayerBsInfo->uiSpatialId		= pCtx->uiDependencyId;
	pLayerBsInfo->uiTemporalId	= pCtx->uiTemporalId;
	pLayerBsInfo->uiQualityId		= 0;
	pLayerBsInfo->uiPriorityId	= 0;
	pLayerBsInfo->iNalCount		= iNalIdxInLayer;

	return 0;
}
} // namespace WelsSVCEnc