shithub: openh264

ref: 5364be4504d4c8b2da84ebfee42fccb8b2d10089
dir: /codec/encoder/core/src/slice_multi_threading.cpp/

View raw version
/*!
 * \copy
 *     Copyright (c)  2010-2013, Cisco Systems
 *     All rights reserved.
 *
 *     Redistribution and use in source and binary forms, with or without
 *     modification, are permitted provided that the following conditions
 *     are met:
 *
 *        * Redistributions of source code must retain the above copyright
 *          notice, this list of conditions and the following disclaimer.
 *
 *        * Redistributions in binary form must reproduce the above copyright
 *          notice, this list of conditions and the following disclaimer in
 *          the documentation and/or other materials provided with the
 *          distribution.
 *
 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 *     POSSIBILITY OF SUCH DAMAGE.
 *
 *
 * \file    slice_multi_threading.h
 *
 * \brief   pSlice based multiple threading
 *
 * \date    04/16/2010 Created
 *
 *************************************************************************************
 */


#include <assert.h>
#if !defined(_WIN32)
#include <semaphore.h>
#include <unistd.h>
#endif//!_WIN32
#ifndef SEM_NAME_MAX
// length of semaphore name should be system constrained at least on mac 10.7
#define  SEM_NAME_MAX 32
#endif//SEM_NAME_MAX
#include "slice_multi_threading.h"
#include "mt_defs.h"
#include "nal_encap.h"
#include "utils.h"
#include "encoder.h"
#include "svc_encode_slice.h"
#include "deblocking.h"
#include "svc_enc_golomb.h"
#include "crt_util_safe_x.h" // for safe crt like calls
#include "rc.h"

#include "cpu.h"

#include "measure_time.h"
#include "wels_task_management.h"

#if defined(ENABLE_TRACE_MT)
#define MT_TRACE_LOG(pLog, x, ...) WelsLog(pLog, x, __VA_ARGS__)
#else
#define MT_TRACE_LOG(x, ...)
#endif

namespace WelsEnc {
void UpdateMbListNeighborParallel (SDqLayer* pCurDq,
                                   SMB* pMbList,
                                   const int32_t uiSliceIdc) {
  SSliceCtx* pSliceCtx           = &pCurDq->sSliceEncCtx;
  const int32_t kiMbWidth        = pSliceCtx->iMbWidth;
  int32_t iIdx                   = pCurDq->pFirstMbIdxOfSlice[uiSliceIdc];
  const int32_t kiEndMbInSlice   = iIdx + pCurDq->pCountMbNumInSlice[uiSliceIdc] - 1;

  do {
    UpdateMbNeighbor (pCurDq, &pMbList[iIdx], kiMbWidth, uiSliceIdc);
    ++ iIdx;
  } while (iIdx <= kiEndMbInSlice);
}

void CalcSliceComplexRatio (SDqLayer* pCurDq) {
  SSliceCtx* pSliceCtx          = &pCurDq->sSliceEncCtx;
  SSlice** ppSliceInLayer       = pCurDq->ppSliceInLayer;
  int32_t iSumAv                = 0;
  const int32_t kiSliceCount    = pSliceCtx->iSliceNumInFrame;
  int32_t iSliceIdx             = 0;
  int32_t iAvI[MAX_SLICES_NUM];

  assert (kiSliceCount <= MAX_SLICES_NUM);
  WelsEmms();

  while (iSliceIdx < kiSliceCount) {
    iAvI[iSliceIdx] = WELS_DIV_ROUND (INT_MULTIPLY * ppSliceInLayer[iSliceIdx]->iCountMbNumInSlice,
                                      ppSliceInLayer[iSliceIdx]->uiSliceConsumeTime);
    MT_TRACE_LOG (NULL, WELS_LOG_DEBUG, "[MT] CalcSliceComplexRatio(), uiSliceConsumeTime[%d]= %d us, slice_run= %d",
                  iSliceIdx,
                  ppSliceInLayer[iSliceIdx]->uiSliceConsumeTime, ppSliceInLayer[iSliceIdx]->iCountMbNumInSlice);
    iSumAv += iAvI[iSliceIdx];

    ++ iSliceIdx;
  }
  while (-- iSliceIdx >= 0) {
    ppSliceInLayer[iSliceIdx]->iSliceComplexRatio = WELS_DIV_ROUND (INT_MULTIPLY * iAvI[iSliceIdx], iSumAv);
  }
}

int32_t NeedDynamicAdjust (SSlice** ppSliceInLayer, const int32_t iSliceNum) {
  if (NULL == ppSliceInLayer) {
    return false;
  }

  uint32_t uiTotalConsume       = 0;
  int32_t iSliceIdx             = 0;
  int32_t iNeedAdj              = false;

  WelsEmms();

  while (iSliceIdx < iSliceNum) {
    if (NULL == ppSliceInLayer[iSliceIdx]) {
      return false;
    }

    uiTotalConsume += ppSliceInLayer[iSliceIdx]->uiSliceConsumeTime;
    iSliceIdx ++;
  }
  if (uiTotalConsume == 0) {
    MT_TRACE_LOG (NULL, WELS_LOG_DEBUG,
                  "[MT] NeedDynamicAdjust(), herein do no adjust due first picture, iCountSliceNum= %d",
                  iSliceNum);
    return false;
  }

  iSliceIdx = 0;
  float fThr                    = EPSN; // threshold for various cores cases
  float fRmse                   = .0f;  // root mean square error of pSlice consume ratios
  const float kfMeanRatio       = 1.0f / iSliceNum;
  do {
    const float fRatio = 1.0f * ppSliceInLayer[iSliceIdx]->uiSliceConsumeTime / uiTotalConsume;
    const float fDiffRatio = fRatio - kfMeanRatio;
    fRmse += (fDiffRatio * fDiffRatio);
    ++ iSliceIdx;
  } while (iSliceIdx + 1 < iSliceNum);
  fRmse = sqrtf (fRmse / iSliceNum);
  if (iSliceNum >= 8) {
    fThr += THRESHOLD_RMSE_CORE8;
  } else if (iSliceNum >= 4) {
    fThr += THRESHOLD_RMSE_CORE4;
  } else if (iSliceNum >= 2) {
    fThr += THRESHOLD_RMSE_CORE2;
  } else
    fThr = 1.0f;
  if (fRmse > fThr)
    iNeedAdj = true;
  MT_TRACE_LOG (NULL, WELS_LOG_DEBUG,
                "[MT] NeedDynamicAdjust(), herein adjustment decision is made (iNeedAdj= %d) by: fRmse of pSlice complexity ratios %.6f, the corresponding threshold %.6f, iCountSliceNum %d",
                iNeedAdj, fRmse, fThr, iSliceNum);

  return iNeedAdj;
}

void DynamicAdjustSlicing (sWelsEncCtx* pCtx,
                           SDqLayer* pCurDqLayer,
                           int32_t iCurDid) {
  SSliceCtx* pSliceCtx          = &pCurDqLayer->sSliceEncCtx;
  SSlice** ppSliceInLayer       = pCurDqLayer->ppSliceInLayer;
  const int32_t kiCountSliceNum = pSliceCtx->iSliceNumInFrame;
  const int32_t kiCountNumMb    = pSliceCtx->iMbNumInFrame;
  int32_t iMinimalMbNum         =
    pSliceCtx->iMbWidth;  // in theory we need only 1 SMB, here let it as one SMB row required
  int32_t iMaximalMbNum         = 0;    // dynamically assign later
  int32_t iMbNumLeft            = kiCountNumMb;
  int32_t iRunLen[MAX_THREADS_NUM] = {0};
  int32_t iSliceIdx             = 0;

  int32_t iNumMbInEachGom = 0;
  SWelsSvcRc* pWelsSvcRc = &pCtx->pWelsSvcRc[iCurDid];
  if (pCtx->pSvcParam->iRCMode != RC_OFF_MODE) {
    iNumMbInEachGom = pWelsSvcRc->iNumberMbGom;

    if (iNumMbInEachGom <= 0) {
      WelsLog (& (pCtx->sLogCtx), WELS_LOG_ERROR,
               "[MT] DynamicAdjustSlicing(), invalid iNumMbInEachGom= %d from RC, iDid= %d, iCountNumMb= %d", iNumMbInEachGom,
               iCurDid, kiCountNumMb);
      return;
    }

    // do not adjust in case no extra iNumMbInEachGom based left for slicing adjustment,
    // extra MB of non integrated GOM assigned at the last pSlice in default, keep up on early initial result.
    if (iNumMbInEachGom * kiCountSliceNum >= kiCountNumMb) {
      return;
    }
    iMinimalMbNum = iNumMbInEachGom;
  }

  if (kiCountSliceNum < 2 || (kiCountSliceNum & 0x01)) // we need suppose uiSliceNum is even for multiple threading
    return;

  iMaximalMbNum = kiCountNumMb - (kiCountSliceNum - 1) * iMinimalMbNum;

  WelsEmms();

  MT_TRACE_LOG (& (pCtx->sLogCtx), WELS_LOG_DEBUG, "[MT] DynamicAdjustSlicing(), iDid= %d, iCountNumMb= %d", iCurDid,
                kiCountNumMb);

  iSliceIdx = 0;
  while (iSliceIdx + 1 < kiCountSliceNum) {
    int32_t iNumMbAssigning = WELS_DIV_ROUND (kiCountNumMb * ppSliceInLayer[iSliceIdx]->iSliceComplexRatio, INT_MULTIPLY);

    // GOM boundary aligned
    if (pCtx->pSvcParam->iRCMode != RC_OFF_MODE) {
      iNumMbAssigning = iNumMbAssigning / iNumMbInEachGom * iNumMbInEachGom;
    }

    // make sure one GOM at least in each pSlice for safe
    if (iNumMbAssigning < iMinimalMbNum)
      iNumMbAssigning = iMinimalMbNum;
    else if (iNumMbAssigning > iMaximalMbNum)
      iNumMbAssigning = iMaximalMbNum;

    assert (iNumMbAssigning > 0);

    iMbNumLeft -= iNumMbAssigning;
    if (iMbNumLeft <= 0) { // error due to we can not support slice_skip now yet, do not adjust this time
      assert (0);
      return;
    }
    iRunLen[iSliceIdx] = iNumMbAssigning;
    MT_TRACE_LOG (& (pCtx->sLogCtx), WELS_LOG_DEBUG,
                  "[MT] DynamicAdjustSlicing(), iSliceIdx= %d, iSliceComplexRatio= %.2f, slice_run_org= %d, slice_run_adj= %d",
                  iSliceIdx, ppSliceInLayer[iSliceIdx]->iSliceComplexRatio * 1.0f / INT_MULTIPLY,
                  ppSliceInLayer[iSliceIdx]->iCountMbNumInSlice,
                  iNumMbAssigning);
    ++ iSliceIdx;
    iMaximalMbNum = iMbNumLeft - (kiCountSliceNum - iSliceIdx - 1) * iMinimalMbNum; // get maximal num_mb in left parts
  }
  iRunLen[iSliceIdx] = iMbNumLeft;
  MT_TRACE_LOG (& (pCtx->sLogCtx), WELS_LOG_DEBUG,
                "[MT] DynamicAdjustSlicing(), iSliceIdx= %d, pSliceComplexRatio= %.2f, slice_run_org= %d, slice_run_adj= %d",
                iSliceIdx, ppSliceInLayer[iSliceIdx]->iSliceComplexRatio * 1.0f / INT_MULTIPLY,
                ppSliceInLayer[iSliceIdx]->iCountMbNumInSlice, iMbNumLeft);
  pCurDqLayer->bNeedAdjustingSlicing = !DynamicAdjustSlicePEncCtxAll (pCurDqLayer, iRunLen);
}

int32_t RequestMtResource (sWelsEncCtx** ppCtx, SWelsSvcCodingParam* pCodingParam, const int32_t iCountBsLen,
                           const int32_t iMaxSliceBufferSize, bool bDynamicSlice) {
  CMemoryAlign* pMa             = NULL;
  SWelsSvcCodingParam* pPara    = NULL;
  SSliceThreading* pSmt         = NULL;
  int32_t iNumSpatialLayers     = 0;
  int32_t iThreadNum            = 0;
  int32_t iIdx                  = 0;
  int32_t iReturn               = ENC_RETURN_SUCCESS;

  if (NULL == ppCtx || NULL == pCodingParam || NULL == *ppCtx || iCountBsLen <= 0)
    return 1;
#if defined(ENABLE_TRACE_MT)
  SLogContext* pLogCtx = & ((*ppCtx)->sLogCtx);
#endif
  pMa                  = (*ppCtx)->pMemAlign;
  pPara                = pCodingParam;
  iNumSpatialLayers    = pPara->iSpatialLayerNum;
  iThreadNum           = pPara->iMultipleThreadIdc;

  assert (iThreadNum > 0);

  pSmt = (SSliceThreading*)pMa->WelsMalloc (sizeof (SSliceThreading), "SSliceThreading");
  WELS_VERIFY_RETURN_IF (1, (NULL == pSmt))
  memset (pSmt, 0, sizeof (SSliceThreading));
  (*ppCtx)->pSliceThreading = pSmt;
  pSmt->pThreadPEncCtx = (SSliceThreadPrivateData*)pMa->WelsMalloc (sizeof (SSliceThreadPrivateData) * iThreadNum,
                         "pThreadPEncCtx");
  WELS_VERIFY_RETURN_IF (1, (NULL == pSmt->pThreadPEncCtx))

#ifdef _WIN32
  // Dummy event namespace, the windows events don't actually use this
  WelsSnprintf (pSmt->eventNamespace, sizeof (pSmt->eventNamespace), "%p", (void*) *ppCtx);
#else
  WelsSnprintf (pSmt->eventNamespace, sizeof (pSmt->eventNamespace), "%p%x", (void*) *ppCtx, getpid());
#endif//!_WIN32

#ifdef MT_DEBUG
  // file handle for MT debug
  pSmt->pFSliceDiff = NULL;

  if (pSmt->pFSliceDiff) {
    fclose (pSmt->pFSliceDiff);
    pSmt->pFSliceDiff = NULL;
  }
  pSmt->pFSliceDiff = fopen ("slice_time.txt", "wt+");
#endif//MT_DEBUG

  MT_TRACE_LOG (pLogCtx, WELS_LOG_INFO, "encpEncCtx= 0x%p", (void*) *ppCtx);

  char name[SEM_NAME_MAX] = {0};
  WELS_GCC_UNUSED WELS_THREAD_ERROR_CODE err = 0;

  iIdx = 0;
  while (iIdx < iThreadNum) {
    pSmt->pThreadPEncCtx[iIdx].pWelsPEncCtx   = (void*) *ppCtx;
    pSmt->pThreadPEncCtx[iIdx].iSliceIndex    = iIdx;
    pSmt->pThreadPEncCtx[iIdx].iThreadIndex   = iIdx;
    pSmt->pThreadHandles[iIdx]                = 0;

    // length of semaphore name should be system constrained at least on mac 10.7
    WelsSnprintf (name, SEM_NAME_MAX, "ud%d%s", iIdx, pSmt->eventNamespace);
    err = WelsEventOpen (&pSmt->pUpdateMbListEvent[iIdx], name);
    MT_TRACE_LOG (pLogCtx, WELS_LOG_INFO, "[MT] Open pUpdateMbListEvent%d named(%s) ret%d err%d", iIdx, name, err, errno);
    WelsSnprintf (name, SEM_NAME_MAX, "fu%d%s", iIdx, pSmt->eventNamespace);
    err = WelsEventOpen (&pSmt->pFinUpdateMbListEvent[iIdx], name);
    MT_TRACE_LOG (pLogCtx, WELS_LOG_INFO, "[MT] Open pFinUpdateMbListEvent%d named(%s) ret%d err%d", iIdx, name, err,
                  errno);
    WelsSnprintf (name, SEM_NAME_MAX, "sc%d%s", iIdx, pSmt->eventNamespace);
    err = WelsEventOpen (&pSmt->pSliceCodedEvent[iIdx], name);
    MT_TRACE_LOG (pLogCtx, WELS_LOG_INFO, "[MT] Open pSliceCodedEvent%d named(%s) ret%d err%d", iIdx, name, err, errno);
    WelsSnprintf (name, SEM_NAME_MAX, "rc%d%s", iIdx, pSmt->eventNamespace);
    err = WelsEventOpen (&pSmt->pReadySliceCodingEvent[iIdx], name);
    MT_TRACE_LOG (pLogCtx, WELS_LOG_INFO, "[MT] Open pReadySliceCodingEvent%d = 0x%p named(%s) ret%d err%d", iIdx,
                  (void*)pSmt->pReadySliceCodingEvent[iIdx], name, err, errno);
    ++ iIdx;
  }

  WelsSnprintf (name, SEM_NAME_MAX, "scm%s", pSmt->eventNamespace);
  err = WelsEventOpen (&pSmt->pSliceCodedMasterEvent, name);
  MT_TRACE_LOG (pLogCtx, WELS_LOG_INFO, "[MT] Open pSliceCodedMasterEvent named(%s) ret%d err%d", name, err, errno);

  iReturn = WelsMutexInit (&pSmt->mutexSliceNumUpdate);
  WELS_VERIFY_RETURN_IF (1, (WELS_THREAD_ERROR_OK != iReturn))

  (*ppCtx)->pTaskManage = IWelsTaskManage::CreateTaskManage (*ppCtx, iNumSpatialLayers, bDynamicSlice);
  WELS_VERIFY_RETURN_IF (1, (NULL == (*ppCtx)->pTaskManage))

  int32_t iThreadBufferNum = WELS_MIN ((*ppCtx)->pTaskManage->GetThreadPoolThreadNum(), MAX_THREADS_NUM);

  for (iIdx = 0; iIdx < iThreadBufferNum; iIdx++) {
    pSmt->pThreadBsBuffer[iIdx] = (uint8_t*)pMa->WelsMallocz (iCountBsLen, "pSmt->pThreadBsBuffer");
    WELS_VERIFY_RETURN_IF (1, (NULL == pSmt->pThreadBsBuffer[iIdx]))
  }
  iReturn = WelsMutexInit (&pSmt->mutexThreadBsBufferUsage);
  WELS_VERIFY_RETURN_PROC_IF (1, (WELS_THREAD_ERROR_OK != iReturn), FreeMemorySvc (ppCtx))

  iReturn = WelsMutexInit (&pSmt->mutexEvent);
  WELS_VERIFY_RETURN_PROC_IF (1, (WELS_THREAD_ERROR_OK != iReturn), FreeMemorySvc (ppCtx));

  iReturn = WelsMutexInit (&pSmt->mutexThreadSlcBuffReallocate);
  WELS_VERIFY_RETURN_PROC_IF (1, (WELS_THREAD_ERROR_OK != iReturn), FreeMemorySvc (ppCtx))

  iReturn = WelsMutexInit (& (*ppCtx)->mutexEncoderError);
  WELS_VERIFY_RETURN_IF (1, (WELS_THREAD_ERROR_OK != iReturn))

  MT_TRACE_LOG (pLogCtx, WELS_LOG_INFO, "RequestMtResource(), iThreadNum=%d, iMultipleThreadIdc= %d",
                pPara->iMultipleThreadIdc,
                (*ppCtx)->iMaxSliceCount);
  return 0;
}

void ReleaseMtResource (sWelsEncCtx** ppCtx) {
  SSliceThreading* pSmt                 = NULL;
  CMemoryAlign* pMa                     = NULL;
  int32_t iIdx                          = 0;
  int32_t iThreadNum                    = 0;

  if (NULL == ppCtx || NULL == *ppCtx)
    return;

  pMa           = (*ppCtx)->pMemAlign;
  iThreadNum    = (*ppCtx)->pSvcParam->iMultipleThreadIdc;
  pSmt          = (*ppCtx)->pSliceThreading;

  if (NULL == pSmt)
    return;

  char ename[SEM_NAME_MAX] = {0};
  while (iIdx < iThreadNum) {
    // length of semaphore name should be system constrained at least on mac 10.7
    WelsSnprintf (ename, SEM_NAME_MAX, "sc%d%s", iIdx, pSmt->eventNamespace);
    WelsEventClose (&pSmt->pSliceCodedEvent[iIdx], ename);
    WelsSnprintf (ename, SEM_NAME_MAX, "rc%d%s", iIdx, pSmt->eventNamespace);
    WelsEventClose (&pSmt->pReadySliceCodingEvent[iIdx], ename);
    WelsSnprintf (ename, SEM_NAME_MAX, "ud%d%s", iIdx, pSmt->eventNamespace);
    WelsEventClose (&pSmt->pUpdateMbListEvent[iIdx], ename);
    WelsSnprintf (ename, SEM_NAME_MAX, "fu%d%s", iIdx, pSmt->eventNamespace);
    WelsEventClose (&pSmt->pFinUpdateMbListEvent[iIdx], ename);

    ++ iIdx;
  }
  WelsSnprintf (ename, SEM_NAME_MAX, "scm%s", pSmt->eventNamespace);
  WelsEventClose (&pSmt->pSliceCodedMasterEvent, ename);

  WelsMutexDestroy (&pSmt->mutexSliceNumUpdate);
  WelsMutexDestroy (&pSmt->mutexThreadBsBufferUsage);
  WelsMutexDestroy (&pSmt->mutexThreadSlcBuffReallocate);
  WelsMutexDestroy (& ((*ppCtx)->mutexEncoderError));
  WelsMutexDestroy (&pSmt->mutexEvent);
  if (pSmt->pThreadPEncCtx != NULL) {
    pMa->WelsFree (pSmt->pThreadPEncCtx, "pThreadPEncCtx");
    pSmt->pThreadPEncCtx = NULL;
  }

  for (int i = 0; i < MAX_THREADS_NUM; i++) {
    if (pSmt->pThreadBsBuffer[i]) {
      pMa->WelsFree (pSmt->pThreadBsBuffer[i], "pSmt->pThreadBsBuffer");
      pSmt->pThreadBsBuffer[i] = NULL;
    }
  }
  memset (&pSmt->bThreadBsBufferUsage, 0, MAX_THREADS_NUM * sizeof (bool));

  if ((*ppCtx)->pTaskManage != NULL) {
    WELS_DELETE_OP ((*ppCtx)->pTaskManage);
  }

#ifdef MT_DEBUG
  // file handle for debug
  if (pSmt->pFSliceDiff) {
    fclose (pSmt->pFSliceDiff);
    pSmt->pFSliceDiff = NULL;
  }
#endif//MT_DEBUG
  pMa->WelsFree ((*ppCtx)->pSliceThreading, "SSliceThreading");
  (*ppCtx)->pSliceThreading = NULL;
}

int32_t AppendSliceToFrameBs (sWelsEncCtx* pCtx, SLayerBSInfo* pLbi, const int32_t iSliceCount) {
  SSlice** ppSliceInlayer = pCtx->pCurDqLayer->ppSliceInLayer;
  SWelsSliceBs* pSliceBs  = NULL;
  int32_t iLayerSize      = 0;
  int32_t iNalIdxBase     = pLbi->iNalCount;
  int32_t iSliceIdx       = 0;

  iNalIdxBase  = pLbi->iNalCount = 0;
  while (iSliceIdx < iSliceCount) {
    pSliceBs    = &ppSliceInlayer[iSliceIdx]->sSliceBs;
    if (pSliceBs != NULL && pSliceBs->uiBsPos > 0) {
      int32_t iNalIdx = 0;
      const int32_t iCountNal = pSliceBs->iNalIndex;

#if MT_DEBUG_BS_WR
      assert (pSliceBs->bSliceCodedFlag);
#endif//MT_DEBUG_BS_WR

      memmove (pCtx->pFrameBs + pCtx->iPosBsBuffer, pSliceBs->pBs, pSliceBs->uiBsPos); // confirmed_safe_unsafe_usage
      pCtx->iPosBsBuffer += pSliceBs->uiBsPos;

      iLayerSize += pSliceBs->uiBsPos;

      while (iNalIdx < iCountNal) {
        pLbi->pNalLengthInByte[iNalIdxBase + iNalIdx] = pSliceBs->iNalLen[iNalIdx];
        ++ iNalIdx;
      }
      pLbi->iNalCount += iCountNal;
      iNalIdxBase     += iCountNal;
    }
    ++ iSliceIdx;
  }

  return iLayerSize;
}

int32_t WriteSliceBs (sWelsEncCtx* pCtx, SWelsSliceBs* pSliceBs, const int32_t iSliceIdx, int32_t& iSliceSize) {
  const int32_t kiNalCnt        = pSliceBs->iNalIndex;
  int32_t iNalIdx               = 0;
  int32_t iNalSize              = 0;
  int32_t iReturn               = ENC_RETURN_SUCCESS;
  int32_t iTotalLeftLength      = pSliceBs->uiSize - pSliceBs->uiBsPos;
  SNalUnitHeaderExt* pNalHdrExt = &pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt;
  uint8_t* pDst                 = pSliceBs->pBs;

  assert (kiNalCnt <= 2);
  if (kiNalCnt > 2)
    return 0;

  iSliceSize = 0;
  while (iNalIdx < kiNalCnt) {
    iNalSize = 0;
    iReturn = WelsEncodeNal (&pSliceBs->sNalList[iNalIdx], pNalHdrExt, iTotalLeftLength - iSliceSize,
                             pDst, &iNalSize);
    WELS_VERIFY_RETURN_IFNEQ (iReturn, ENC_RETURN_SUCCESS)

    pSliceBs->iNalLen[iNalIdx] = iNalSize;
    iSliceSize                += iNalSize;
    pDst                      += iNalSize;
    ++ iNalIdx;
  }
  pSliceBs->uiBsPos = iSliceSize;

  return iReturn;
}

// thread process for coding one pSlice
int32_t DynamicDetectCpuCores() {
  WelsLogicalProcessInfo  info;
  WelsQueryLogicalProcessInfo (&info);
  return info.ProcessorCount;
}

int32_t AdjustBaseLayer (sWelsEncCtx* pCtx) {
  SDqLayer* pCurDq      = pCtx->ppDqLayerList[0];
  int32_t iNeedAdj      = 1;
#ifdef MT_DEBUG
  int64_t iT0 = WelsTime();
#endif//MT_DEBUG

  pCtx->pCurDqLayer = pCurDq;

  // do not need adjust due to not different at both slices of consumed time
  iNeedAdj = NeedDynamicAdjust (pCtx->ppDqLayerList[0]->ppSliceInLayer
                                , pCurDq->sSliceEncCtx.iSliceNumInFrame);
  if (iNeedAdj)
    DynamicAdjustSlicing (pCtx,
                          pCurDq,
                          0);
#ifdef MT_DEBUG
  iT0 = WelsTime() - iT0;
  if (pCtx->pSliceThreading->pFSliceDiff) {
    fprintf (pCtx->pSliceThreading->pFSliceDiff,
             "%6" PRId64" us adjust time at base spatial layer, iNeedAdj %d, DynamicAdjustSlicing()\n",
             iT0, iNeedAdj);
  }
#endif//MT_DEBUG

  return iNeedAdj;
}

int32_t AdjustEnhanceLayer (sWelsEncCtx* pCtx, int32_t iCurDid) {
#ifdef MT_DEBUG
  int64_t iT1 = WelsTime();
#endif//MT_DEBUG
  int32_t iNeedAdj = 1;
  // uiSliceMode of referencing spatial should be SM_FIXEDSLCNUM_SLICE
  // if using spatial base layer for complexity estimation

  const bool kbModelingFromSpatial = (pCtx->pCurDqLayer->pRefLayer != NULL && iCurDid > 0)
                                     && (pCtx->pSvcParam->sSpatialLayers[iCurDid - 1].sSliceArgument.uiSliceMode == SM_FIXEDSLCNUM_SLICE
                                         && pCtx->pSvcParam->iMultipleThreadIdc >= pCtx->pSvcParam->sSpatialLayers[iCurDid -
                                             1].sSliceArgument.uiSliceNum);

  if (kbModelingFromSpatial) { // using spatial base layer for complexity estimation
    // do not need adjust due to not different at both slices of consumed time
    iNeedAdj = NeedDynamicAdjust (pCtx->ppDqLayerList[iCurDid - 1]->ppSliceInLayer,
                                  pCtx->pCurDqLayer->sSliceEncCtx.iSliceNumInFrame);
    if (iNeedAdj)
      DynamicAdjustSlicing (pCtx,
                            pCtx->pCurDqLayer,
                            iCurDid
                           );
  } else { // use temporal layer for complexity estimation
    // do not need adjust due to not different at both slices of consumed time
    iNeedAdj = NeedDynamicAdjust (pCtx->ppDqLayerList[iCurDid]->ppSliceInLayer,
                                  pCtx->pCurDqLayer->sSliceEncCtx.iSliceNumInFrame);
    if (iNeedAdj)
      DynamicAdjustSlicing (pCtx,
                            pCtx->pCurDqLayer,
                            iCurDid
                           );
  }

#ifdef MT_DEBUG
  iT1 = WelsTime() - iT1;
  if (pCtx->pSliceThreading->pFSliceDiff) {
    fprintf (pCtx->pSliceThreading->pFSliceDiff,
             "%6" PRId64" us adjust time at spatial layer %d, iNeedAdj %d, DynamicAdjustSlicing()\n",
             iT1, iCurDid, iNeedAdj);
  }
#endif//MT_DEBUG

  return iNeedAdj;
}



#if defined(MT_DEBUG)
void TrackSliceComplexities (sWelsEncCtx* pCtx, const int32_t iCurDid) {
  const int32_t kiCountSliceNum = pCtx->pCurDqLayer->sSliceEncCtx.iSliceNumInFrame;
  SSlice** ppSliceInLayer = pCtx->pCurDqLayer->ppSliceInLayer;
  if (kiCountSliceNum > 0) {
    int32_t iSliceIdx = 0;
    do {
      fprintf (pCtx->pSliceThreading->pFSliceDiff, "%6.3f complexity pRatio at iDid %d pSlice %d\n",
               ppSliceInLayer[iSliceIdx]->iSliceComplexRatio, iCurDid, iSliceIdx);
      ++ iSliceIdx;
    } while (iSliceIdx < kiCountSliceNum);
  }
}
#endif

#if defined(MT_DEBUG)
void TrackSliceConsumeTime (sWelsEncCtx* pCtx, int32_t* pDidList, const int32_t iSpatialNum) {
  SWelsSvcCodingParam* pPara = NULL;
  int32_t iSpatialIdx = 0;

  if (iSpatialNum > MAX_DEPENDENCY_LAYER)
    return;

  pPara = pCtx->pSvcParam;
  while (iSpatialIdx < iSpatialNum) {
    const int32_t kiDid             = pDidList[iSpatialIdx];
    SSliceArgument* pSliceArgument  = &pPara->sSpatialLayers[kiDid].sSliceArgument;
    SDqLayer* pCurDq                = pCtx->ppDqLayerList[kiDid];
    SSlice** ppSliceInLayer         = pCurDq->ppSliceInLayer;
    SSliceCtx* pSliceCtx            = &pCurDq->sSliceEncCtx;
    const uint32_t kuiCountSliceNum = pSliceCtx->iSliceNumInFrame;
    if (pCtx->pSliceThreading) {
      if (pCtx->pSliceThreading->pFSliceDiff
          && ((pSliceArgument->uiSliceMode == SM_FIXEDSLCNUM_SLICE) || (pSliceArgument->uiSliceMode == SM_SIZELIMITED_SLICE))
          && pPara->iMultipleThreadIdc > 1
          && pPara->iMultipleThreadIdc >= kuiCountSliceNum) {
        uint32_t i = 0;
        uint32_t uiMaxT = 0;
        int32_t iMaxI = 0;
        while (i < kuiCountSliceNum) {
          fprintf (pCtx->pSliceThreading->pFSliceDiff, "%6d us consume_time coding_idx %d iDid %d pSlice %d\n",
                   ppSliceInLayer[i]->uiSliceConsumeTime, pCtx->iCodingIndex, kiDid, i /*/ 1000*/);
          if (ppSliceInLayer[i]->uiSliceConsumeTime > uiMaxT) {
            uiMaxT = ppSliceInLayer[i]->uiSliceConsumeTime;
            iMaxI = i;
          }
          ++ i;
        }
        fprintf (pCtx->pSliceThreading->pFSliceDiff, "%6d us consume_time_max coding_idx %d iDid %d pSlice %d\n", uiMaxT,
                 pCtx->iCodingIndex, kiDid, iMaxI /*/ 1000*/);
      }
    }
    ++ iSpatialIdx;
  }
}
#endif//#if defined(MT_DEBUG)

void SetOneSliceBsBufferUnderMultithread (sWelsEncCtx* pCtx, const int32_t kiThreadIdx, SSlice* pSlice) {
  SWelsSliceBs* pSliceBs  = &pSlice->sSliceBs;
  pSliceBs->pBsBuffer     = pCtx->pSliceThreading->pThreadBsBuffer[kiThreadIdx];
  pSliceBs->uiBsPos       = 0;
}
}