shithub: openh264

Download patch

ref: e7f5f6a0528ab2e49ccf5199e772dd7b42a1a8a9
parent: 033c6a0448227d6290b402435c7faec9d3063dd0
author: xiaotiansf <[email protected]>
date: Mon Nov 11 06:18:59 EST 2019

Threaded decoding enhancement:

1. Distinguish non-threaded decoding with m_iThreadCount = 0 from one-thread decoding with m_iThreadCount=1.
2. Removed the use of bAvailable of PPicture and replaced by the use of iRefCount to more preciously represent the picture's being-use counts in both threaded and non-threaded mode.

--- a/codec/console/dec/src/h264dec.cpp
+++ b/codec/console/dec/src/h264dec.cpp
@@ -283,7 +283,7 @@
         goto label_exit;
       iSliceSize = static_cast<int32_t> (pInfo[2]);
     } else {
-      if (iThreadCount > 1) {
+      if (iThreadCount >= 1) {
         uint8_t* uSpsPtr = NULL;
         int32_t iSpsByteCount = 0;
         iSliceSize = readPicture (pBuf, iFileSize, iBufPos, uSpsPtr, iSpsByteCount);
@@ -577,7 +577,7 @@
     pDecoder->SetOption (DECODER_OPTION_TRACE_LEVEL, &iLevelSetting);
   }
 
-  int32_t iThreadCount = 1;
+  int32_t iThreadCount = 0;
   pDecoder->SetOption (DECODER_OPTION_NUM_OF_THREADS, &iThreadCount);
 
   if (pDecoder->Initialize (&sDecParam)) {
--- a/codec/decoder/core/inc/decoder_context.h
+++ b/codec/decoder/core/inc/decoder_context.h
@@ -550,6 +550,14 @@
     }
   }
 }
+static inline int32_t GetThreadCount (PWelsDecoderContext pCtx) {
+  int32_t iThreadCount = 0;
+  if (pCtx->pThreadCtx != NULL) {
+    PWelsDecoderThreadCTX pThreadCtx = (PWelsDecoderThreadCTX)pCtx->pThreadCtx;
+    iThreadCount = pThreadCtx->sThreadInfo.uiThrMaxNum;
+  }
+  return iThreadCount;
+}
 //#ifdef __cplusplus
 //}
 //#endif//__cplusplus
--- a/codec/decoder/core/inc/picture.h
+++ b/codec/decoder/core/inc/picture.h
@@ -69,8 +69,7 @@
   /*******************************sef_definition for misc use****************************/
   bool            bUsedAsRef;                                                     //for ref pic management
   bool            bIsLongRef;     // long term reference frame flag       //for ref pic management
-  uint8_t         uiRefCount;
-  bool            bAvailableFlag; // indicate whether it is available in this picture memory block.
+  int8_t          iRefCount;
 
   bool            bIsComplete;    // indicate whether current picture is complete, not from EC
   /*******************************for future use****************************/
--- a/codec/decoder/core/src/decode_slice.cpp
+++ b/codec/decoder/core/src/decode_slice.cpp
@@ -236,7 +236,7 @@
   }
   WelsMbInterSampleConstruction (pCtx, pCurDqLayer, pDstY, pDstCb, pDstCr, iLumaStride, iChromaStride);
 
-  if (pCtx->pThreadCtx == NULL) {
+  if (GetThreadCount (pCtx) <= 1) {
     pCtx->sBlockFunc.pWelsSetNonZeroCountFunc (
       pCurDqLayer->pNzc[pCurDqLayer->iMbXyIndex]); // set all none-zero nzc to 1; dbk can be opti!
   }
@@ -1365,7 +1365,7 @@
 
     pCurDqLayer->pInterPredictionDoneFlag[iMbXy] = 0;
     memset (pCurDqLayer->pDec->pRefIndex[0][iMbXy], 0, sizeof (int8_t) * 16);
-    bool bIsPending = pCtx->pThreadCtx != NULL;
+    bool bIsPending = GetThreadCount (pCtx) > 1;
     pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPic[0] && (ppRefPic[0]->bIsComplete
                             || bIsPending));
     //predict mv
@@ -1421,7 +1421,7 @@
 
   memset (pCurDqLayer->pDirect[iMbXy], 0, sizeof (int8_t) * 16);
 
-  bool bIsPending = pCtx->pThreadCtx != NULL;
+  bool bIsPending = GetThreadCount (pCtx) > 1;
 
   if (uiCode) {
     int16_t pMv[LIST_A][2] = { {0, 0}, { 0, 0 } };
@@ -1696,7 +1696,7 @@
 
   SDeblockingFilter pFilter;
   int32_t iFilterIdc = 1;
-  if (pCtx->pThreadCtx && pSliceHeader->uiDisableDeblockingFilterIdc != 1) {
+  if (pSliceHeader->uiDisableDeblockingFilterIdc != 1) {
     WelsDeblockingInitFilter (pCtx, pFilter, iFilterIdc);
   }
 
@@ -1764,11 +1764,15 @@
     pCurDqLayer->iMbX = iMbX;
     pCurDqLayer->iMbY = iMbY;
     pCurDqLayer->iMbXyIndex = iNextMbXyIndex;
-    if ((iMbY > iLastMby) && (iLastMbx == pCurDqLayer->iMbWidth - 1)) {
-      SET_EVENT (&pCtx->pDec->pReadyEvent[iLastMby]);
+    if (GetThreadCount (pCtx) > 1) {
+      if ((iMbY > iLastMby) && (iLastMbx == pCurDqLayer->iMbWidth - 1)) {
+        SET_EVENT (&pCtx->pDec->pReadyEvent[iLastMby]);
+      }
     }
   } while (1);
-  SET_EVENT (&pCtx->pDec->pReadyEvent[pCurDqLayer->iMbY]);
+  if (GetThreadCount (pCtx) > 1) {
+    SET_EVENT (&pCtx->pDec->pReadyEvent[pCurDqLayer->iMbY]);
+  }
   return ERR_NONE;
 }
 
@@ -2467,7 +2471,7 @@
 
     pCurDqLayer->pInterPredictionDoneFlag[iMbXy] = 0;
     memset (pCurDqLayer->pDec->pRefIndex[0][iMbXy], 0, sizeof (int8_t) * 16);
-    bool bIsPending = pCtx->pThreadCtx != NULL;
+    bool bIsPending = GetThreadCount (pCtx) > 1;
     pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPic[0] && (ppRefPic[0]->bIsComplete
                             || bIsPending));
     //predict iMv
@@ -2564,7 +2568,7 @@
     pCurDqLayer->pInterPredictionDoneFlag[iMbXy] = 0;
     memset (pCurDqLayer->pDec->pRefIndex[LIST_0][iMbXy], 0, sizeof (int8_t) * 16);
     memset (pCurDqLayer->pDec->pRefIndex[LIST_1][iMbXy], 0, sizeof (int8_t) * 16);
-    bool bIsPending = pCtx->pThreadCtx != NULL;
+    bool bIsPending = GetThreadCount (pCtx) > 1;
     pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPicL0[0] && (ppRefPicL0[0]->bIsComplete
                             || bIsPending)) || ! (ppRefPicL1[0] && (ppRefPicL1[0]->bIsComplete || bIsPending));
 
--- a/codec/decoder/core/src/decoder.cpp
+++ b/codec/decoder/core/src/decoder.cpp
@@ -151,8 +151,7 @@
   for (int32_t i = 0; i < pPicNewBuf->iCapacity; i++) {
     pPicNewBuf->ppPic[i]->bUsedAsRef = false;
     pPicNewBuf->ppPic[i]->bIsLongRef = false;
-    pPicNewBuf->ppPic[i]->uiRefCount = 0;
-    pPicNewBuf->ppPic[i]->bAvailableFlag = true;
+    pPicNewBuf->ppPic[i]->iRefCount = 0;
     pPicNewBuf->ppPic[i]->bIsComplete = false;
   }
 // remove old PicBuf
@@ -240,8 +239,7 @@
   for (int32_t i = 0; i < pPicNewBuf->iCapacity; i++) {
     pPicNewBuf->ppPic[i]->bUsedAsRef = false;
     pPicNewBuf->ppPic[i]->bIsLongRef = false;
-    pPicNewBuf->ppPic[i]->uiRefCount = 0;
-    pPicNewBuf->ppPic[i]->bAvailableFlag = true;
+    pPicNewBuf->ppPic[i]->iRefCount = 0;
     pPicNewBuf->ppPic[i]->bIsComplete = false;
   }
   // remove old PicBuf
@@ -440,7 +438,7 @@
     iNumRefFrames = MAX_REF_PIC_COUNT + 2;
   } else {
     iNumRefFrames = pCtx->pSps->iNumRefFrames + 2;
-    if (pCtx->pThreadCtx != NULL) {
+    if (GetThreadCount (pCtx) > 1) {
       iNumRefFrames = MAX_REF_PIC_COUNT + 1;
     }
   }
@@ -484,7 +482,7 @@
                          && kiPicHeight == pCtx->iImgHeightInPixel) && (!bNeedChangePicQueue)) // have same scaled buffer
 
   // sync update pRefList
-  if (pCtx->pThreadCtx == NULL) {
+  if (GetThreadCount (pCtx) <= 1) {
     WelsResetRefPic (pCtx); // added to sync update ref list due to pictures are free
   }
 
@@ -562,7 +560,7 @@
   if (NULL != pPicBuff && NULL != *pPicBuff) {
     DestroyPicBuff (pCtx, pPicBuff, pMa);
   }
-  if (pCtx->pThreadCtx != NULL) {
+  if (GetThreadCount (pCtx) > 1) {
     //prevent from double destruction of PPicBuff
     PWelsDecoderThreadCTX pThreadCtx = (PWelsDecoderThreadCTX) (pCtx->pThreadCtx);
     int32_t threadCount = pThreadCtx->sThreadInfo.uiThrMaxNum;
--- a/codec/decoder/core/src/decoder_core.cpp
+++ b/codec/decoder/core/src/decoder_core.cpp
@@ -221,10 +221,10 @@
   ppDst[1] = ppDst[1] + pCtx->sFrameCrop.iTopOffset  * pPic->iLinesize[1] + pCtx->sFrameCrop.iLeftOffset;
   ppDst[2] = ppDst[2] + pCtx->sFrameCrop.iTopOffset  * pPic->iLinesize[1] + pCtx->sFrameCrop.iLeftOffset;
   pDstInfo->iBufferStatus = 1;
-  if (pCtx->pThreadCtx != NULL && pPic->bIsComplete == false) {
+  if (GetThreadCount (pCtx) > 1 && pPic->bIsComplete == false) {
     pPic->bIsComplete = true;
   }
-  if (pCtx->pThreadCtx != NULL) {
+  if (GetThreadCount (pCtx) > 1) {
     uint32_t uiMbHeight = (pCtx->pDec->iHeightInPixel + 15) >> 4;
     for (uint32_t i = 0; i < uiMbHeight; ++i) {
       SET_EVENT (&pCtx->pDec->pReadyEvent[i]);
@@ -231,7 +231,7 @@
     }
   }
   bool bOutResChange = false;
-  if (pCtx->pThreadCtx == NULL || pCtx->pLastThreadCtx == NULL) {
+  if (GetThreadCount (pCtx) <= 1 || pCtx->pLastThreadCtx == NULL) {
     bOutResChange = (pCtx->iLastImgWidthInPixel != pDstInfo->UsrData.sSystemBuffer.iWidth)
                     || (pCtx->iLastImgHeightInPixel != pDstInfo->UsrData.sSystemBuffer.iHeight);
   } else {
@@ -2282,7 +2282,7 @@
 */
 int32_t AllocPicBuffOnNewSeqBegin (PWelsDecoderContext pCtx) {
   //try to allocate or relocate DPB memory only when new sequence is coming.
-  if (pCtx->pThreadCtx == NULL) {
+  if (GetThreadCount (pCtx) <= 1) {
     WelsResetRefPic (pCtx); //clear ref pPic when IDR NAL
   }
   int32_t iErr = SyncPictureResolutionExt (pCtx, pCtx->pSps->iMbWidth, pCtx->pSps->iMbHeight);
@@ -2418,7 +2418,7 @@
 
 int32_t InitRefPicList (PWelsDecoderContext pCtx, const uint8_t kuiNRi, int32_t iPoc) {
   int32_t iRet = ERR_NONE;
-  if (pCtx->pThreadCtx != NULL && pCtx->bNewSeqBegin) {
+  if (GetThreadCount (pCtx) > 1 && pCtx->bNewSeqBegin) {
     WelsResetRefPic (pCtx);
   }
   if (pCtx->eSliceType == B_SLICE) {
@@ -2542,7 +2542,7 @@
       }
     }
     bool isNewFrame = true;
-    if (pThreadCtx != NULL) {
+    if (GetThreadCount (pCtx) > 1) {
       isNewFrame = pCtx->pDec == NULL;
     }
     if (pCtx->pDec == NULL) {
@@ -2553,7 +2553,6 @@
             uint32_t i = 0;
             while (i < MAX_DPB_COUNT && pLastThreadCtx->pCtx->sRefPic.pRefList[listIdx][i]) {
               pLastThreadCtx->pDec->pRefPic[listIdx][i] = pLastThreadCtx->pCtx->sRefPic.pRefList[listIdx][i];
-              pLastThreadCtx->pDec->pRefPic[listIdx][i]->bAvailableFlag = false;
               ++i;
             }
           }
@@ -2563,30 +2562,8 @@
         } else {
           pCtx->sRefPic = pLastThreadCtx->pCtx->sRefPic;
         }
-        //printf ("last uiDecodingTimeStamp = %d\n", pLastThreadCtx->pCtx->uiDecodingTimeStamp);
-        for (int32_t i = 0; i < pCtx->sRefPic.uiRefCount[LIST_0]; ++i) {
-          if (pCtx->sRefPic.pRefList[LIST_0][i] != NULL) {
-            pCtx->sRefPic.pRefList[LIST_0][i]->bAvailableFlag = false;
-          }
-        }
-        for (int32_t i = 0; i < pCtx->sRefPic.uiRefCount[LIST_1]; ++i) {
-          if (pCtx->sRefPic.pRefList[LIST_1][i] != NULL) {
-            pCtx->sRefPic.pRefList[LIST_1][i]->bAvailableFlag = false;
-          }
-        }
       }
       pCtx->pDec = PrefetchPic (pCtx->pPicBuff);
-      if (pThreadCtx != NULL) {
-        if (pCtx->pDec != NULL) {
-          pCtx->pDec->bAvailableFlag = false;
-          pCtx->pDec->bIsUngroupedMultiSlice = false;
-          pThreadCtx->pDec = pCtx->pDec;
-          uint32_t uiMbHeight = (pCtx->pDec->iHeightInPixel + 15) >> 4;
-          for (uint32_t i = 0; i < uiMbHeight; ++i) {
-            RESET_EVENT (&pCtx->pDec->pReadyEvent[i]);
-          }
-        }
-      }
       if (pCtx->iTotalNumMbRec != 0)
         pCtx->iTotalNumMbRec = 0;
 
@@ -2598,6 +2575,15 @@
         pCtx->iErrorCode |= dsOutOfMemory;
         return ERR_INFO_REF_COUNT_OVERFLOW;
       }
+      if (pThreadCtx != NULL) {
+        pCtx->pDec->bIsUngroupedMultiSlice = false;
+        pThreadCtx->pDec = pCtx->pDec;
+        if (GetThreadCount (pCtx) > 1) ++pCtx->pDec->iRefCount;
+        uint32_t uiMbHeight = (pCtx->pDec->iHeightInPixel + 15) >> 4;
+        for (uint32_t i = 0; i < uiMbHeight; ++i) {
+          RESET_EVENT (&pCtx->pDec->pReadyEvent[i]);
+        }
+      }
       pCtx->pDec->bNewSeqBegin = pCtx->bNewSeqBegin; //set flag for start decoding
     } else if (pCtx->iTotalNumMbRec == 0) { //pDec != NULL, already start
       pCtx->pDec->bNewSeqBegin = pCtx->bNewSeqBegin; //set flag for start decoding
@@ -2743,7 +2729,7 @@
         if (pSh->eSliceType == B_SLICE && !pSh->iDirectSpatialMvPredFlag)
           ComputeColocatedTemporalScaling (pCtx);
 
-        if (pThreadCtx != NULL) {
+        if (GetThreadCount (pCtx) > 1) {
           memset (&pCtx->lastReadyHeightOffset[0][0], -1, LIST_A * MAX_REF_PIC_COUNT * sizeof (int16_t));
           SET_EVENT (&pThreadCtx->sSliceDecodeStart);
           iRet = WelsDecodeAndConstructSlice (pCtx);
@@ -2765,7 +2751,7 @@
           }
         }
 
-        if (pThreadCtx == NULL && bReconstructSlice) {
+        if (GetThreadCount (pCtx) <= 1 && bReconstructSlice) {
           if ((iRet = WelsDecodeConstructSlice (pCtx, pNalCur)) != ERR_NONE) {
             pCtx->pDec->bIsComplete = false; // reconstruction error, directly set the flag false
             return iRet;
@@ -2772,7 +2758,7 @@
           }
         }
         if (bAllRefComplete && pCtx->eSliceType != I_SLICE) {
-          if (pCtx->pThreadCtx == NULL) {
+          if (GetThreadCount (pCtx) <= 1) {
             if (pCtx->sRefPic.uiRefCount[LIST_0] > 0) {
               bAllRefComplete &= CheckRefPicturesComplete (pCtx);
             } else {
@@ -2829,17 +2815,18 @@
         }
       }
 
-      if (pThreadCtx != NULL && pCtx->uiDecodingTimeStamp > 1 && pCtx->pLastDecPicInfo->uiDecodingTimeStamp > 0) {
+      if (GetThreadCount (pCtx) > 1 && pCtx->uiDecodingTimeStamp > 1 && pCtx->pLastDecPicInfo->uiDecodingTimeStamp > 0) {
         while (pCtx->uiDecodingTimeStamp > pCtx->pLastDecPicInfo->uiDecodingTimeStamp + 1) {
           WelsSleep (1);
         }
       }
-      if (pThreadCtx != NULL) {
+
+      if (GetThreadCount (pCtx) >= 1) {
         pCtx->pLastDecPicInfo->uiDecodingTimeStamp = pCtx->uiDecodingTimeStamp;
       }
       iRet = DecodeFrameConstruction (pCtx, ppDst, pDstInfo);
       if (iRet) {
-        if (pThreadCtx != NULL) {
+        if (GetThreadCount (pCtx) > 1) {
           SET_EVENT (&pThreadCtx->sSliceDecodeFinsh);
         }
         return iRet;
@@ -2847,7 +2834,7 @@
 
       pCtx->pLastDecPicInfo->pPreviousDecodedPictureInDpb = pCtx->pDec; //store latest decoded picture for EC
       pCtx->bUsedAsRef = pCtx->uiNalRefIdc > 0;
-      if (pCtx->pThreadCtx == NULL) {
+      if (GetThreadCount (pCtx) <= 1) {
         if (pCtx->bUsedAsRef) {
           for (int32_t listIdx = LIST_0; listIdx < LIST_A; ++listIdx) {
             uint32_t i = 0;
@@ -2870,7 +2857,7 @@
                                       pCtx->pDec->iLinesize,
                                       pCtx->sExpandPicFunc.pfExpandLumaPicture, pCtx->sExpandPicFunc.pfExpandChromaPicture);
         }
-      } else {
+      } else if (GetThreadCount (pCtx) > 1) {
         SET_EVENT (&pThreadCtx->sImageReady);
       }
       pCtx->pDec = NULL; //after frame decoding, always set to NULL
@@ -2881,7 +2868,7 @@
       pCtx->pLastDecPicInfo->iPrevFrameNum = pSh->iFrameNum;
     if (pCtx->pLastDecPicInfo->bLastHasMmco5)
       pCtx->pLastDecPicInfo->iPrevFrameNum = 0;
-    if (pThreadCtx != NULL) {
+    if (GetThreadCount (pCtx) > 1) {
       int32_t threadCount = pThreadCtx->sThreadInfo.uiThrMaxNum;
       int32_t  id = pThreadCtx->sThreadInfo.uiThrNum;
       for (int32_t i = 0; i < threadCount; ++i) {
@@ -2899,7 +2886,7 @@
       }
     }
   }
-  if (pThreadCtx != NULL) {
+  if (GetThreadCount (pCtx) > 1) {
     SET_EVENT (&pThreadCtx->sSliceDecodeFinsh);
   }
   return ERR_NONE;
--- a/codec/decoder/core/src/manage_dec_ref.cpp
+++ b/codec/decoder/core/src/manage_dec_ref.cpp
@@ -80,6 +80,7 @@
     pRef->uiSpatialId = -1;
     pRef->iSpsId = -1;
     pRef->bIsComplete = false;
+    pRef->iRefCount = 0;
 
     if (pRef->eSliceType == I_SLICE) {
       return;
@@ -88,7 +89,7 @@
     for (int32_t i = 0; i < MAX_DPB_COUNT; ++i) {
       for (int32_t list = 0; list < lists; ++list) {
         if (pRef->pRefPic[list][i] != NULL) {
-          pRef->pRefPic[list][i]->bAvailableFlag = true;
+          pRef->pRefPic[list][i]->iRefCount = 0;
           pRef->pRefPic[list][i] = NULL;
         }
       }
@@ -781,8 +782,8 @@
   for (i = 0; i < pRefPic->uiShortRefCount[LIST_0]; i++) {
     if (pRefPic->pShortRefList[LIST_0][i]->iFrameNum == iFrameNum) {
       iMoveSize = pRefPic->uiShortRefCount[LIST_0] - i - 1;
-      pRefPic->pShortRefList[LIST_0][i]->bUsedAsRef = false;
       pPic = pRefPic->pShortRefList[LIST_0][i];
+      pPic->bUsedAsRef = false;
       pRefPic->pShortRefList[LIST_0][i] = NULL;
       if (iMoveSize > 0) {
         memmove (&pRefPic->pShortRefList[LIST_0][i], &pRefPic->pShortRefList[LIST_0][i + 1],
--- a/codec/decoder/core/src/mv_pred.cpp
+++ b/codec/decoder/core/src/mv_pred.cpp
@@ -315,7 +315,7 @@
   mbType = GetMbType (pCurDqLayer)[iMbXy];
 
   PPicture colocPic = pCtx->sRefPic.pRefList[LIST_1][0];
-  if (pCtx->pThreadCtx != NULL) {
+  if (GetThreadCount (pCtx) > 1) {
     if (16 * pCurDqLayer->iMbY > pCtx->lastReadyHeightOffset[1][0]) {
       if (colocPic->pReadyEvent[pCurDqLayer->iMbY].isSignaled != 1) {
         WAIT_EVENT (&colocPic->pReadyEvent[pCurDqLayer->iMbY], WELS_DEC_THREAD_WAIT_INFINITE);
--- a/codec/decoder/core/src/parse_mb_syn_cabac.cpp
+++ b/codec/decoder/core/src/parse_mb_syn_cabac.cpp
@@ -535,7 +535,7 @@
   pRefCount[0] = pSliceHeader->uiRefCount[0];
   pRefCount[1] = pSliceHeader->uiRefCount[1];
 
-  bool bIsPending = pCtx->pThreadCtx != NULL;
+  bool bIsPending = GetThreadCount (pCtx) > 1;
 
   switch (pCurDqLayer->pDec->pMbType[iMbXy]) {
   case MB_TYPE_16x16: {
@@ -741,7 +741,7 @@
 
   MbType mbType = pCurDqLayer->pDec->pMbType[iMbXy];
 
-  bool bIsPending = pCtx->pThreadCtx != NULL;
+  bool bIsPending = GetThreadCount (pCtx) > 1;
 
   if (IS_DIRECT (mbType)) {
 
--- a/codec/decoder/core/src/parse_mb_syn_cavlc.cpp
+++ b/codec/decoder/core/src/parse_mb_syn_cavlc.cpp
@@ -1083,7 +1083,7 @@
   iRefCount[0] = pSliceHeader->uiRefCount[0];
   iRefCount[1] = pSliceHeader->uiRefCount[1];
 
-  bool bIsPending = pCtx->pThreadCtx != NULL;
+  bool bIsPending = GetThreadCount (pCtx) > 1;
 
   switch (pCurDqLayer->pDec->pMbType[iMbXy]) {
   case MB_TYPE_16x16: {
@@ -1348,7 +1348,7 @@
   iRefCount[0] = pSliceHeader->uiRefCount[0];
   iRefCount[1] = pSliceHeader->uiRefCount[1];
 
-  bool bIsPending = pCtx->pThreadCtx != NULL;
+  bool bIsPending = GetThreadCount (pCtx) > 1;
 
   MbType mbType = pCurDqLayer->pDec->pMbType[iMbXy];
   if (IS_DIRECT (mbType)) {
--- a/codec/decoder/core/src/pic_queue.cpp
+++ b/codec/decoder/core/src/pic_queue.cpp
@@ -106,12 +106,14 @@
   pPic->iWidthInPixel  = kiPicWidth;
   pPic->iHeightInPixel = kiPicHeight;
   pPic->iFrameNum      = -1;
-  pPic->bAvailableFlag = true;
+  pPic->iRefCount = 0;
 
   uint32_t uiMbWidth = (kiPicWidth + 15) >> 4;
   uint32_t uiMbHeight = (kiPicHeight + 15) >> 4;
   uint32_t uiMbCount = uiMbWidth * uiMbHeight;
+
   pPic->pMbCorrectlyDecodedFlag = (bool*)pMa->WelsMallocz (uiMbCount * sizeof (bool), "pPic->pMbCorrectlyDecodedFlag");
+
   pPic->pMbType = (uint32_t*)pMa->WelsMallocz (uiMbCount * sizeof (uint32_t), "pPic->pMbType");
   pPic->pMv[LIST_0] = (int16_t (*)[16][2])pMa->WelsMallocz (uiMbCount * sizeof (
                         int16_t) * MV_A * MB_BLOCK4x4_NUM, "pPic->pMv[]");
@@ -182,8 +184,8 @@
   }
 
   for (iPicIdx = pPicBuf->iCurrentIdx + 1; iPicIdx < pPicBuf->iCapacity ; ++iPicIdx) {
-    if (pPicBuf->ppPic[iPicIdx] != NULL && pPicBuf->ppPic[iPicIdx]->bAvailableFlag
-        && !pPicBuf->ppPic[iPicIdx]->bUsedAsRef) {
+    if (pPicBuf->ppPic[iPicIdx] != NULL && !pPicBuf->ppPic[iPicIdx]->bUsedAsRef
+        && pPicBuf->ppPic[iPicIdx]->iRefCount <= 0) {
       pPic = pPicBuf->ppPic[iPicIdx];
       break;
     }
@@ -194,8 +196,8 @@
     return pPic;
   }
   for (iPicIdx = 0 ; iPicIdx <= pPicBuf->iCurrentIdx ; ++iPicIdx) {
-    if (pPicBuf->ppPic[iPicIdx] != NULL && pPicBuf->ppPic[iPicIdx]->bAvailableFlag
-        && !pPicBuf->ppPic[iPicIdx]->bUsedAsRef) {
+    if (pPicBuf->ppPic[iPicIdx] != NULL && !pPicBuf->ppPic[iPicIdx]->bUsedAsRef
+        && pPicBuf->ppPic[iPicIdx]->iRefCount <= 0) {
       pPic = pPicBuf->ppPic[iPicIdx];
       break;
     }
--- a/codec/decoder/core/src/rec_mb.cpp
+++ b/codec/decoder/core/src/rec_mb.cpp
@@ -252,7 +252,7 @@
   iFullMVy = WELS_CLIP3 (iFullMVy, ((-PADDING_LENGTH + 2) * (1 << 2)),
                          ((pMCRefMem->iPicHeight + PADDING_LENGTH - 19) * (1 << 2)));
 
-  if (pCtx->pThreadCtx != NULL && iRefIdx >= 0) {
+  if (GetThreadCount (pCtx) > 1 && iRefIdx >= 0) {
     // wait for the lines of reference macroblock (3 + 16).
     PPicture pRefPic = pCtx->sRefPic.pRefList[listIdx][iRefIdx];
     if (pCtx->bNewSeqBegin && (pCtx->iErrorCode & dsRefLost)) {
--- a/codec/decoder/plus/inc/welsDecoderExt.h
+++ b/codec/decoder/plus/inc/welsDecoderExt.h
@@ -120,6 +120,7 @@
   bool                    m_bIsBaseline;
   int32_t                 m_iCpuCount;
   int32_t                 m_iThreadCount;
+  int32_t                 m_iCtxCount;
   PPicBuff                m_pPicBuff;
   bool                    m_bParamSetsLostFlag;
   bool                    m_bFreezeOutput;
--- a/codec/decoder/plus/src/welsDecoderExt.cpp
+++ b/codec/decoder/plus/src/welsDecoderExt.cpp
@@ -105,7 +105,9 @@
     RESET_EVENT (&pLastThreadCtx->sSliceDecodeStart);
   }
   pThrCtx->pDec = NULL;
-  RESET_EVENT (&pThrCtx->sSliceDecodeFinsh);
+  if (GetThreadCount (pThrCtx->pCtx) > 1) {
+    RESET_EVENT (&pThrCtx->sSliceDecodeFinsh);
+  }
   iRet |= pWelsDecoder->DecodeFrame2WithCtx (pThrCtx->pCtx, NULL, 0, pThrCtx->ppDst, &pThrCtx->sDstInfo);
 
   //WelsMutexUnlock (&pWelsDecoder->m_csDecoder);
@@ -133,7 +135,8 @@
     m_uiDecodeTimeStamp (0),
     m_bIsBaseline (false),
     m_iCpuCount (1),
-    m_iThreadCount (1),
+    m_iThreadCount (0),
+    m_iCtxCount (1),
     m_pPicBuff (NULL),
     m_bParamSetsLostFlag (false),
     m_bFreezeOutput (false),
@@ -167,8 +170,9 @@
   if (m_iCpuCount > WELS_DEC_MAX_NUM_CPU) {
     m_iCpuCount = WELS_DEC_MAX_NUM_CPU;
   }
-  m_pDecThrCtx = new SWelsDecoderThreadCTX[m_iThreadCount];
-  memset (m_pDecThrCtx, 0, sizeof (SWelsDecoderThreadCTX)*m_iThreadCount);
+
+  m_pDecThrCtx = new SWelsDecoderThreadCTX[m_iCtxCount];
+  memset (m_pDecThrCtx, 0, sizeof (SWelsDecoderThreadCTX)*m_iCtxCount);
   for (int32_t i = 0; i < WELS_DEC_MAX_NUM_CPU; ++i) {
     m_pDecThrCtxActive[i] = NULL;
   }
@@ -277,7 +281,7 @@
 }
 
 void CWelsDecoder::UninitDecoder (void) {
-  for (int32_t i = 0; i < m_iThreadCount; ++i) {
+  for (int32_t i = 0; i < m_iCtxCount; ++i) {
     if (m_pDecThrCtx[i].pCtx != NULL) {
       if (i > 0) {
         WelsResetRefPicWithoutUnRef (m_pDecThrCtx[i].pCtx);
@@ -288,7 +292,7 @@
 }
 
 void CWelsDecoder::OpenDecoderThreads() {
-  if (m_iThreadCount > 1) {
+  if (m_iThreadCount >= 1) {
     m_uiDecodeTimeStamp = 0;
     CREATE_SEMAPHORE (&m_sIsBusy, m_iThreadCount, m_iThreadCount, NULL);
     WelsMutexInit (&m_csDecoder);
@@ -318,7 +322,7 @@
   }
 }
 void CWelsDecoder::CloseDecoderThreads() {
-  if (m_iThreadCount > 1) {
+  if (m_iThreadCount >= 1) {
     for (int32_t i = 0; i < m_iThreadCount; i++) { //waiting the completion begun slices
       WAIT_SEMAPHORE (&m_pDecThrCtx[i].sThreadInfo.sIsIdle, WELS_DEC_THREAD_WAIT_INFINITE);
       m_pDecThrCtx[i].sThreadInfo.uiCommand = WELS_DEC_THREAD_COMMAND_ABORT;
@@ -367,8 +371,8 @@
   WelsLog (&m_pWelsTrace->m_sLogCtx, WELS_LOG_INFO,
            "CWelsDecoder::init_decoder(), openh264 codec version = %s, ParseOnly = %d",
            VERSION_NUMBER, (int32_t)pParam->bParseOnly);
-  if (m_iThreadCount > 1 && pParam->bParseOnly) {
-    m_iThreadCount = 1;
+  if (m_iThreadCount >= 1 && pParam->bParseOnly) {
+    m_iThreadCount = 0;
   }
   OpenDecoderThreads();
   //reset decoder context
@@ -377,9 +381,9 @@
   memset (&m_sVlcTable, 0, sizeof (SVlcTable));
   UninitDecoder();
   WelsDecoderLastDecPicInfoDefaults (m_sLastDecPicInfo);
-  for (int32_t i = 0; i < m_iThreadCount; ++i) {
+  for (int32_t i = 0; i < m_iCtxCount; ++i) {
     InitDecoderCtx (m_pDecThrCtx[i].pCtx, pParam);
-    if (m_iThreadCount > 1) {
+    if (m_iThreadCount >= 1) {
       m_pDecThrCtx[i].pCtx->pThreadCtx = &m_pDecThrCtx[i];
     }
   }
@@ -429,7 +433,7 @@
 
 int32_t CWelsDecoder::ResetDecoder (PWelsDecoderContext& pCtx) {
   // TBC: need to be modified when context and trace point are null
-  if (m_iThreadCount > 1) {
+  if (m_iThreadCount >= 1) {
     ThreadResetDecoder (pCtx);
   } else {
     if (pCtx != NULL && m_pWelsTrace != NULL) {
@@ -472,9 +476,8 @@
   if (eOptID == DECODER_OPTION_NUM_OF_THREADS) {
     if (pOption != NULL) {
       int32_t threadCount = * ((int32_t*)pOption);
-      if (threadCount <= 0) {
-        threadCount = 1;
-      } else if (threadCount > m_iCpuCount) {
+      if (threadCount < 0) threadCount = 0;
+      if (threadCount > m_iCpuCount) {
         threadCount = m_iCpuCount;
       }
       if (threadCount > 3) {
@@ -484,14 +487,15 @@
         m_iThreadCount = threadCount;
         if (m_pDecThrCtx != NULL) {
           delete [] m_pDecThrCtx;
-          m_pDecThrCtx = new SWelsDecoderThreadCTX[m_iThreadCount];
-          memset (m_pDecThrCtx, 0, sizeof (SWelsDecoderThreadCTX)*m_iThreadCount);
+          m_iCtxCount = m_iThreadCount == 0 ? 1 : m_iThreadCount;
+          m_pDecThrCtx = new SWelsDecoderThreadCTX[m_iCtxCount];
+          memset (m_pDecThrCtx, 0, sizeof (SWelsDecoderThreadCTX)*m_iCtxCount);
         }
       }
     }
     return cmResultSuccess;
   }
-  for (int32_t i = 0; i < m_iThreadCount; ++i) {
+  for (int32_t i = 0; i < m_iCtxCount; ++i) {
     PWelsDecoderContext pDecContext = m_pDecThrCtx[i].pCtx;
     if (pDecContext == NULL && eOptID != DECODER_OPTION_TRACE_LEVEL &&
         eOptID != DECODER_OPTION_TRACE_CALLBACK && eOptID != DECODER_OPTION_TRACE_CALLBACK_CONTEXT)
@@ -502,6 +506,8 @@
 
       iVal = * ((int*)pOption); // boolean value for whether enabled End Of Stream flag
 
+      if (pDecContext == NULL) return dsInitialOptExpected;
+
       pDecContext->bEndOfStreamFlag = iVal ? true : false;
 
       return cmResultSuccess;
@@ -509,6 +515,8 @@
       if (pOption == NULL)
         return cmInitParaError;
 
+      if (pDecContext == NULL) return dsInitialOptExpected;
+
       iVal = * ((int*)pOption); // int value for error concealment idc
       iVal = WELS_CLIP3 (iVal, (int32_t)ERROR_CON_DISABLE, (int32_t)ERROR_CON_SLICE_MV_COPY_CROSS_IDR_FREEZE_RES_CHANGE);
       if ((pDecContext->pParam->bParseOnly) && (iVal != (int32_t)ERROR_CON_DISABLE)) {
@@ -550,6 +558,7 @@
       return cmInitParaError;
     } else if (eOptID == DECODER_OPTION_STATISTICS_LOG_INTERVAL) {
       if (pOption) {
+        if (pDecContext == NULL) return dsInitialOptExpected;
         pDecContext->pDecoderStatistics->iStatisticsLogInterval = (* ((unsigned int*)pOption));
         return cmResultSuccess;
       }
@@ -681,7 +690,7 @@
     unsigned char** ppDst,
     SBufferInfo* pDstInfo) {
   int iRet = dsErrorFree;
-  if (m_iThreadCount > 1) {
+  if (m_iThreadCount >= 1) {
     iRet = ThreadDecodeFrameInternal (kpSrc, kiSrcLen, ppDst, pDstInfo);
     if (m_sReoderingStatus.iNumOfPicts) {
       WAIT_EVENT (&m_sBufferingEvent, WELS_DEC_THREAD_WAIT_INFINITE);
@@ -742,6 +751,9 @@
     }
 #endif//OUTPUT_BIT_STREAM
     pDecContext->bEndOfStreamFlag = false;
+    if (GetThreadCount (pDecContext) <= 0) {
+      pDecContext->uiDecodingTimeStamp = ++m_uiDecodeTimeStamp;
+    }
   } else {
     //For application MODE, the error detection should be added for safe.
     //But for CONSOLE MODE, when decoding LAST AU, kiSrcLen==0 && kpSrc==NULL.
@@ -752,13 +764,13 @@
   int64_t iStart, iEnd;
   iStart = WelsTime();
 
-  if (pDecContext->pThreadCtx == NULL) {
+  if (GetThreadCount (pDecContext) <= 1) {
     ppDst[0] = ppDst[1] = ppDst[2] = NULL;
   }
   pDecContext->iErrorCode = dsErrorFree; //initialize at the starting of AU decoding.
   pDecContext->iFeedbackVclNalInAu = FEEDBACK_UNKNOWN_NAL; //initialize
   unsigned long long uiInBsTimeStamp = pDstInfo->uiInBsTimeStamp;
-  if (pDecContext->pThreadCtx == NULL) {
+  if (GetThreadCount (pDecContext) <= 1) {
     memset (pDstInfo, 0, sizeof (SBufferInfo));
   }
   pDstInfo->uiInBsTimeStamp = uiInBsTimeStamp;
@@ -856,7 +868,7 @@
 
     OutputStatisticsLog (*pDecContext->pDecoderStatistics);
 
-    if (pDecContext->pThreadCtx != NULL) {
+    if (GetThreadCount (pDecContext) >= 1) {
       WAIT_EVENT (&m_sReleaseBufferEvent, WELS_DEC_THREAD_WAIT_INFINITE);
       RESET_EVENT (&m_sBufferingEvent);
       BufferingReadyPicture (pDecContext, ppDst, pDstInfo);
@@ -882,7 +894,7 @@
   iEnd = WelsTime();
   pDecContext->dDecTime += (iEnd - iStart) / 1e3;
 
-  if (pDecContext->pThreadCtx != NULL) {
+  if (GetThreadCount (pDecContext) >= 1) {
     WAIT_EVENT (&m_sReleaseBufferEvent, WELS_DEC_THREAD_WAIT_INFINITE);
     RESET_EVENT (&m_sBufferingEvent);
     BufferingReadyPicture (pDecContext, ppDst, pDstInfo);
@@ -904,7 +916,7 @@
 DECODING_STATE CWelsDecoder::FlushFrame (unsigned char** ppDst,
     SBufferInfo* pDstInfo) {
   bool bEndOfStreamFlag = true;
-  for (int32_t j = 0; j < m_iThreadCount; ++j) {
+  for (int32_t j = 0; j < m_iCtxCount; ++j) {
     if (!m_pDecThrCtx[j].pCtx->bEndOfStreamFlag) {
       bEndOfStreamFlag = false;
     }
@@ -934,9 +946,10 @@
     ppDst[1] = m_sPictInfoList[m_sReoderingStatus.iPictInfoIndex].pData[1];
     ppDst[2] = m_sPictInfoList[m_sReoderingStatus.iPictInfoIndex].pData[2];
     m_sPictInfoList[m_sReoderingStatus.iPictInfoIndex].iPOC = IMinInt32;
-    PPicBuff pPicBuff = m_iThreadCount == 1 ? m_pDecThrCtx[0].pCtx->pPicBuff : m_pPicBuff;
+    PPicBuff pPicBuff = m_iThreadCount <= 1 ? m_pDecThrCtx[0].pCtx->pPicBuff : m_pPicBuff;
     if (m_sPictInfoList[m_sReoderingStatus.iPictInfoIndex].iPicBuffIdx < pPicBuff->iCapacity) {
-      pPicBuff->ppPic[m_sPictInfoList[m_sReoderingStatus.iPictInfoIndex].iPicBuffIdx]->bAvailableFlag = true;
+      PPicture pPic = pPicBuff->ppPic[m_sPictInfoList[m_sReoderingStatus.iPictInfoIndex].iPicBuffIdx];
+      --pPic->iRefCount;
     }
     m_sPictInfoList[m_sReoderingStatus.iPictInfoIndex].bLastGOP = false;
     m_sReoderingStatus.iMinPOC = IMinInt32;
@@ -1001,6 +1014,7 @@
     if (m_sReoderingStatus.iNumOfPicts && pCtx->pLastDecPicInfo->pPreviousDecodedPictureInDpb
         && pCtx->pLastDecPicInfo->pPreviousDecodedPictureInDpb->bNewSeqBegin) {
       m_sReoderingStatus.iLastGOPRemainPicts = m_sReoderingStatus.iNumOfPicts;
+
       for (int32_t i = 0; i <= m_sReoderingStatus.iLargestBufferedPicIndex; ++i) {
         if (m_sPictInfoList[i].iPOC > IMinInt32) {
           m_sPictInfoList[i].bLastGOP = true;
@@ -1036,7 +1050,7 @@
       m_sPictInfoList[i].iPOC = pCtx->pSliceHeader->iPicOrderCntLsb;
       m_sPictInfoList[i].uiDecodingTimeStamp = pCtx->uiDecodingTimeStamp;
       m_sPictInfoList[i].iPicBuffIdx = pCtx->pLastDecPicInfo->pPreviousDecodedPictureInDpb->iPicBuffIdx;
-      pCtx->pPicBuff->ppPic[m_sPictInfoList[i].iPicBuffIdx]->bAvailableFlag = false;
+      if (GetThreadCount (pCtx) <= 1) ++pCtx->pLastDecPicInfo->pPreviousDecodedPictureInDpb->iRefCount;
       m_sPictInfoList[i].bLastGOP = false;
       pDstInfo->iBufferStatus = 0;
       ++m_sReoderingStatus.iNumOfPicts;
@@ -1051,6 +1065,9 @@
 void CWelsDecoder::ReleaseBufferedReadyPicture (PWelsDecoderContext pCtx, unsigned char** ppDst,
     SBufferInfo* pDstInfo) {
   PPicBuff pPicBuff = pCtx ? pCtx->pPicBuff : m_pPicBuff;
+  if (pCtx == NULL && m_iThreadCount <= 1) {
+    pCtx = m_pDecThrCtx[0].pCtx;
+  }
   if (!m_bIsBaseline && m_sReoderingStatus.iLastGOPRemainPicts > 0) {
     m_sReoderingStatus.iMinPOC = IMinInt32;
     for (int32_t i = 0; i <= m_sReoderingStatus.iLargestBufferedPicIndex; ++i) {
@@ -1075,7 +1092,8 @@
     ppDst[1] = m_sPictInfoList[m_sReoderingStatus.iPictInfoIndex].pData[1];
     ppDst[2] = m_sPictInfoList[m_sReoderingStatus.iPictInfoIndex].pData[2];
     m_sPictInfoList[m_sReoderingStatus.iPictInfoIndex].iPOC = IMinInt32;
-    pPicBuff->ppPic[m_sPictInfoList[m_sReoderingStatus.iPictInfoIndex].iPicBuffIdx]->bAvailableFlag = true;
+    PPicture pPic = pPicBuff->ppPic[m_sPictInfoList[m_sReoderingStatus.iPictInfoIndex].iPicBuffIdx];
+    --pPic->iRefCount;
     m_sPictInfoList[m_sReoderingStatus.iPictInfoIndex].bLastGOP = false;
     m_sReoderingStatus.iMinPOC = IMinInt32;
     --m_sReoderingStatus.iNumOfPicts;
@@ -1107,7 +1125,8 @@
       ppDst[1] = m_sPictInfoList[m_sReoderingStatus.iPictInfoIndex].pData[1];
       ppDst[2] = m_sPictInfoList[m_sReoderingStatus.iPictInfoIndex].pData[2];
       m_sPictInfoList[m_sReoderingStatus.iPictInfoIndex].iPOC = IMinInt32;
-      pPicBuff->ppPic[m_sPictInfoList[m_sReoderingStatus.iPictInfoIndex].iPicBuffIdx]->bAvailableFlag = true;
+      PPicture pPic = pPicBuff->ppPic[m_sPictInfoList[m_sReoderingStatus.iPictInfoIndex].iPicBuffIdx];
+      --pPic->iRefCount;
       --m_sReoderingStatus.iNumOfPicts;
     }
     return;
@@ -1147,7 +1166,8 @@
       ppDst[1] = m_sPictInfoList[m_sReoderingStatus.iPictInfoIndex].pData[1];
       ppDst[2] = m_sPictInfoList[m_sReoderingStatus.iPictInfoIndex].pData[2];
       m_sPictInfoList[m_sReoderingStatus.iPictInfoIndex].iPOC = IMinInt32;
-      pPicBuff->ppPic[m_sPictInfoList[m_sReoderingStatus.iPictInfoIndex].iPicBuffIdx]->bAvailableFlag = true;
+      PPicture pPic = pPicBuff->ppPic[m_sPictInfoList[m_sReoderingStatus.iPictInfoIndex].iPicBuffIdx];
+      --pPic->iRefCount;
       m_sPictInfoList[m_sReoderingStatus.iPictInfoIndex].bLastGOP = false;
       m_sReoderingStatus.iMinPOC = IMinInt32;
       --m_sReoderingStatus.iNumOfPicts;
@@ -1159,7 +1179,6 @@
     SBufferInfo* pDstInfo) {
   DECODING_STATE iRet = dsErrorFree;
   if (pDstInfo->iBufferStatus == 1) {
-    ++pDecContext->uiDecodingTimeStamp;
     m_bIsBaseline = pDecContext->pSps->uiProfileIdc == 66 || pDecContext->pSps->uiProfileIdc == 83;
     if (!m_bIsBaseline) {
       BufferingReadyPicture (pDecContext, ppDst, pDstInfo);
@@ -1364,7 +1383,9 @@
   memcpy (&m_pDecThrCtx[signal].sDstInfo, pDstInfo, sizeof (SBufferInfo));
 
   ParseAccessUnit (m_pDecThrCtx[signal]);
-  m_pLastDecThrCtx = &m_pDecThrCtx[signal];
+  if (m_iThreadCount > 1) {
+    m_pLastDecThrCtx = &m_pDecThrCtx[signal];
+  }
   m_pDecThrCtx[signal].sThreadInfo.uiCommand = WELS_DEC_THREAD_COMMAND_RUN;
   RELEASE_SEMAPHORE (&m_pDecThrCtx[signal].sThreadInfo.sIsActivated);