shithub: openh264

Download patch

ref: 57bd721b2f082801590a8ebe9b03e52a08a56f18
parent: 82a0d3e4a2236e3913b91e7e36e19dc667bc2a80
author: huili2 <[email protected]>
date: Mon Jun 8 07:06:32 EDT 2015

add sub8x8 mode decision functions

--- a/codec/common/src/mc.cpp
+++ b/codec/common/src/mc.cpp
@@ -1292,8 +1292,10 @@
 #if defined (X86_ASM)
   if (uiCpuFlag & WELS_CPU_SSE2) {
     pMcFuncs->pfLumaHalfpelHor  = McHorVer20Width9Or17_sse2;
+#if 1 //could not work well for sub8x8: should disable it for now, or bugfix for it!
     pMcFuncs->pfLumaHalfpelVer  = McHorVer02Height9Or17_sse2;
     pMcFuncs->pfLumaHalfpelCen  = McHorVer22Width9Or17Height9Or17_sse2;
+#endif
     pMcFuncs->pfSampleAveraging = PixelAvg_sse2;
     pMcFuncs->pMcChromaFunc     = McChroma_sse2;
     pMcFuncs->pMcLumaFunc       = McLuma_sse2;
--- a/codec/encoder/core/inc/mv_pred.h
+++ b/codec/encoder/core/inc/mv_pred.h
@@ -84,6 +84,30 @@
                            SMVUnitXY* pMv);
 
 /*!
+ * \brief   update pMv and uiRefIndex cache for current MB and pMbCache, only for P_4x4
+ * \param
+ * \param
+ */
+void UpdateP4x4MotionInfo (SMbCache* pMbCache, SMB* pCurMb, const int32_t kiPartIdx, const int8_t kiRef,
+                           SMVUnitXY* pMv);
+
+/*!
+ * \brief   update pMv and uiRefIndex cache for current MB and pMbCache, only for P_8x4
+ * \param
+ * \param
+ */
+void UpdateP8x4MotionInfo (SMbCache* pMbCache, SMB* pCurMb, const int32_t kiPartIdx, const int8_t kiRef,
+                           SMVUnitXY* pMv);
+
+/*!
+ * \brief   update pMv and uiRefIndex cache for current MB and pMbCache, only for P_4x8
+ * \param
+ * \param
+ */
+void UpdateP4x8MotionInfo (SMbCache* pMbCache, SMB* pCurMb, const int32_t kiPartIdx, const int8_t kiRef,
+                           SMVUnitXY* pMv);
+
+/*!
  * \brief   get the motion predictor for 4*4 or 8*8 or 16*16 block
  * \param
  * \param   output mvp_x and mvp_y
@@ -135,6 +159,7 @@
  * \param
  */
 void UpdateP8x16Motion2Cache (SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* pMv);
+
 /*!
  * \brief   only update pMv cache for current MB, only for P_8x8
  * \param
@@ -141,5 +166,26 @@
  * \param
  */
 void UpdateP8x8Motion2Cache (SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* pMv);
+
+/*!
+ * \brief   only update pMv cache for current MB, only for P_4x4
+ * \param
+ * \param
+ */
+void UpdateP4x4Motion2Cache (SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* pMv);
+
+/*!
+ * \brief   only update pMv cache for current MB, only for P_8x4
+ * \param
+ * \param
+ */
+void UpdateP8x4Motion2Cache (SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* pMv);
+
+/*!
+ * \brief   only update pMv cache for current MB, only for P_4x8
+ * \param
+ * \param
+ */
+void UpdateP4x8Motion2Cache (SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* pMv);
 }
 #endif//WELS_MV_PRED_H__
--- a/codec/encoder/core/inc/svc_base_layer_md.h
+++ b/codec/encoder/core/inc/svc_base_layer_md.h
@@ -63,6 +63,9 @@
 int32_t WelsMdP16x8 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice* pSlice);
 int32_t WelsMdP8x16 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice* pSlice);
 int32_t WelsMdP8x8 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice* pSlice);
+int32_t WelsMdP4x4 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice* pSlice, const int32_t ki8x8Idx);
+int32_t WelsMdP8x4 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice* pSlice, const int32_t ki8x8Idx);
+int32_t WelsMdP4x8 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice* pSlice, const int32_t ki8x8Idx);
 /*static*/  void WelsMdInterInit (sWelsEncCtx* pEncCtx, SSlice* pSlice, SMB* pCurMb, const int32_t kiSliceFirstMbXY);
 /*static*/ void WelsMdInterFinePartition (sWelsEncCtx* pEnc, SWelsMD* pMd, SSlice* pSlice, SMB* pCurMb, int32_t bestCost);
 /*static*/ void WelsMdInterFinePartitionVaa (sWelsEncCtx* pEnc, SWelsMD* pMd, SSlice* pSlice, SMB* pCurMb, int32_t bestCost);
--- a/codec/encoder/core/inc/svc_enc_macroblock.h
+++ b/codec/encoder/core/inc/svc_enc_macroblock.h
@@ -50,6 +50,7 @@
 /*************************mb_layer() syntax and generated********************************/
 /*mb_layer():*/
 Mb_Type         uiMbType;       // including MB detailed partition type, number and type of reference list
+Mb_Type         uiSubMbType[4]; // sub MB types
 int32_t         iMbXY;          // offset position of MB top left point based
 int16_t         iMbX;           // position of MB in horizontal axis [0..32767]
 int16_t         iMbY;           // position of MB in vertical axis [0..32767]
@@ -71,7 +72,7 @@
 uint16_t        uiSliceIdc;     // 2^16=65536 > MaxFS(36864) of level 5.1; AVC: pFirstMbInSlice?; SVC: (pFirstMbInSlice << 7) | ((uiDependencyId << 4) | uiQualityId);
 uint32_t        uiChromPredMode;
 int32_t         iLumaDQp;
-SMVUnitXY       sMvd[4];
+SMVUnitXY       sMvd[MB_BLOCK4x4_NUM]; //only for CABAC writing; storage structure the same as sMv, in 4x4 scan order.
 int32_t         iCbpDc;
 //uint8_t         reserved_filling_bytes[1];      // not deleting this line for further changes of this structure. filling bytes reserved to make structure aligned with 4 bytes, higher cache hit on less structure size by 2 cache lines( 2 * 64 bytes) once hit
 } SMB, *PMb;
--- a/codec/encoder/core/src/mv_pred.cpp
+++ b/codec/encoder/core/src/mv_pred.cpp
@@ -301,7 +301,51 @@
       pMvComp->sMotionVectorCache[kiCacheIdx6] =
         pMvComp->sMotionVectorCache[kiCacheIdx7] = *pMv;
 }
+//update uiRefIndex and pMv of both SMB and Mb_cache, only for P4x4
+void UpdateP4x4MotionInfo (SMbCache* pMbCache, SMB* pCurMb, const int32_t kiPartIdx, const int8_t kiRef,
+                           SMVUnitXY* pMv) {
+  SMVComponentUnit* pMvComp = &pMbCache->sMvComponents;
+  const int16_t kiScan4Idx		= g_kuiMbCountScan4Idx[kiPartIdx];
+  const int16_t kiCacheIdx		= g_kuiCache30ScanIdx[kiPartIdx];
 
+  //mb
+  pCurMb->sMv[kiScan4Idx] = *pMv;
+  //cache
+  pMvComp->iRefIndexCache[kiCacheIdx] = kiRef;
+  pMvComp->sMotionVectorCache[kiCacheIdx] = *pMv;
+}
+//update uiRefIndex and pMv of both SMB and Mb_cache, only for P8x4
+void UpdateP8x4MotionInfo (SMbCache* pMbCache, SMB* pCurMb, const int32_t kiPartIdx, const int8_t kiRef,
+                           SMVUnitXY* pMv) {
+  SMVComponentUnit* pMvComp = &pMbCache->sMvComponents;
+  const int16_t kiScan4Idx		= g_kuiMbCountScan4Idx[kiPartIdx];
+  const int16_t kiCacheIdx		= g_kuiCache30ScanIdx[kiPartIdx];
+
+  //mb
+  pCurMb->sMv[    kiScan4Idx] = *pMv;
+  pCurMb->sMv[1 + kiScan4Idx] = *pMv;
+  //cache
+  pMvComp->iRefIndexCache[    kiCacheIdx] = kiRef;
+  pMvComp->iRefIndexCache[1 + kiCacheIdx] = kiRef;
+  pMvComp->sMotionVectorCache[    kiCacheIdx] = *pMv;
+  pMvComp->sMotionVectorCache[1 + kiCacheIdx] = *pMv;
+}
+//update uiRefIndex and pMv of both SMB and Mb_cache, only for P4x8
+void UpdateP4x8MotionInfo (SMbCache* pMbCache, SMB* pCurMb, const int32_t kiPartIdx, const int8_t kiRef,
+                           SMVUnitXY* pMv) {
+  SMVComponentUnit* pMvComp = &pMbCache->sMvComponents;
+  const int16_t kiScan4Idx		= g_kuiMbCountScan4Idx[kiPartIdx];
+  const int16_t kiCacheIdx		= g_kuiCache30ScanIdx[kiPartIdx];
+
+  //mb
+  pCurMb->sMv[    kiScan4Idx] = *pMv;
+  pCurMb->sMv[4 + kiScan4Idx] = *pMv;
+  //cache
+  pMvComp->iRefIndexCache[    kiCacheIdx] = kiRef;
+  pMvComp->iRefIndexCache[6 + kiCacheIdx] = kiRef;
+  pMvComp->sMotionVectorCache[    kiCacheIdx] = *pMv;
+  pMvComp->sMotionVectorCache[6 + kiCacheIdx] = *pMv;
+}
 //=========================update motion info(MV and ref_idx) into Mb_cache==========================
 //update pMv and uiRefIndex cache only for Mb_cache, only for P_16*16 (SKIP inclusive)
 
@@ -359,4 +403,34 @@
         pMvComp->sMotionVectorCache[7 + kuiCacheIdx] = *pMv;
 }
 
+//update uiRefIndex and pMv of only Mb_cache, for P4x4
+void UpdateP4x4Motion2Cache (SMbCache* pMbCache, int32_t iPartIdx, int8_t pRef, SMVUnitXY* pMv) {
+  SMVComponentUnit* pMvComp = &pMbCache->sMvComponents;
+  const uint8_t kuiCacheIdx = g_kuiCache30ScanIdx[iPartIdx];
+
+  pMvComp->iRefIndexCache    [kuiCacheIdx] = pRef;
+  pMvComp->sMotionVectorCache[kuiCacheIdx] = *pMv;
+}
+
+//update uiRefIndex and pMv of only Mb_cache, for P8x4
+void UpdateP8x4Motion2Cache (SMbCache* pMbCache, int32_t iPartIdx, int8_t pRef, SMVUnitXY* pMv) {
+  SMVComponentUnit* pMvComp = &pMbCache->sMvComponents;
+  const uint8_t kuiCacheIdx = g_kuiCache30ScanIdx[iPartIdx];
+
+  pMvComp->iRefIndexCache      [    kuiCacheIdx] =
+    pMvComp->iRefIndexCache    [1 + kuiCacheIdx] = pRef;
+  pMvComp->sMotionVectorCache  [    kuiCacheIdx] =
+    pMvComp->sMotionVectorCache[1 + kuiCacheIdx] = *pMv;
+}
+
+//update uiRefIndex and pMv of only Mb_cache, for P4x8
+void UpdateP4x8Motion2Cache (SMbCache* pMbCache, int32_t iPartIdx, int8_t pRef, SMVUnitXY* pMv) {
+  SMVComponentUnit* pMvComp = &pMbCache->sMvComponents;
+  const uint8_t kuiCacheIdx = g_kuiCache30ScanIdx[iPartIdx];
+
+  pMvComp->iRefIndexCache      [    kuiCacheIdx] =
+    pMvComp->iRefIndexCache    [6 + kuiCacheIdx] = pRef;
+  pMvComp->sMotionVectorCache  [    kuiCacheIdx] =
+    pMvComp->sMotionVectorCache[6 + kuiCacheIdx] = *pMv;
+}
 } // namespace WelsEnc
--- a/codec/encoder/core/src/svc_base_layer_md.cpp
+++ b/codec/encoder/core/src/svc_base_layer_md.cpp
@@ -1117,6 +1117,124 @@
   return iCostP8x8;
 }
 
+int32_t WelsMdP4x4 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice* pSlice,
+                    const int32_t ki8x8Idx) {
+  SMbCache* pMbCache = &pSlice->sMbCacheInfo;
+  int32_t iLineSizeEnc = pCurDqLayer->iEncStride[0];
+  int32_t iLineSizeRef = pCurDqLayer->pRefPic->iLineSize[0];
+  SWelsME* sMe4x4;
+  int32_t i4x4Idx, iIdxX, iIdxY, iPixelX, iPixelY, iStrideEnc, iStrideRef;
+  int32_t iCostP4x4 = 0;
+  for (i4x4Idx = 0; i4x4Idx < 4; ++i4x4Idx) {
+    int32_t iPartIdx = (ki8x8Idx << 2) + i4x4Idx;
+    iIdxX = ((ki8x8Idx & 1) << 1) + (i4x4Idx & 1);
+    iIdxY = ((ki8x8Idx >> 1) << 1) + (i4x4Idx >> 1);
+    iPixelX = (iIdxX << 2);
+    iPixelY = (iIdxY << 2);
+    iStrideEnc = iPixelX + (iPixelY * iLineSizeEnc);
+    iStrideRef = iPixelX + (iPixelY * iLineSizeRef);
+
+    sMe4x4 = &pWelsMd->sMe.sMe4x4[ki8x8Idx][i4x4Idx];
+    InitMe (*pWelsMd, BLOCK_4x4,
+            pMbCache->SPicData.pEncMb[0] + iStrideEnc,
+            pMbCache->SPicData.pRefMb[0] + iStrideRef,
+            pCurDqLayer->pRefPic->pScreenBlockFeatureStorage,
+            *sMe4x4);
+    //not putting these three lines below into InitMe to avoid judging mode in InitMe
+    sMe4x4->iCurMeBlockPixX = pWelsMd->iMbPixX + iPixelX;
+    sMe4x4->iCurMeBlockPixY = pWelsMd->iMbPixY + iPixelY;
+    sMe4x4->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb >> 2;
+
+    pSlice->sMvc[0] = sMe4x4->sMvBase;
+    pSlice->uiMvcNum = 1;
+
+    PredMv (&pMbCache->sMvComponents, iPartIdx, 1, pWelsMd->uiRef, & (sMe4x4->sMvp));
+    pFunc->pfMotionSearch[0] (pFunc, pCurDqLayer, sMe4x4, pSlice);
+    UpdateP4x4Motion2Cache (pMbCache, iPartIdx, pWelsMd->uiRef, & (sMe4x4->sMv));
+    iCostP4x4 += sMe4x4->uiSatdCost;
+  }
+  return iCostP4x4;
+}
+
+int32_t WelsMdP8x4 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice* pSlice,
+                    const int32_t ki8x8Idx) {
+  SMbCache* pMbCache = &pSlice->sMbCacheInfo;
+  int32_t iLineSizeEnc = pCurDqLayer->iEncStride[0];
+  int32_t iLineSizeRef = pCurDqLayer->pRefPic->iLineSize[0];
+  SWelsME* sMe8x4;
+  int32_t i8x4Idx, iIdxX, iIdxY, iPixelX, iPixelY, iStrideEnc, iStrideRef;
+  int32_t iCostP8x4 = 0;
+  for (i8x4Idx = 0; i8x4Idx < 2; ++i8x4Idx) {
+    int32_t iPartIdx = (ki8x8Idx << 2) + (i8x4Idx << 1);
+    iIdxX = ((ki8x8Idx & 1) << 1);
+    iIdxY = ((ki8x8Idx >> 1) << 1) + i8x4Idx;
+    iPixelX = (iIdxX << 2);
+    iPixelY = (iIdxY << 2);
+    iStrideEnc = iPixelX + (iPixelY * iLineSizeEnc);
+    iStrideRef = iPixelX + (iPixelY * iLineSizeRef);
+
+    sMe8x4 = &pWelsMd->sMe.sMe8x4[ki8x8Idx][i8x4Idx];
+    InitMe (*pWelsMd, BLOCK_8x4,
+            pMbCache->SPicData.pEncMb[0] + iStrideEnc,
+            pMbCache->SPicData.pRefMb[0] + iStrideRef,
+            pCurDqLayer->pRefPic->pScreenBlockFeatureStorage,
+            *sMe8x4);
+    //not putting these three lines below into InitMe to avoid judging mode in InitMe
+    sMe8x4->iCurMeBlockPixX = pWelsMd->iMbPixX + iPixelX;
+    sMe8x4->iCurMeBlockPixY = pWelsMd->iMbPixY + iPixelY;
+    sMe8x4->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb >> 2;
+
+    pSlice->sMvc[0] = sMe8x4->sMvBase;
+    pSlice->uiMvcNum = 1;
+
+    PredMv (&pMbCache->sMvComponents, iPartIdx, 2, pWelsMd->uiRef, & (sMe8x4->sMvp));
+    pFunc->pfMotionSearch[0] (pFunc, pCurDqLayer, sMe8x4, pSlice);
+    UpdateP8x4Motion2Cache (pMbCache, iPartIdx, pWelsMd->uiRef, & (sMe8x4->sMv));
+    iCostP8x4 += sMe8x4->uiSatdCost;
+  }
+  return iCostP8x4;
+}
+
+int32_t WelsMdP4x8 (SWelsFuncPtrList* pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice* pSlice,
+                    const int32_t ki8x8Idx) {
+  //Wayne, to be modified
+  SMbCache* pMbCache = &pSlice->sMbCacheInfo;
+  int32_t iLineSizeEnc = pCurDqLayer->iEncStride[0];
+  int32_t iLineSizeRef = pCurDqLayer->pRefPic->iLineSize[0];
+  SWelsME* sMe4x8;
+  int32_t i4x8Idx, iIdxX, iIdxY, iPixelX, iPixelY, iStrideEnc, iStrideRef;
+  int32_t iCostP4x8 = 0;
+  for (i4x8Idx = 0; i4x8Idx < 2; ++i4x8Idx) {
+    int32_t iPartIdx = (ki8x8Idx << 2) + i4x8Idx;
+    iIdxX = ((ki8x8Idx & 1) << 1) + i4x8Idx;
+    iIdxY = ((ki8x8Idx >> 1) << 1);
+    iPixelX = (iIdxX << 2);
+    iPixelY = (iIdxY << 2);
+    iStrideEnc = iPixelX + (iPixelY * iLineSizeEnc);
+    iStrideRef = iPixelX + (iPixelY * iLineSizeRef);
+
+    sMe4x8 = &pWelsMd->sMe.sMe4x8[ki8x8Idx][i4x8Idx];
+    InitMe (*pWelsMd, BLOCK_4x8,
+            pMbCache->SPicData.pEncMb[0] + iStrideEnc,
+            pMbCache->SPicData.pRefMb[0] + iStrideRef,
+            pCurDqLayer->pRefPic->pScreenBlockFeatureStorage,
+            *sMe4x8);
+    //not putting these three lines below into InitMe to avoid judging mode in InitMe
+    sMe4x8->iCurMeBlockPixX = pWelsMd->iMbPixX + iPixelX;
+    sMe4x8->iCurMeBlockPixY = pWelsMd->iMbPixY + iPixelY;
+    sMe4x8->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb >> 2;
+
+    pSlice->sMvc[0] = sMe4x8->sMvBase;
+    pSlice->uiMvcNum = 1;
+
+    PredMv (&pMbCache->sMvComponents, iPartIdx, 1, pWelsMd->uiRef, & (sMe4x8->sMvp));
+    pFunc->pfMotionSearch[0] (pFunc, pCurDqLayer, sMe4x8, pSlice);
+    UpdateP4x8Motion2Cache (pMbCache, iPartIdx, pWelsMd->uiRef, & (sMe4x8->sMv));
+    iCostP4x8 += sMe4x8->uiSatdCost;
+  }
+  return iCostP4x8;
+}
+
 void WelsMdInterFinePartition (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice* pSlice, SMB* pCurMb, int32_t iBestCost) {
   SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
 //  SMbCache *pMbCache = &pSlice->sMbCacheInfo;
@@ -1129,6 +1247,7 @@
   if (iCost < iBestCost) {
     int32_t iCostPart;
     pCurMb->uiMbType = MB_TYPE_8x8;
+    pCurMb->uiSubMbType[0] = pCurMb->uiSubMbType[1] = pCurMb->uiSubMbType[2] = pCurMb->uiSubMbType[3] = SUB_MB_TYPE_8x8;
 
 //    WelsLog( pEncCtx, WELS_LOG_INFO, "WelsMdP16x8, p_ref[0]= 0x%p", pMbCache->SPicData.pRefMb[0]);
     iCostPart = WelsMdP16x8 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
@@ -1190,6 +1309,7 @@
     if (iCostP8x8 < iBestCost) {
       iBestCost = iCostP8x8;
       pCurMb->uiMbType = MB_TYPE_8x8;
+      pCurMb->uiSubMbType[0] = pCurMb->uiSubMbType[1] = pCurMb->uiSubMbType[2] = pCurMb->uiSubMbType[3] = SUB_MB_TYPE_8x8;
     }
     break;
 
@@ -1198,6 +1318,7 @@
     if (iCostP8x8 < iBestCost) {
       iBestCost = iCostP8x8;
       pCurMb->uiMbType = MB_TYPE_8x8;
+      pCurMb->uiSubMbType[0] = pCurMb->uiSubMbType[1] = pCurMb->uiSubMbType[2] = pCurMb->uiSubMbType[3] = SUB_MB_TYPE_8x8;
 
       iCostP16x8 = WelsMdP16x8 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
       if (iCostP16x8 <= iBestCost) {
@@ -1426,6 +1547,32 @@
 const int32_t g_kiPixStrideIdx8x8[4] = {  0,                                             ME_REFINE_BUF_WIDTH_BLK8,
                                           ME_REFINE_BUF_STRIDE_BLK8, ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_WIDTH_BLK8
                                        };
+const int32_t g_kiPixStrideIdx4x4[4][4] = {
+  {
+    0,
+    0 + ME_REFINE_BUF_WIDTH_BLK4,
+    0 + ME_REFINE_BUF_STRIDE_BLK4,
+    0 + ME_REFINE_BUF_WIDTH_BLK4 + ME_REFINE_BUF_STRIDE_BLK4
+  }, //[0][]
+  {
+    ME_REFINE_BUF_WIDTH_BLK8,
+    ME_REFINE_BUF_WIDTH_BLK8 + ME_REFINE_BUF_WIDTH_BLK4,
+    ME_REFINE_BUF_WIDTH_BLK8 + ME_REFINE_BUF_STRIDE_BLK4,
+    ME_REFINE_BUF_WIDTH_BLK8 + ME_REFINE_BUF_WIDTH_BLK4 + ME_REFINE_BUF_STRIDE_BLK4
+  }, //[1][]
+  {
+    ME_REFINE_BUF_STRIDE_BLK8,
+    ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_WIDTH_BLK4,
+    ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_STRIDE_BLK4,
+    ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_WIDTH_BLK4 + ME_REFINE_BUF_STRIDE_BLK4
+  }, //[2][]
+  {
+    ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_WIDTH_BLK8,
+    ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_WIDTH_BLK8 + ME_REFINE_BUF_WIDTH_BLK4,
+    ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_WIDTH_BLK8 + ME_REFINE_BUF_STRIDE_BLK4,
+    ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_WIDTH_BLK8 + ME_REFINE_BUF_WIDTH_BLK4 + ME_REFINE_BUF_STRIDE_BLK4
+  } //[3][]
+};
 
 void WelsMdInterMbRefinement (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache* pMbCache) {
   SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
@@ -1436,7 +1583,7 @@
   int32_t iBestSadCost = 0, iBestSatdCost = 0;
   SMeRefinePointer sMeRefine;
 
-  int32_t i, iIdx, iPixStride;
+  int32_t i, j, iIdx, iPixStride;
 
   uint8_t* pRefCb = pMbCache->SPicData.pRefMb[1];
   uint8_t* pRefCr = pMbCache->SPicData.pRefMb[2];
@@ -1536,40 +1683,141 @@
     }
     break;
   case MB_TYPE_8x8:
-    sMeRefine.pfCopyBlockByMode = pFunc->pfCopy8x8Aligned;
+    pMbCache->sMvComponents.iRefIndexCache [9] = pMbCache->sMvComponents.iRefIndexCache [21] = REF_NOT_AVAIL;
     for (i = 0; i < 4; i++) {
       int32_t iBlk8Idx = i << 2; //0, 4, 8, 12
-      int32_t iBlk4X, iBlk4Y;
+      int32_t iBlk4X, iBlk4Y, iBlk4x4Idx;
 
       pCurMb->pRefIndex[i] = pWelsMd->uiRef;
+      switch (pCurMb->uiSubMbType[i]) {
+      case SUB_MB_TYPE_8x8:
+        sMeRefine.pfCopyBlockByMode = pFunc->pfCopy8x8Aligned;
+        //luma
+        InitMeRefinePointer (&sMeRefine, pMbCache, g_kiPixStrideIdx8x8[i]);
+        PredMv (&pMbCache->sMvComponents, iBlk8Idx, 2, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x8[i].sMvp);
+        MeRefineFracPixel (pEncCtx, pDstLuma + g_kuiSmb4AddrIn256[iBlk8Idx], &pWelsMd->sMe.sMe8x8[i], &sMeRefine, 8, 8);
+        UpdateP8x8MotionInfo (pMbCache, pCurMb, iBlk8Idx, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x8[i].sMv);
+        pMbCache->sMbMvp[g_kuiMbCountScan4Idx[iBlk8Idx]] = pWelsMd->sMe.sMe8x8[i].sMvp;
+        iBestSadCost += pWelsMd->sMe.sMe8x8[i].uiSadCost;
+        iBestSatdCost += pWelsMd->sMe.sMe8x8[i].uiSatdCost;
 
-      //luma
-      InitMeRefinePointer (&sMeRefine, pMbCache, g_kiPixStrideIdx8x8[i]);
-      PredMv (&pMbCache->sMvComponents, iBlk8Idx, 2, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x8[i].sMvp);
-      MeRefineFracPixel (pEncCtx, pDstLuma + g_kuiSmb4AddrIn256[iBlk8Idx], &pWelsMd->sMe.sMe8x8[i], &sMeRefine, 8, 8);
-      UpdateP8x8MotionInfo (pMbCache, pCurMb, iBlk8Idx, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x8[i].sMv);
-      pMbCache->sMbMvp[i] = pWelsMd->sMe.sMe8x8[i].sMvp;
-      iBestSadCost += pWelsMd->sMe.sMe8x8[i].uiSadCost;
-      iBestSatdCost += pWelsMd->sMe.sMe8x8[i].uiSatdCost;
+        //chroma
+        pMv = &pWelsMd->sMe.sMe8x8[i].sMv;
+        iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
 
-      //chroma
-      pMv = &pWelsMd->sMe.sMe8x8[i].sMv;
-      iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
+        iBlk4X = (i & 1) << 2;
+        iBlk4Y = (i >> 1) << 2;
+        iRefBlk4Stride = iBlk4Y * iLineSizeRefUV + iBlk4X;
+        iDstBlk4Stride = (iBlk4Y << 3) + iBlk4X;
 
-      iBlk4X = (i & 1) << 2;
-      iBlk4Y = (i >> 1) << 2;
-      iRefBlk4Stride = iBlk4Y * iLineSizeRefUV + iBlk4X;
-      iDstBlk4Stride = (iBlk4Y << 3) + iBlk4X;
+        pTmpRefCb = pRefCb + iRefBlk4Stride;
+        pTmpDstCb = pDstCb + iDstBlk4Stride;
+        pTmpRefCr = pRefCr + iRefBlk4Stride;
+        pTmpDstCr = pDstCr + iDstBlk4Stride;
+        pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCb + iMvStride, iLineSizeRefUV, pTmpDstCb, 8, pMv->iMvX, pMv->iMvY,
+            4, 4); //Cb
+        pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCr + iMvStride, iLineSizeRefUV, pTmpDstCr, 8, pMv->iMvX, pMv->iMvY,
+            4, 4); //Cr
+        break;
+      case SUB_MB_TYPE_4x4:
+        sMeRefine.pfCopyBlockByMode = pFunc->pfCopy4x4;
+        //luma
+        for (j = 0; j < 4; ++j) {
+          iBlk4x4Idx = iBlk8Idx + j;
+          InitMeRefinePointer (&sMeRefine, pMbCache, g_kiPixStrideIdx4x4[i][j]);
+          PredMv (&pMbCache->sMvComponents, iBlk4x4Idx, 1, pWelsMd->uiRef, &pWelsMd->sMe.sMe4x4[i][j].sMvp);
+          MeRefineFracPixel (pEncCtx, pDstLuma + g_kuiSmb4AddrIn256[iBlk4x4Idx], &pWelsMd->sMe.sMe4x4[i][j], &sMeRefine, 4, 4);
+          UpdateP4x4MotionInfo (pMbCache, pCurMb, iBlk4x4Idx, pWelsMd->uiRef, &pWelsMd->sMe.sMe4x4[i][j].sMv);
+          pMbCache->sMbMvp[g_kuiMbCountScan4Idx[iBlk4x4Idx]] = pWelsMd->sMe.sMe4x4[i][j].sMvp;
+          iBestSadCost += pWelsMd->sMe.sMe4x4[i][j].uiSadCost;
+          iBestSatdCost += pWelsMd->sMe.sMe4x4[i][j].uiSatdCost;
 
-      pTmpRefCb = pRefCb + iRefBlk4Stride;
-      pTmpDstCb = pDstCb + iDstBlk4Stride;
-      pTmpRefCr = pRefCr + iRefBlk4Stride;
-      pTmpDstCr = pDstCr + iDstBlk4Stride;
-      pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCb + iMvStride, iLineSizeRefUV, pTmpDstCb, 8, pMv->iMvX, pMv->iMvY,
-          4, 4); //Cb
-      pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCr + iMvStride, iLineSizeRefUV, pTmpDstCr, 8, pMv->iMvX, pMv->iMvY,
-          4, 4); //Cr
+          //chroma
+          pMv = &pWelsMd->sMe.sMe4x4[i][j].sMv;
+          iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
 
+          iBlk4X = (((i & 1) << 1) + (j & 1)) << 1;
+          iBlk4Y = (((i >> 1) << 1) + (j >> 1)) << 1;
+          iRefBlk4Stride = iBlk4Y * iLineSizeRefUV + iBlk4X;
+          iDstBlk4Stride = (iBlk4Y << 3) + iBlk4X;
+
+          pTmpRefCb = pRefCb + iRefBlk4Stride;
+          pTmpDstCb = pDstCb + iDstBlk4Stride;
+          pTmpRefCr = pRefCr + iRefBlk4Stride;
+          pTmpDstCr = pDstCr + iDstBlk4Stride;
+          pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCb + iMvStride, iLineSizeRefUV, pTmpDstCb, 8, pMv->iMvX, pMv->iMvY,
+              2, 2); //Cb
+          pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCr + iMvStride, iLineSizeRefUV, pTmpDstCr, 8, pMv->iMvX, pMv->iMvY,
+              2, 2); //Cr
+        }
+        break;
+      case SUB_MB_TYPE_8x4:
+        sMeRefine.pfCopyBlockByMode = pFunc->pfCopy8x4;
+        //luma
+        for (j = 0; j < 2; ++j) {
+          iBlk4x4Idx = iBlk8Idx + (j << 1);
+          InitMeRefinePointer (&sMeRefine, pMbCache, g_kiPixStrideIdx4x4[i][j << 1]);
+          PredMv (&pMbCache->sMvComponents, iBlk4x4Idx, 2, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x4[i][j].sMvp);
+          MeRefineFracPixel (pEncCtx, pDstLuma + g_kuiSmb4AddrIn256[iBlk4x4Idx], &pWelsMd->sMe.sMe8x4[i][j], &sMeRefine, 8, 4);
+          UpdateP8x4MotionInfo (pMbCache, pCurMb, iBlk4x4Idx, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x4[i][j].sMv);
+          pMbCache->sMbMvp[g_kuiMbCountScan4Idx[    iBlk4x4Idx]] = pWelsMd->sMe.sMe8x4[i][j].sMvp;
+          //pMbCache->sMbMvp[g_kuiMbCountScan4Idx[1 + iBlk4x4Idx]] = pWelsMd->sMe.sMe8x4[i][j].sMvp;
+          iBestSadCost += pWelsMd->sMe.sMe8x4[i][j].uiSadCost;
+          iBestSatdCost += pWelsMd->sMe.sMe8x4[i][j].uiSatdCost;
+
+          //chroma
+          pMv = &pWelsMd->sMe.sMe8x4[i][j].sMv;
+          iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
+
+          iBlk4X = ((i & 1) << 1) << 1;
+          iBlk4Y = (((i >> 1) << 1) + j) << 1;
+          iRefBlk4Stride = iBlk4Y * iLineSizeRefUV + iBlk4X;
+          iDstBlk4Stride = (iBlk4Y << 3) + iBlk4X;
+
+          pTmpRefCb = pRefCb + iRefBlk4Stride;
+          pTmpDstCb = pDstCb + iDstBlk4Stride;
+          pTmpRefCr = pRefCr + iRefBlk4Stride;
+          pTmpDstCr = pDstCr + iDstBlk4Stride;
+          pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCb + iMvStride, iLineSizeRefUV, pTmpDstCb, 8, pMv->iMvX, pMv->iMvY,
+              4, 2); //Cb
+          pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCr + iMvStride, iLineSizeRefUV, pTmpDstCr, 8, pMv->iMvX, pMv->iMvY,
+              4, 2); //Cr
+        }
+        break;
+      case SUB_MB_TYPE_4x8:
+        sMeRefine.pfCopyBlockByMode = pFunc->pfCopy4x8;
+        //luma
+        for (j = 0; j < 2; ++j) {
+          iBlk4x4Idx = iBlk8Idx + j;
+          InitMeRefinePointer (&sMeRefine, pMbCache, g_kiPixStrideIdx4x4[i][j]);
+          PredMv (&pMbCache->sMvComponents, iBlk4x4Idx, 1, pWelsMd->uiRef, &pWelsMd->sMe.sMe4x8[i][j].sMvp);
+          MeRefineFracPixel (pEncCtx, pDstLuma + g_kuiSmb4AddrIn256[iBlk4x4Idx], &pWelsMd->sMe.sMe4x8[i][j], &sMeRefine, 4, 8);
+          UpdateP4x8MotionInfo (pMbCache, pCurMb, iBlk4x4Idx, pWelsMd->uiRef, &pWelsMd->sMe.sMe4x8[i][j].sMv);
+          pMbCache->sMbMvp[g_kuiMbCountScan4Idx[    iBlk4x4Idx]] = pWelsMd->sMe.sMe4x8[i][j].sMvp;
+          //pMbCache->sMbMvp[g_kuiMbCountScan4Idx[4 + iBlk4x4Idx]] = pWelsMd->sMe.sMe8x4[i][j].sMvp;
+          iBestSadCost += pWelsMd->sMe.sMe4x8[i][j].uiSadCost;
+          iBestSatdCost += pWelsMd->sMe.sMe4x8[i][j].uiSatdCost;
+
+          //chroma
+          pMv = &pWelsMd->sMe.sMe4x8[i][j].sMv;
+          iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
+
+          iBlk4X = (((i & 1) << 1) + j) << 1;
+          iBlk4Y = (((i >> 1) << 1)) << 1;
+          iRefBlk4Stride = iBlk4Y * iLineSizeRefUV + iBlk4X;
+          iDstBlk4Stride = (iBlk4Y << 3) + iBlk4X;
+
+          pTmpRefCb = pRefCb + iRefBlk4Stride;
+          pTmpDstCb = pDstCb + iDstBlk4Stride;
+          pTmpRefCr = pRefCr + iRefBlk4Stride;
+          pTmpDstCr = pDstCr + iDstBlk4Stride;
+          pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCb + iMvStride, iLineSizeRefUV, pTmpDstCb, 8, pMv->iMvX, pMv->iMvY,
+              2, 4); //Cb
+          pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCr + iMvStride, iLineSizeRefUV, pTmpDstCr, 8, pMv->iMvX, pMv->iMvY,
+              2, 4); //Cr
+        }
+        break;
+      }
     }
     break;
   default:
--- a/codec/encoder/core/src/svc_mode_decision.cpp
+++ b/codec/encoder/core/src/svc_mode_decision.cpp
@@ -532,7 +532,8 @@
 
   return false;
 }
-bool WelsMdInterJudgeSCDPskipFalse (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice* slice, SMB* pCurMb, SMbCache* pMbCache) {
+bool WelsMdInterJudgeSCDPskipFalse (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice* slice, SMB* pCurMb,
+                                    SMbCache* pMbCache) {
   return false;
 }
 
@@ -606,7 +607,8 @@
 }
 
 
-void WelsMdInterFinePartitionVaaOnScreen (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice* pSlice, SMB* pCurMb, int32_t iBestCost) {
+void WelsMdInterFinePartitionVaaOnScreen (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice* pSlice, SMB* pCurMb,
+    int32_t iBestCost) {
   SMbCache* pMbCache = &pSlice->sMbCacheInfo;
   SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
   int32_t iCostP8x8;
@@ -620,8 +622,37 @@
   if (iCostP8x8 < iBestCost) {
     iBestCost = iCostP8x8;
     pCurMb->uiMbType = MB_TYPE_8x8;
-
-    TryModeMerge (pMbCache, pWelsMd, pCurMb);
+    pCurMb->uiSubMbType[0] = pCurMb->uiSubMbType[1] = pCurMb->uiSubMbType[2] = pCurMb->uiSubMbType[3] = SUB_MB_TYPE_8x8;
+#if 0 //Disable for sub8x8 modes for now
+    iBestCost = 0;
+    //reset neighbor info for sub8x8
+    pMbCache->sMvComponents.iRefIndexCache [9] = pMbCache->sMvComponents.iRefIndexCache [21] = REF_NOT_AVAIL;
+    for (int32_t i8x8Idx = 0; i8x8Idx < 4; ++i8x8Idx) {
+      int32_t iCurCostSub8x8, iBestCostSub8x8 = pWelsMd->sMe.sMe8x8[i8x8Idx].uiSatdCost;
+      //4x4
+      iCurCostSub8x8 = WelsMdP4x4 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice, i8x8Idx);
+      if (iCurCostSub8x8 < iBestCostSub8x8) {
+        pCurMb->uiSubMbType[i8x8Idx] = SUB_MB_TYPE_4x4;
+        iBestCostSub8x8 = iCurCostSub8x8;
+      }
+      //8x4
+      iCurCostSub8x8 = WelsMdP8x4 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice, i8x8Idx);
+      if (iCurCostSub8x8 < iBestCostSub8x8) {
+        pCurMb->uiSubMbType[i8x8Idx] = SUB_MB_TYPE_8x4;
+        iBestCostSub8x8 = iCurCostSub8x8;
+      }
+      //4x8
+      iCurCostSub8x8 = WelsMdP4x8 (pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice, i8x8Idx);
+      if (iCurCostSub8x8 < iBestCostSub8x8) {
+        pCurMb->uiSubMbType[i8x8Idx] = SUB_MB_TYPE_4x8;
+        iBestCostSub8x8 = iCurCostSub8x8;
+      }
+      iBestCost += iBestCostSub8x8;
+    }
+    if ((pCurMb->uiSubMbType[0] == SUB_MB_TYPE_8x8) && (pCurMb->uiSubMbType[1] == SUB_MB_TYPE_8x8)
+        && (pCurMb->uiSubMbType[2] == SUB_MB_TYPE_8x8) && (pCurMb->uiSubMbType[3] == SUB_MB_TYPE_8x8)) //all 8x8
+#endif
+      TryModeMerge (pMbCache, pWelsMd, pCurMb);
   }
   pWelsMd->iCostLuma = iBestCost;
 }
--- a/codec/encoder/core/src/svc_set_mb_syn_cabac.cpp
+++ b/codec/encoder/core/src/svc_set_mb_syn_cabac.cpp
@@ -271,8 +271,7 @@
   WelsCabacEncodeDecision (pCabacCtx, iCtx, bSkipFlag);
 
   if (bSkipFlag) {
-    for (int  i = 0; i < 4; i++) {
-
+    for (int  i = 0; i < 16; i++) {
       pCurMb->sMvd[i].iMvX = 0;
       pCurMb->sMvd[i].iMvY = 0;
     }
@@ -338,7 +337,7 @@
   }
 }
 SMVUnitXY WelsCabacMbMvd (SCabacCtx* pCabacCtx, SMB* pCurMb, uint32_t iMbWidth,
-                          SMVUnitXY sCurMv, SMVUnitXY sPredMv, int16_t iBlockIdx) {
+                          SMVUnitXY sCurMv, SMVUnitXY sPredMv, int16_t i4x4ScanIdx) {
   uint32_t iAbsMvd0, iAbsMvd1;
   uint8_t uiNeighborAvail = pCurMb->uiNeighborAvail;
   SMVUnitXY sMvd;
@@ -347,19 +346,16 @@
 
   sMvdLeft.iMvX = sMvdLeft.iMvY = sMvdTop.iMvX = sMvdTop.iMvY = 0;
   sMvd.sDeltaMv (sCurMv, sPredMv);
-
-  if (((iBlockIdx == 0) || (iBlockIdx == 1)) && (uiNeighborAvail & TOP_MB_POS)) {
-    sMvdTop.sAssginMv ((pCurMb - iMbWidth)->sMvd[iBlockIdx + 2]);
+  if ((i4x4ScanIdx < 4) && (uiNeighborAvail & TOP_MB_POS)) { //top row blocks
+    sMvdTop.sAssginMv ((pCurMb - iMbWidth)->sMvd[i4x4ScanIdx + 12]);
+  } else if (i4x4ScanIdx >= 4) {
+    sMvdTop.sAssginMv (pCurMb->sMvd[i4x4ScanIdx - 4]);
   }
-  if ((iBlockIdx == 2) || (iBlockIdx == 3)) {
-    sMvdTop.sAssginMv (pCurMb->sMvd[iBlockIdx - 2]);
+  if ((! (i4x4ScanIdx & 0x03)) && (uiNeighborAvail & LEFT_MB_POS)) { //left column blocks
+    sMvdLeft.sAssginMv ((pCurMb - 1)->sMvd[i4x4ScanIdx + 3]);
+  } else if (i4x4ScanIdx & 0x03) {
+    sMvdLeft.sAssginMv (pCurMb->sMvd[i4x4ScanIdx - 1]);
   }
-  if (((iBlockIdx == 0) || (iBlockIdx == 2)) && (uiNeighborAvail & LEFT_MB_POS)) {
-    sMvdLeft.sAssginMv ((pCurMb - 1)->sMvd[iBlockIdx + 1]);
-  }
-  if ((iBlockIdx == 1) || (iBlockIdx == 3)) {
-    sMvdLeft.sAssginMv (pCurMb->sMvd[iBlockIdx - 1]);
-  }
 
   iAbsMvd0 = WELS_ABS (sMvdLeft.iMvX) + WELS_ABS (sMvdTop.iMvX);
   iAbsMvd1 = WELS_ABS (sMvdLeft.iMvY) + WELS_ABS (sMvdTop.iMvY);
@@ -368,7 +364,63 @@
   WelsCabacMbMvdLx (pCabacCtx, sMvd.iMvY, 47, iAbsMvd1);
   return sMvd;
 }
+static void WelsCabacSubMbType (SCabacCtx* pCabacCtx, SMB* pCurMb) {
+  for (int32_t i8x8Idx = 0; i8x8Idx < 4; ++i8x8Idx) {
+    uint32_t uiSubMbType = pCurMb->uiSubMbType[i8x8Idx];
+    if (SUB_MB_TYPE_8x8 == uiSubMbType) {
+      WelsCabacEncodeDecision (pCabacCtx, 21, 1);
+      continue;
+    }
+    WelsCabacEncodeDecision (pCabacCtx, 21, 0);
+    if (SUB_MB_TYPE_8x4 == uiSubMbType) {
+      WelsCabacEncodeDecision (pCabacCtx, 22, 0);
+    } else {
+      WelsCabacEncodeDecision (pCabacCtx, 22, 1);
+      WelsCabacEncodeDecision (pCabacCtx, 23, SUB_MB_TYPE_4x8 == uiSubMbType);
+    }
+  } //for
+}
 
+static void WelsCabacSubMbMvd (SCabacCtx* pCabacCtx, SMB* pCurMb, SMbCache* pMbCache, const int kiMbWidth) {
+  SMVUnitXY sMvd;
+  int32_t i8x8Idx, i4x4ScanIdx;
+  for (i8x8Idx = 0; i8x8Idx < 4; ++i8x8Idx) {
+    uint32_t uiSubMbType = pCurMb->uiSubMbType[i8x8Idx];
+    if (SUB_MB_TYPE_8x8 == uiSubMbType) {
+      i4x4ScanIdx = g_kuiMbCountScan4Idx[i8x8Idx << 2];
+      sMvd = WelsCabacMbMvd (pCabacCtx, pCurMb, kiMbWidth, pCurMb->sMv[i4x4ScanIdx], pMbCache->sMbMvp[i4x4ScanIdx],
+                             i4x4ScanIdx);
+      pCurMb->sMvd[    i4x4ScanIdx].sAssginMv (sMvd);
+      pCurMb->sMvd[1 + i4x4ScanIdx].sAssginMv (sMvd);
+      pCurMb->sMvd[4 + i4x4ScanIdx].sAssginMv (sMvd);
+      pCurMb->sMvd[5 + i4x4ScanIdx].sAssginMv (sMvd);
+    } else if (SUB_MB_TYPE_4x4 == uiSubMbType) {
+      for (int32_t i4x4Idx = 0; i4x4Idx < 4; ++i4x4Idx) {
+        i4x4ScanIdx = g_kuiMbCountScan4Idx[ (i8x8Idx << 2) + i4x4Idx];
+        sMvd = WelsCabacMbMvd (pCabacCtx, pCurMb, kiMbWidth, pCurMb->sMv[i4x4ScanIdx], pMbCache->sMbMvp[i4x4ScanIdx],
+                               i4x4ScanIdx);
+        pCurMb->sMvd[i4x4ScanIdx].sAssginMv (sMvd);
+      }
+    } else if (SUB_MB_TYPE_8x4 == uiSubMbType) {
+      for (int32_t i8x4Idx = 0; i8x4Idx < 2; ++i8x4Idx) {
+        i4x4ScanIdx = g_kuiMbCountScan4Idx[ (i8x8Idx << 2) + (i8x4Idx << 1)];
+        sMvd = WelsCabacMbMvd (pCabacCtx, pCurMb, kiMbWidth, pCurMb->sMv[i4x4ScanIdx], pMbCache->sMbMvp[i4x4ScanIdx],
+                               i4x4ScanIdx);
+        pCurMb->sMvd[    i4x4ScanIdx].sAssginMv (sMvd);
+        pCurMb->sMvd[1 + i4x4ScanIdx].sAssginMv (sMvd);
+      }
+    } else if (SUB_MB_TYPE_4x8 == uiSubMbType) {
+      for (int32_t i4x8Idx = 0; i4x8Idx < 2; ++i4x8Idx) {
+        i4x4ScanIdx = g_kuiMbCountScan4Idx[ (i8x8Idx << 2) + i4x8Idx];
+        sMvd = WelsCabacMbMvd (pCabacCtx, pCurMb, kiMbWidth, pCurMb->sMv[i4x4ScanIdx], pMbCache->sMbMvp[i4x4ScanIdx],
+                               i4x4ScanIdx);
+        pCurMb->sMvd[    i4x4ScanIdx].sAssginMv (sMvd);
+        pCurMb->sMvd[4 + i4x4ScanIdx].sAssginMv (sMvd);
+      }
+    }
+  }
+}
+
 int16_t WelsGetMbCtxCabac (SMbCache* pMbCache, SMB* pCurMb, uint32_t iMbWidth, ECtxBlockCat eCtxBlockCat,
                            int16_t iIdx) {
   int16_t iNzA = -1, iNzB = -1;
@@ -610,10 +662,9 @@
       }
       WelsCabacMbIntraChromaPredMode (pCabacCtx, pCurMb, pMbCache, iMbWidth);
       sMvd.iMvX = sMvd.iMvY = 0;
-      pCurMb->sMvd[0].sAssginMv (sMvd);
-      pCurMb->sMvd[1].sAssginMv (sMvd);
-      pCurMb->sMvd[2].sAssginMv (sMvd);
-      pCurMb->sMvd[3].sAssginMv (sMvd);
+      for (i = 0; i < 16; ++i) {
+        pCurMb->sMvd[i].sAssginMv (sMvd);
+      }
 
     } else if (uiMbType == MB_TYPE_16x16) {
 
@@ -622,10 +673,9 @@
       }
       sMvd = WelsCabacMbMvd (pCabacCtx, pCurMb, iMbWidth, pCurMb->sMv[0], pMbCache->sMbMvp[0], 0);
 
-      pCurMb->sMvd[0].sAssginMv (sMvd);
-      pCurMb->sMvd[1].sAssginMv (sMvd);
-      pCurMb->sMvd[2].sAssginMv (sMvd);
-      pCurMb->sMvd[3].sAssginMv (sMvd);
+      for (i = 0; i < 16; ++i) {
+        pCurMb->sMvd[i].sAssginMv (sMvd);
+      }
 
     } else if (uiMbType == MB_TYPE_16x8) {
       if (uiNumRefIdxL0Active > 0) {
@@ -633,14 +683,13 @@
         WelsCabacMbRef (pCabacCtx, pCurMb, pMbCache, 12);
       }
       sMvd = WelsCabacMbMvd (pCabacCtx, pCurMb, iMbWidth , pCurMb->sMv[0], pMbCache->sMbMvp[0], 0);
-      pCurMb->sMvd[0].sAssginMv (sMvd);
-      pCurMb->sMvd[1].sAssginMv (sMvd);
-
-
-      sMvd = WelsCabacMbMvd (pCabacCtx, pCurMb, iMbWidth, pCurMb->sMv[8], pMbCache->sMbMvp[1], 2);
-      pCurMb->sMvd[2].sAssginMv (sMvd);
-      pCurMb->sMvd[3].sAssginMv (sMvd);
-
+      for (i = 0; i < 8; ++i) {
+        pCurMb->sMvd[i].sAssginMv (sMvd);
+      }
+      sMvd = WelsCabacMbMvd (pCabacCtx, pCurMb, iMbWidth, pCurMb->sMv[8], pMbCache->sMbMvp[1], 8);
+      for (i = 8; i < 16; ++i) {
+        pCurMb->sMvd[i].sAssginMv (sMvd);
+      }
     } else  if (uiMbType == MB_TYPE_8x16) {
       if (uiNumRefIdxL0Active > 0) {
         WelsCabacMbRef (pCabacCtx, pCurMb, pMbCache, 0);
@@ -647,16 +696,18 @@
         WelsCabacMbRef (pCabacCtx, pCurMb, pMbCache, 2);
       }
       sMvd = WelsCabacMbMvd (pCabacCtx, pCurMb, iMbWidth, pCurMb->sMv[0], pMbCache->sMbMvp[0], 0);
-      pCurMb->sMvd[0].sAssginMv (sMvd);
-      pCurMb->sMvd[2].sAssginMv (sMvd);
-
-      sMvd = WelsCabacMbMvd (pCabacCtx, pCurMb, iMbWidth,  pCurMb->sMv[2], pMbCache->sMbMvp[1], 1);
-      pCurMb->sMvd[1].sAssginMv (sMvd);
-      pCurMb->sMvd[3].sAssginMv (sMvd);
-
+      for (i = 0; i < 16; i += 4) {
+        pCurMb->sMvd[i    ].sAssginMv (sMvd);
+        pCurMb->sMvd[i + 1].sAssginMv (sMvd);
+      }
+      sMvd = WelsCabacMbMvd (pCabacCtx, pCurMb, iMbWidth,  pCurMb->sMv[2], pMbCache->sMbMvp[1], 2);
+      for (i = 0; i < 16; i += 4) {
+        pCurMb->sMvd[i + 2].sAssginMv (sMvd);
+        pCurMb->sMvd[i + 3].sAssginMv (sMvd);
+      }
     } else if ((uiMbType == MB_TYPE_8x8) || (uiMbType == MB_TYPE_8x8_REF0)) {
-      for (i = 0; i < 4; i++)
-        WelsCabacEncodeDecision (pCabacCtx, 21, 1);
+      //write sub_mb_type
+      WelsCabacSubMbType (pCabacCtx, pCurMb);
 
       if (uiNumRefIdxL0Active > 0) {
         WelsCabacMbRef (pCabacCtx, pCurMb, pMbCache, 0);
@@ -664,19 +715,8 @@
         WelsCabacMbRef (pCabacCtx, pCurMb, pMbCache, 12);
         WelsCabacMbRef (pCabacCtx, pCurMb, pMbCache, 14);
       }
-
-
-      sMvd = WelsCabacMbMvd (pCabacCtx, pCurMb, iMbWidth, pCurMb->sMv[0], pMbCache->sMbMvp[0], 0);
-      pCurMb->sMvd[0].sAssginMv (sMvd);
-
-      sMvd = WelsCabacMbMvd (pCabacCtx, pCurMb, iMbWidth, pCurMb->sMv[2], pMbCache->sMbMvp[1], 1);
-      pCurMb->sMvd[1].sAssginMv (sMvd);
-
-      sMvd = WelsCabacMbMvd (pCabacCtx, pCurMb, iMbWidth, pCurMb->sMv[8], pMbCache->sMbMvp[2], 2);
-      pCurMb->sMvd[2].sAssginMv (sMvd);
-
-      sMvd = WelsCabacMbMvd (pCabacCtx, pCurMb, iMbWidth, pCurMb->sMv[10], pMbCache->sMbMvp[3], 3);
-      pCurMb->sMvd[3].sAssginMv (sMvd);
+      //write sub8x8 mvd
+      WelsCabacSubMbMvd (pCabacCtx, pCurMb, pMbCache, iMbWidth);
     }
     if (uiMbType != MB_TYPE_INTRA16x16) {
       WelsCabacMbCbp (pCurMb, iMbWidth, pCabacCtx);
--- a/codec/encoder/core/src/svc_set_mb_syn_cavlc.cpp
+++ b/codec/encoder/core/src/svc_set_mb_syn_cavlc.cpp
@@ -190,7 +190,22 @@
 
   //step 1: sub_mb_type
   for (i = 0; i < 4; i++) {
-    BsWriteUE (pBs, 0);
+    switch (pCurMb->uiSubMbType[i]) {
+    case SUB_MB_TYPE_8x8:
+      BsWriteUE (pBs, 0);
+      break;
+    case SUB_MB_TYPE_8x4:
+      BsWriteUE (pBs, 1);
+      break;
+    case SUB_MB_TYPE_4x8:
+      BsWriteUE (pBs, 2);
+      break;
+    case SUB_MB_TYPE_4x4:
+      BsWriteUE (pBs, 3);
+      break;
+    default: //should not enter
+      break;
+    }
   }
 
   //step 2: get and write uiRefIndex and sMvd
@@ -202,8 +217,30 @@
   }
   //write sMvd
   for (i = 0; i < 4; i++) {
-    BsWriteSE (pBs, pCurMb->sMv[*kpScan4].iMvX - pMbCache->sMbMvp[i].iMvX);
-    BsWriteSE (pBs, pCurMb->sMv[*kpScan4].iMvY - pMbCache->sMbMvp[i].iMvY);
+    uint32_t uiSubMbType = pCurMb->uiSubMbType[i];
+    if (SUB_MB_TYPE_8x8 == uiSubMbType) {
+      BsWriteSE (pBs, pCurMb->sMv[*kpScan4].iMvX - pMbCache->sMbMvp[*kpScan4].iMvX);
+      BsWriteSE (pBs, pCurMb->sMv[*kpScan4].iMvY - pMbCache->sMbMvp[*kpScan4].iMvY);
+    } else if (SUB_MB_TYPE_4x4 == uiSubMbType) {
+      BsWriteSE (pBs, pCurMb->sMv[*kpScan4].iMvX - pMbCache->sMbMvp[*kpScan4].iMvX);
+      BsWriteSE (pBs, pCurMb->sMv[*kpScan4].iMvY - pMbCache->sMbMvp[*kpScan4].iMvY);
+      BsWriteSE (pBs, pCurMb->sMv[* (kpScan4 + 1)].iMvX - pMbCache->sMbMvp[* (kpScan4 + 1)].iMvX);
+      BsWriteSE (pBs, pCurMb->sMv[* (kpScan4 + 1)].iMvY - pMbCache->sMbMvp[* (kpScan4 + 1)].iMvY);
+      BsWriteSE (pBs, pCurMb->sMv[* (kpScan4 + 2)].iMvX - pMbCache->sMbMvp[* (kpScan4 + 2)].iMvX);
+      BsWriteSE (pBs, pCurMb->sMv[* (kpScan4 + 2)].iMvY - pMbCache->sMbMvp[* (kpScan4 + 2)].iMvY);
+      BsWriteSE (pBs, pCurMb->sMv[* (kpScan4 + 3)].iMvX - pMbCache->sMbMvp[* (kpScan4 + 3)].iMvX);
+      BsWriteSE (pBs, pCurMb->sMv[* (kpScan4 + 3)].iMvY - pMbCache->sMbMvp[* (kpScan4 + 3)].iMvY);
+    } else if (SUB_MB_TYPE_8x4 == uiSubMbType) {
+      BsWriteSE (pBs, pCurMb->sMv[*kpScan4].iMvX - pMbCache->sMbMvp[*kpScan4].iMvX);
+      BsWriteSE (pBs, pCurMb->sMv[*kpScan4].iMvY - pMbCache->sMbMvp[*kpScan4].iMvY);
+      BsWriteSE (pBs, pCurMb->sMv[* (kpScan4 + 2)].iMvX - pMbCache->sMbMvp[* (kpScan4 + 2)].iMvX);
+      BsWriteSE (pBs, pCurMb->sMv[* (kpScan4 + 2)].iMvY - pMbCache->sMbMvp[* (kpScan4 + 2)].iMvY);
+    } else if (SUB_MB_TYPE_4x8 == uiSubMbType) {
+      BsWriteSE (pBs, pCurMb->sMv[*kpScan4].iMvX - pMbCache->sMbMvp[*kpScan4].iMvX);
+      BsWriteSE (pBs, pCurMb->sMv[*kpScan4].iMvY - pMbCache->sMbMvp[*kpScan4].iMvY);
+      BsWriteSE (pBs, pCurMb->sMv[* (kpScan4 + 1)].iMvX - pMbCache->sMbMvp[* (kpScan4 + 1)].iMvX);
+      BsWriteSE (pBs, pCurMb->sMv[* (kpScan4 + 1)].iMvY - pMbCache->sMbMvp[* (kpScan4 + 1)].iMvY);
+    }
     kpScan4 += 4;
   }
 }