shithub: openh264

ref: 102dc5f0f046a8bf144f491759a7a93826ded4f4
dir: /codec/encoder/core/src/md.cpp/

View raw version
/*!
 * \copy
 *     Copyright (c)  2009-2013, Cisco Systems
 *     All rights reserved.
 *
 *     Redistribution and use in source and binary forms, with or without
 *     modification, are permitted provided that the following conditions
 *     are met:
 *
 *        * Redistributions of source code must retain the above copyright
 *          notice, this list of conditions and the following disclaimer.
 *
 *        * Redistributions in binary form must reproduce the above copyright
 *          notice, this list of conditions and the following disclaimer in
 *          the documentation and/or other materials provided with the
 *          distribution.
 *
 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 *     POSSIBILITY OF SUCH DAMAGE.
 *
 *
 * \file	md.c
 *
 * \brief	mode decision
 *
 * \date	2009.05.14 Created
 *
 *************************************************************************************
 */

#include "ls_defines.h"
#include "md.h"
#include "cpu_core.h"
#include "svc_enc_golomb.h"

namespace WelsSVCEnc {
#define INTRA_VARIANCE_SAD_THRESHOLD 150
#define INTER_VARIANCE_SAD_THRESHOLD 20

//fill cache of neighbor MB, containing pNonZeroCount, sample_avail, pIntra4x4PredMode
void FillNeighborCacheIntra (SMbCache* pMbCache, SMB* pCurMb, int32_t iMbWidth) {
  uint32_t uiNeighborAvail = pCurMb->uiNeighborAvail;
  uint32_t uiNeighborIntra = 0;

  if (uiNeighborAvail & LEFT_MB_POS) { //LEFT MB
    int8_t* pLeftMbNonZeroCount = pCurMb->pNonZeroCount - MB_LUMA_CHROMA_BLOCK4x4_NUM;
    pMbCache->iNonZeroCoeffCount[8] = pLeftMbNonZeroCount[ 3];
    pMbCache->iNonZeroCoeffCount[16] = pLeftMbNonZeroCount[ 7];
    pMbCache->iNonZeroCoeffCount[24] = pLeftMbNonZeroCount[11];
    pMbCache->iNonZeroCoeffCount[32] = pLeftMbNonZeroCount[15];

    pMbCache->iNonZeroCoeffCount[ 13] = pLeftMbNonZeroCount[17];
    pMbCache->iNonZeroCoeffCount[21] = pLeftMbNonZeroCount[21];
    pMbCache->iNonZeroCoeffCount[37] = pLeftMbNonZeroCount[19];
    pMbCache->iNonZeroCoeffCount[45] = pLeftMbNonZeroCount[23];

    uiNeighborIntra |= LEFT_MB_POS;

    if (IS_INTRA4x4 ((pCurMb - 1)->uiMbType)) {
      int8_t* pLeftMbIntra4x4PredMode = pCurMb->pIntra4x4PredMode - INTRA_4x4_MODE_NUM;
      pMbCache->iIntraPredMode[8] = pLeftMbIntra4x4PredMode[4];
      pMbCache->iIntraPredMode[16] = pLeftMbIntra4x4PredMode[5];
      pMbCache->iIntraPredMode[24] = pLeftMbIntra4x4PredMode[6];
      pMbCache->iIntraPredMode[32] = pLeftMbIntra4x4PredMode[3];
    } else { // if ( 0 == constrained_intra_pred_flag || IS_INTRA16x16((pCurMb-1)->uiMbType ))
      pMbCache->iIntraPredMode[8] =
        pMbCache->iIntraPredMode[16] =
          pMbCache->iIntraPredMode[24] =
            pMbCache->iIntraPredMode[32] = 2; //DC
    }
  } else {
    pMbCache->iNonZeroCoeffCount[ 8] =
      pMbCache->iNonZeroCoeffCount[16] =
        pMbCache->iNonZeroCoeffCount[24] =
          pMbCache->iNonZeroCoeffCount[32] = -1;//unavailable
    pMbCache->iNonZeroCoeffCount[13] =
      pMbCache->iNonZeroCoeffCount[21] =
        pMbCache->iNonZeroCoeffCount[37] =
          pMbCache->iNonZeroCoeffCount[45] = -1;//unavailable

    pMbCache->iIntraPredMode[8] =
      pMbCache->iIntraPredMode[16] =
        pMbCache->iIntraPredMode[24] =
          pMbCache->iIntraPredMode[32] = -1;//unavailable
  }

  if (uiNeighborAvail & TOP_MB_POS) { //TOP MB
    SMB* pTopMb = pCurMb - iMbWidth;
    ST32 (&pMbCache->iNonZeroCoeffCount[1], LD32 (&pTopMb->pNonZeroCount[12]));

    ST16 (&pMbCache->iNonZeroCoeffCount[6], LD16 (&pTopMb->pNonZeroCount[20]));
    ST16 (&pMbCache->iNonZeroCoeffCount[30], LD16 (&pTopMb->pNonZeroCount[22]));

    uiNeighborIntra |= TOP_MB_POS;

    if (IS_INTRA4x4 (pTopMb->uiMbType)) {
      ST32 (pMbCache->iIntraPredMode + 1, LD32 (&pTopMb->pIntra4x4PredMode[0]));
    } else { // if ( 0 == constrained_intra_pred_flag || IS_INTRA16x16( pTopMb->uiMbType ))
      const uint32_t kuiDc32 = 0x02020202;
      ST32 (pMbCache->iIntraPredMode + 1 , kuiDc32);
    }
  } else {
    const uint32_t kuiUnavail32 = 0xffffffff;
    ST32 (pMbCache->iIntraPredMode + 1 , kuiUnavail32);
    ST32 (&pMbCache->iNonZeroCoeffCount[1], kuiUnavail32);

    ST16 (&pMbCache->iNonZeroCoeffCount[6], 0xffff);
    ST16 (&pMbCache->iNonZeroCoeffCount[30], 0xffff);
  }

  if (uiNeighborAvail & TOPLEFT_MB_POS) {
    uiNeighborIntra |= 0x04;
  }


  if (uiNeighborAvail & TOPRIGHT_MB_POS) {
    uiNeighborIntra |= 0x08;
  }
  pMbCache->uiNeighborIntra = uiNeighborIntra;
}
//fill cache of neighbor MB, containing motion_vector and uiRefIndex
void FillNeighborCacheInterWithoutBGD (SMbCache* pMbCache, SMB* pCurMb, int32_t iMbWidth, int8_t* pVaaBgMbFlag) {
  uint32_t uiNeighborAvail = pCurMb->uiNeighborAvail;
  SMB* pLeftMb = pCurMb - 1 ;
  SMB* pTopMb = pCurMb - iMbWidth;
  SMB* pLeftTopMb = pCurMb - iMbWidth - 1 ;
  SMB* iRightTopMb = pCurMb - iMbWidth + 1 ;
  SMVComponentUnit* pMvComp = &pMbCache->sMvComponents;
  if ((uiNeighborAvail & LEFT_MB_POS) && IS_SVC_INTER (pLeftMb->uiMbType)) {
    pMvComp->sMotionVectorCache[ 6] = pLeftMb->sMv[ 3];
    pMvComp->sMotionVectorCache[12] = pLeftMb->sMv[ 7];
    pMvComp->sMotionVectorCache[18] = pLeftMb->sMv[11];
    pMvComp->sMotionVectorCache[24] = pLeftMb->sMv[15];
    pMvComp->iRefIndexCache[ 6] = pLeftMb->pRefIndex[1];
    pMvComp->iRefIndexCache[12] = pLeftMb->pRefIndex[1];
    pMvComp->iRefIndexCache[18] = pLeftMb->pRefIndex[3];
    pMvComp->iRefIndexCache[24] = pLeftMb->pRefIndex[3];
    pMbCache->iSadCost[3] = pLeftMb->pSadCost[0];

    if (pLeftMb->uiMbType == MB_TYPE_SKIP) {
      pMbCache->bMbTypeSkip[3] = 1;
      pMbCache->iSadCostSkip[3] = pMbCache->pEncSad[-1];
    } else {
      pMbCache->bMbTypeSkip[3] = 0;
      pMbCache->iSadCostSkip[3] = 0;
    }
  } else { //avail or non-inter
    ST32 (&pMvComp->sMotionVectorCache[ 6], 0);
    ST32 (&pMvComp->sMotionVectorCache[12], 0);
    ST32 (&pMvComp->sMotionVectorCache[18], 0);
    ST32 (&pMvComp->sMotionVectorCache[24], 0);
    pMvComp->iRefIndexCache[ 6] =
      pMvComp->iRefIndexCache[12] =
        pMvComp->iRefIndexCache[18] =
          pMvComp->iRefIndexCache[24] = (uiNeighborAvail & LEFT_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
    pMbCache->iSadCost[3] = 0;
    pMbCache->bMbTypeSkip[3] = 0;
    pMbCache->iSadCostSkip[3] = 0;
  }

  if ((uiNeighborAvail & TOP_MB_POS) && IS_SVC_INTER (pTopMb->uiMbType)) { //TOP MB
    ST64 (&pMvComp->sMotionVectorCache[1], LD64 (&pTopMb->sMv[12]));
    ST64 (&pMvComp->sMotionVectorCache[3], LD64 (&pTopMb->sMv[14]));
    pMvComp->iRefIndexCache[1] = pTopMb->pRefIndex[2];
    pMvComp->iRefIndexCache[2] = pTopMb->pRefIndex[2];
    pMvComp->iRefIndexCache[3] = pTopMb->pRefIndex[3];
    pMvComp->iRefIndexCache[4] = pTopMb->pRefIndex[3];
    pMbCache->iSadCost[1] = pTopMb->pSadCost[0];

    if (pTopMb->uiMbType == MB_TYPE_SKIP) {
      pMbCache->bMbTypeSkip[1] = 1;
      pMbCache->iSadCostSkip[1] = pMbCache->pEncSad[-iMbWidth];
    } else {
      pMbCache->bMbTypeSkip[1] = 0;
      pMbCache->iSadCostSkip[1] = 0;
    }
  } else { //unavail
    ST64 (&pMvComp->sMotionVectorCache[1], 0);
    ST64 (&pMvComp->sMotionVectorCache[3], 0);
    pMvComp->iRefIndexCache[1] =
      pMvComp->iRefIndexCache[2] =
        pMvComp->iRefIndexCache[3] =
          pMvComp->iRefIndexCache[4] = (uiNeighborAvail & TOP_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
    pMbCache->iSadCost[1] = 0;

    pMbCache->bMbTypeSkip[1] = 0;
    pMbCache->iSadCostSkip[1] = 0;
  }

  if ((uiNeighborAvail & TOPLEFT_MB_POS) && IS_SVC_INTER (pLeftTopMb->uiMbType)) { //LEFT_TOP MB
    pMvComp->sMotionVectorCache[0] = pLeftTopMb->sMv[15];
    pMvComp->iRefIndexCache[0] = pLeftTopMb->pRefIndex[3];
    pMbCache->iSadCost[0] = pLeftTopMb->pSadCost[0];

    if (pLeftTopMb->uiMbType == MB_TYPE_SKIP) {
      pMbCache->bMbTypeSkip[0] = 1;
      pMbCache->iSadCostSkip[0] = pMbCache->pEncSad[-iMbWidth - 1];
    } else {
      pMbCache->bMbTypeSkip[0] = 0;
      pMbCache->iSadCostSkip[0] = 0;
    }
  } else { //unavail
    ST32 (&pMvComp->sMotionVectorCache[0], 0);
    pMvComp->iRefIndexCache[0] = (uiNeighborAvail & TOPLEFT_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
    pMbCache->iSadCost[0] = 0;
    pMbCache->bMbTypeSkip[0] = 0;
    pMbCache->iSadCostSkip[0] = 0;
  }

  if ((uiNeighborAvail & TOPRIGHT_MB_POS) && IS_SVC_INTER (iRightTopMb->uiMbType)) { //RIGHT_TOP MB
    pMvComp->sMotionVectorCache[5] = iRightTopMb->sMv[12];
    pMvComp->iRefIndexCache[5] = iRightTopMb->pRefIndex[2];
    pMbCache->iSadCost[2] = iRightTopMb->pSadCost[0];

    if (iRightTopMb->uiMbType == MB_TYPE_SKIP) {
      pMbCache->bMbTypeSkip[2] = 1;
      pMbCache->iSadCostSkip[2] = pMbCache->pEncSad[-iMbWidth + 1];
    } else {
      pMbCache->bMbTypeSkip[2] = 0;
      pMbCache->iSadCostSkip[2] = 0;
    }
  } else { //unavail
    ST32 (&pMvComp->sMotionVectorCache[5], 0);
    pMvComp->iRefIndexCache[5] = (uiNeighborAvail & TOPRIGHT_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
    pMbCache->iSadCost[2] = 0;
    pMbCache->bMbTypeSkip[2] = 0;
    pMbCache->iSadCostSkip[2] = 0;
  }

  //right-top 4*4 pBlock unavailable
  ST32 (&pMvComp->sMotionVectorCache[ 9], 0);
  ST32 (&pMvComp->sMotionVectorCache[21], 0);
  ST32 (&pMvComp->sMotionVectorCache[11], 0);
  ST32 (&pMvComp->sMotionVectorCache[17], 0);
  ST32 (&pMvComp->sMotionVectorCache[23], 0);
  pMvComp->iRefIndexCache[ 9] =
    pMvComp->iRefIndexCache[11] =
      pMvComp->iRefIndexCache[17] =
        pMvComp->iRefIndexCache[21] =
          pMvComp->iRefIndexCache[23] = REF_NOT_AVAIL;
}

void FillNeighborCacheInterWithBGD (SMbCache* pMbCache, SMB* pCurMb, int32_t iMbWidth, int8_t* pVaaBgMbFlag) {
  uint32_t uiNeighborAvail = pCurMb->uiNeighborAvail;
  SMB* pLeftMb = pCurMb - 1 ;
  SMB* pTopMb = pCurMb - iMbWidth;
  SMB* pLeftTopMb = pCurMb - iMbWidth - 1 ;
  SMB* iRightTopMb = pCurMb - iMbWidth + 1 ;
  SMVComponentUnit* pMvComp = &pMbCache->sMvComponents;

  if ((uiNeighborAvail & LEFT_MB_POS) && IS_SVC_INTER (pLeftMb->uiMbType)) {
    pMvComp->sMotionVectorCache[ 6] = pLeftMb->sMv[ 3];
    pMvComp->sMotionVectorCache[12] = pLeftMb->sMv[ 7];
    pMvComp->sMotionVectorCache[18] = pLeftMb->sMv[11];
    pMvComp->sMotionVectorCache[24] = pLeftMb->sMv[15];
    pMvComp->iRefIndexCache[ 6] = pLeftMb->pRefIndex[1];
    pMvComp->iRefIndexCache[12] = pLeftMb->pRefIndex[1];
    pMvComp->iRefIndexCache[18] = pLeftMb->pRefIndex[3];
    pMvComp->iRefIndexCache[24] = pLeftMb->pRefIndex[3];
    pMbCache->iSadCost[3] = pLeftMb->pSadCost[0];

    if (pLeftMb->uiMbType == MB_TYPE_SKIP && pVaaBgMbFlag[-1] == 0) {
      pMbCache->bMbTypeSkip[3] = 1;
      pMbCache->iSadCostSkip[3] = pMbCache->pEncSad[-1];
    } else {
      pMbCache->bMbTypeSkip[3] = 0;
      pMbCache->iSadCostSkip[3] = 0;
    }
  } else { //avail or non-inter
    ST32 (&pMvComp->sMotionVectorCache[ 6], 0);
    ST32 (&pMvComp->sMotionVectorCache[12], 0);
    ST32 (&pMvComp->sMotionVectorCache[18], 0);
    ST32 (&pMvComp->sMotionVectorCache[24], 0);
    pMvComp->iRefIndexCache[ 6] =
      pMvComp->iRefIndexCache[12] =
        pMvComp->iRefIndexCache[18] =
          pMvComp->iRefIndexCache[24] = (uiNeighborAvail & LEFT_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
    pMbCache->iSadCost[3] = 0;
    pMbCache->bMbTypeSkip[3] = 0;
    pMbCache->iSadCostSkip[3] = 0;
  }

  if ((uiNeighborAvail & TOP_MB_POS) && IS_SVC_INTER (pTopMb->uiMbType)) { //TOP MB
    ST64 (&pMvComp->sMotionVectorCache[1], LD64 (&pTopMb->sMv[12]));
    ST64 (&pMvComp->sMotionVectorCache[3], LD64 (&pTopMb->sMv[14]));
    pMvComp->iRefIndexCache[1] = pTopMb->pRefIndex[2];
    pMvComp->iRefIndexCache[2] = pTopMb->pRefIndex[2];
    pMvComp->iRefIndexCache[3] = pTopMb->pRefIndex[3];
    pMvComp->iRefIndexCache[4] = pTopMb->pRefIndex[3];
    pMbCache->iSadCost[1] = pTopMb->pSadCost[0];
    if (pTopMb->uiMbType == MB_TYPE_SKIP  && pVaaBgMbFlag[-iMbWidth] == 0) {
      pMbCache->bMbTypeSkip[1] = 1;
      pMbCache->iSadCostSkip[1] = pMbCache->pEncSad[-iMbWidth];
    } else {
      pMbCache->bMbTypeSkip[1] = 0;
      pMbCache->iSadCostSkip[1] = 0;
    }
  } else { //unavail
    ST64 (&pMvComp->sMotionVectorCache[1], 0);
    ST64 (&pMvComp->sMotionVectorCache[3], 0);
    pMvComp->iRefIndexCache[1] =
      pMvComp->iRefIndexCache[2] =
        pMvComp->iRefIndexCache[3] =
          pMvComp->iRefIndexCache[4] = (uiNeighborAvail & TOP_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
    pMbCache->iSadCost[1] = 0;
    pMbCache->bMbTypeSkip[1] = 0;
    pMbCache->iSadCostSkip[1] = 0;
  }


  if ((uiNeighborAvail & TOPLEFT_MB_POS) && IS_SVC_INTER (pLeftTopMb->uiMbType)) { //LEFT_TOP MB
    pMvComp->sMotionVectorCache[0] = pLeftTopMb->sMv[15];
    pMvComp->iRefIndexCache[0] = pLeftTopMb->pRefIndex[3];
    pMbCache->iSadCost[0] = pLeftTopMb->pSadCost[0];

    if (pLeftTopMb->uiMbType == MB_TYPE_SKIP  && pVaaBgMbFlag[-iMbWidth - 1] == 0) {
      pMbCache->bMbTypeSkip[0] = 1;
      pMbCache->iSadCostSkip[0] = pMbCache->pEncSad[-iMbWidth - 1];
    } else {
      pMbCache->bMbTypeSkip[0] = 0;
      pMbCache->iSadCostSkip[0] = 0;
    }
  } else { //unavail
    ST32 (&pMvComp->sMotionVectorCache[0], 0);
    pMvComp->iRefIndexCache[0] = (uiNeighborAvail & TOPLEFT_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
    pMbCache->iSadCost[0] = 0;
    pMbCache->bMbTypeSkip[0] = 0;
    pMbCache->iSadCostSkip[0] = 0;
  }

  if ((uiNeighborAvail & TOPRIGHT_MB_POS) && IS_SVC_INTER (iRightTopMb->uiMbType)) { //RIGHT_TOP MB
    pMvComp->sMotionVectorCache[5] = iRightTopMb->sMv[12];
    pMvComp->iRefIndexCache[5] = iRightTopMb->pRefIndex[2];
    pMbCache->iSadCost[2] = iRightTopMb->pSadCost[0];

    if (iRightTopMb->uiMbType == MB_TYPE_SKIP  && pVaaBgMbFlag[-iMbWidth + 1] == 0) {
      pMbCache->bMbTypeSkip[2] = 1;
      pMbCache->iSadCostSkip[2] = pMbCache->pEncSad[-iMbWidth + 1];
    } else {
      pMbCache->bMbTypeSkip[2] = 0;
      pMbCache->iSadCostSkip[2] = 0;
    }
  } else { //unavail
    ST32 (&pMvComp->sMotionVectorCache[5], 0);
    pMvComp->iRefIndexCache[5] = (uiNeighborAvail & TOPRIGHT_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
    pMbCache->iSadCost[2] = 0;
    pMbCache->bMbTypeSkip[2] = 0;
    pMbCache->iSadCostSkip[2] = 0;
  }

  //right-top 4*4 pBlock unavailable
  ST32 (&pMvComp->sMotionVectorCache[ 9], 0);
  ST32 (&pMvComp->sMotionVectorCache[21], 0);
  ST32 (&pMvComp->sMotionVectorCache[11], 0);
  ST32 (&pMvComp->sMotionVectorCache[17], 0);
  ST32 (&pMvComp->sMotionVectorCache[23], 0);
  pMvComp->iRefIndexCache[ 9] =
    pMvComp->iRefIndexCache[11] =
      pMvComp->iRefIndexCache[17] =
        pMvComp->iRefIndexCache[21] =
          pMvComp->iRefIndexCache[23] = REF_NOT_AVAIL;
}

void InitFillNeighborCacheInterFunc (SWelsFuncPtrList* pFuncList, const int32_t kiFlag) {
  pFuncList->pfFillInterNeighborCache = kiFlag ? FillNeighborCacheInterWithBGD : FillNeighborCacheInterWithoutBGD;
}

void UpdateMbMv_c (SMVUnitXY* pMvBuffer, const SMVUnitXY ksMv) {
  int32_t k = 0;
  for (; k < MB_BLOCK4x4_NUM; k += 4) {
    pMvBuffer[k  ] =
      pMvBuffer[k + 1] =
        pMvBuffer[k + 2] =
          pMvBuffer[k + 3] = ksMv;
  }
}


uint8_t MdInterAnalysisVaaInfo_c (int32_t* pSad8x8) {
  int32_t iSadBlock[4], iAverageSadBlock[4];
  int32_t iAverageSad, iVarianceSad;

  iSadBlock[0] = pSad8x8[0];
  iAverageSad = iSadBlock[0];

  iSadBlock[1] = pSad8x8[1];
  iAverageSad += iSadBlock[1];

  iSadBlock[2] = pSad8x8[2];
  iAverageSad += iSadBlock[2];

  iSadBlock[3] = pSad8x8[3];
  iAverageSad += iSadBlock[3];

  iAverageSad = iAverageSad >> 2;

  iAverageSadBlock[0] = (iSadBlock[0] >> 6) - (iAverageSad >> 6);
  iVarianceSad = iAverageSadBlock[0] * iAverageSadBlock[0];

  iAverageSadBlock[1] = (iSadBlock[1] >> 6) - (iAverageSad >> 6);
  iVarianceSad += iAverageSadBlock[1] * iAverageSadBlock[1];

  iAverageSadBlock[2] = (iSadBlock[2] >> 6) - (iAverageSad >> 6);
  iVarianceSad += iAverageSadBlock[2] * iAverageSadBlock[2];

  iAverageSadBlock[3] = (iSadBlock[3] >> 6) - (iAverageSad >> 6);
  iVarianceSad += iAverageSadBlock[3] * iAverageSadBlock[3];

  if (iVarianceSad < INTER_VARIANCE_SAD_THRESHOLD) {
    return 15;
  }

  uint8_t uiMbSign = 0;
  if (iSadBlock[0] > iAverageSad)
    uiMbSign |= 0x08;
  if (iSadBlock[1] > iAverageSad)
    uiMbSign |= 0x04;
  if (iSadBlock[2] > iAverageSad)
    uiMbSign |= 0x02;
  if (iSadBlock[3] > iAverageSad)
    uiMbSign |= 0x01;
  return (uiMbSign);
}

int32_t AnalysisVaaInfoIntra_c (uint8_t* pDataY, const int32_t kiLineSize) {
  ENFORCE_STACK_ALIGN_1D (uint16_t, uiAvgBlock, 16, 16)
  uint16_t* pBlock = &uiAvgBlock[0];
  uint8_t* pEncData	= pDataY;
  const int32_t kiLineSize2	= kiLineSize << 1;
  const int32_t kiLineSize3	= kiLineSize + kiLineSize2;
  const int32_t kiLineSize4	= kiLineSize << 2;
  int32_t i = 0, j = 0, num = 0;
  int32_t iSumAvg = 0, iSumSqr = 0;

//	analysis_vaa_info_intra_core_c( pDataY, iLineSize, pBlock );
  for (; j < 16; j += 4) {
    num = 0;
    for (i = 0; i < 16; i += 4, num ++) {
      pBlock[num]	=  pEncData[i          ] + pEncData[i + 1          ] + pEncData[i + 2          ] + pEncData[i +
                     3          ];
      pBlock[num]	+= pEncData[i + kiLineSize ] + pEncData[i + kiLineSize + 1 ] + pEncData[i + kiLineSize + 2 ] + pEncData[i +
                     kiLineSize + 3 ];
      pBlock[num]	+= pEncData[i + kiLineSize2] + pEncData[i + kiLineSize2 + 1] + pEncData[i + kiLineSize2 + 2] + pEncData[i +
                     kiLineSize2 + 3];
      pBlock[num]	+= pEncData[i + kiLineSize3] + pEncData[i + kiLineSize3 + 1] + pEncData[i + kiLineSize3 + 2] + pEncData[i +
                     kiLineSize3 + 3];
      pBlock[num]	>>=  4;
    }
    pBlock += 4;
    pEncData += kiLineSize4;
  }

  pBlock = &uiAvgBlock[0];
  i = 4;
  for (; i > 0; --i) {
    iSumAvg += pBlock[0] + pBlock[1] + pBlock[2] + pBlock[3];
    iSumSqr += pBlock[0] * pBlock[0] + pBlock[1] * pBlock[1] + pBlock[2] * pBlock[2] + pBlock[3] * pBlock[3];

    pBlock += 4;
  }


  return /*variance =*/ (iSumSqr - ((iSumAvg * iSumAvg) >> 4));
}

// for pfGetVarianceFromIntraVaa function ptr adaptive by CPU features, 6/7/2010
void InitIntraAnalysisVaaInfo (SWelsFuncPtrList* pFuncList, const uint32_t kuiCpuFlag) {
  pFuncList->pfGetVarianceFromIntraVaa		= AnalysisVaaInfoIntra_c;
  pFuncList->pfGetMbSignFromInterVaa	= MdInterAnalysisVaaInfo_c;
  pFuncList->pfUpdateMbMv					= UpdateMbMv_c;

#if defined(X86_ASM)
  if ((kuiCpuFlag & WELS_CPU_SSE2) == WELS_CPU_SSE2) {
    pFuncList->pfGetVarianceFromIntraVaa		= AnalysisVaaInfoIntra_sse2;
    pFuncList->pfGetMbSignFromInterVaa	= MdInterAnalysisVaaInfo_sse2;
    pFuncList->pfUpdateMbMv					= UpdateMbMv_sse2;
  }
  if ((kuiCpuFlag & WELS_CPU_SSSE3) == WELS_CPU_SSSE3) {
    pFuncList->pfGetVarianceFromIntraVaa	= AnalysisVaaInfoIntra_ssse3;
  }
  if ((kuiCpuFlag & WELS_CPU_SSE41) == WELS_CPU_SSE41) {
    pFuncList->pfGetMbSignFromInterVaa	= MdInterAnalysisVaaInfo_sse41;
  }
#endif//X86_ASM
}

bool MdIntraAnalysisVaaInfo (sWelsEncCtx* pEncCtx, uint8_t* pEncMb) {

  SDqLayer* pCurDqLayer	= pEncCtx->pCurDqLayer;
  const int32_t kiLineSize  = pCurDqLayer->iEncStride[0];
  const int32_t kiVariance	= pEncCtx->pFuncList->pfGetVarianceFromIntraVaa (pEncMb, kiLineSize);
  return (kiVariance >= INTRA_VARIANCE_SAD_THRESHOLD);
}

void InitMeRefinePointer (SMeRefinePointer* pMeRefine, SMbCache* pMbCache, int32_t iStride) {
  pMeRefine->pHalfPixH    = &pMbCache->pBufferInterPredMe[0] + iStride;
  pMeRefine->pHalfPixV    = &pMbCache->pBufferInterPredMe[640] + iStride;

  pMeRefine->pQuarPixBest = &pMbCache->pBufferInterPredMe[1280] + iStride;
  pMeRefine->pQuarPixTmp  = &pMbCache->pBufferInterPredMe[1920] + iStride;
}
typedef struct TagQuarParams {
  int32_t iBestCost;
  int32_t iBestHalfPix;
  int32_t iStrideA;
  int32_t iStrideB;
  uint8_t* pRef;
  uint8_t* pSrcB[4];
  uint8_t* pSrcA[4];
  int32_t iLms[4];
  int32_t iBestQuarPix;
} SQuarRefineParams;

#define SWITCH_BEST_TMP_BUF(prev_best, curr_best){\
	pParams->iBestCost = iCurCost;\
	pTmp = prev_best;\
	prev_best = curr_best;\
	curr_best = pTmp;\
}
#define CALC_COST(me_buf, lm) ( pFunc->sSampleDealingFuncs.pfMeCost[kuiPixel](pEncMb, iStrideEnc, me_buf, ME_REFINE_BUF_STRIDE) + lm )

inline void MeRefineQuarPixel (SWelsFuncPtrList* pFunc, SWelsME* pMe, SMeRefinePointer* pMeRefine,
                               const int32_t kiWidth, const int32_t kiHeight, SQuarRefineParams* pParams, int32_t iStrideEnc) {
  PWelsSampleAveragingFunc* pSampleAvg	= pFunc->sMcFuncs.pfSampleAveraging;
  const int32_t kiAvgIndex		= kiWidth >> 4;
  int32_t iCurCost;
  uint8_t* pEncMb				= pMe->pEncMb;
  uint8_t* pTmp				= NULL;
  const uint8_t kuiPixel		= pMe->uiBlockSize;

  pSampleAvg[kiAvgIndex] (pMeRefine->pQuarPixTmp, ME_REFINE_BUF_STRIDE, pParams->pSrcA[0], ME_REFINE_BUF_STRIDE,
                          pParams->pSrcB[0], pParams->iStrideA, kiHeight);

  iCurCost = CALC_COST (pMeRefine->pQuarPixTmp, pParams->iLms[0]);
  if (iCurCost < pParams->iBestCost) {
    pParams->iBestQuarPix =	ME_QUAR_PIXEL_TOP;
    SWITCH_BEST_TMP_BUF (pMeRefine->pQuarPixBest, pMeRefine->pQuarPixTmp);
  }
  //=========================(0, 1)=======================//
  pSampleAvg[kiAvgIndex] (pMeRefine->pQuarPixTmp, ME_REFINE_BUF_STRIDE, pParams->pSrcA[1],
                          ME_REFINE_BUF_STRIDE, pParams->pSrcB[1], pParams->iStrideA, kiHeight);
  iCurCost = CALC_COST (pMeRefine->pQuarPixTmp, pParams->iLms[1]);
  if (iCurCost < pParams->iBestCost) {
    pParams->iBestQuarPix = ME_QUAR_PIXEL_BOTTOM;
    SWITCH_BEST_TMP_BUF (pMeRefine->pQuarPixBest, pMeRefine->pQuarPixTmp);
  }
  //==========================(-1, 0)=========================//
  pSampleAvg[kiAvgIndex] (pMeRefine->pQuarPixTmp, ME_REFINE_BUF_STRIDE, pParams->pSrcA[2],
                          ME_REFINE_BUF_STRIDE, pParams->pSrcB[2], pParams->iStrideB, kiHeight);
  iCurCost = CALC_COST (pMeRefine->pQuarPixTmp, pParams->iLms[2]);
  if (iCurCost < pParams->iBestCost) {
    pParams->iBestQuarPix = ME_QUAR_PIXEL_LEFT;
    SWITCH_BEST_TMP_BUF (pMeRefine->pQuarPixBest, pMeRefine->pQuarPixTmp);
  }
  //==========================(1, 0)=========================//
  pSampleAvg[kiAvgIndex] (pMeRefine->pQuarPixTmp, ME_REFINE_BUF_STRIDE, pParams->pSrcA[3],
                          ME_REFINE_BUF_STRIDE,	pParams->pSrcB[3], pParams->iStrideB,  kiHeight);

  iCurCost = CALC_COST (pMeRefine->pQuarPixTmp, pParams->iLms[3]);
  if (iCurCost < pParams->iBestCost) {
    pParams->iBestQuarPix = ME_QUAR_PIXEL_RIGHT;
    SWITCH_BEST_TMP_BUF (pMeRefine->pQuarPixBest, pMeRefine->pQuarPixTmp);
  }
}

void MeRefineFracPixel (sWelsEncCtx* pEncCtx, uint8_t* pMemPredInterMb, SWelsME* pMe,
                        SMeRefinePointer* pMeRefine, int32_t iWidth, int32_t iHeight) {
  SWelsFuncPtrList* pFunc = pEncCtx->pFuncList;
  int16_t iMvx = pMe->sMv.iMvX;
  int16_t iMvy = pMe->sMv.iMvY;

  int16_t iHalfMvx = iMvx;
  int16_t iHalfMvy = iMvy;
  const int32_t kiStrideEnc = pEncCtx->pCurDqLayer->iEncStride[0];
  const int32_t kiStrideRef = pEncCtx->pCurDqLayer->pRefPic->iLineSize[0];

  uint8_t* pEncData = pMe->pEncMb;
  uint8_t* pRef = pMe->pRefMb;//091010

  int32_t iBestQuarPix = ME_NO_BEST_QUAR_PIXEL;

  SQuarRefineParams sParams;
  static int32_t iMvQuarAddX[10] = {0, 0, -1, 1, 0, 0, 0, -1, 1, 0};
  int32_t* pMvQuarAddY = iMvQuarAddX + 3;
  uint8_t* pBestPredInter = pRef;
  int32_t iInterBlk4Stride = ME_REFINE_BUF_STRIDE;

  int32_t iBestCost;
  int32_t iCurCost;
  int32_t iBestHalfPix;

  if ((pFunc->sSampleDealingFuncs.pfMeCost == pFunc->sSampleDealingFuncs.pfSampleSatd)
      && (pFunc->sSampleDealingFuncs.pfMdCost == pFunc->sSampleDealingFuncs.pfSampleSatd)) {
    iBestCost = pMe->uSadPredISatd.uiSatd + COST_MVD (pMe->pMvdCost, iMvx - pMe->sMvp.iMvX, iMvy - pMe->sMvp.iMvY);
  } else {
    iBestCost = pFunc->sSampleDealingFuncs.pfMeCost[pMe->uiBlockSize] (pEncData, kiStrideEnc, pRef, kiStrideRef) +
                COST_MVD (pMe->pMvdCost, iMvx - pMe->sMvp.iMvX, iMvy - pMe->sMvp.iMvY);
  }

  iBestHalfPix = REFINE_ME_NO_BEST_HALF_PIXEL;

  pFunc->sMcFuncs.pfLumaHalfpelVer (pRef - kiStrideRef, kiStrideRef, pMeRefine->pHalfPixV, ME_REFINE_BUF_STRIDE, iWidth,
                                    iHeight + 1);

  //step 1: get [iWidth][iHeight+1] half pixel from vertical filter
  //===========================(0, -2)==============================//
  iCurCost = pFunc->sSampleDealingFuncs.pfMeCost[pMe->uiBlockSize] (pEncData, kiStrideEnc, pMeRefine->pHalfPixV,
             ME_REFINE_BUF_STRIDE) +
             COST_MVD (pMe->pMvdCost, iMvx - pMe->sMvp.iMvX, iMvy - 2 - pMe->sMvp.iMvY);
  if (iCurCost < iBestCost) {
    iBestCost = iCurCost;
    iBestHalfPix = REFINE_ME_HALF_PIXEL_TOP;
    pBestPredInter = pMeRefine->pHalfPixV;
  }
  //===========================(0, 2)==============================//
  iCurCost = pFunc->sSampleDealingFuncs.pfMeCost[pMe->uiBlockSize] (pEncData, kiStrideEnc,
             pMeRefine->pHalfPixV + ME_REFINE_BUF_STRIDE, ME_REFINE_BUF_STRIDE) +
             COST_MVD (pMe->pMvdCost, iMvx - pMe->sMvp.iMvX, iMvy + 2 - pMe->sMvp.iMvY);
  if (iCurCost < iBestCost) {
    iBestCost = iCurCost;
    iBestHalfPix = REFINE_ME_HALF_PIXEL_BOTTOM;
    pBestPredInter = pMeRefine->pHalfPixV + ME_REFINE_BUF_STRIDE;
  }
  pFunc->sMcFuncs.pfLumaHalfpelHor (pRef - 1, kiStrideRef, pMeRefine->pHalfPixH, ME_REFINE_BUF_STRIDE, iWidth + 1,
                                    iHeight);
  //step 2: get [iWidth][iHeight+1] half pixel from horizon filter

  //===========================(-2, 0)==============================//
  iCurCost = pFunc->sSampleDealingFuncs.pfMeCost[pMe->uiBlockSize] (pEncData, kiStrideEnc, pMeRefine->pHalfPixH,
             ME_REFINE_BUF_STRIDE) +
             COST_MVD (pMe->pMvdCost, iMvx - 2 - pMe->sMvp.iMvX, iMvy - pMe->sMvp.iMvY);
  if (iCurCost < iBestCost) {
    iBestCost = iCurCost;
    iBestHalfPix = REFINE_ME_HALF_PIXEL_LEFT;
    pBestPredInter = pMeRefine->pHalfPixH;
  }
  //===========================(2, 0)===============================//
  iCurCost = pFunc->sSampleDealingFuncs.pfMeCost[pMe->uiBlockSize] (pEncData, kiStrideEnc, pMeRefine->pHalfPixH + 1,
             ME_REFINE_BUF_STRIDE) +
             COST_MVD (pMe->pMvdCost, iMvx + 2 - pMe->sMvp.iMvX, iMvy - pMe->sMvp.iMvY);
  if (iCurCost < iBestCost) {
    iBestCost = iCurCost;
    iBestHalfPix = REFINE_ME_HALF_PIXEL_RIGHT;
    pBestPredInter = pMeRefine->pHalfPixH + 1;
  }

  sParams.iBestCost = iBestCost;
  sParams.iBestHalfPix = iBestHalfPix;
  sParams.pRef = pRef;
  sParams.iBestQuarPix = ME_NO_BEST_QUAR_PIXEL;

  //step 5: if no best half-pixel prediction, try quarter pixel prediction
  //        if yes, must get [X+1][X+1] half-pixel from (2, 2) horizontal and vertical filter
  if (REFINE_ME_NO_BEST_HALF_PIXEL == iBestHalfPix) {
    sParams.iStrideA = kiStrideRef;
    sParams.iStrideB = kiStrideRef;
    sParams.pSrcA[0] = pMeRefine->pHalfPixV;
    sParams.pSrcA[1] = pMeRefine->pHalfPixV + ME_REFINE_BUF_STRIDE;
    sParams.pSrcA[2] = pMeRefine->pHalfPixH;
    sParams.pSrcA[3] = pMeRefine->pHalfPixH + 1;

    sParams.pSrcB[0] = sParams.pSrcB[1] = sParams.pSrcB[2] = sParams.pSrcB[3] = pRef;

    sParams.iLms[0] = COST_MVD (pMe->pMvdCost, iHalfMvx - pMe->sMvp.iMvX, iHalfMvy - 1 - pMe->sMvp.iMvY);
    sParams.iLms[1] = COST_MVD (pMe->pMvdCost, iHalfMvx - pMe->sMvp.iMvX, iHalfMvy + 1 - pMe->sMvp.iMvY);
    sParams.iLms[2] = COST_MVD (pMe->pMvdCost, iHalfMvx - 1 - pMe->sMvp.iMvX, iHalfMvy - pMe->sMvp.iMvY);
    sParams.iLms[3] = COST_MVD (pMe->pMvdCost, iHalfMvx + 1 - pMe->sMvp.iMvX, iHalfMvy - pMe->sMvp.iMvY);
  } else { //must get [X+1][X+1] half-pixel from (2, 2) horizontal and vertical filter
    switch (iBestHalfPix) {
    case REFINE_ME_HALF_PIXEL_LEFT: {
      pMeRefine->pHalfPixHV = pMeRefine->pHalfPixV;//reuse pBuffer, here only h&hv
      pFunc->sMcFuncs.pfLumaHalfpelCen (pRef - 1 - kiStrideRef, kiStrideRef, pMeRefine->pHalfPixHV, ME_REFINE_BUF_STRIDE,
                                        iWidth + 1, iHeight + 1);

      iHalfMvx -= 2;
      sParams.iStrideA = ME_REFINE_BUF_STRIDE;
      sParams.iStrideB = kiStrideRef;
      sParams.pSrcA[0] = pMeRefine->pHalfPixH;
      sParams.pSrcA[3] = sParams.pSrcA[2] = sParams.pSrcA[1] = sParams.pSrcA[0];
      sParams.pSrcB[0] = pMeRefine->pHalfPixHV;
      sParams.pSrcB[1] = pMeRefine->pHalfPixHV + ME_REFINE_BUF_STRIDE;
      sParams.pSrcB[2] = pRef - 1;
      sParams.pSrcB[3] = pRef;

    }
    break;
    case REFINE_ME_HALF_PIXEL_RIGHT: {
      pMeRefine->pHalfPixHV = pMeRefine->pHalfPixV;//reuse pBuffer, here only h&hv
      pFunc->sMcFuncs.pfLumaHalfpelCen (pRef - 1 - kiStrideRef, kiStrideRef, pMeRefine->pHalfPixHV, ME_REFINE_BUF_STRIDE,
                                        iWidth + 1, iHeight + 1);
      iHalfMvx += 2;
      sParams.iStrideA = ME_REFINE_BUF_STRIDE;
      sParams.iStrideB = kiStrideRef;
      sParams.pSrcA[0] = pMeRefine->pHalfPixH + 1;
      sParams.pSrcA[3] = sParams.pSrcA[2] = sParams.pSrcA[1] = sParams.pSrcA[0];
      sParams.pSrcB[0] = pMeRefine->pHalfPixHV + 1;
      sParams.pSrcB[1] = pMeRefine->pHalfPixHV + 1 + ME_REFINE_BUF_STRIDE;
      sParams.pSrcB[2] = pRef;
      sParams.pSrcB[3] = pRef + 1;
    }
    break;
    case REFINE_ME_HALF_PIXEL_TOP: {
      pMeRefine->pHalfPixHV = pMeRefine->pHalfPixH;//reuse pBuffer, here only v&hv
      pFunc->sMcFuncs.pfLumaHalfpelCen (pRef - 1 - kiStrideRef, kiStrideRef, pMeRefine->pHalfPixHV, ME_REFINE_BUF_STRIDE,
                                        iWidth + 1, iHeight + 1);

      iHalfMvy -= 2;
      sParams.iStrideA = kiStrideRef;
      sParams.iStrideB = ME_REFINE_BUF_STRIDE;
      sParams.pSrcA[0] = pMeRefine->pHalfPixV;
      sParams.pSrcA[3] = sParams.pSrcA[2] = sParams.pSrcA[1] = sParams.pSrcA[0];
      sParams.pSrcB[0] = pRef - kiStrideRef;
      sParams.pSrcB[1] = pRef;
      sParams.pSrcB[2] = pMeRefine->pHalfPixHV;
      sParams.pSrcB[3] = pMeRefine->pHalfPixHV + 1;
    }
    break;
    case REFINE_ME_HALF_PIXEL_BOTTOM: {
      pMeRefine->pHalfPixHV = pMeRefine->pHalfPixH;//reuse pBuffer, here only v&hv
      pFunc->sMcFuncs.pfLumaHalfpelCen (pRef - 1 - kiStrideRef, kiStrideRef, pMeRefine->pHalfPixHV, ME_REFINE_BUF_STRIDE,
                                        iWidth + 1, iHeight + 1);
      iHalfMvy += 2;
      sParams.iStrideA = kiStrideRef;
      sParams.iStrideB = ME_REFINE_BUF_STRIDE;
      sParams.pSrcA[0] = pMeRefine->pHalfPixV + ME_REFINE_BUF_STRIDE;
      sParams.pSrcA[3] = sParams.pSrcA[2] = sParams.pSrcA[1] = sParams.pSrcA[0];
      sParams.pSrcB[0] = pRef;
      sParams.pSrcB[1] = pRef + kiStrideRef;
      sParams.pSrcB[2] = pMeRefine->pHalfPixHV + ME_REFINE_BUF_STRIDE;
      sParams.pSrcB[3] = pMeRefine->pHalfPixHV + ME_REFINE_BUF_STRIDE + 1;
    }
    break;
    default:
      break;
    }
    sParams.iLms[0] = COST_MVD (pMe->pMvdCost, iHalfMvx - pMe->sMvp.iMvX, iHalfMvy - 1 - pMe->sMvp.iMvY);
    sParams.iLms[1] = COST_MVD (pMe->pMvdCost, iHalfMvx - pMe->sMvp.iMvX, iHalfMvy + 1 - pMe->sMvp.iMvY);
    sParams.iLms[2] = COST_MVD (pMe->pMvdCost, iHalfMvx - 1 - pMe->sMvp.iMvX, iHalfMvy - pMe->sMvp.iMvY);
    sParams.iLms[3] = COST_MVD (pMe->pMvdCost, iHalfMvx + 1 - pMe->sMvp.iMvX, iHalfMvy - pMe->sMvp.iMvY);
  }
  MeRefineQuarPixel (pFunc, pMe, pMeRefine, iWidth, iHeight, &sParams, kiStrideEnc);

  if (iBestCost > sParams.iBestCost) {
    pBestPredInter = pMeRefine->pQuarPixBest;
    iBestCost = sParams.iBestCost;
  }
  iBestQuarPix = sParams.iBestQuarPix;

  //update final best MV
  pMe->sMv.iMvX = iHalfMvx + iMvQuarAddX[iBestQuarPix];
  pMe->sMv.iMvY = iHalfMvy + pMvQuarAddY[iBestQuarPix];
  pMe->uiSatdCost = iBestCost;

  //No half or quarter pixel best, so do MC with integer pixel MV
  if (iBestHalfPix + iBestQuarPix == NO_BEST_FRAC_PIX) {
    pBestPredInter = pRef;
    iInterBlk4Stride = kiStrideRef;
  }
  if (MB_WIDTH_LUMA == iWidth && MB_HEIGHT_LUMA == iHeight) { //P16x16
    pFunc->pfCopy16x16NotAligned (pMemPredInterMb, MB_WIDTH_LUMA, pBestPredInter,
                                  iInterBlk4Stride);	// dst can be align with 16 bytes, but not sure at pSrc, 12/29/2011
  } else if (MB_WIDTH_LUMA == iWidth && MB_HEIGHT_CHROMA == iHeight) { //P16x8
    pFunc->pfCopy16x8NotAligned (pMemPredInterMb, MB_WIDTH_LUMA, pBestPredInter,
                                 iInterBlk4Stride);	// dst can be align with 16 bytes, but not sure at pSrc, 12/29/2011
  } else if (MB_WIDTH_CHROMA == iWidth && MB_HEIGHT_LUMA == iHeight) { //P8x16
    pFunc->pfCopy8x16Aligned (pMemPredInterMb, MB_WIDTH_LUMA, pBestPredInter, iInterBlk4Stride);
  } else { //P8x8
    pFunc->pfCopy8x8Aligned (pMemPredInterMb, MB_WIDTH_LUMA, pBestPredInter, iInterBlk4Stride);
  }
}

void InitBlkStrideWithRef (int32_t* pBlkStride, const int32_t kiStrideRef) {
  static const uint8_t kuiStrideX[16] = {
    0, 4 , 0, 4 ,
    8, 12, 8, 12,
    0, 4 , 0, 4 ,
    8, 12, 8, 12
  };
  static const uint8_t kuiStrideY[16] = {
    0, 0, 4 , 4 ,
    0, 0, 4 , 4 ,
    8, 8, 12, 12,
    8, 8, 12, 12
  };
  int32_t i;

  for (i = 0; i < 16; i += 4) {
    pBlkStride[i  ] = kuiStrideX[i  ] + kuiStrideY[i  ] * kiStrideRef;
    pBlkStride[i + 1] = kuiStrideX[i + 1] + kuiStrideY[i + 1] * kiStrideRef;
    pBlkStride[i + 2] = kuiStrideX[i + 2] + kuiStrideY[i + 2] * kiStrideRef;
    pBlkStride[i + 3] = kuiStrideX[i + 3] + kuiStrideY[i + 3] * kiStrideRef;
  }
}

/*
 * iMvdSz = (648*2+1) or (972*2+1);
 */
void MvdCostInit (uint16_t* pMvdCostInter, const int32_t kiMvdSz) {
  const int32_t kiSz		= kiMvdSz >> 1;
  uint16_t* pNegMvd		= pMvdCostInter;
  uint16_t* pPosMvd		= pMvdCostInter + kiSz + 1;
  const int32_t* kpQpLambda = &g_kiQpCostTable[0];
  int32_t i, j;

  for (i = 0; i < 52; ++ i) {
    const uint16_t kiLambda = kpQpLambda[i];
    int32_t iNegSe = -kiSz;
    int32_t iPosSe = 1;

    for (j = 0; j < kiSz; j += 4) {
      *pNegMvd++	= kiLambda * BsSizeSE (iNegSe++);
      *pNegMvd++	= kiLambda * BsSizeSE (iNegSe++);
      *pNegMvd++	= kiLambda * BsSizeSE (iNegSe++);
      *pNegMvd++	= kiLambda * BsSizeSE (iNegSe++);

      *pPosMvd++	= kiLambda * BsSizeSE (iPosSe++);
      *pPosMvd++	= kiLambda * BsSizeSE (iPosSe++);
      *pPosMvd++	= kiLambda * BsSizeSE (iPosSe++);
      *pPosMvd++	= kiLambda * BsSizeSE (iPosSe++);
    }
    *pNegMvd = kiLambda;
    pNegMvd += kiSz + 1;
    pPosMvd += kiSz + 1;
  }
}

void PredictSad (int8_t* pRefIndexCache, int32_t* pSadCostCache, int32_t uiRef, int32_t* pSadPred) {
  const int32_t kiRefB	= pRefIndexCache[1];//top g_uiCache12_8x8RefIdx[0] - 4
  int32_t iRefC			= pRefIndexCache[5];//top-right g_uiCache12_8x8RefIdx[0] - 2
  const int32_t kiRefA	= pRefIndexCache[6];//left g_uiCache12_8x8RefIdx[0] - 1
  const int32_t kiSadB		= pSadCostCache[1];
  int32_t iSadC			= pSadCostCache[2];
  const int32_t kiSadA		= pSadCostCache[3];

  int32_t iCount;

  if (iRefC == REF_NOT_AVAIL) {
    iRefC = pRefIndexCache[0];//top-left g_uiCache12_8x8RefIdx[0] - 4 - 1
    iSadC  = pSadCostCache[0];
  }

  if (kiRefB == REF_NOT_AVAIL && iRefC == REF_NOT_AVAIL && kiRefA != REF_NOT_AVAIL) {
    * pSadPred = kiSadA;
  } else {
    iCount  = (uiRef == kiRefA) << MB_LEFT_BIT;
    iCount |= (uiRef == kiRefB) << MB_TOP_BIT;
    iCount |= (uiRef == iRefC) << MB_TOPRIGHT_BIT;
    switch (iCount) {
    case LEFT_MB_POS:// A
      *pSadPred = kiSadA;
      break;
    case TOP_MB_POS:// B
      *pSadPred = kiSadB;
      break;
    case TOPRIGHT_MB_POS:// C or D
      *pSadPred = iSadC;
      break;
    default:
      *pSadPred = WelsMedian (kiSadA, kiSadB, iSadC);
      break;
    }
  }

#define REPLACE_SAD_MULTIPLY(x)   ((x) - (x>>3) + (x >>5))    // it's 0.90625, very close with 0.9
  iCount = (*pSadPred) << 6;  // here *64 will not overflow. SAD range 0~ 255*256(max 2^16), int32_t is enough
  *pSadPred = (REPLACE_SAD_MULTIPLY (iCount) + 32) >> 6;
#undef REPLACE_SAD_MULTIPLY
}


void PredictSadSkip (int8_t* pRefIndexCache, bool* pMbSkipCache, int32_t* pSadCostCache, int32_t uiRef,
                     int32_t* iSadPredSkip) {
  const int32_t kiRefB	= pRefIndexCache[1];//top g_uiCache12_8x8RefIdx[0] - 4
  int32_t iRefC			= pRefIndexCache[5];//top-right g_uiCache12_8x8RefIdx[0] - 2
  const int32_t kiRefA	= pRefIndexCache[6];//left g_uiCache12_8x8RefIdx[0] - 1
  const int32_t kiSadB		= (pMbSkipCache[1] == 1 ? pSadCostCache[1] : 0);
  int32_t iSadC			= (pMbSkipCache[2] == 1 ? pSadCostCache[2] : 0);
  const int32_t kiSadA		= (pMbSkipCache[3] == 1 ? pSadCostCache[3] : 0);
  int32_t iRefSkip		= pMbSkipCache[2];

  int32_t iCount = 0;

  if (iRefC == REF_NOT_AVAIL) {
    iRefC = pRefIndexCache[0];//top-left g_uiCache12_8x8RefIdx[0] - 4 - 1
    iSadC  = (pMbSkipCache[0] == 1 ? pSadCostCache[0] : 0);
    iRefSkip = pMbSkipCache[0];
  }

  if (kiRefB == REF_NOT_AVAIL && iRefC == REF_NOT_AVAIL && kiRefA != REF_NOT_AVAIL) {
    * iSadPredSkip = kiSadA;
  } else {
    iCount  = ((uiRef == kiRefA) && (pMbSkipCache[3] == 1)) << MB_LEFT_BIT;
    iCount |= ((uiRef == kiRefB) && (pMbSkipCache[1] == 1)) << MB_TOP_BIT;
    iCount |= ((uiRef == iRefC) && (iRefSkip == 1)) << MB_TOPRIGHT_BIT;
    switch (iCount) {
    case LEFT_MB_POS:// A
      *iSadPredSkip = kiSadA;
      break;
    case TOP_MB_POS:// B
      *iSadPredSkip = kiSadB;
      break;
    case TOPRIGHT_MB_POS:// C or D
      *iSadPredSkip = iSadC;
      break;
    default:
      *iSadPredSkip = WelsMedian (kiSadA, kiSadB, iSadC);
      break;
    }
  }
}
}