shithub: openh264

ref: bcb76d383b778b944bf75bdea4f02c1c3c0bcf37
dir: /codec/encoder/core/src/svc_motion_estimate.cpp/

View raw version
/*!
 * \copy
 *     Copyright (c)  2009-2013, Cisco Systems
 *     All rights reserved.
 *
 *     Redistribution and use in source and binary forms, with or without
 *     modification, are permitted provided that the following conditions
 *     are met:
 *
 *        * Redistributions of source code must retain the above copyright
 *          notice, this list of conditions and the following disclaimer.
 *
 *        * Redistributions in binary form must reproduce the above copyright
 *          notice, this list of conditions and the following disclaimer in
 *          the documentation and/or other materials provided with the
 *          distribution.
 *
 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 *     POSSIBILITY OF SUCH DAMAGE.
 *
 *
 * \file  svc motion estimate.c
 *
 * \brief  Interfaces introduced in svc mb motion estimation
 *
 * \date  08/11/2009 Created
 *
 *************************************************************************************
 */

#include "cpu_core.h"
#include "ls_defines.h"
#include "svc_motion_estimate.h"
#include "wels_transpose_matrix.h"

namespace WelsSVCEnc {

const int32_t QStepx16ByQp[52] = {  /* save QStep<<4 for int32_t */
    10,  11,  13,  14,  16,  18,  /* 0~5   */
    20,  22,  26,  28,  32,  36,  /* 6~11  */
    40,  44,  52,  56,  64,  72,  /* 12~17 */
    80,  88,  104, 112, 128, 144, /* 18~23 */
    160, 176, 208, 224, 256, 288, /* 24~29 */
    320, 352, 416, 448, 512, 576, /* 30~35 */
    640, 704, 832, 896, 1024, 1152, /* 36~41 */
    1280, 1408, 1664, 1792, 2048, 2304, /* 42~47 */
    2560, 2816, 3328, 3584     /* 48~51 */
};

static inline void UpdateMeResults( const SMVUnitXY ksBestMv, const uint32_t kiBestSadCost, uint8_t* pRef, SWelsME * pMe )
{
  pMe->sMv = ksBestMv;
  pMe->pRefMb = pRef;
  pMe->uiSadCost = kiBestSadCost;
}
static inline void MeEndIntepelSearch( SWelsME * pMe )
{
    /* -> qpel mv */
    pMe->sMv.iMvX <<= 2;
    pMe->sMv.iMvY <<= 2;
    pMe->uiSatdCost = pMe->uiSadCost;
}

void WelsInitMeFunc( SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag, bool bScreenContent ) {
  if (!bScreenContent) {
    pFuncList->pfCheckDirectionalMv = CheckDirectionalMvFalse;
    pFuncList->pfCalculateBlockFeatureOfFrame[0] =
      pFuncList->pfCalculateBlockFeatureOfFrame[1] = NULL;
    pFuncList->pfCalculateSingleBlockFeature[0] =
      pFuncList->pfCalculateSingleBlockFeature[1] = NULL;
    pFuncList->pfUpdateFMESwitch = UpdateFMESwitchNull;
  } else {
    pFuncList->pfCheckDirectionalMv = CheckDirectionalMv;

    //for cross serarch
    pFuncList->pfVerticalFullSearch = LineFullSearch_c;
    pFuncList->pfHorizontalFullSearch = LineFullSearch_c;
#if defined (X86_ASM)
    if ( uiCpuFlag & WELS_CPU_SSE41 ) {
      pFuncList->pfSampleSadHor8[0] = SampleSad8x8Hor8_sse41;
      pFuncList->pfSampleSadHor8[1] = SampleSad16x16Hor8_sse41;
      pFuncList->pfVerticalFullSearch = VerticalFullSearchUsingSSE41;
      pFuncList->pfHorizontalFullSearch = HorizontalFullSearchUsingSSE41;
    }
#endif

    //for feature search
    pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_c;
    pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_c;
    //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
    pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_c;
    pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_c;
    pFuncList->pfUpdateFMESwitch = UpdateFMESwitchNull;
  }
}

/*!
 * \brief  BL mb motion estimate search
 *
 * \param  enc      Wels encoder context
 * \param  pMe          Wels me information
 *
 * \return  NONE
 */

void WelsMotionEstimateSearch (SWelsFuncPtrList* pFuncList, void* pLplayer, void* pLpme, void* pLpslice) {
  SDqLayer* pCurDqLayer      = (SDqLayer*)pLplayer;
  SWelsME* pMe            = (SWelsME*)pLpme;
  SSlice* pSlice          = (SSlice*)pLpslice;
  const int32_t kiStrideEnc = pCurDqLayer->iEncStride[0];
  const int32_t kiStrideRef = pCurDqLayer->pRefPic->iLineSize[0];

  //  Step 1: Initial point prediction
  if ( !WelsMotionEstimateInitialPoint (pFuncList, pMe, pSlice, kiStrideEnc, kiStrideRef) ) {
    pFuncList->pfSearchMethod[pMe->uiBlockSize] (pFuncList, pMe, pSlice, kiStrideEnc, kiStrideRef);
    MeEndIntepelSearch(pMe);
  }

  pFuncList->pfCalculateSatd( pFuncList->sSampleDealingFuncs.pfSampleSatd[pMe->uiBlockSize], pMe, kiStrideEnc, kiStrideRef );
}

/*!
 * \brief  EL mb motion estimate initial point testing
 *
 * \param  pix_pFuncList  SSampleDealingFunc
 * \param  pMe          Wels me information
 * \param  mv_range  search range in motion estimate
 * \param  point      the best match point in motion estimation
 *
 * \return  NONE
 */
bool WelsMotionEstimateInitialPoint (SWelsFuncPtrList* pFuncList, SWelsME* pMe, SSlice* pSlice, int32_t iStrideEnc,
                                     int32_t iStrideRef) {
  PSampleSadSatdCostFunc pSad    = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize];
  const uint16_t* kpMvdCost  = pMe->pMvdCost;
  uint8_t* const kpEncMb    = pMe->pEncMb;
  int16_t iMvc0, iMvc1;
  int32_t iSadCost;
  int32_t iBestSadCost;
  uint8_t* pRefMb;
  uint8_t* pFref2;
  uint32_t i;
  const uint32_t kuiMvcNum    = pSlice->uiMvcNum;
  const SMVUnitXY* kpMvcList  = &pSlice->sMvc[0];
  const SMVUnitXY ksMvStartMin    = pSlice->sMvStartMin;
  const SMVUnitXY ksMvStartMax    = pSlice->sMvStartMax;
  const SMVUnitXY ksMvp    = pMe->sMvp;
  SMVUnitXY sMv;

  //  Step 1: Initial point prediction
  // init with sMvp
  sMv.iMvX  = WELS_CLIP3 ((2 + ksMvp.iMvX) >> 2, ksMvStartMin.iMvX, ksMvStartMax.iMvX);
  sMv.iMvY  = WELS_CLIP3 ((2 + ksMvp.iMvY) >> 2, ksMvStartMin.iMvY, ksMvStartMax.iMvY);

  pRefMb = &pMe->pRefMb[sMv.iMvY * iStrideRef + sMv.iMvX];

  iBestSadCost = pSad (kpEncMb, iStrideEnc, pRefMb, iStrideRef);
  iBestSadCost += COST_MVD (kpMvdCost, ((sMv.iMvX) << 2) - ksMvp.iMvX, ((sMv.iMvY) << 2) - ksMvp.iMvY);

  for (i = 0; i < kuiMvcNum; i++) {
    //clipping here is essential since some pOut-of-range MVC may happen here (i.e., refer to baseMV)
    iMvc0 = WELS_CLIP3 ((2 + kpMvcList[i].iMvX) >> 2, ksMvStartMin.iMvX, ksMvStartMax.iMvX);
    iMvc1 = WELS_CLIP3 ((2 + kpMvcList[i].iMvY) >> 2, ksMvStartMin.iMvY, ksMvStartMax.iMvY);

    if (((iMvc0 - sMv.iMvX) || (iMvc1 - sMv.iMvY))) {
      pFref2 = &pMe->pRefMb[iMvc1 * iStrideRef + iMvc0];

      iSadCost = pSad (kpEncMb, iStrideEnc, pFref2, iStrideRef) +
                 COST_MVD (kpMvdCost, (iMvc0 << 2) - ksMvp.iMvX, (iMvc1 << 2) - ksMvp.iMvY);

      if (iSadCost < iBestSadCost) {
        sMv.iMvX = iMvc0;
        sMv.iMvY = iMvc1;
        pRefMb = pFref2;
        iBestSadCost = iSadCost;
      }
    }
  }

  if ( pFuncList->pfCheckDirectionalMv
    (pSad, pMe, ksMvStartMin, ksMvStartMax, iStrideEnc, iStrideRef, iSadCost) ) {
      sMv = pMe->sDirectionalMv;
      pRefMb =  &pMe->pColoRefMb[sMv.iMvY * iStrideRef + sMv.iMvX];
      iBestSadCost = iSadCost;
  }

  UpdateMeResults( sMv, iBestSadCost, pRefMb, pMe );
  if ( iBestSadCost < static_cast<int32_t>(pMe->uSadPredISatd.uiSadPred) ) {
    //Initial point early Stop
    MeEndIntepelSearch(pMe);
    return true;
  }
  return false;
}

void CalculateSatdCost( PSampleSadSatdCostFunc pSatd, void * vpMe,
                       const int32_t kiEncStride, const int32_t kiRefStride ) {
  SWelsME* pMe             = static_cast<SWelsME *>(vpMe);
  pMe->uSadPredISatd.uiSatd = pSatd(pMe->pEncMb, kiEncStride, pMe->pRefMb, kiRefStride);
  pMe->uiSatdCost = pMe->uSadPredISatd.uiSatd + COST_MVD (pMe->pMvdCost, pMe->sMv.iMvX - pMe->sMvp.iMvX,
                                                            pMe->sMv.iMvY - pMe->sMvp.iMvY);
}
void NotCalculateSatdCost( PSampleSadSatdCostFunc pSatd, void * vpMe,
                          const int32_t kiEncStride, const int32_t kiRefStride ) {
}


/////////////////////////
// Diamond Search Basics
/////////////////////////
bool WelsMeSadCostSelect (int32_t* iSadCost, const uint16_t* kpMvdCost, int32_t* pBestCost, const int32_t kiDx,
                            const int32_t kiDy, int32_t* pIx, int32_t* pIy) {
  int32_t iTempSadCost[4];
  int32_t iInputSadCost = *pBestCost;
  iTempSadCost[0] = iSadCost[0] + COST_MVD (kpMvdCost, kiDx, kiDy - 4);
  iTempSadCost[1] = iSadCost[1] + COST_MVD (kpMvdCost, kiDx, kiDy + 4);
  iTempSadCost[2] = iSadCost[2] + COST_MVD (kpMvdCost, kiDx - 4, kiDy);
  iTempSadCost[3] = iSadCost[3] + COST_MVD (kpMvdCost, kiDx + 4, kiDy);

  if (iTempSadCost[0] < *pBestCost) {
    *pBestCost = iTempSadCost[0];
    *pIx = 0;
    *pIy = 1;
  }

  if (iTempSadCost[1] < *pBestCost) {
    *pBestCost = iTempSadCost[1];
    *pIx = 0;
    *pIy = -1;
  }

  if (iTempSadCost[2] < *pBestCost) {
    *pBestCost = iTempSadCost[2];
    *pIx = 1;
    *pIy = 0;
  }

  if (iTempSadCost[3] < *pBestCost) {
    *pBestCost = iTempSadCost[3];
    *pIx = -1;
    *pIy = 0;
  }
  return (*pBestCost == iInputSadCost);
}

void WelsDiamondSearch (SWelsFuncPtrList* pFuncList, void* pLpme, void* pLpslice,
                        const int32_t kiStrideEnc,  const int32_t kiStrideRef) {
  SWelsME* pMe            = (SWelsME*)pLpme;
  PSample4SadCostFunc      pSad          =  pFuncList->sSampleDealingFuncs.pfSample4Sad[pMe->uiBlockSize];

  uint8_t* pFref = pMe->pRefMb;
  uint8_t* const kpEncMb = pMe->pEncMb;
  const uint16_t* kpMvdCost = pMe->pMvdCost;

  int32_t iMvDx = ((pMe->sMv.iMvX) << 2) - pMe->sMvp.iMvX;
  int32_t iMvDy = ((pMe->sMv.iMvY) << 2) - pMe->sMvp.iMvY;

  uint8_t* pRefMb = pFref;
  int32_t iBestCost = (pMe->uiSadCost);

  int32_t iTimeThreshold = ITERATIVE_TIMES;
  ENFORCE_STACK_ALIGN_1D (int32_t, iSadCosts, 4, 16)

  while (iTimeThreshold--) {
    pSad (kpEncMb, kiStrideEnc, pRefMb, kiStrideRef, &iSadCosts[0]);

    int32_t iX, iY;

    const bool kbIsBestCostWorse = WelsMeSadCostSelect (iSadCosts, kpMvdCost, &iBestCost, iMvDx, iMvDy, &iX, &iY);
    if (kbIsBestCostWorse)
      break;

    iMvDx -= iX << 2 ;
    iMvDy -= iY << 2 ;

    pRefMb -= (iX + iY * kiStrideRef);

  }

  /* integer-pel mv */
  pMe->sMv.iMvX = (iMvDx + pMe->sMvp.iMvX) >>2;
  pMe->sMv.iMvY = (iMvDy + pMe->sMvp.iMvY) >>2;
  pMe->uiSatdCost = pMe->uiSadCost = (iBestCost);
  pMe->pRefMb = pRefMb;
}

/////////////////////////
// DirectionalMv Basics
/////////////////////////
bool CheckDirectionalMv(PSampleSadSatdCostFunc pSad, void * vpMe,
                      const SMVUnitXY ksMinMv, const SMVUnitXY ksMaxMv, const int32_t kiEncStride, const int32_t kiRefStride,
                      int32_t& iBestSadCost) {
  SWelsME* pMe             = static_cast<SWelsME *>(vpMe);
  const int16_t kiMvX = pMe->sDirectionalMv.iMvX;
  const int16_t kiMvY = pMe->sDirectionalMv.iMvY;

  //Check MV from scrolling detection
  if ( (BLOCK_16x16!=pMe->uiBlockSize) //scrolled_MV with P16x16 is checked SKIP checking function
    && ( kiMvX | kiMvY ) //(0,0) checked in ordinary initial point checking
    && CheckMvInRange( pMe->sDirectionalMv, ksMinMv, ksMaxMv ) ) {
    uint8_t* pRef = &pMe->pColoRefMb[kiMvY * kiRefStride + kiMvX];
    uint32_t uiCurrentSadCost = pSad( pMe->pEncMb, kiEncStride,  pRef, kiRefStride ) +
      COST_MVD(pMe->pMvdCost, (kiMvX<<2) - pMe->sMvp.iMvX, (kiMvY<<2) - pMe->sMvp.iMvY );
    if( uiCurrentSadCost < pMe->uiSadCost ) {
      iBestSadCost = uiCurrentSadCost;
      return true;
    }
  }
  return false;
}

bool CheckDirectionalMvFalse(PSampleSadSatdCostFunc pSad, void * vpMe,
                      const SMVUnitXY ksMinMv, const SMVUnitXY ksMaxMv, const int32_t kiEncStride, const int32_t kiRefStride,
                      int32_t& iBestSadCost) {
  return false;
}

/////////////////////////
// Cross Search Basics
/////////////////////////
#if defined (X86_ASM)
void CalcMvdCostx8_c( uint16_t *pMvdCost, const int32_t kiStartMv, uint16_t* pMvdTable, const uint16_t kiFixedCost )
{
  uint16_t *pBaseCost  = pMvdCost;
  const int32_t kiOffset = (kiStartMv<<2);
  uint16_t *pMvd  = pMvdTable+kiOffset;
  for (int32_t i = 0; i < 8; ++ i) {
    pBaseCost[i] = ((*pMvd) + kiFixedCost);
    pMvd += 4;
  }
}
void VerticalFullSearchUsingSSE41( void *pFunc, void *vpMe,
                            uint16_t* pMvdTable, const int32_t kiFixedMvd,
                            const int32_t kiEncStride, const int32_t kiRefStride,
                          const int32_t kiMinPos, const int32_t kiMaxPos,
                          const bool bVerticalSearch ) {
  SWelsFuncPtrList *pFuncList      = static_cast<SWelsFuncPtrList *>(pFunc);
  SWelsME *pMe                        = static_cast<SWelsME *>(vpMe);
  uint8_t*  kpEncMb = pMe->pEncMb;
  const int32_t kiCurMeBlockPix = pMe->iCurMeBlockPixY;
  uint8_t* pRef         = &pMe->pColoRefMb[(kiMinPos - kiCurMeBlockPix)*kiRefStride];
  const int32_t kIsBlock16x16 = pMe->uiBlockSize == BLOCK_16x16;
  const int32_t kiEdgeBlocks = kIsBlock16x16 ? 16 : 8;
  PSampleSadHor8Func pSampleSadHor8 = pFuncList->pfSampleSadHor8[kIsBlock16x16];
  PSampleSadSatdCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize];
  PTransposeMatrixBlockFunc TransposeMatrixBlock = kIsBlock16x16 ? TransposeMatrixBlock16x16_sse2 : TransposeMatrixBlock8x8_mmx;
  PTransposeMatrixBlocksFunc TransposeMatrixBlocks= kIsBlock16x16 ? TransposeMatrixBlocksx16_sse2 : TransposeMatrixBlocksx8_mmx;

  const int32_t kiDiff   = kiMaxPos - kiMinPos;
  const int32_t kiRowNum  = WELS_ALIGN((kiDiff - kiEdgeBlocks + 1), kiEdgeBlocks);
  const int32_t kiBlocksNum  = kIsBlock16x16 ? (kiRowNum>>4) : (kiRowNum>>3);
  int32_t iCountLoop8  = (kiRowNum-kiEdgeBlocks) >> 3;
  const int32_t kiRemainingVectors  = kiDiff - (iCountLoop8<<3);
  const int32_t kiMatrixStride  = MAX_VERTICAL_MV_RANGE;
  ENFORCE_STACK_ALIGN_2D( uint8_t, uiMatrixRef, 16, kiMatrixStride, 16 ); // transpose matrix result for ref
  ENFORCE_STACK_ALIGN_2D( uint8_t, uiMatrixEnc, 16, 16, 16 );    // transpose matrix result for enc
  assert(kiRowNum <= kiMatrixStride); // make sure effective memory

  TransposeMatrixBlock( &uiMatrixEnc[0][0], 16, kpEncMb, kiEncStride );
  TransposeMatrixBlocks( &uiMatrixRef[0][0], kiMatrixStride, pRef, kiRefStride, kiBlocksNum );
  ENFORCE_STACK_ALIGN_1D( uint16_t, uiBaseCost, 8, 16 );
  int32_t iTargetPos   = kiMinPos;
  int16_t iBestPos    = pMe->sMv.iMvX;
  uint32_t uiBestCost   = pMe->uiSadCost;
  uint32_t uiCostMin;
  int32_t iIndexMinPos;
  kpEncMb = &uiMatrixEnc[0][0];
  pRef = &uiMatrixRef[0][0];

  while(iCountLoop8 > 0) {
    CalcMvdCostx8_c(uiBaseCost, iTargetPos, pMvdTable, kiFixedMvd);
    uiCostMin = pSampleSadHor8( kpEncMb, 16, pRef, kiMatrixStride, uiBaseCost, &iIndexMinPos );
    if (uiCostMin < uiBestCost) {
      uiBestCost = uiCostMin;
      iBestPos  = iTargetPos+iIndexMinPos;
    }
    iTargetPos += 8;
    pRef += 8;
    -- iCountLoop8;
  }
  if (kiRemainingVectors > 0) {
    kpEncMb = pMe->pEncMb;
    pRef = &pMe->pColoRefMb[(iTargetPos - kiCurMeBlockPix)*kiRefStride];
    while (iTargetPos < kiMaxPos) {
      const uint16_t pMvdCost = pMvdTable[iTargetPos<<2];
      uint32_t uiSadCost = pSad( kpEncMb, kiEncStride, pRef, kiRefStride ) + (kiFixedMvd + pMvdCost);
      if (uiSadCost < uiBestCost) {
        uiBestCost = uiSadCost;
        iBestPos = iTargetPos;
      }
      pRef += kiRefStride;
      ++iTargetPos;
    }
  }
  if (uiBestCost < pMe->uiSadCost) {
    SMVUnitXY sBestMv;
    sBestMv.iMvX = 0;
    sBestMv.iMvY = iBestPos - kiCurMeBlockPix;
    UpdateMeResults( sBestMv, uiBestCost, &pMe->pColoRefMb[sBestMv.iMvY*kiRefStride], pMe );
  }
}

void HorizontalFullSearchUsingSSE41( void *pFunc, void *vpMe,
                                      uint16_t* pMvdTable, const int32_t kiFixedMvd,
                                      const int32_t kiEncStride, const int32_t kiRefStride,
                                      const int32_t kiMinPos, const int32_t kiMaxPos,
                                      const bool bVerticalSearch )
{
  SWelsFuncPtrList *pFuncList      = static_cast<SWelsFuncPtrList *>(pFunc);
  SWelsME *pMe                        = static_cast<SWelsME *>(vpMe);
  uint8_t *kpEncMb = pMe->pEncMb;
  const int32_t kiCurMeBlockPix = pMe->iCurMeBlockPixX;
  uint8_t *pRef         = &pMe->pColoRefMb[kiMinPos - kiCurMeBlockPix];
  const int32_t kIsBlock16x16 = pMe->uiBlockSize == BLOCK_16x16;
  PSampleSadHor8Func pSampleSadHor8 = pFuncList->pfSampleSadHor8[kIsBlock16x16];
  PSampleSadSatdCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize];
  ENFORCE_STACK_ALIGN_1D( uint16_t, uiBaseCost, 8, 16 );
  const int32_t kiNumVector = kiMaxPos - kiMinPos;
  int32_t iCountLoop8 = kiNumVector >> 3;
  const int32_t kiRemainingLoop8 = kiNumVector & 7;
  int32_t iTargetPos   = kiMinPos;
  int16_t iBestPos    = pMe->sMv.iMvX;
  uint32_t uiBestCost   = pMe->uiSadCost;
  uint32_t uiCostMin;
  int32_t iIndexMinPos;

  while(iCountLoop8 > 0) {
    CalcMvdCostx8_c(uiBaseCost, iTargetPos, pMvdTable, kiFixedMvd);
    uiCostMin = pSampleSadHor8( kpEncMb, kiEncStride, pRef, kiRefStride, uiBaseCost, &iIndexMinPos );
    if (uiCostMin < uiBestCost) {
      uiBestCost = uiCostMin;
      iBestPos  = iTargetPos+iIndexMinPos;
    }
    iTargetPos += 8;
    pRef += 8;
    -- iCountLoop8;
  }
  if ( kiRemainingLoop8 > 0 ) {
    while (iTargetPos < kiMaxPos) {
      const uint16_t pMvdCost = pMvdTable[iTargetPos<<2];
      uint32_t uiSadCost = pSad( kpEncMb, kiEncStride, pRef, kiRefStride ) + (kiFixedMvd + pMvdCost);
      if (uiSadCost < uiBestCost) {
        uiBestCost = uiSadCost;
        iBestPos = iTargetPos;
      }
      ++pRef;
      ++iTargetPos;
    }
  }
  if (uiBestCost < pMe->uiSadCost) {
    SMVUnitXY sBestMv;
    sBestMv.iMvX = iBestPos - kiCurMeBlockPix;
    sBestMv.iMvY = 0;
    UpdateMeResults( sBestMv, uiBestCost, &pMe->pColoRefMb[sBestMv.iMvY], pMe );
  }
}
#endif
void LineFullSearch_c( void *pFunc, void *vpMe,
             uint16_t* pMvdTable, const int32_t kiFixedMvd,
             const int32_t kiEncStride, const int32_t kiRefStride,
             const int32_t kiMinPos, const int32_t kiMaxPos,
                          const bool bVerticalSearch ) {
  SWelsFuncPtrList *pFuncList      = static_cast<SWelsFuncPtrList *>(pFunc);
  SWelsME *pMe                            = static_cast<SWelsME *>(vpMe);
  PSampleSadSatdCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize];
  const int32_t kiCurMeBlockPix  = bVerticalSearch?pMe->iCurMeBlockPixY:pMe->iCurMeBlockPixX;
  const int32_t kiStride = bVerticalSearch?kiRefStride:1;
  uint8_t* pRef            = &pMe->pColoRefMb[(kiMinPos - kiCurMeBlockPix)*kiStride];
  uint16_t* pMvdCost  = &(pMvdTable[kiMinPos<<2]);
  uint32_t uiBestCost    = 0xFFFFFFFF;
  int32_t iBestPos       = 0;

  for ( int32_t iTargetPos = kiMinPos; iTargetPos < kiMaxPos; ++ iTargetPos ) {
    uint8_t* const kpEncMb  = pMe->pEncMb;
    uint32_t uiSadCost = pSad( kpEncMb, kiEncStride, pRef, kiRefStride ) + (kiFixedMvd + *pMvdCost);
    if (uiSadCost < uiBestCost) {
      uiBestCost  = uiSadCost;
      iBestPos  = iTargetPos;
    }
    pRef += kiStride;
    pMvdCost+=4;
  }

  if (uiBestCost < pMe->uiSadCost) {
    SMVUnitXY sBestMv;
    sBestMv.iMvX = bVerticalSearch?0:(iBestPos - kiCurMeBlockPix);
    sBestMv.iMvY = bVerticalSearch?(iBestPos - kiCurMeBlockPix):0;
    UpdateMeResults( sBestMv, uiBestCost, &pMe->pColoRefMb[sBestMv.iMvY*kiStride], pMe );
  }
}

void WelsMotionCrossSearch(SWelsFuncPtrList *pFuncList,  SWelsME * pMe,
           const SSlice* pSlice, const int32_t kiEncStride,  const int32_t kiRefStride) {
  PLineFullSearchFunc pfVerticalFullSearchFunc = pFuncList->pfVerticalFullSearch;
  PLineFullSearchFunc pfHorizontalFullSearchFunc = pFuncList->pfHorizontalFullSearch;

  const int32_t iCurMeBlockPixX = pMe->iCurMeBlockPixX;
  const int32_t iCurMeBlockQpelPixX = ((iCurMeBlockPixX)<<2);
  const int32_t iCurMeBlockPixY = pMe->iCurMeBlockPixY;
  const int32_t iCurMeBlockQpelPixY = ((iCurMeBlockPixY)<<2);
  uint16_t* pMvdCostX = pMe->pMvdCost - iCurMeBlockQpelPixX - pMe->sMvp.iMvX;//do the offset here instead of in the search
  uint16_t* pMvdCostY = pMe->pMvdCost - iCurMeBlockQpelPixY - pMe->sMvp.iMvY;//do the offset here instead of in the search

  //vertical search
  pfVerticalFullSearchFunc( pFuncList, pMe,
    pMvdCostY, pMvdCostX[ iCurMeBlockQpelPixX ],
    kiEncStride, kiRefStride,
    iCurMeBlockPixY + pSlice->sMvStartMin.iMvY,
    iCurMeBlockPixY + pSlice->sMvStartMax.iMvY, true );

  //horizontal search
  if (pMe->uiSadCost >= pMe->uiSadCostThreshold) {
    pfHorizontalFullSearchFunc( pFuncList, pMe,
      pMvdCostX, pMvdCostY[ iCurMeBlockQpelPixY ],
      kiEncStride, kiRefStride,
      iCurMeBlockPixX + pSlice->sMvStartMin.iMvX,
      iCurMeBlockPixX + pSlice->sMvStartMax.iMvX,
      false );
  }
}

/////////////////////////
// Feature Search Basics
/////////////////////////
//memory related
int32_t RequestFeatureSearchPreparation( CMemoryAlign *pMa, const int32_t kiFrameWidth,  const int32_t kiFrameHeight, const int32_t iNeedFeatureStorage,
                                        SFeatureSearchPreparation* pFeatureSearchPreparation) {
  const int32_t kiFeatureStrategyIndex = iNeedFeatureStorage>>16;
  const bool bFme8x8 = ((iNeedFeatureStorage & 0x0000FF & ME_FME)==ME_FME);
  const int32_t kiMarginSize = bFme8x8?8:16;
  const int32_t kiFrameSize = (kiFrameWidth-kiMarginSize) * (kiFrameHeight-kiMarginSize);
  int32_t iListOfFeatureOfBlock;

  if (0==kiFeatureStrategyIndex) {
    iListOfFeatureOfBlock =sizeof(uint16_t) * kiFrameSize;
  } else {
    iListOfFeatureOfBlock = sizeof(uint16_t) * kiFrameSize +
      (kiFrameWidth-kiMarginSize) * sizeof(uint32_t) + kiFrameWidth * 8 * sizeof(uint8_t);
  }
  pFeatureSearchPreparation->pFeatureOfBlock =
    (uint16_t *)pMa->WelsMalloc(iListOfFeatureOfBlock, "pFeatureOfBlock");
  WELS_VERIFY_RETURN_IF(ENC_RETURN_MEMALLOCERR, NULL == (pFeatureSearchPreparation->pFeatureOfBlock) )

  pFeatureSearchPreparation->uiFeatureStrategyIndex = kiFeatureStrategyIndex;
  pFeatureSearchPreparation->bFMESwitchFlag = true;
  pFeatureSearchPreparation->uiFMEGoodFrameCount = FMESWITCH_DEFAULT_GOODFRAME_NUM;
  pFeatureSearchPreparation->iHighFreMbCount = 0;

  return ENC_RETURN_SUCCESS;
}
int32_t ReleaseFeatureSearchPreparation( CMemoryAlign *pMa, uint16_t*& pFeatureOfBlock) {
  if ( pMa && pFeatureOfBlock ) {
    pMa->WelsFree( pFeatureOfBlock, "pFeatureOfBlock");
    pFeatureOfBlock=NULL;
    return ENC_RETURN_SUCCESS;
  }
  return ENC_RETURN_UNEXPECTED;
}

int32_t RequestScreenBlockFeatureStorage( CMemoryAlign *pMa, const int32_t kiFrameWidth,  const int32_t kiFrameHeight, const int32_t iNeedFeatureStorage,
                                         SScreenBlockFeatureStorage* pScreenBlockFeatureStorage) {

  const int32_t kiFeatureStrategyIndex = iNeedFeatureStorage>>16;
  const int32_t kiMe8x8FME = iNeedFeatureStorage & 0x0000FF & ME_FME;
  const int32_t kiMe16x16FME = ((iNeedFeatureStorage & 0x00FF00)>>8) & ME_FME;
  if ((kiMe8x8FME==ME_FME) && (kiMe16x16FME==ME_FME)) {
    return ENC_RETURN_UNSUPPORTED_PARA;
    //the following memory allocation cannot support when FME at both size
  }

  const bool bIsBlock8x8 = (kiMe8x8FME==ME_FME);
  const int32_t kiMarginSize = bIsBlock8x8?8:16;
  const int32_t kiFrameSize = (kiFrameWidth-kiMarginSize) * (kiFrameHeight-kiMarginSize);
  const int32_t kiListSize  = (0==kiFeatureStrategyIndex)?(bIsBlock8x8 ? LIST_SIZE_SUM_8x8 : LIST_SIZE_SUM_16x16):256;

  pScreenBlockFeatureStorage->pTimesOfFeatureValue = (uint32_t*)pMa->WelsMalloc(kiListSize*sizeof(uint32_t),"pScreenBlockFeatureStorage->pTimesOfFeatureValue");
  WELS_VERIFY_RETURN_IF(ENC_RETURN_MEMALLOCERR, NULL == pScreenBlockFeatureStorage->pTimesOfFeatureValue)

  pScreenBlockFeatureStorage->pLocationOfFeature = (uint16_t**)pMa->WelsMalloc(kiListSize*sizeof(uint16_t*),"pScreenBlockFeatureStorage->pLocationOfFeature");
  WELS_VERIFY_RETURN_IF(ENC_RETURN_MEMALLOCERR, NULL == pScreenBlockFeatureStorage->pLocationOfFeature)

  pScreenBlockFeatureStorage->pLocationPointer = (uint16_t*)pMa->WelsMalloc(2*kiFrameSize*sizeof(uint16_t), "pScreenBlockFeatureStorage->pLocationPointer");
  WELS_VERIFY_RETURN_IF(ENC_RETURN_MEMALLOCERR, NULL == pScreenBlockFeatureStorage->pLocationPointer)

  pScreenBlockFeatureStorage->pFeatureOfBlockPointer = NULL;
  pScreenBlockFeatureStorage->iIs16x16 = !bIsBlock8x8;
  pScreenBlockFeatureStorage->uiFeatureStrategyIndex = kiFeatureStrategyIndex;
  pScreenBlockFeatureStorage->iActualListSize = kiListSize;
  memset(pScreenBlockFeatureStorage->uiSadCostThreshold, UINT_MAX, BLOCK_SIZE_ALL*sizeof(uint32_t));
  pScreenBlockFeatureStorage->bRefBlockFeatureCalculated = false;

  return ENC_RETURN_SUCCESS;
}
int32_t ReleaseScreenBlockFeatureStorage( CMemoryAlign *pMa, SScreenBlockFeatureStorage* pScreenBlockFeatureStorage ) {
  if ( pMa && pScreenBlockFeatureStorage ) {
    pMa->WelsFree( pScreenBlockFeatureStorage->pTimesOfFeatureValue, "pScreenBlockFeatureStorage->pTimesOfFeatureValue");
    pScreenBlockFeatureStorage->pTimesOfFeatureValue=NULL;

    pMa->WelsFree( pScreenBlockFeatureStorage->pLocationOfFeature, "pScreenBlockFeatureStorage->pLocationOfFeature");
    pScreenBlockFeatureStorage->pLocationOfFeature=NULL;

    pMa->WelsFree( pScreenBlockFeatureStorage->pLocationPointer, "pScreenBlockFeatureStorage->pLocationPointer");
    pScreenBlockFeatureStorage->pLocationPointer=NULL;

    return ENC_RETURN_SUCCESS;
  }
  return ENC_RETURN_UNEXPECTED;
}

//preprocess related
int32_t SumOf8x8SingleBlock_c(uint8_t* pRef, const int32_t kiRefStride) {
  int32_t iSum = 0, i;
  for(i = 0; i < 8; i++) {
    iSum +=  pRef[0]    + pRef[1]  + pRef[2]  + pRef[3];
    iSum +=  pRef[4]    + pRef[5]  + pRef[6]  + pRef[7];
    pRef += kiRefStride;
  }
  return iSum;
}
int32_t SumOf16x16SingleBlock_c(uint8_t* pRef, const int32_t kiRefStride) {
  int32_t iSum = 0, i;
  for(i = 0; i < 16; i++) {
    iSum +=  pRef[0]    + pRef[1]  + pRef[2]  + pRef[3];
    iSum +=  pRef[4]    + pRef[5]  + pRef[6]  + pRef[7];
    iSum    +=  pRef[8]    + pRef[9]  + pRef[10]  + pRef[11];
    iSum    +=  pRef[12]  + pRef[13]  + pRef[14]  + pRef[15];
    pRef += kiRefStride;
  }
  return iSum;
}

void SumOf8x8BlockOfFrame_c(uint8_t *pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
                                              uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
{
  int32_t x, y;
  uint8_t *pRef;
  uint16_t *pBuffer;
  int32_t iSum;
  for(y = 0; y < kiHeight; y++) {
    pRef = pRefPicture  + kiRefStride * y;
    pBuffer  = pFeatureOfBlock + kiWidth * y;
    for(x = 0; x < kiWidth; x++) {
      iSum = SumOf8x8SingleBlock_c(pRef + x, kiRefStride);

      pBuffer[x] = iSum;
      pTimesOfFeatureValue[iSum]++;
    }
  }
}

void SumOf16x16BlockOfFrame_c(uint8_t *pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
                                              uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
{//TODO: this is similar to SumOf8x8BlockOfFrame_c expect the calling of single block func, refactor-able?
  int32_t x, y;
  uint8_t *pRef;
  uint16_t *pBuffer;
  int32_t iSum;
  for(y = 0; y < kiHeight; y++) {
    pRef = pRefPicture  + kiRefStride * y;
    pBuffer  = pFeatureOfBlock + kiWidth * y;
    for(x = 0; x < kiWidth; x++) {
      iSum = SumOf16x16SingleBlock_c(pRef + x, kiRefStride);

      pBuffer[x] = iSum;
      pTimesOfFeatureValue[iSum]++;
    }
  }
}

void InitializeHashforFeature_c( uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
                                uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList ) {
  //assign location pointer
  uint16_t *pBufPos  = pBuf;
  for( int32_t i = 0 ; i < kiListSize; ++i ) {
    pLocationOfFeature[i] =
      pFeatureValuePointerList[i] = pBufPos;
    pBufPos      += (pTimesOfFeatureValue[i]<<1);
  }
}
void FillQpelLocationByFeatureValue_c( uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight,
                                       uint16_t** pFeatureValuePointerList ) {
  //assign each pixel's position
  uint16_t* pSrcPointer  =  pFeatureOfBlock;
  int32_t iQpelY = 0;
  for(int32_t y = 0; y < kiHeight; y++) {
    for(int32_t x = 0; x < kiWidth; x++) {
      uint16_t uiFeature = pSrcPointer[x];
      ST32( &pFeatureValuePointerList[uiFeature][0], ((iQpelY<<16)|(x<<2)) );
      pFeatureValuePointerList[uiFeature] += 2;
    }
    iQpelY += 4;
    pSrcPointer += kiWidth;
  }
}

void CalculateFeatureOfBlock( SWelsFuncPtrList *pFunc, SPicture* pRef,
                         SScreenBlockFeatureStorage* pScreenBlockFeatureStorage) {
  uint16_t* pFeatureOfBlock = pScreenBlockFeatureStorage->pFeatureOfBlockPointer;
  uint32_t* pTimesOfFeatureValue = pScreenBlockFeatureStorage->pTimesOfFeatureValue;
  uint16_t** pLocationOfFeature  = pScreenBlockFeatureStorage->pLocationOfFeature;
  uint16_t* pBuf = pScreenBlockFeatureStorage->pLocationPointer;

  uint8_t* pRefData = pRef->pData[0];
  const int32_t iRefStride = pRef->iLineSize[0];
  int32_t iIs16x16 = pScreenBlockFeatureStorage->iIs16x16;
  const int32_t iEdgeDiscard = (iIs16x16?16:8);//this is to save complexity of padding on pRef
  const int32_t iWidth = pRef->iWidthInPixel - iEdgeDiscard;
  const int32_t kiHeight = pRef->iHeightInPixel - iEdgeDiscard;
  const int32_t kiActualListSize = pScreenBlockFeatureStorage->iActualListSize;
  uint16_t* pFeatureValuePointerList[WELS_MAX(LIST_SIZE_SUM_16x16,LIST_SIZE_MSE_16x16)] = {0};

  memset(pTimesOfFeatureValue, 0, sizeof(int32_t)*kiActualListSize);
  (pFunc->pfCalculateBlockFeatureOfFrame[iIs16x16])(pRefData,iWidth, kiHeight, iRefStride, pFeatureOfBlock, pTimesOfFeatureValue);

  //assign pLocationOfFeature pointer
  InitializeHashforFeature_c( pTimesOfFeatureValue, pBuf, kiActualListSize,
    pLocationOfFeature, pFeatureValuePointerList );

  //assign each pixel's pLocationOfFeature
  FillQpelLocationByFeatureValue_c(pFeatureOfBlock, iWidth, kiHeight, pFeatureValuePointerList);
}

void PerformFMEPreprocess( SWelsFuncPtrList *pFunc, SPicture* pRef,
                          SScreenBlockFeatureStorage* pScreenBlockFeatureStorage) {
    CalculateFeatureOfBlock(pFunc, pRef, pScreenBlockFeatureStorage );
    pScreenBlockFeatureStorage->bRefBlockFeatureCalculated = true;

    uint32_t uiRefPictureAvgQstepx16 = QStepx16ByQp[WelsMedian(0, pRef->iFrameAverageQp, 51)];
    uint32_t uiSadCostThreshold16x16 = ((30 * (uiRefPictureAvgQstepx16 + 160))>>3);
    pScreenBlockFeatureStorage->uiSadCostThreshold[BLOCK_16x16] = uiSadCostThreshold16x16;
    pScreenBlockFeatureStorage->uiSadCostThreshold[BLOCK_8x8] = (uiSadCostThreshold16x16>>2);
    pScreenBlockFeatureStorage->uiSadCostThreshold[BLOCK_16x8]
    = pScreenBlockFeatureStorage->uiSadCostThreshold[BLOCK_8x16]
    = pScreenBlockFeatureStorage->uiSadCostThreshold[BLOCK_4x4] = UINT_MAX;
}

//search related
void SetFeatureSearchIn( SWelsFuncPtrList *pFunc,  const SWelsME& sMe,
                        const SSlice *pSlice, SScreenBlockFeatureStorage* pRefFeatureStorage,
                        const int32_t kiEncStride, const int32_t kiRefStride,
                        SFeatureSearchIn* pFeatureSearchIn ) {
  pFeatureSearchIn->pSad = pFunc->sSampleDealingFuncs.pfSampleSad[sMe.uiBlockSize];
  pFeatureSearchIn->iFeatureOfCurrent=pFunc->pfCalculateSingleBlockFeature[BLOCK_16x16==sMe.uiBlockSize](sMe.pEncMb, kiEncStride);

  pFeatureSearchIn->pEnc       = sMe.pEncMb;
  pFeatureSearchIn->pColoRef = sMe.pColoRefMb;
  pFeatureSearchIn->iEncStride = kiEncStride;
  pFeatureSearchIn->iRefStride = kiRefStride;
  pFeatureSearchIn->uiSadCostThresh = sMe.uiSadCostThreshold;

  pFeatureSearchIn->iCurPixX = sMe.iCurMeBlockPixX;
  pFeatureSearchIn->iCurPixXQpel = (pFeatureSearchIn->iCurPixX<<2);
  pFeatureSearchIn->iCurPixY = sMe.iCurMeBlockPixY;
  pFeatureSearchIn->iCurPixYQpel = (pFeatureSearchIn->iCurPixY<<2);

  pFeatureSearchIn->pTimesOfFeature = pRefFeatureStorage->pTimesOfFeatureValue;
  pFeatureSearchIn->pQpelLocationOfFeature = pRefFeatureStorage->pLocationOfFeature;
  pFeatureSearchIn->pMvdCostX = sMe.pMvdCost - pFeatureSearchIn->iCurPixXQpel - sMe.sMvp.iMvX;
  pFeatureSearchIn->pMvdCostY = sMe.pMvdCost - pFeatureSearchIn->iCurPixYQpel - sMe.sMvp.iMvY;

  pFeatureSearchIn->iMinQpelX = pFeatureSearchIn->iCurPixXQpel+((pSlice->sMvStartMin.iMvX)<<2);
  pFeatureSearchIn->iMinQpelY = pFeatureSearchIn->iCurPixYQpel+((pSlice->sMvStartMin.iMvY)<<2);
  pFeatureSearchIn->iMaxQpelX = pFeatureSearchIn->iCurPixXQpel+((pSlice->sMvStartMax.iMvX)<<2);
  pFeatureSearchIn->iMaxQpelY = pFeatureSearchIn->iCurPixYQpel+((pSlice->sMvStartMax.iMvY)<<2);
}
void SaveFeatureSearchOut( const SMVUnitXY sBestMv, const uint32_t uiBestSadCost, uint8_t* pRef, SFeatureSearchOut* pFeatureSearchOut) {
  pFeatureSearchOut->sBestMv = sBestMv;
  pFeatureSearchOut->uiBestSadCost = uiBestSadCost;
  pFeatureSearchOut->pBestRef = pRef;
}

bool FeatureSearchOne( SFeatureSearchIn &sFeatureSearchIn, const int32_t iFeatureDifference, const uint32_t kuiExpectedSearchTimes,
                      SFeatureSearchOut* pFeatureSearchOut) {
  const int32_t iFeatureOfRef = (sFeatureSearchIn.iFeatureOfCurrent + iFeatureDifference);
  if(iFeatureOfRef < 0 || iFeatureOfRef >= LIST_SIZE)
    return true;

  PSampleSadSatdCostFunc pSad = sFeatureSearchIn.pSad;
  uint8_t* pEnc =  sFeatureSearchIn.pEnc;
  uint8_t* pColoRef = sFeatureSearchIn.pColoRef;
  const int32_t iEncStride=  sFeatureSearchIn.iEncStride;
  const int32_t iRefStride =  sFeatureSearchIn.iRefStride;
  const uint16_t uiSadCostThresh = sFeatureSearchIn.uiSadCostThresh;

  const int32_t iCurPixX = sFeatureSearchIn.iCurPixX;
  const int32_t iCurPixY = sFeatureSearchIn.iCurPixY;
  const int32_t iCurPixXQpel = sFeatureSearchIn.iCurPixXQpel;
  const int32_t iCurPixYQpel = sFeatureSearchIn.iCurPixYQpel;

  const int32_t iMinQpelX =  sFeatureSearchIn.iMinQpelX;
  const int32_t iMinQpelY =  sFeatureSearchIn.iMinQpelY;
  const int32_t iMaxQpelX =  sFeatureSearchIn.iMaxQpelX;
  const int32_t iMaxQpelY =  sFeatureSearchIn.iMaxQpelY;

  const int32_t iSearchTimes = WELS_MIN(sFeatureSearchIn.pTimesOfFeature[iFeatureOfRef], kuiExpectedSearchTimes);
  const int32_t iSearchTimesx2 = (iSearchTimes<<1);
  const uint16_t* pQpelPosition = sFeatureSearchIn.pQpelLocationOfFeature[iFeatureOfRef];

  SMVUnitXY sBestMv;
  uint32_t uiBestCost, uiTmpCost;
  uint8_t *pBestRef, *pCurRef;
  int32_t iQpelX, iQpelY;
  int32_t iIntepelX, iIntepelY;
  int32_t i;

  sBestMv.iMvX = pFeatureSearchOut->sBestMv.iMvX;
  sBestMv.iMvY = pFeatureSearchOut->sBestMv.iMvY;
  uiBestCost = pFeatureSearchOut->uiBestSadCost;
  pBestRef = pFeatureSearchOut->pBestRef;

  for( i = 0; i < iSearchTimesx2; i+=2) {
    iQpelX = pQpelPosition[i];
    iQpelY = pQpelPosition[i+1];

    if((iQpelX > iMaxQpelX) || (iQpelX < iMinQpelX)
      || (iQpelY > iMaxQpelY) || (iQpelY < iMinQpelY)
      || (iQpelX == iCurPixXQpel) || (iQpelY == iCurPixYQpel) )
      continue;

    uiTmpCost = sFeatureSearchIn.pMvdCostX[ iQpelX ] + sFeatureSearchIn.pMvdCostY[ iQpelY ];
    if(uiTmpCost + iFeatureDifference >= uiBestCost)
      continue;

    iIntepelX = (iQpelX>>2) - iCurPixX;
    iIntepelY = (iQpelY>>2) - iCurPixY;
    pCurRef = &pColoRef[iIntepelX + iIntepelY * iRefStride];
    uiTmpCost += pSad( pEnc, iEncStride, pCurRef, iRefStride );
    if( uiTmpCost < uiBestCost ) {
      sBestMv.iMvX = iIntepelX;
      sBestMv.iMvY = iIntepelY;
      uiBestCost = uiTmpCost;
      pBestRef = pCurRef;

      if(uiBestCost < uiSadCostThresh)
        break;
    }
  }
  SaveFeatureSearchOut(sBestMv, uiBestCost, pBestRef, pFeatureSearchOut);
  return (i < iSearchTimesx2);
}


void MotionEstimateFeatureFullSearch( SFeatureSearchIn &sFeatureSearchIn,
                                        const uint32_t kuiMaxSearchPoint,
                                        SWelsME* pMe) {
  SFeatureSearchOut sFeatureSearchOut = { { 0 } };//TODO: this can be refactored and removed
  sFeatureSearchOut.uiBestSadCost = pMe->uiSadCost;
  sFeatureSearchOut.sBestMv = pMe->sMv;
  sFeatureSearchOut.pBestRef = pMe->pRefMb;

  int32_t iFeatureDifference = 0;//TODO: change it according to computational-complexity setting when needed
  FeatureSearchOne( sFeatureSearchIn, iFeatureDifference, kuiMaxSearchPoint, &sFeatureSearchOut );
  if ( sFeatureSearchOut.uiBestSadCost < pMe->uiSadCost ) {//TODO: this may be refactored and removed
    UpdateMeResults(sFeatureSearchOut.sBestMv,
      sFeatureSearchOut.uiBestSadCost, sFeatureSearchOut.pBestRef,
      pMe);
  }
}

//switch related
static uint32_t CountFMECostDown( const SDqLayer* pCurLayer ) {
  uint32_t uiCostDownSum      = 0;
  const int32_t kiSliceCount  = GetCurrentSliceNum( pCurLayer->pSliceEncCtx );
  if ( kiSliceCount >= 1 ) {
    int32_t iSliceIndex  = 0;
    SSlice *pSlice    = &pCurLayer->sLayerInfo.pSliceInLayer[iSliceIndex];
    while( iSliceIndex < kiSliceCount ) {
      uiCostDownSum += pSlice->uiSliceFMECostDown;
      ++ pSlice;
      ++ iSliceIndex;
    }
  }
  return uiCostDownSum;
}
#define FMESWITCH_MBAVERCOSTSAVING_THRESHOLD (2) //empirically set.
#define FMESWITCH_GOODFRAMECOUNT_MAX (5) //empirically set.
static void UpdateFMEGoodFrameCount(const uint32_t iAvMBNormalizedRDcostDown, uint8_t& uiFMEGoodFrameCount) {
  //this strategy may be changed, here the number is derived from empirical-numbers
  // uiFMEGoodFrameCount lies in [0,FMESWITCH_GOODFRAMECOUNT_MAX]
  if ( iAvMBNormalizedRDcostDown > FMESWITCH_MBAVERCOSTSAVING_THRESHOLD ) {
    if ( uiFMEGoodFrameCount < FMESWITCH_GOODFRAMECOUNT_MAX )
      ++ uiFMEGoodFrameCount;
  } else {
    if ( uiFMEGoodFrameCount > 0 )
      -- uiFMEGoodFrameCount;
  }
}
void UpdateFMESwitch(SDqLayer* pCurLayer) {
  const uint32_t iFMECost = CountFMECostDown( pCurLayer );
  const uint32_t iAvMBNormalizedRDcostDown  = iFMECost / (pCurLayer->iMbWidth*pCurLayer->iMbHeight);
  UpdateFMEGoodFrameCount( iAvMBNormalizedRDcostDown, pCurLayer->pFeatureSearchPreparation->uiFMEGoodFrameCount );
}
void UpdateFMESwitchNull(SDqLayer* pCurLayer) {
}
/////////////////////////
// Search function options
/////////////////////////
void WelsDiamondCrossSearch(SWelsFuncPtrList *pFunc, void* vpMe, void* vpSlice, const int32_t kiEncStride,  const int32_t kiRefStride) {
    SWelsME* pMe       = static_cast<SWelsME *>(vpMe);
    SSlice* pSlice         = static_cast<SSlice *>(vpSlice);

    //  Step 1: diamond search
    WelsDiamondSearch(pFunc, vpMe, vpSlice, kiEncStride, kiRefStride);

    //  Step 2: CROSS search
    SScreenBlockFeatureStorage pRefBlockFeature; //TODO: use this structure from Ref
    pMe->uiSadCostThreshold = pRefBlockFeature.uiSadCostThreshold[pMe->uiBlockSize];
    if (pMe->uiSadCost >= pMe->uiSadCostThreshold) {
      WelsMotionCrossSearch(pFunc, pMe, pSlice, kiEncStride, kiRefStride);
    }
}
void WelsDiamondCrossFeatureSearch(SWelsFuncPtrList *pFunc, void* vpMe, void* vpSlice, const int32_t kiEncStride, const int32_t kiRefStride) {
    SWelsME* pMe       = static_cast<SWelsME *>(vpMe);
    SSlice* pSlice         = static_cast<SSlice *>(vpSlice);

    //  Step 1: diamond search + cross
    WelsDiamondCrossSearch(pFunc, pMe, pSlice, kiEncStride, kiRefStride);

    // Step 2: FeatureSearch
    if (pMe->uiSadCost >= pMe->uiSadCostThreshold) {
        pSlice->uiSliceFMECostDown += pMe->uiSadCost;

        SScreenBlockFeatureStorage tmpScreenBlockFeatureStorage; //TODO: use this structure from Ref
        uint32_t uiMaxSearchPoint = INT_MAX;//TODO: change it according to computational-complexity setting
        SFeatureSearchIn sFeatureSearchIn = {0};
        SetFeatureSearchIn(pFunc, *pMe, pSlice, &tmpScreenBlockFeatureStorage,
          kiEncStride, kiRefStride,
          &sFeatureSearchIn);
        MotionEstimateFeatureFullSearch( sFeatureSearchIn, uiMaxSearchPoint, pMe);

        pSlice->uiSliceFMECostDown -= pMe->uiSadCost;
    }
}

} // namespace WelsSVCEnc