ref: e6a6b09dba1a008fbb17b01bbb9d48e74deb29cb
dir: /codec/encoder/core/src/svc_motion_estimate.cpp/
/*! * \copy * Copyright (c) 2009-2013, Cisco Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * * \file svc motion estimate.c * * \brief Interfaces introduced in svc mb motion estimation * * \date 08/11/2009 Created * ************************************************************************************* */ #include "cpu_core.h" #include "ls_defines.h" #include "svc_motion_estimate.h" #include "wels_transpose_matrix.h" namespace WelsEnc { const int32_t QStepx16ByQp[52] = { /* save QStep<<4 for int32_t */ 10, 11, 13, 14, 16, 18, /* 0~5 */ 20, 22, 26, 28, 32, 36, /* 6~11 */ 40, 44, 52, 56, 64, 72, /* 12~17 */ 80, 88, 104, 112, 128, 144, /* 18~23 */ 160, 176, 208, 224, 256, 288, /* 24~29 */ 320, 352, 416, 448, 512, 576, /* 30~35 */ 640, 704, 832, 896, 1024, 1152, /* 36~41 */ 1280, 1408, 1664, 1792, 2048, 2304, /* 42~47 */ 2560, 2816, 3328, 3584 /* 48~51 */ }; static inline void UpdateMeResults (const SMVUnitXY ksBestMv, const uint32_t kiBestSadCost, uint8_t* pRef, SWelsME* pMe) { pMe->sMv = ksBestMv; pMe->pRefMb = pRef; pMe->uiSadCost = kiBestSadCost; } static inline void MeEndIntepelSearch (SWelsME* pMe) { /* -> qpel mv */ pMe->sMv.iMvX *= (1 << 2); pMe->sMv.iMvY *= (1 << 2); pMe->uiSatdCost = pMe->uiSadCost; } void WelsInitMeFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag, bool bScreenContent) { pFuncList->pfUpdateFMESwitch = UpdateFMESwitchNull; if (!bScreenContent) { pFuncList->pfCheckDirectionalMv = CheckDirectionalMvFalse; pFuncList->pfCalculateBlockFeatureOfFrame[0] = pFuncList->pfCalculateBlockFeatureOfFrame[1] = NULL; pFuncList->pfCalculateSingleBlockFeature[0] = pFuncList->pfCalculateSingleBlockFeature[1] = NULL; } else { pFuncList->pfCheckDirectionalMv = CheckDirectionalMv; //for cross serarch pFuncList->pfVerticalFullSearch = LineFullSearch_c; pFuncList->pfHorizontalFullSearch = LineFullSearch_c; #if defined (X86_ASM) if (uiCpuFlag & WELS_CPU_SSE41) { pFuncList->pfSampleSadHor8[0] = SampleSad8x8Hor8_sse41; pFuncList->pfSampleSadHor8[1] = SampleSad16x16Hor8_sse41; pFuncList->pfVerticalFullSearch = VerticalFullSearchUsingSSE41; pFuncList->pfHorizontalFullSearch = HorizontalFullSearchUsingSSE41; } #endif //for feature search pFuncList->pfInitializeHashforFeature = InitializeHashforFeature_c; pFuncList->pfFillQpelLocationByFeatureValue = FillQpelLocationByFeatureValue_c; pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_c; pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_c; //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8? pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_c; pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_c; #if defined (X86_ASM) if (uiCpuFlag & WELS_CPU_SSE2) { //for feature search pFuncList->pfInitializeHashforFeature = InitializeHashforFeature_sse2; pFuncList->pfFillQpelLocationByFeatureValue = FillQpelLocationByFeatureValue_sse2; pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_sse2; pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_sse2; //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8? pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_sse2; pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_sse2; } if (uiCpuFlag & WELS_CPU_SSE41) { //for feature search pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_sse4; pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_sse4; } #endif #if defined (HAVE_NEON) if (uiCpuFlag & WELS_CPU_NEON) { //for feature search pFuncList->pfInitializeHashforFeature = InitializeHashforFeature_neon; pFuncList->pfFillQpelLocationByFeatureValue = FillQpelLocationByFeatureValue_neon; pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_neon; pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_neon; //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8? pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_neon; pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_neon; } #endif #if defined (HAVE_NEON_AARCH64) if (uiCpuFlag & WELS_CPU_NEON) { //for feature search pFuncList->pfInitializeHashforFeature = InitializeHashforFeature_AArch64_neon; pFuncList->pfFillQpelLocationByFeatureValue = FillQpelLocationByFeatureValue_AArch64_neon; pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_AArch64_neon; pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_AArch64_neon; //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8? pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_AArch64_neon; pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_AArch64_neon; } #endif } } /*! * \brief BL mb motion estimate search * * \param enc Wels encoder context * \param pMe Wels me information * * \return NONE */ void WelsMotionEstimateSearch (SWelsFuncPtrList* pFuncList, SDqLayer* pCurDqLayer, SWelsME* pMe, SSlice* pSlice) { const int32_t kiStrideEnc = pCurDqLayer->iEncStride[0]; const int32_t kiStrideRef = pCurDqLayer->pRefPic->iLineSize[0]; // Step 1: Initial point prediction if (!WelsMotionEstimateInitialPoint (pFuncList, pMe, pSlice, kiStrideEnc, kiStrideRef)) { pFuncList->pfSearchMethod[pMe->uiBlockSize] (pFuncList, pMe, pSlice, kiStrideEnc, kiStrideRef); MeEndIntepelSearch (pMe); } pFuncList->pfCalculateSatd (pFuncList->sSampleDealingFuncs.pfSampleSatd[pMe->uiBlockSize], pMe, kiStrideEnc, kiStrideRef); } void WelsMotionEstimateSearchStatic (SWelsFuncPtrList* pFuncList, SDqLayer* pCurDqLayer, SWelsME* pMe, SSlice* pLpslice) { const int32_t kiStrideEnc = pCurDqLayer->iEncStride[0]; const int32_t kiStrideRef = pCurDqLayer->pRefPic->iLineSize[0]; pMe->sMv.iMvX = pMe->sMv.iMvY = 0; pMe->uiSadCost = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize] (pMe->pEncMb, kiStrideEnc, pMe->pRefMb, kiStrideRef) ; pMe->uiSadCost += COST_MVD (pMe->pMvdCost, - pMe->sMvp.iMvX, - pMe->sMvp.iMvY); MeEndIntepelSearch (pMe); pFuncList->pfCalculateSatd (pFuncList->sSampleDealingFuncs.pfSampleSatd[pMe->uiBlockSize], pMe, kiStrideEnc, kiStrideRef); } void WelsMotionEstimateSearchScrolled (SWelsFuncPtrList* pFuncList, SDqLayer* pCurDqLayer, SWelsME* pMe, SSlice* pSlice) { const int32_t kiStrideEnc = pCurDqLayer->iEncStride[0]; const int32_t kiStrideRef = pCurDqLayer->pRefPic->iLineSize[0]; pMe->sMv = pMe->sDirectionalMv; pMe->pRefMb = pMe->pColoRefMb + pMe->sMv.iMvY * kiStrideRef + pMe->sMv.iMvX; pMe->uiSadCost = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize] (pMe->pEncMb, kiStrideEnc, pMe->pRefMb, kiStrideRef) + COST_MVD (pMe->pMvdCost, (pMe->sMv.iMvX * (1 << 2)) - pMe->sMvp.iMvX, (pMe->sMv.iMvY * (1 << 2)) - pMe->sMvp.iMvY); MeEndIntepelSearch (pMe); pFuncList->pfCalculateSatd (pFuncList->sSampleDealingFuncs.pfSampleSatd[pMe->uiBlockSize], pMe, kiStrideEnc, kiStrideRef); } /*! * \brief EL mb motion estimate initial point testing * * \param pix_pFuncList SSampleDealingFunc * \param pMe Wels me information * \param mv_range search range in motion estimate * \param point the best match point in motion estimation * * \return NONE */ bool WelsMotionEstimateInitialPoint (SWelsFuncPtrList* pFuncList, SWelsME* pMe, SSlice* pSlice, int32_t iStrideEnc, int32_t iStrideRef) { PSampleSadSatdCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize]; const uint16_t* kpMvdCost = pMe->pMvdCost; uint8_t* const kpEncMb = pMe->pEncMb; int16_t iMvc0, iMvc1; int32_t iSadCost; int32_t iBestSadCost; uint8_t* pRefMb; uint8_t* pFref2; uint32_t i; const uint32_t kuiMvcNum = pSlice->uiMvcNum; const SMVUnitXY* kpMvcList = &pSlice->sMvc[0]; const SMVUnitXY ksMvStartMin = pSlice->sMvStartMin; const SMVUnitXY ksMvStartMax = pSlice->sMvStartMax; const SMVUnitXY ksMvp = pMe->sMvp; SMVUnitXY sMv; // Step 1: Initial point prediction // init with sMvp sMv.iMvX = WELS_CLIP3 ((2 + ksMvp.iMvX) >> 2, ksMvStartMin.iMvX, ksMvStartMax.iMvX); sMv.iMvY = WELS_CLIP3 ((2 + ksMvp.iMvY) >> 2, ksMvStartMin.iMvY, ksMvStartMax.iMvY); pRefMb = &pMe->pRefMb[sMv.iMvY * iStrideRef + sMv.iMvX]; iBestSadCost = pSad (kpEncMb, iStrideEnc, pRefMb, iStrideRef); iBestSadCost += COST_MVD (kpMvdCost, ((sMv.iMvX) * (1 << 2)) - ksMvp.iMvX, ((sMv.iMvY) * (1 << 2)) - ksMvp.iMvY); for (i = 0; i < kuiMvcNum; i++) { //clipping here is essential since some pOut-of-range MVC may happen here (i.e., refer to baseMV) iMvc0 = WELS_CLIP3 ((2 + kpMvcList[i].iMvX) >> 2, ksMvStartMin.iMvX, ksMvStartMax.iMvX); iMvc1 = WELS_CLIP3 ((2 + kpMvcList[i].iMvY) >> 2, ksMvStartMin.iMvY, ksMvStartMax.iMvY); if (((iMvc0 - sMv.iMvX) || (iMvc1 - sMv.iMvY))) { pFref2 = &pMe->pRefMb[iMvc1 * iStrideRef + iMvc0]; iSadCost = pSad (kpEncMb, iStrideEnc, pFref2, iStrideRef) + COST_MVD (kpMvdCost, (iMvc0 * (1 << 2)) - ksMvp.iMvX, (iMvc1 * (1 << 2)) - ksMvp.iMvY); if (iSadCost < iBestSadCost) { sMv.iMvX = iMvc0; sMv.iMvY = iMvc1; pRefMb = pFref2; iBestSadCost = iSadCost; } } } if (pFuncList->pfCheckDirectionalMv (pSad, pMe, ksMvStartMin, ksMvStartMax, iStrideEnc, iStrideRef, iSadCost)) { sMv = pMe->sDirectionalMv; pRefMb = &pMe->pColoRefMb[sMv.iMvY * iStrideRef + sMv.iMvX]; iBestSadCost = iSadCost; } UpdateMeResults (sMv, iBestSadCost, pRefMb, pMe); if (iBestSadCost < static_cast<int32_t> (pMe->uSadPredISatd.uiSadPred)) { //Initial point early Stop MeEndIntepelSearch (pMe); return true; } return false; } void CalculateSatdCost (PSampleSadSatdCostFunc pSatd, SWelsME* pMe, const int32_t kiEncStride, const int32_t kiRefStride) { pMe->uSadPredISatd.uiSatd = pSatd (pMe->pEncMb, kiEncStride, pMe->pRefMb, kiRefStride); pMe->uiSatdCost = pMe->uSadPredISatd.uiSatd + COST_MVD (pMe->pMvdCost, pMe->sMv.iMvX - pMe->sMvp.iMvX, pMe->sMv.iMvY - pMe->sMvp.iMvY); } void NotCalculateSatdCost (PSampleSadSatdCostFunc pSatd, SWelsME* pMe, const int32_t kiEncStride, const int32_t kiRefStride) { } ///////////////////////// // Diamond Search Basics ///////////////////////// bool WelsMeSadCostSelect (int32_t* iSadCost, const uint16_t* kpMvdCost, int32_t* pBestCost, const int32_t kiDx, const int32_t kiDy, int32_t* pIx, int32_t* pIy) { int32_t iTempSadCost[4]; int32_t iInputSadCost = *pBestCost; iTempSadCost[0] = iSadCost[0] + COST_MVD (kpMvdCost, kiDx, kiDy - 4); iTempSadCost[1] = iSadCost[1] + COST_MVD (kpMvdCost, kiDx, kiDy + 4); iTempSadCost[2] = iSadCost[2] + COST_MVD (kpMvdCost, kiDx - 4, kiDy); iTempSadCost[3] = iSadCost[3] + COST_MVD (kpMvdCost, kiDx + 4, kiDy); if (iTempSadCost[0] < *pBestCost) { *pBestCost = iTempSadCost[0]; *pIx = 0; *pIy = 1; } if (iTempSadCost[1] < *pBestCost) { *pBestCost = iTempSadCost[1]; *pIx = 0; *pIy = -1; } if (iTempSadCost[2] < *pBestCost) { *pBestCost = iTempSadCost[2]; *pIx = 1; *pIy = 0; } if (iTempSadCost[3] < *pBestCost) { *pBestCost = iTempSadCost[3]; *pIx = -1; *pIy = 0; } return (*pBestCost == iInputSadCost); } void WelsDiamondSearch (SWelsFuncPtrList* pFuncList, SWelsME* pMe, SSlice* pSlice, const int32_t kiStrideEnc, const int32_t kiStrideRef) { PSample4SadCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSample4Sad[pMe->uiBlockSize]; uint8_t* pFref = pMe->pRefMb; uint8_t* const kpEncMb = pMe->pEncMb; const uint16_t* kpMvdCost = pMe->pMvdCost; const SMVUnitXY ksMvStartMin = pSlice->sMvStartMin; const SMVUnitXY ksMvStartMax = pSlice->sMvStartMax; int32_t iMvDx = ((pMe->sMv.iMvX) * (1 << 2)) - pMe->sMvp.iMvX; int32_t iMvDy = ((pMe->sMv.iMvY) * (1 << 2)) - pMe->sMvp.iMvY; uint8_t* pRefMb = pFref; int32_t iBestCost = (pMe->uiSadCost); int32_t iTimeThreshold = ITERATIVE_TIMES; ENFORCE_STACK_ALIGN_1D (int32_t, iSadCosts, 4, 16) while (iTimeThreshold--) { pMe->sMv.iMvX = (iMvDx + pMe->sMvp.iMvX) >> 2; pMe->sMv.iMvY = (iMvDy + pMe->sMvp.iMvY) >> 2; if (!CheckMvInRange (pMe->sMv, ksMvStartMin, ksMvStartMax)) continue; pSad (kpEncMb, kiStrideEnc, pRefMb, kiStrideRef, &iSadCosts[0]); int32_t iX, iY; const bool kbIsBestCostWorse = WelsMeSadCostSelect (iSadCosts, kpMvdCost, &iBestCost, iMvDx, iMvDy, &iX, &iY); if (kbIsBestCostWorse) break; iMvDx -= (iX * (1 << 2)) ; iMvDy -= (iY * (1 << 2)) ; pRefMb -= (iX + iY * kiStrideRef); } /* integer-pel mv */ pMe->sMv.iMvX = (iMvDx + pMe->sMvp.iMvX) >> 2; pMe->sMv.iMvY = (iMvDy + pMe->sMvp.iMvY) >> 2; pMe->uiSatdCost = pMe->uiSadCost = (iBestCost); pMe->pRefMb = pRefMb; } ///////////////////////// // DirectionalMv Basics ///////////////////////// bool CheckDirectionalMv (PSampleSadSatdCostFunc pSad, SWelsME* pMe, const SMVUnitXY ksMinMv, const SMVUnitXY ksMaxMv, const int32_t kiEncStride, const int32_t kiRefStride, int32_t& iBestSadCost) { const int16_t kiMvX = pMe->sDirectionalMv.iMvX; const int16_t kiMvY = pMe->sDirectionalMv.iMvY; //Check MV from scrolling detection if ((BLOCK_16x16 != pMe->uiBlockSize) //scrolled_MV with P16x16 is checked SKIP checking function && (kiMvX | kiMvY) //(0,0) checked in ordinary initial point checking && CheckMvInRange (pMe->sDirectionalMv, ksMinMv, ksMaxMv)) { uint8_t* pRef = &pMe->pColoRefMb[kiMvY * kiRefStride + kiMvX]; uint32_t uiCurrentSadCost = pSad (pMe->pEncMb, kiEncStride, pRef, kiRefStride) + COST_MVD (pMe->pMvdCost, (kiMvX * (1 << 2)) - pMe->sMvp.iMvX, (kiMvY * (1 << 2)) - pMe->sMvp.iMvY); if (uiCurrentSadCost < pMe->uiSadCost) { iBestSadCost = uiCurrentSadCost; return true; } } return false; } bool CheckDirectionalMvFalse (PSampleSadSatdCostFunc pSad, SWelsME* vpMe, const SMVUnitXY ksMinMv, const SMVUnitXY ksMaxMv, const int32_t kiEncStride, const int32_t kiRefStride, int32_t& iBestSadCost) { return false; } ///////////////////////// // Cross Search Basics ///////////////////////// #if defined (X86_ASM) void CalcMvdCostx8_c (uint16_t* pMvdCost, const int32_t kiStartMv, uint16_t* pMvdTable, const uint16_t kiFixedCost) { uint16_t* pBaseCost = pMvdCost; const int32_t kiOffset = (kiStartMv * (1 << 2)); uint16_t* pMvd = pMvdTable + kiOffset; for (int32_t i = 0; i < 8; ++ i) { pBaseCost[i] = ((*pMvd) + kiFixedCost); pMvd += 4; } } void VerticalFullSearchUsingSSE41 (SWelsFuncPtrList* pFuncList, SWelsME* pMe, uint16_t* pMvdTable, const int32_t kiEncStride, const int32_t kiRefStride, const int16_t kiMinMv, const int16_t kiMaxMv, const bool bVerticalSearch) { uint8_t* kpEncMb = pMe->pEncMb; const int32_t kiCurMeBlockPix = pMe->iCurMeBlockPixY; uint8_t* pRef = &pMe->pColoRefMb[kiMinMv * kiRefStride]; const int32_t kiCurMeBlockPixY = pMe->iCurMeBlockPixY; int32_t iMinPos = kiCurMeBlockPixY + kiMinMv; int32_t iMaxPos = kiCurMeBlockPixY + kiMaxMv; int32_t iFixedMvd = * (pMvdTable - pMe->sMvp.iMvX); uint16_t* pMvdCost = & (pMvdTable[ (kiMinMv * (1 << 2)) - pMe->sMvp.iMvY]); int16_t iStartMv = 0; const int32_t kIsBlock16x16 = pMe->uiBlockSize == BLOCK_16x16; const int32_t kiEdgeBlocks = kIsBlock16x16 ? 16 : 8; PSampleSadHor8Func pSampleSadHor8 = pFuncList->pfSampleSadHor8[kIsBlock16x16]; PSampleSadSatdCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize]; PTransposeMatrixBlockFunc TransposeMatrixBlock = kIsBlock16x16 ? TransposeMatrixBlock16x16_sse2 : TransposeMatrixBlock8x8_mmx; PTransposeMatrixBlocksFunc TransposeMatrixBlocks = kIsBlock16x16 ? TransposeMatrixBlocksx16_sse2 : TransposeMatrixBlocksx8_mmx; const int32_t kiDiff = iMaxPos - iMinPos; const int32_t kiRowNum = WELS_ALIGN ((kiDiff - kiEdgeBlocks + 1), kiEdgeBlocks); const int32_t kiBlocksNum = kIsBlock16x16 ? (kiRowNum >> 4) : (kiRowNum >> 3); int32_t iCountLoop8 = (kiRowNum - kiEdgeBlocks) >> 3; const int32_t kiRemainingVectors = kiDiff - (iCountLoop8 << 3); const int32_t kiMatrixStride = MAX_VERTICAL_MV_RANGE; ENFORCE_STACK_ALIGN_2D (uint8_t, uiMatrixRef, 16, kiMatrixStride, 16); // transpose matrix result for ref ENFORCE_STACK_ALIGN_2D (uint8_t, uiMatrixEnc, 16, 16, 16); // transpose matrix result for enc assert (kiRowNum <= kiMatrixStride); // make sure effective memory TransposeMatrixBlock (&uiMatrixEnc[0][0], 16, kpEncMb, kiEncStride); TransposeMatrixBlocks (&uiMatrixRef[0][0], kiMatrixStride, pRef, kiRefStride, kiBlocksNum); ENFORCE_STACK_ALIGN_1D (uint16_t, uiBaseCost, 8, 16); int32_t iTargetPos = iMinPos; int16_t iBestPos = pMe->sMv.iMvX; uint32_t uiBestCost = pMe->uiSadCost; uint32_t uiCostMin; int32_t iIndexMinPos; kpEncMb = &uiMatrixEnc[0][0]; pRef = &uiMatrixRef[0][0]; while (iCountLoop8 > 0) { CalcMvdCostx8_c (uiBaseCost, iStartMv, pMvdCost, iFixedMvd); uiCostMin = pSampleSadHor8 (kpEncMb, 16, pRef, kiMatrixStride, uiBaseCost, &iIndexMinPos); if (uiCostMin < uiBestCost) { uiBestCost = uiCostMin; iBestPos = iTargetPos + iIndexMinPos; } iTargetPos += 8; pRef += 8; iStartMv += 8; -- iCountLoop8; } if (kiRemainingVectors > 0) { kpEncMb = pMe->pEncMb; pRef = &pMe->pColoRefMb[ (iTargetPos - kiCurMeBlockPix) * kiRefStride]; while (iTargetPos < iMaxPos) { const uint16_t uiMvdCost = pMvdCost[iStartMv * (1 << 2)]; uint32_t uiSadCost = pSad (kpEncMb, kiEncStride, pRef, kiRefStride) + (iFixedMvd + uiMvdCost); if (uiSadCost < uiBestCost) { uiBestCost = uiSadCost; iBestPos = iTargetPos; } iStartMv++; pRef += kiRefStride; ++iTargetPos; } } if (uiBestCost < pMe->uiSadCost) { SMVUnitXY sBestMv; sBestMv.iMvX = 0; sBestMv.iMvY = iBestPos - kiCurMeBlockPix; UpdateMeResults (sBestMv, uiBestCost, &pMe->pColoRefMb[sBestMv.iMvY * kiRefStride], pMe); } } void HorizontalFullSearchUsingSSE41 (SWelsFuncPtrList* pFuncList, SWelsME* pMe, uint16_t* pMvdTable, const int32_t kiEncStride, const int32_t kiRefStride, const int16_t kiMinMv, const int16_t kiMaxMv, const bool bVerticalSearch) { uint8_t* kpEncMb = pMe->pEncMb; const int32_t iCurMeBlockPixX = pMe->iCurMeBlockPixX; int32_t iMinPos = iCurMeBlockPixX + kiMinMv; int32_t iMaxPos = iCurMeBlockPixX + kiMaxMv; int32_t iFixedMvd = * (pMvdTable - pMe->sMvp.iMvY); uint16_t* pMvdCost = & (pMvdTable[ (kiMinMv * (1 << 2)) - pMe->sMvp.iMvX]); int16_t iStartMv = 0; uint8_t* pRef = &pMe->pColoRefMb[kiMinMv]; const int32_t kIsBlock16x16 = pMe->uiBlockSize == BLOCK_16x16; PSampleSadHor8Func pSampleSadHor8 = pFuncList->pfSampleSadHor8[kIsBlock16x16]; PSampleSadSatdCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize]; ENFORCE_STACK_ALIGN_1D (uint16_t, uiBaseCost, 8, 16); const int32_t kiNumVector = iMaxPos - iMinPos; int32_t iCountLoop8 = kiNumVector >> 3; const int32_t kiRemainingLoop8 = kiNumVector & 7; int32_t iTargetPos = iMinPos; int16_t iBestPos = pMe->sMv.iMvX; uint32_t uiBestCost = pMe->uiSadCost; uint32_t uiCostMin; int32_t iIndexMinPos; while (iCountLoop8 > 0) { CalcMvdCostx8_c (uiBaseCost, iStartMv, pMvdCost, iFixedMvd); uiCostMin = pSampleSadHor8 (kpEncMb, kiEncStride, pRef, kiRefStride, uiBaseCost, &iIndexMinPos); if (uiCostMin < uiBestCost) { uiBestCost = uiCostMin; iBestPos = iTargetPos + iIndexMinPos; } iTargetPos += 8; pRef += 8; iStartMv += 8; -- iCountLoop8; } if (kiRemainingLoop8 > 0) { while (iTargetPos < iMaxPos) { const uint16_t uiMvdCost = pMvdCost[iStartMv * (1 << 2)]; uint32_t uiSadCost = pSad (kpEncMb, kiEncStride, pRef, kiRefStride) + (iFixedMvd + uiMvdCost); if (uiSadCost < uiBestCost) { uiBestCost = uiSadCost; iBestPos = iTargetPos; } iStartMv++; ++pRef; ++iTargetPos; } } if (uiBestCost < pMe->uiSadCost) { SMVUnitXY sBestMv; sBestMv.iMvX = iBestPos - iCurMeBlockPixX; sBestMv.iMvY = 0; UpdateMeResults (sBestMv, uiBestCost, &pMe->pColoRefMb[sBestMv.iMvX], pMe); } } #endif void LineFullSearch_c (SWelsFuncPtrList* pFuncList, SWelsME* pMe, uint16_t* pMvdTable, const int32_t kiEncStride, const int32_t kiRefStride, const int16_t iMinMv, const int16_t iMaxMv, const bool bVerticalSearch) { PSampleSadSatdCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize]; const int32_t kiCurMeBlockPixX = pMe->iCurMeBlockPixX; const int32_t kiCurMeBlockPixY = pMe->iCurMeBlockPixY; int32_t iMinPos, iMaxPos; int32_t iFixedMvd; int32_t iCurMeBlockPix; int32_t iStride; uint16_t* pMvdCost; if (bVerticalSearch) { iMinPos = kiCurMeBlockPixY + iMinMv; iMaxPos = kiCurMeBlockPixY + iMaxMv; iFixedMvd = * (pMvdTable - pMe->sMvp.iMvX); iCurMeBlockPix = pMe->iCurMeBlockPixY; iStride = kiRefStride; pMvdCost = & (pMvdTable[ (iMinMv * (1 << 2)) - pMe->sMvp.iMvY]); } else { iMinPos = kiCurMeBlockPixX + iMinMv; iMaxPos = kiCurMeBlockPixX + iMaxMv; iFixedMvd = * (pMvdTable - pMe->sMvp.iMvY); iCurMeBlockPix = pMe->iCurMeBlockPixX; iStride = 1; pMvdCost = & (pMvdTable[ (iMinMv * (1 << 2)) - pMe->sMvp.iMvX]); } uint8_t* pRef = &pMe->pColoRefMb[ iMinMv * iStride]; uint32_t uiBestCost = 0xFFFFFFFF; int32_t iBestPos = 0; for (int32_t iTargetPos = iMinPos; iTargetPos < iMaxPos; ++ iTargetPos) { uint8_t* const kpEncMb = pMe->pEncMb; uint32_t uiSadCost = pSad (kpEncMb, kiEncStride, pRef, kiRefStride) + (iFixedMvd + *pMvdCost); if (uiSadCost < uiBestCost) { uiBestCost = uiSadCost; iBestPos = iTargetPos; } pRef += iStride; pMvdCost += 4; } if (uiBestCost < pMe->uiSadCost) { SMVUnitXY sBestMv; sBestMv.iMvX = bVerticalSearch ? 0 : (iBestPos - iCurMeBlockPix); sBestMv.iMvY = bVerticalSearch ? (iBestPos - iCurMeBlockPix) : 0; UpdateMeResults (sBestMv, uiBestCost, &pMe->pColoRefMb[sBestMv.iMvY * kiRefStride + sBestMv.iMvX], pMe); } } void WelsMotionCrossSearch (SWelsFuncPtrList* pFuncList, SWelsME* pMe, SSlice* pSlice, const int32_t kiEncStride, const int32_t kiRefStride) { PLineFullSearchFunc pfVerticalFullSearchFunc = pFuncList->pfVerticalFullSearch; PLineFullSearchFunc pfHorizontalFullSearchFunc = pFuncList->pfHorizontalFullSearch; //vertical search pfVerticalFullSearchFunc (pFuncList, pMe, pMe->pMvdCost, kiEncStride, kiRefStride, pSlice->sMvStartMin.iMvY, pSlice->sMvStartMax.iMvY, true); //horizontal search if (pMe->uiSadCost >= pMe->uiSadCostThreshold) { pfHorizontalFullSearchFunc (pFuncList, pMe, pMe->pMvdCost, kiEncStride, kiRefStride, pSlice->sMvStartMin.iMvX, pSlice->sMvStartMax.iMvX, false); } } ///////////////////////// // Feature Search Basics ///////////////////////// //memory related int32_t RequestFeatureSearchPreparation (CMemoryAlign* pMa, const int32_t kiFrameWidth, const int32_t kiFrameHeight, const int32_t iNeedFeatureStorage, SFeatureSearchPreparation* pFeatureSearchPreparation) { const int32_t kiFeatureStrategyIndex = iNeedFeatureStorage >> 16; const bool bFme8x8 = ((iNeedFeatureStorage & 0x0000FF & ME_FME) == ME_FME); const int32_t kiMarginSize = bFme8x8 ? 8 : 16; const int32_t kiFrameSize = (kiFrameWidth - kiMarginSize) * (kiFrameHeight - kiMarginSize); int32_t iListOfFeatureOfBlock; if (0 == kiFeatureStrategyIndex) { iListOfFeatureOfBlock = sizeof (uint16_t) * kiFrameSize; } else { iListOfFeatureOfBlock = sizeof (uint16_t) * kiFrameSize + (kiFrameWidth - kiMarginSize) * sizeof (uint32_t) + kiFrameWidth * 8 * sizeof (uint8_t); } pFeatureSearchPreparation->pFeatureOfBlock = (uint16_t*)pMa->WelsMalloc (iListOfFeatureOfBlock, "pFeatureOfBlock"); WELS_VERIFY_RETURN_IF (ENC_RETURN_MEMALLOCERR, NULL == (pFeatureSearchPreparation->pFeatureOfBlock)) pFeatureSearchPreparation->uiFeatureStrategyIndex = kiFeatureStrategyIndex; pFeatureSearchPreparation->bFMESwitchFlag = true; pFeatureSearchPreparation->uiFMEGoodFrameCount = FMESWITCH_DEFAULT_GOODFRAME_NUM; pFeatureSearchPreparation->iHighFreMbCount = 0; return ENC_RETURN_SUCCESS; } int32_t ReleaseFeatureSearchPreparation (CMemoryAlign* pMa, uint16_t*& pFeatureOfBlock) { if (pMa && pFeatureOfBlock) { pMa->WelsFree (pFeatureOfBlock, "pFeatureOfBlock"); pFeatureOfBlock = NULL; return ENC_RETURN_SUCCESS; } return ENC_RETURN_UNEXPECTED; } int32_t RequestScreenBlockFeatureStorage (CMemoryAlign* pMa, const int32_t kiFrameWidth, const int32_t kiFrameHeight, const int32_t iNeedFeatureStorage, SScreenBlockFeatureStorage* pScreenBlockFeatureStorage) { const int32_t kiFeatureStrategyIndex = iNeedFeatureStorage >> 16; const int32_t kiMe8x8FME = iNeedFeatureStorage & 0x0000FF & ME_FME; const int32_t kiMe16x16FME = ((iNeedFeatureStorage & 0x00FF00) >> 8) & ME_FME; if ((kiMe8x8FME == ME_FME) && (kiMe16x16FME == ME_FME)) { return ENC_RETURN_UNSUPPORTED_PARA; //the following memory allocation cannot support when FME at both size } const bool bIsBlock8x8 = (kiMe8x8FME == ME_FME); const int32_t kiMarginSize = bIsBlock8x8 ? 8 : 16; const int32_t kiFrameSize = (kiFrameWidth - kiMarginSize) * (kiFrameHeight - kiMarginSize); const int32_t kiListSize = (0 == kiFeatureStrategyIndex) ? (bIsBlock8x8 ? LIST_SIZE_SUM_8x8 : LIST_SIZE_SUM_16x16) : 256; pScreenBlockFeatureStorage->pTimesOfFeatureValue = (uint32_t*)pMa->WelsMalloc (kiListSize * sizeof (uint32_t), "pScreenBlockFeatureStorage->pTimesOfFeatureValue"); WELS_VERIFY_RETURN_IF (ENC_RETURN_MEMALLOCERR, NULL == pScreenBlockFeatureStorage->pTimesOfFeatureValue) pScreenBlockFeatureStorage->pLocationOfFeature = (uint16_t**)pMa->WelsMalloc (kiListSize * sizeof (uint16_t*), "pScreenBlockFeatureStorage->pLocationOfFeature"); WELS_VERIFY_RETURN_IF (ENC_RETURN_MEMALLOCERR, NULL == pScreenBlockFeatureStorage->pLocationOfFeature) pScreenBlockFeatureStorage->pLocationPointer = (uint16_t*)pMa->WelsMalloc (2 * kiFrameSize * sizeof (uint16_t), "pScreenBlockFeatureStorage->pLocationPointer"); WELS_VERIFY_RETURN_IF (ENC_RETURN_MEMALLOCERR, NULL == pScreenBlockFeatureStorage->pLocationPointer) // uint16_t* pFeatureValuePointerList[WELS_MAX (LIST_SIZE_SUM_16x16, LIST_SIZE_MSE_16x16)] = {0}; pScreenBlockFeatureStorage->pFeatureValuePointerList = (uint16_t**)pMa->WelsMalloc (WELS_MAX (LIST_SIZE_SUM_16x16, LIST_SIZE_MSE_16x16) * sizeof (uint16_t*), "pScreenBlockFeatureStorage->pFeatureValuePointerList"); WELS_VERIFY_RETURN_IF (ENC_RETURN_MEMALLOCERR, NULL == pScreenBlockFeatureStorage->pFeatureValuePointerList) pScreenBlockFeatureStorage->pFeatureOfBlockPointer = NULL; pScreenBlockFeatureStorage->iIs16x16 = !bIsBlock8x8; pScreenBlockFeatureStorage->uiFeatureStrategyIndex = kiFeatureStrategyIndex; pScreenBlockFeatureStorage->iActualListSize = kiListSize; WelsSetMemMultiplebytes_c (pScreenBlockFeatureStorage->uiSadCostThreshold, UINT_MAX, BLOCK_SIZE_ALL, sizeof(uint32_t)); pScreenBlockFeatureStorage->bRefBlockFeatureCalculated = false; return ENC_RETURN_SUCCESS; } int32_t ReleaseScreenBlockFeatureStorage (CMemoryAlign* pMa, SScreenBlockFeatureStorage* pScreenBlockFeatureStorage) { if (pMa && pScreenBlockFeatureStorage) { if (pScreenBlockFeatureStorage->pTimesOfFeatureValue) { pMa->WelsFree (pScreenBlockFeatureStorage->pTimesOfFeatureValue, "pScreenBlockFeatureStorage->pTimesOfFeatureValue"); pScreenBlockFeatureStorage->pTimesOfFeatureValue = NULL; } if (pScreenBlockFeatureStorage->pLocationOfFeature) { pMa->WelsFree (pScreenBlockFeatureStorage->pLocationOfFeature, "pScreenBlockFeatureStorage->pLocationOfFeature"); pScreenBlockFeatureStorage->pLocationOfFeature = NULL; } if (pScreenBlockFeatureStorage->pLocationPointer) { pMa->WelsFree (pScreenBlockFeatureStorage->pLocationPointer, "pScreenBlockFeatureStorage->pLocationPointer"); pScreenBlockFeatureStorage->pLocationPointer = NULL; } if (pScreenBlockFeatureStorage->pFeatureValuePointerList) { pMa->WelsFree (pScreenBlockFeatureStorage->pFeatureValuePointerList, "pScreenBlockFeatureStorage->pFeatureValuePointerList"); pScreenBlockFeatureStorage->pFeatureValuePointerList = NULL; } return ENC_RETURN_SUCCESS; } return ENC_RETURN_UNEXPECTED; } //preprocess related int32_t SumOf8x8SingleBlock_c (uint8_t* pRef, const int32_t kiRefStride) { int32_t iSum = 0, i; for (i = 0; i < 8; i++) { iSum += pRef[0] + pRef[1] + pRef[2] + pRef[3]; iSum += pRef[4] + pRef[5] + pRef[6] + pRef[7]; pRef += kiRefStride; } return iSum; } int32_t SumOf16x16SingleBlock_c (uint8_t* pRef, const int32_t kiRefStride) { int32_t iSum = 0, i; for (i = 0; i < 16; i++) { iSum += pRef[0] + pRef[1] + pRef[2] + pRef[3]; iSum += pRef[4] + pRef[5] + pRef[6] + pRef[7]; iSum += pRef[8] + pRef[9] + pRef[10] + pRef[11]; iSum += pRef[12] + pRef[13] + pRef[14] + pRef[15]; pRef += kiRefStride; } return iSum; } void SumOf8x8BlockOfFrame_c (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride, uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]) { int32_t x, y; uint8_t* pRef; uint16_t* pBuffer; int32_t iSum; for (y = 0; y < kiHeight; y++) { pRef = pRefPicture + kiRefStride * y; pBuffer = pFeatureOfBlock + kiWidth * y; for (x = 0; x < kiWidth; x++) { iSum = SumOf8x8SingleBlock_c (pRef + x, kiRefStride); pBuffer[x] = iSum; pTimesOfFeatureValue[iSum]++; } } } void SumOf16x16BlockOfFrame_c (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride, uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]) { //TODO: this is similar to SumOf8x8BlockOfFrame_c expect the calling of single block func, refactor-able? int32_t x, y; uint8_t* pRef; uint16_t* pBuffer; int32_t iSum; for (y = 0; y < kiHeight; y++) { pRef = pRefPicture + kiRefStride * y; pBuffer = pFeatureOfBlock + kiWidth * y; for (x = 0; x < kiWidth; x++) { iSum = SumOf16x16SingleBlock_c (pRef + x, kiRefStride); pBuffer[x] = iSum; pTimesOfFeatureValue[iSum]++; } } } void InitializeHashforFeature_c (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList) { //assign location pointer uint16_t* pBufPos = pBuf; for (int32_t i = 0 ; i < kiListSize; ++i) { pLocationOfFeature[i] = pFeatureValuePointerList[i] = pBufPos; pBufPos += (pTimesOfFeatureValue[i] << 1); } } void FillQpelLocationByFeatureValue_c (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList) { //assign each pixel's position uint16_t* pSrcPointer = pFeatureOfBlock; int32_t iQpelY = 0; for (int32_t y = 0; y < kiHeight; y++) { for (int32_t x = 0; x < kiWidth; x++) { uint16_t uiFeature = pSrcPointer[x]; pFeatureValuePointerList[uiFeature][0] = x << 2; pFeatureValuePointerList[uiFeature][1] = iQpelY; pFeatureValuePointerList[uiFeature] += 2; } iQpelY += 4; pSrcPointer += kiWidth; } } bool CalculateFeatureOfBlock (SWelsFuncPtrList* pFunc, SPicture* pRef, SScreenBlockFeatureStorage* pScreenBlockFeatureStorage) { uint16_t* pFeatureOfBlock = pScreenBlockFeatureStorage->pFeatureOfBlockPointer; uint32_t* pTimesOfFeatureValue = pScreenBlockFeatureStorage->pTimesOfFeatureValue; uint16_t** pLocationOfFeature = pScreenBlockFeatureStorage->pLocationOfFeature; uint16_t* pBuf = pScreenBlockFeatureStorage->pLocationPointer; if (NULL == pFeatureOfBlock || NULL == pTimesOfFeatureValue || NULL == pLocationOfFeature || NULL == pBuf || NULL == pRef->pData[0]) { return false; } uint8_t* pRefData = pRef->pData[0]; const int32_t iRefStride = pRef->iLineSize[0]; int32_t iIs16x16 = pScreenBlockFeatureStorage->iIs16x16; const int32_t iEdgeDiscard = (iIs16x16 ? 16 : 8); //this is to save complexity of padding on pRef const int32_t iWidth = pRef->iWidthInPixel - iEdgeDiscard; const int32_t kiHeight = pRef->iHeightInPixel - iEdgeDiscard; const int32_t kiActualListSize = pScreenBlockFeatureStorage->iActualListSize; memset (pTimesOfFeatureValue, 0, sizeof (int32_t)*kiActualListSize); (pFunc->pfCalculateBlockFeatureOfFrame[iIs16x16]) (pRefData, iWidth, kiHeight, iRefStride, pFeatureOfBlock, pTimesOfFeatureValue); //assign pLocationOfFeature pointer pFunc->pfInitializeHashforFeature (pTimesOfFeatureValue, pBuf, kiActualListSize, pLocationOfFeature, pScreenBlockFeatureStorage->pFeatureValuePointerList); //assign each pixel's pLocationOfFeature pFunc->pfFillQpelLocationByFeatureValue (pFeatureOfBlock, iWidth, kiHeight, pScreenBlockFeatureStorage->pFeatureValuePointerList); return true; } void PerformFMEPreprocess (SWelsFuncPtrList* pFunc, SPicture* pRef, uint16_t* pFeatureOfBlock, SScreenBlockFeatureStorage* pScreenBlockFeatureStorage) { pScreenBlockFeatureStorage->pFeatureOfBlockPointer = pFeatureOfBlock; pScreenBlockFeatureStorage->bRefBlockFeatureCalculated = CalculateFeatureOfBlock (pFunc, pRef, pScreenBlockFeatureStorage); if (pScreenBlockFeatureStorage->bRefBlockFeatureCalculated) { uint32_t uiRefPictureAvgQstepx16 = QStepx16ByQp[WelsMedian (0, pRef->iFrameAverageQp, 51)]; uint32_t uiSadCostThreshold16x16 = ((30 * (uiRefPictureAvgQstepx16 + 160)) >> 3); pScreenBlockFeatureStorage->uiSadCostThreshold[BLOCK_16x16] = uiSadCostThreshold16x16; pScreenBlockFeatureStorage->uiSadCostThreshold[BLOCK_8x8] = (uiSadCostThreshold16x16 >> 2); pScreenBlockFeatureStorage->uiSadCostThreshold[BLOCK_16x8] = pScreenBlockFeatureStorage->uiSadCostThreshold[BLOCK_8x16] = pScreenBlockFeatureStorage->uiSadCostThreshold[BLOCK_4x4] = UINT_MAX; } } //search related bool SetFeatureSearchIn (SWelsFuncPtrList* pFunc, const SWelsME& sMe, const SSlice* pSlice, SScreenBlockFeatureStorage* pRefFeatureStorage, const int32_t kiEncStride, const int32_t kiRefStride, SFeatureSearchIn* pFeatureSearchIn) { pFeatureSearchIn->pSad = pFunc->sSampleDealingFuncs.pfSampleSad[sMe.uiBlockSize]; pFeatureSearchIn->iFeatureOfCurrent = pFunc->pfCalculateSingleBlockFeature[BLOCK_16x16 == sMe.uiBlockSize] (sMe.pEncMb, kiEncStride); pFeatureSearchIn->pEnc = sMe.pEncMb; pFeatureSearchIn->pColoRef = sMe.pColoRefMb; pFeatureSearchIn->iEncStride = kiEncStride; pFeatureSearchIn->iRefStride = kiRefStride; pFeatureSearchIn->uiSadCostThresh = sMe.uiSadCostThreshold; pFeatureSearchIn->iCurPixX = sMe.iCurMeBlockPixX; pFeatureSearchIn->iCurPixXQpel = (pFeatureSearchIn->iCurPixX << 2); pFeatureSearchIn->iCurPixY = sMe.iCurMeBlockPixY; pFeatureSearchIn->iCurPixYQpel = (pFeatureSearchIn->iCurPixY << 2); pFeatureSearchIn->pTimesOfFeature = pRefFeatureStorage->pTimesOfFeatureValue; pFeatureSearchIn->pQpelLocationOfFeature = pRefFeatureStorage->pLocationOfFeature; pFeatureSearchIn->pMvdCostX = sMe.pMvdCost - pFeatureSearchIn->iCurPixXQpel - sMe.sMvp.iMvX; pFeatureSearchIn->pMvdCostY = sMe.pMvdCost - pFeatureSearchIn->iCurPixYQpel - sMe.sMvp.iMvY; pFeatureSearchIn->iMinQpelX = pFeatureSearchIn->iCurPixXQpel + ((pSlice->sMvStartMin.iMvX) * (1 << 2)); pFeatureSearchIn->iMinQpelY = pFeatureSearchIn->iCurPixYQpel + ((pSlice->sMvStartMin.iMvY) * (1 << 2)); pFeatureSearchIn->iMaxQpelX = pFeatureSearchIn->iCurPixXQpel + ((pSlice->sMvStartMax.iMvX) * (1 << 2)); pFeatureSearchIn->iMaxQpelY = pFeatureSearchIn->iCurPixYQpel + ((pSlice->sMvStartMax.iMvY) * (1 << 2)); if (NULL == pFeatureSearchIn->pSad || NULL == pFeatureSearchIn->pTimesOfFeature || NULL == pFeatureSearchIn->pQpelLocationOfFeature) { return false; } return true; } void SaveFeatureSearchOut (const SMVUnitXY sBestMv, const uint32_t uiBestSadCost, uint8_t* pRef, SFeatureSearchOut* pFeatureSearchOut) { pFeatureSearchOut->sBestMv = sBestMv; pFeatureSearchOut->uiBestSadCost = uiBestSadCost; pFeatureSearchOut->pBestRef = pRef; } bool FeatureSearchOne (SFeatureSearchIn& sFeatureSearchIn, const int32_t iFeatureDifference, const uint32_t kuiExpectedSearchTimes, SFeatureSearchOut* pFeatureSearchOut) { const int32_t iFeatureOfRef = (sFeatureSearchIn.iFeatureOfCurrent + iFeatureDifference); if (iFeatureOfRef < 0 || iFeatureOfRef >= LIST_SIZE) return true; PSampleSadSatdCostFunc pSad = sFeatureSearchIn.pSad; uint8_t* pEnc = sFeatureSearchIn.pEnc; uint8_t* pColoRef = sFeatureSearchIn.pColoRef; const int32_t iEncStride = sFeatureSearchIn.iEncStride; const int32_t iRefStride = sFeatureSearchIn.iRefStride; const uint16_t uiSadCostThresh = sFeatureSearchIn.uiSadCostThresh; const int32_t iCurPixX = sFeatureSearchIn.iCurPixX; const int32_t iCurPixY = sFeatureSearchIn.iCurPixY; const int32_t iCurPixXQpel = sFeatureSearchIn.iCurPixXQpel; const int32_t iCurPixYQpel = sFeatureSearchIn.iCurPixYQpel; const int32_t iMinQpelX = sFeatureSearchIn.iMinQpelX; const int32_t iMinQpelY = sFeatureSearchIn.iMinQpelY; const int32_t iMaxQpelX = sFeatureSearchIn.iMaxQpelX; const int32_t iMaxQpelY = sFeatureSearchIn.iMaxQpelY; const int32_t iSearchTimes = WELS_MIN (sFeatureSearchIn.pTimesOfFeature[iFeatureOfRef], kuiExpectedSearchTimes); const int32_t iSearchTimesx2 = (iSearchTimes << 1); const uint16_t* pQpelPosition = sFeatureSearchIn.pQpelLocationOfFeature[iFeatureOfRef]; SMVUnitXY sBestMv; uint32_t uiBestCost, uiTmpCost; uint8_t* pBestRef, *pCurRef; int32_t iQpelX, iQpelY; int32_t iIntepelX, iIntepelY; int32_t i; sBestMv.iMvX = pFeatureSearchOut->sBestMv.iMvX; sBestMv.iMvY = pFeatureSearchOut->sBestMv.iMvY; uiBestCost = pFeatureSearchOut->uiBestSadCost; pBestRef = pFeatureSearchOut->pBestRef; for (i = 0; i < iSearchTimesx2; i += 2) { iQpelX = pQpelPosition[i]; iQpelY = pQpelPosition[i + 1]; if ((iQpelX > iMaxQpelX) || (iQpelX < iMinQpelX) || (iQpelY > iMaxQpelY) || (iQpelY < iMinQpelY) || (iQpelX == iCurPixXQpel) || (iQpelY == iCurPixYQpel)) continue; uiTmpCost = sFeatureSearchIn.pMvdCostX[ iQpelX ] + sFeatureSearchIn.pMvdCostY[ iQpelY ]; if (uiTmpCost + iFeatureDifference >= uiBestCost) continue; iIntepelX = (iQpelX >> 2) - iCurPixX; iIntepelY = (iQpelY >> 2) - iCurPixY; pCurRef = &pColoRef[iIntepelX + iIntepelY * iRefStride]; uiTmpCost += pSad (pEnc, iEncStride, pCurRef, iRefStride); if (uiTmpCost < uiBestCost) { sBestMv.iMvX = iIntepelX; sBestMv.iMvY = iIntepelY; uiBestCost = uiTmpCost; pBestRef = pCurRef; if (uiBestCost < uiSadCostThresh) break; } } SaveFeatureSearchOut (sBestMv, uiBestCost, pBestRef, pFeatureSearchOut); return (i < iSearchTimesx2); } void MotionEstimateFeatureFullSearch (SFeatureSearchIn& sFeatureSearchIn, const uint32_t kuiMaxSearchPoint, SWelsME* pMe) { SFeatureSearchOut sFeatureSearchOut = { { 0 } };//TODO: this can be refactored and removed sFeatureSearchOut.uiBestSadCost = pMe->uiSadCost; sFeatureSearchOut.sBestMv = pMe->sMv; sFeatureSearchOut.pBestRef = pMe->pRefMb; int32_t iFeatureDifference = 0;//TODO: change it according to computational-complexity setting when needed FeatureSearchOne (sFeatureSearchIn, iFeatureDifference, kuiMaxSearchPoint, &sFeatureSearchOut); if (sFeatureSearchOut.uiBestSadCost < pMe->uiSadCost) { //TODO: this may be refactored and removed UpdateMeResults (sFeatureSearchOut.sBestMv, sFeatureSearchOut.uiBestSadCost, sFeatureSearchOut.pBestRef, pMe); } } //switch related static uint32_t CountFMECostDown (const SDqLayer* pCurLayer) { uint32_t uiCostDownSum = 0; const int32_t kiSliceCount = GetCurrentSliceNum (pCurLayer); if (kiSliceCount >= 1) { int32_t iSliceIndex = 0; SSlice* pSlice = &pCurLayer->sLayerInfo.pSliceInLayer[iSliceIndex]; while (iSliceIndex < kiSliceCount) { uiCostDownSum += pSlice->uiSliceFMECostDown; ++ pSlice; ++ iSliceIndex; } } return uiCostDownSum; } #define FMESWITCH_MBAVERCOSTSAVING_THRESHOLD (2) //empirically set. #define FMESWITCH_GOODFRAMECOUNT_MAX (5) //empirically set. static void UpdateFMEGoodFrameCount (const uint32_t iAvMBNormalizedRDcostDown, uint8_t& uiFMEGoodFrameCount) { //this strategy may be changed, here the number is derived from empirical-numbers // uiFMEGoodFrameCount lies in [0,FMESWITCH_GOODFRAMECOUNT_MAX] if (iAvMBNormalizedRDcostDown > FMESWITCH_MBAVERCOSTSAVING_THRESHOLD) { if (uiFMEGoodFrameCount < FMESWITCH_GOODFRAMECOUNT_MAX) ++ uiFMEGoodFrameCount; } else { if (uiFMEGoodFrameCount > 0) -- uiFMEGoodFrameCount; } } void UpdateFMESwitch (SDqLayer* pCurLayer) { const uint32_t iFMECost = CountFMECostDown (pCurLayer); const uint32_t iAvMBNormalizedRDcostDown = iFMECost / (pCurLayer->iMbWidth * pCurLayer->iMbHeight); UpdateFMEGoodFrameCount (iAvMBNormalizedRDcostDown, pCurLayer->pFeatureSearchPreparation->uiFMEGoodFrameCount); } void UpdateFMESwitchNull (SDqLayer* pCurLayer) { } ///////////////////////// // Search function options ///////////////////////// void WelsDiamondCrossSearch (SWelsFuncPtrList* pFunc, SWelsME* pMe, SSlice* pSlice, const int32_t kiEncStride, const int32_t kiRefStride) { // Step 1: diamond search WelsDiamondSearch (pFunc, pMe, pSlice, kiEncStride, kiRefStride); // Step 2: CROSS search pMe->uiSadCostThreshold = pMe->pRefFeatureStorage->uiSadCostThreshold[pMe->uiBlockSize]; if (pMe->uiSadCost >= pMe->uiSadCostThreshold) { WelsMotionCrossSearch (pFunc, pMe, pSlice, kiEncStride, kiRefStride); } } void WelsDiamondCrossFeatureSearch (SWelsFuncPtrList* pFunc, SWelsME* pMe, SSlice* pSlice, const int32_t kiEncStride, const int32_t kiRefStride) { // Step 1: diamond search + cross WelsDiamondCrossSearch (pFunc, pMe, pSlice, kiEncStride, kiRefStride); // Step 2: FeatureSearch if (pMe->uiSadCost >= pMe->uiSadCostThreshold) { pSlice->uiSliceFMECostDown += pMe->uiSadCost; uint32_t uiMaxSearchPoint = INT_MAX;//TODO: change it according to computational-complexity setting SFeatureSearchIn sFeatureSearchIn = {0}; if (SetFeatureSearchIn (pFunc, *pMe, pSlice, pMe->pRefFeatureStorage, kiEncStride, kiRefStride, &sFeatureSearchIn)) { MotionEstimateFeatureFullSearch (sFeatureSearchIn, uiMaxSearchPoint, pMe); } pSlice->uiSliceFMECostDown -= pMe->uiSadCost; } } } // namespace WelsEnc