ref: a128d7f790d57107f3fbb95edc26c37356367a81
dir: /codec/encoder/core/src/encode_mb_aux.cpp/
/*! * \copy * Copyright (c) 2013, Cisco Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */ #include "ls_defines.h" #include "encode_mb_aux.h" #include "cpu_core.h" namespace WelsSVCEnc { __align16 (const int16_t, g_kiQuantInterFF[58][8]) = { /* 0*/ { 0, 1, 0, 1, 1, 1, 1, 1 }, /* 1*/ { 0, 1, 0, 1, 1, 1, 1, 1 }, /* 2*/ { 1, 1, 1, 1, 1, 1, 1, 1 }, /* 3*/ { 1, 1, 1, 1, 1, 1, 1, 1 }, /* 4*/ { 1, 1, 1, 1, 1, 2, 1, 2 }, /* 5*/ { 1, 1, 1, 1, 1, 2, 1, 2 }, /* 6*/ { 1, 1, 1, 1, 1, 2, 1, 2 }, /* 7*/ { 1, 1, 1, 1, 1, 2, 1, 2 }, /* 8*/ { 1, 2, 1, 2, 2, 3, 2, 3 }, /* 9*/ { 1, 2, 1, 2, 2, 3, 2, 3 }, /*10*/ { 1, 2, 1, 2, 2, 3, 2, 3 }, /*11*/ { 1, 2, 1, 2, 2, 4, 2, 4 }, /*12*/ { 2, 3, 2, 3, 3, 4, 3, 4 }, /*13*/ { 2, 3, 2, 3, 3, 5, 3, 5 }, /*14*/ { 2, 3, 2, 3, 3, 5, 3, 5 }, /*15*/ { 2, 4, 2, 4, 4, 6, 4, 6 }, /*16*/ { 3, 4, 3, 4, 4, 7, 4, 7 }, /*17*/ { 3, 5, 3, 5, 5, 8, 5, 8 }, /*18*/ { 3, 5, 3, 5, 5, 8, 5, 8 }, /*19*/ { 4, 6, 4, 6, 6, 9, 6, 9 }, /*20*/ { 4, 7, 4, 7, 7, 10, 7, 10 }, /*21*/ { 5, 8, 5, 8, 8, 12, 8, 12 }, /*22*/ { 5, 8, 5, 8, 8, 13, 8, 13 }, /*23*/ { 6, 10, 6, 10, 10, 15, 10, 15 }, /*24*/ { 7, 11, 7, 11, 11, 17, 11, 17 }, /*25*/ { 7, 12, 7, 12, 12, 19, 12, 19 }, /*26*/ { 9, 13, 9, 13, 13, 21, 13, 21 }, /*27*/ { 9, 15, 9, 15, 15, 24, 15, 24 }, /*28*/ { 11, 17, 11, 17, 17, 26, 17, 26 }, /*29*/ { 12, 19, 12, 19, 19, 30, 19, 30 }, /*30*/ { 13, 22, 13, 22, 22, 33, 22, 33 }, /*31*/ { 15, 23, 15, 23, 23, 38, 23, 38 }, /*32*/ { 17, 27, 17, 27, 27, 42, 27, 42 }, /*33*/ { 19, 30, 19, 30, 30, 48, 30, 48 }, /*34*/ { 21, 33, 21, 33, 33, 52, 33, 52 }, /*35*/ { 24, 38, 24, 38, 38, 60, 38, 60 }, /*36*/ { 27, 43, 27, 43, 43, 67, 43, 67 }, /*37*/ { 29, 47, 29, 47, 47, 75, 47, 75 }, /*38*/ { 35, 53, 35, 53, 53, 83, 53, 83 }, /*39*/ { 37, 60, 37, 60, 60, 96, 60, 96 }, /*40*/ { 43, 67, 43, 67, 67, 104, 67, 104 }, /*41*/ { 48, 77, 48, 77, 77, 121, 77, 121 }, /*42*/ { 53, 87, 53, 87, 87, 133, 87, 133 }, /*43*/ { 59, 93, 59, 93, 93, 150, 93, 150 }, /*44*/ { 69, 107, 69, 107, 107, 167, 107, 167 }, /*45*/ { 75, 120, 75, 120, 120, 192, 120, 192 }, /*46*/ { 85, 133, 85, 133, 133, 208, 133, 208 }, /*47*/ { 96, 153, 96, 153, 153, 242, 153, 242 }, /*48*/ { 107, 173, 107, 173, 173, 267, 173, 267 }, /*49*/ { 117, 187, 117, 187, 187, 300, 187, 300 }, /*50*/ { 139, 213, 139, 213, 213, 333, 213, 333 }, /*51*/ { 149, 240, 149, 240, 240, 383, 240, 383 }, /* from here below is only for intra */ /*46*/ { 171, 267, 171, 267, 267, 417, 267, 417 }, /*47*/ { 192, 307, 192, 307, 307, 483, 307, 483 }, /*48*/ { 213, 347, 213, 347, 347, 533, 347, 533 }, /*49*/ { 235, 373, 235, 373, 373, 600, 373, 600 }, /*50*/ { 277, 427, 277, 427, 427, 667, 427, 667 }, /*51*/ { 299, 480, 299, 480, 480, 767, 480, 767 }, }; __align16 (const int16_t, g_kiQuantMF[52][8]) = { /* 0*/ {26214, 16132, 26214, 16132, 16132, 10486, 16132, 10486 }, /* 1*/ {23832, 14980, 23832, 14980, 14980, 9320, 14980, 9320 }, /* 2*/ {20164, 13108, 20164, 13108, 13108, 8388, 13108, 8388 }, /* 3*/ {18724, 11650, 18724, 11650, 11650, 7294, 11650, 7294 }, /* 4*/ {16384, 10486, 16384, 10486, 10486, 6710, 10486, 6710 }, /* 5*/ {14564, 9118, 14564, 9118, 9118, 5786, 9118, 5786 }, /* 6*/ {13107, 8066, 13107, 8066, 8066, 5243, 8066, 5243 }, /* 7*/ {11916, 7490, 11916, 7490, 7490, 4660, 7490, 4660 }, /* 8*/ {10082, 6554, 10082, 6554, 6554, 4194, 6554, 4194 }, /* 9*/ { 9362, 5825, 9362, 5825, 5825, 3647, 5825, 3647 }, /*10*/ { 8192, 5243, 8192, 5243, 5243, 3355, 5243, 3355 }, /*11*/ { 7282, 4559, 7282, 4559, 4559, 2893, 4559, 2893 }, /*12*/ { 6554, 4033, 6554, 4033, 4033, 2622, 4033, 2622 }, /*13*/ { 5958, 3745, 5958, 3745, 3745, 2330, 3745, 2330 }, /*14*/ { 5041, 3277, 5041, 3277, 3277, 2097, 3277, 2097 }, /*15*/ { 4681, 2913, 4681, 2913, 2913, 1824, 2913, 1824 }, /*16*/ { 4096, 2622, 4096, 2622, 2622, 1678, 2622, 1678 }, /*17*/ { 3641, 2280, 3641, 2280, 2280, 1447, 2280, 1447 }, /*18*/ { 3277, 2017, 3277, 2017, 2017, 1311, 2017, 1311 }, /*19*/ { 2979, 1873, 2979, 1873, 1873, 1165, 1873, 1165 }, /*20*/ { 2521, 1639, 2521, 1639, 1639, 1049, 1639, 1049 }, /*21*/ { 2341, 1456, 2341, 1456, 1456, 912, 1456, 912 }, /*22*/ { 2048, 1311, 2048, 1311, 1311, 839, 1311, 839 }, /*23*/ { 1821, 1140, 1821, 1140, 1140, 723, 1140, 723 }, /*24*/ { 1638, 1008, 1638, 1008, 1008, 655, 1008, 655 }, /*25*/ { 1490, 936, 1490, 936, 936, 583, 936, 583 }, /*26*/ { 1260, 819, 1260, 819, 819, 524, 819, 524 }, /*27*/ { 1170, 728, 1170, 728, 728, 456, 728, 456 }, /*28*/ { 1024, 655, 1024, 655, 655, 419, 655, 419 }, /*29*/ { 910, 570, 910, 570, 570, 362, 570, 362 }, /*30*/ { 819, 504, 819, 504, 504, 328, 504, 328 }, /*31*/ { 745, 468, 745, 468, 468, 291, 468, 291 }, /*32*/ { 630, 410, 630, 410, 410, 262, 410, 262 }, /*33*/ { 585, 364, 585, 364, 364, 228, 364, 228 }, /*34*/ { 512, 328, 512, 328, 328, 210, 328, 210 }, /*35*/ { 455, 285, 455, 285, 285, 181, 285, 181 }, /*36*/ { 410, 252, 410, 252, 252, 164, 252, 164 }, /*37*/ { 372, 234, 372, 234, 234, 146, 234, 146 }, /*38*/ { 315, 205, 315, 205, 205, 131, 205, 131 }, /*39*/ { 293, 182, 293, 182, 182, 114, 182, 114 }, /*40*/ { 256, 164, 256, 164, 164, 105, 164, 105 }, /*41*/ { 228, 142, 228, 142, 142, 90, 142, 90 }, /*42*/ { 205, 126, 205, 126, 126, 82, 126, 82 }, /*43*/ { 186, 117, 186, 117, 117, 73, 117, 73 }, /*44*/ { 158, 102, 158, 102, 102, 66, 102, 66 }, /*45*/ { 146, 91, 146, 91, 91, 57, 91, 57 }, /*46*/ { 128, 82, 128, 82, 82, 52, 82, 52 }, /*47*/ { 114, 71, 114, 71, 71, 45, 71, 45 }, /*48*/ { 102, 63, 102, 63, 63, 41, 63, 41 }, /*49*/ { 93, 59, 93, 59, 59, 36, 59, 36 }, /*50*/ { 79, 51, 79, 51, 51, 33, 51, 33 }, /*51*/ { 73, 46, 73, 46, 46, 28, 46, 28 } }; /**************************************************************************** * HDM and Quant functions ****************************************************************************/ #define WELS_ABS_LC(a) ((iSign ^ (int32_t)(a)) - iSign) #define NEW_QUANT(pDct, iFF, iMF) (((iFF)+ WELS_ABS_LC(pDct))*(iMF)) >>16 #define WELS_NEW_QUANT(pDct,iFF,iMF) WELS_ABS_LC(NEW_QUANT(pDct, iFF, iMF)) void WelsQuant4x4_c (int16_t* pDct, const int16_t* pFF, const int16_t* pMF) { int32_t i, j, iSign; for (i = 0; i < 16; i += 4) { j = i & 0x07; iSign = WELS_SIGN (pDct[i]); pDct[i] = WELS_NEW_QUANT (pDct[i], pFF[j], pMF[j]); iSign = WELS_SIGN (pDct[i + 1]); pDct[i + 1] = WELS_NEW_QUANT (pDct[i + 1], pFF[j + 1], pMF[j + 1]); iSign = WELS_SIGN (pDct[i + 2]); pDct[i + 2] = WELS_NEW_QUANT (pDct[i + 2], pFF[j + 2], pMF[j + 2]); iSign = WELS_SIGN (pDct[i + 3]); pDct[i + 3] = WELS_NEW_QUANT (pDct[i + 3], pFF[j + 3], pMF[j + 3]); } } void WelsQuant4x4Dc_c (int16_t* pDct, int16_t iFF, int16_t iMF) { int32_t i, iSign; for (i = 0; i < 16; i += 4) { iSign = WELS_SIGN (pDct[i]); pDct[i] = WELS_NEW_QUANT (pDct[i], iFF, iMF); iSign = WELS_SIGN (pDct[i + 1]); pDct[i + 1] = WELS_NEW_QUANT (pDct[i + 1], iFF, iMF); iSign = WELS_SIGN (pDct[i + 2]); pDct[i + 2] = WELS_NEW_QUANT (pDct[i + 2], iFF, iMF); iSign = WELS_SIGN (pDct[i + 3]); pDct[i + 3] = WELS_NEW_QUANT (pDct[i + 3], iFF, iMF); } } void WelsQuantFour4x4_c (int16_t* pDct, const int16_t* pFF, const int16_t* pMF) { int32_t i, j, iSign; for (i = 0; i < 64; i += 4) { j = i & 0x07; iSign = WELS_SIGN (pDct[i]); pDct[i] = WELS_NEW_QUANT (pDct[i], pFF[j], pMF[j]); iSign = WELS_SIGN (pDct[i + 1]); pDct[i + 1] = WELS_NEW_QUANT (pDct[i + 1], pFF[j + 1], pMF[j + 1]); iSign = WELS_SIGN (pDct[i + 2]); pDct[i + 2] = WELS_NEW_QUANT (pDct[i + 2], pFF[j + 2], pMF[j + 2]); iSign = WELS_SIGN (pDct[i + 3]); pDct[i + 3] = WELS_NEW_QUANT (pDct[i + 3], pFF[j + 3], pMF[j + 3]); } } void WelsQuantFour4x4Max_c (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax) { int32_t i, j, k, iSign; int16_t iMaxAbs; for (k = 0; k < 4; k++) { iMaxAbs = 0; for (i = 0; i < 16; i++) { j = i & 0x07; iSign = WELS_SIGN (pDct[i]); pDct[i] = NEW_QUANT (pDct[i], pFF[j], pMF[j]); if (iMaxAbs < pDct[i]) iMaxAbs = pDct[i]; pDct[i] = WELS_ABS_LC (pDct[i]); } pDct += 16; pMax[k] = iMaxAbs; } } int32_t WelsHadamardQuant2x2Skip_c (int16_t* pRs, int16_t iFF, int16_t iMF) { int16_t pDct[4], s[4]; int16_t iThreshold = ((1 << 16) - 1) / iMF - iFF; s[0] = pRs[0] + pRs[32]; s[1] = pRs[0] - pRs[32]; s[2] = pRs[16] + pRs[48]; s[3] = pRs[16] - pRs[48]; pDct[0] = s[0] + s[2]; pDct[1] = s[0] - s[2]; pDct[2] = s[1] + s[3]; pDct[3] = s[1] - s[3]; return ((WELS_ABS (pDct[0]) > iThreshold) || (WELS_ABS (pDct[1]) > iThreshold) || (WELS_ABS (pDct[2]) > iThreshold) || (WELS_ABS (pDct[3]) > iThreshold)); } int32_t WelsHadamardQuant2x2_c (int16_t* pRs, const int16_t iFF, int16_t iMF, int16_t* pDct, int16_t* pBlock) { int16_t s[4]; int32_t iSign, i, iDcNzc = 0; s[0] = pRs[0] + pRs[32]; s[1] = pRs[0] - pRs[32]; s[2] = pRs[16] + pRs[48]; s[3] = pRs[16] - pRs[48]; pRs[0] = 0; pRs[16] = 0; pRs[32] = 0; pRs[48] = 0; pDct[0] = s[0] + s[2]; pDct[1] = s[0] - s[2]; pDct[2] = s[1] + s[3]; pDct[3] = s[1] - s[3]; iSign = WELS_SIGN (pDct[0]); pDct[0] = WELS_NEW_QUANT (pDct[0], iFF, iMF); iSign = WELS_SIGN (pDct[1]); pDct[1] = WELS_NEW_QUANT (pDct[1], iFF, iMF); iSign = WELS_SIGN (pDct[2]); pDct[2] = WELS_NEW_QUANT (pDct[2], iFF, iMF); iSign = WELS_SIGN (pDct[3]); pDct[3] = WELS_NEW_QUANT (pDct[3], iFF, iMF); ST64 (pBlock, LD64 (pDct)); for (i = 0; i < 4; i++) iDcNzc += (pBlock[i] != 0); return iDcNzc; } /* dc value pick up and hdm_4x4 */ void WelsHadamardT4Dc_c (int16_t* pLumaDc, int16_t* pDct) { int32_t p[16], s[4]; int32_t i, iIdx; for (i = 0 ; i < 16 ; i += 4) { iIdx = ((i & 0x08) << 4) + ((i & 0x04) << 3); s[0] = pDct[iIdx ] + pDct[iIdx + 80]; s[3] = pDct[iIdx ] - pDct[iIdx + 80]; s[1] = pDct[iIdx + 16] + pDct[iIdx + 64]; s[2] = pDct[iIdx + 16] - pDct[iIdx + 64]; p[i ] = s[0] + s[1]; p[i + 2] = s[0] - s[1]; p[i + 1] = s[3] + s[2]; p[i + 3] = s[3] - s[2]; } for (i = 0 ; i < 4 ; i ++) { s[0] = p[i ] + p[i + 12]; s[3] = p[i ] - p[i + 12]; s[1] = p[i + 4] + p[i + 8]; s[2] = p[i + 4] - p[i + 8]; pLumaDc[i ] = WELS_CLIP3 ((s[0] + s[1] + 1) >> 1, -32768, 32767); pLumaDc[i + 8 ] = WELS_CLIP3 ((s[0] - s[1] + 1) >> 1, -32768, 32767); pLumaDc[i + 4 ] = WELS_CLIP3 ((s[3] + s[2] + 1) >> 1, -32768, 32767); pLumaDc[i + 12] = WELS_CLIP3 ((s[3] - s[2] + 1) >> 1, -32768, 32767); } } /**************************************************************************** * DCT functions ****************************************************************************/ void WelsDctT4_c (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2) { int16_t i, pData[16], s[4]; for (i = 0 ; i < 16 ; i += 4) { const int32_t kiI1 = 1 + i; const int32_t kiI2 = 2 + i; const int32_t kiI3 = 3 + i; pData[i ] = pPixel1[0] - pPixel2[0]; pData[kiI1] = pPixel1[1] - pPixel2[1]; pData[kiI2] = pPixel1[2] - pPixel2[2]; pData[kiI3] = pPixel1[3] - pPixel2[3]; pPixel1 += iStride1; pPixel2 += iStride2; /*horizontal transform */ s[0] = pData[i] + pData[kiI3]; s[3] = pData[i] - pData[kiI3]; s[1] = pData[kiI1] + pData[kiI2]; s[2] = pData[kiI1] - pData[kiI2]; pDct[i ] = s[0] + s[1]; pDct[kiI2] = s[0] - s[1]; pDct[kiI1] = (s[3] << 1) + s[2]; pDct[kiI3] = s[3] - (s[2] << 1); } /* vertical transform */ for (i = 0 ; i < 4 ; i ++) { const int32_t kiI4 = 4 + i; const int32_t kiI8 = 8 + i; const int32_t kiI12 = 12 + i; s[0] = pDct[i ] + pDct[kiI12]; s[3] = pDct[i ] - pDct[kiI12]; s[1] = pDct[kiI4] + pDct[kiI8 ]; s[2] = pDct[kiI4] - pDct[kiI8 ]; pDct[i ] = s[0] + s[1]; pDct[kiI8 ] = s[0] - s[1]; pDct[kiI4 ] = (s[3] << 1) + s[2]; pDct[kiI12] = s[3] - (s[2] << 1); } } void WelsDctFourT4_c (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2) { int32_t stride_1 = iStride1 << 2; int32_t stride_2 = iStride2 << 2; WelsDctT4_c (pDct, &pPixel1[0], iStride1, &pPixel2[0], iStride2); WelsDctT4_c (pDct + 16, &pPixel1[4], iStride1, &pPixel2[4], iStride2); WelsDctT4_c (pDct + 32, &pPixel1[stride_1 ], iStride1, &pPixel2[stride_2 ], iStride2); WelsDctT4_c (pDct + 48, &pPixel1[stride_1 + 4], iStride1, &pPixel2[stride_2 + 4], iStride2); } /**************************************************************************** * Scan and Score functions ****************************************************************************/ void WelsScan4x4DcAc_c (int16_t* pLevel, int16_t* pDct) { ST32 (pLevel, LD32 (pDct)); pLevel[2] = pDct[4]; pLevel[3] = pDct[8]; pLevel[4] = pDct[5]; ST32 (pLevel + 5, LD32 (pDct + 2)); pLevel[7] = pDct[6]; pLevel[8] = pDct[9]; ST32 (pLevel + 9, LD32 (pDct + 12)); pLevel[11] = pDct[10]; pLevel[12] = pDct[7]; pLevel[13] = pDct[11]; ST32 (pLevel + 14, LD32 (pDct + 14)); } void WelsScan4x4Ac_c (int16_t* pLevel, int16_t* pDct) { pLevel[0] = pDct[1]; pLevel[1] = pDct[4]; pLevel[2] = pDct[8]; pLevel[3] = pDct[5]; ST32 (&pLevel[4], LD32 (&pDct[2])); pLevel[6] = pDct[6]; pLevel[7] = pDct[9]; ST32 (&pLevel[8], LD32 (&pDct[12])); pLevel[10] = pDct[10]; pLevel[11] = pDct[7]; pLevel[12] = pDct[11]; ST32 (&pLevel[13], LD32 (&pDct[14])); pLevel[15] = 0; } void WelsScan4x4Dc (int16_t* pLevel, int16_t* pDct) { ST32 (pLevel, LD32 (pDct)); pLevel[2] = pDct[4]; pLevel[3] = pDct[8]; pLevel[4] = pDct[5]; ST32 (pLevel + 5, LD32 (pDct + 2)); pLevel[7] = pDct[6]; pLevel[8] = pDct[9]; ST32 (pLevel + 9, LD32 (pDct + 12)); pLevel[11] = pDct[10]; pLevel[12] = pDct[7]; pLevel[13] = pDct[11]; ST32 (pLevel + 14, LD32 (pDct + 14)); } //refer to JVT-O079 int32_t WelsCalculateSingleCtr4x4_c (int16_t* pDct) { static const int32_t kiTRunTable[16] = { 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; int32_t iSingleCtr = 0; int32_t iIdx = 15; int32_t iRun; while (iIdx >= 0 && pDct[iIdx] == 0) --iIdx; while (iIdx >= 0) { -- iIdx; iRun = iIdx; while (iIdx >= 0 && pDct[iIdx] == 0) --iIdx; iRun -= iIdx; iSingleCtr += kiTRunTable[iRun]; } return iSingleCtr; } /**************************************************************************** * Copy functions ****************************************************************************/ void WelsCopy4x4 (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) { const int32_t kiSrcStride2 = iStrideS << 1; const int32_t kiSrcStride3 = iStrideS + kiSrcStride2; const int32_t kiDstStride2 = iStrideD << 1; const int32_t kiDstStride3 = iStrideD + kiDstStride2; ST32 (pDst, LD32 (pSrc)); ST32 (pDst + iStrideD, LD32 (pSrc + iStrideS)); ST32 (pDst + kiDstStride2, LD32 (pSrc + kiSrcStride2)); ST32 (pDst + kiDstStride3, LD32 (pSrc + kiSrcStride3)); } void WelsCopy8x8_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) { int32_t i; for (i = 0; i < 4; i++) { ST32 (pDst, LD32 (pSrc)); ST32 (pDst + 4 , LD32 (pSrc + 4)); ST32 (pDst + iStrideD, LD32 (pSrc + iStrideS)); ST32 (pDst + iStrideD + 4 , LD32 (pSrc + iStrideS + 4)); pDst += iStrideD << 1; pSrc += iStrideS << 1; } } void WelsCopy8x16_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) { int32_t i; for (i = 0; i < 8; ++i) { ST32 (pDst, LD32 (pSrc)); ST32 (pDst + 4 , LD32 (pSrc + 4)); ST32 (pDst + iStrideD, LD32 (pSrc + iStrideS)); ST32 (pDst + iStrideD + 4 , LD32 (pSrc + iStrideS + 4)); pDst += iStrideD << 1; pSrc += iStrideS << 1; } } void WelsCopy16x8_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) { int32_t i; for (i = 0; i < 8; i++) { ST32 (pDst, LD32 (pSrc)); ST32 (pDst + 4 , LD32 (pSrc + 4)); ST32 (pDst + 8 , LD32 (pSrc + 8)); ST32 (pDst + 12 , LD32 (pSrc + 12)); pDst += iStrideD ; pSrc += iStrideS; } } void WelsCopy16x16_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) { int32_t i; for (i = 0; i < 16; i++) { ST32 (pDst, LD32 (pSrc)); ST32 (pDst + 4 , LD32 (pSrc + 4)); ST32 (pDst + 8 , LD32 (pSrc + 8)); ST32 (pDst + 12 , LD32 (pSrc + 12)); pDst += iStrideD ; pSrc += iStrideS; } } int32_t WelsGetNoneZeroCount_c (int16_t* pLevel) { int32_t iCnt = 0; int32_t iIdx = 0; while (iIdx < 16) { iCnt += (pLevel[ iIdx] == 0); iCnt += (pLevel[1 + iIdx] == 0); iCnt += (pLevel[2 + iIdx] == 0); iCnt += (pLevel[3 + iIdx] == 0); iIdx += 4; } return (16 - iCnt); } #ifdef HAVE_NEON int32_t WelsHadamardQuant2x2Skip_neon(int16_t* pRes, int16_t iFF, int16_t iMF) { int16_t iThreshold = ((1<<16)-1)/iMF - iFF; return WelsHadamardQuant2x2SkipKernel_neon(pRes, iThreshold); } #endif void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) { pFuncList->pfCopy8x8Aligned = WelsCopy8x8_c; pFuncList->pfCopy16x16Aligned = pFuncList->pfCopy16x16NotAligned = WelsCopy16x16_c; pFuncList->pfCopy16x8NotAligned = WelsCopy16x8_c; pFuncList->pfCopy8x16Aligned = WelsCopy8x16_c; pFuncList->pfQuantizationHadamard2x2 = WelsHadamardQuant2x2_c; pFuncList->pfQuantizationHadamard2x2Skip = WelsHadamardQuant2x2Skip_c; pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_c; pFuncList->pfDctT4 = WelsDctT4_c; pFuncList->pfDctFourT4 = WelsDctFourT4_c; pFuncList->pfScan4x4 = WelsScan4x4DcAc_c; pFuncList->pfScan4x4Ac = WelsScan4x4Ac_c; pFuncList->pfCalculateSingleCtr4x4 = WelsCalculateSingleCtr4x4_c; pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_c; pFuncList->pfQuantization4x4 = WelsQuant4x4_c; pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_c; pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_c; pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_c; #if defined(X86_ASM) if (uiCpuFlag & WELS_CPU_MMXEXT) { pFuncList->pfQuantizationHadamard2x2 = WelsHadamardQuant2x2_mmx; pFuncList->pfQuantizationHadamard2x2Skip = WelsHadamardQuant2x2Skip_mmx; pFuncList->pfDctT4 = WelsDctT4_mmx; pFuncList->pfCopy8x8Aligned = WelsCopy8x8_mmx; pFuncList->pfCopy8x16Aligned = WelsCopy8x16_mmx; } if (uiCpuFlag & WELS_CPU_SSE2) { pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_sse2; pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_sse2; pFuncList->pfQuantization4x4 = WelsQuant4x4_sse2; pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_sse2; pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_sse2; pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_sse2; pFuncList->pfCopy16x16Aligned = WelsCopy16x16_sse2; pFuncList->pfCopy16x16NotAligned = WelsCopy16x16NotAligned_sse2; pFuncList->pfCopy16x8NotAligned = WelsCopy16x8NotAligned_sse2; pFuncList->pfScan4x4 = WelsScan4x4DcAc_sse2; pFuncList->pfScan4x4Ac = WelsScan4x4Ac_sse2; pFuncList->pfCalculateSingleCtr4x4 = WelsCalculateSingleCtr4x4_sse2; pFuncList->pfDctFourT4 = WelsDctFourT4_sse2; } //#ifndef MACOS if (uiCpuFlag & WELS_CPU_SSSE3) { pFuncList->pfScan4x4 = WelsScan4x4DcAc_ssse3; } //#endif//MACOS #endif//X86_ASM #if defined(HAVE_NEON) if (uiCpuFlag & WELS_CPU_NEON) { pFuncList->pfQuantizationHadamard2x2 = WelsHadamardQuant2x2_neon; pFuncList->pfQuantizationHadamard2x2Skip = WelsHadamardQuant2x2Skip_neon; pFuncList->pfDctT4 = WelsDctT4_neon; pFuncList->pfCopy8x8Aligned = WelsCopy8x8_neon; pFuncList->pfCopy8x16Aligned = WelsCopy8x16_neon; pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_neon; pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_neon; pFuncList->pfQuantization4x4 = WelsQuant4x4_neon; pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_neon; pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_neon; pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_neon; pFuncList->pfCopy16x16Aligned = WelsCopy16x16_neon; pFuncList->pfCopy16x16NotAligned = WelsCopy16x16NotAligned_neon; pFuncList->pfCopy16x8NotAligned = WelsCopy16x8NotAligned_neon; pFuncList->pfDctFourT4 = WelsDctFourT4_neon; } #endif } }