ref: 063709c92e8bafe467d384f564afbdc6b66eb39a
dir: /codec/encoder/core/src/encode_mb_aux.cpp/
/*! * \copy * Copyright (c) 2013, Cisco Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */ #include <string.h> #include "macros.h" #include "ls_defines.h" #include "encode_mb_aux.h" #include "cpu_core.h" #include "as264_common.h" #include "svc_encode_mb.h" namespace WelsSVCEnc { __align16( int16_t, g_kiQuantInterFF[58][8] )= { /* 0*/ { 0, 1, 0, 1, 1, 1, 1, 1 }, /* 1*/ { 0, 1, 0, 1, 1, 1, 1, 1 }, /* 2*/ { 1, 1, 1, 1, 1, 1, 1, 1 }, /* 3*/ { 1, 1, 1, 1, 1, 1, 1, 1 }, /* 4*/ { 1, 1, 1, 1, 1, 2, 1, 2 }, /* 5*/ { 1, 1, 1, 1, 1, 2, 1, 2 }, /* 6*/ { 1, 1, 1, 1, 1, 2, 1, 2 }, /* 7*/ { 1, 1, 1, 1, 1, 2, 1, 2 }, /* 8*/ { 1, 2, 1, 2, 2, 3, 2, 3 }, /* 9*/ { 1, 2, 1, 2, 2, 3, 2, 3 }, /*10*/ { 1, 2, 1, 2, 2, 3, 2, 3 }, /*11*/ { 1, 2, 1, 2, 2, 4, 2, 4 }, /*12*/ { 2, 3, 2, 3, 3, 4, 3, 4 }, /*13*/ { 2, 3, 2, 3, 3, 5, 3, 5 }, /*14*/ { 2, 3, 2, 3, 3, 5, 3, 5 }, /*15*/ { 2, 4, 2, 4, 4, 6, 4, 6 }, /*16*/ { 3, 4, 3, 4, 4, 7, 4, 7 }, /*17*/ { 3, 5, 3, 5, 5, 8, 5, 8 }, /*18*/ { 3, 5, 3, 5, 5, 8, 5, 8 }, /*19*/ { 4, 6, 4, 6, 6, 9, 6, 9 }, /*20*/ { 4, 7, 4, 7, 7, 10, 7, 10 }, /*21*/ { 5, 8, 5, 8, 8, 12, 8, 12 }, /*22*/ { 5, 8, 5, 8, 8, 13, 8, 13 }, /*23*/ { 6, 10, 6, 10, 10, 15, 10, 15 }, /*24*/ { 7, 11, 7, 11, 11, 17, 11, 17 }, /*25*/ { 7, 12, 7, 12, 12, 19, 12, 19 }, /*26*/ { 9, 13, 9, 13, 13, 21, 13, 21 }, /*27*/ { 9, 15, 9, 15, 15, 24, 15, 24 }, /*28*/ { 11, 17, 11, 17, 17, 26, 17, 26 }, /*29*/ { 12, 19, 12, 19, 19, 30, 19, 30 }, /*30*/ { 13, 22, 13, 22, 22, 33, 22, 33 }, /*31*/ { 15, 23, 15, 23, 23, 38, 23, 38 }, /*32*/ { 17, 27, 17, 27, 27, 42, 27, 42 }, /*33*/ { 19, 30, 19, 30, 30, 48, 30, 48 }, /*34*/ { 21, 33, 21, 33, 33, 52, 33, 52 }, /*35*/ { 24, 38, 24, 38, 38, 60, 38, 60 }, /*36*/ { 27, 43, 27, 43, 43, 67, 43, 67 }, /*37*/ { 29, 47, 29, 47, 47, 75, 47, 75 }, /*38*/ { 35, 53, 35, 53, 53, 83, 53, 83 }, /*39*/ { 37, 60, 37, 60, 60, 96, 60, 96 }, /*40*/ { 43, 67, 43, 67, 67, 104, 67, 104 }, /*41*/ { 48, 77, 48, 77, 77, 121, 77, 121 }, /*42*/ { 53, 87, 53, 87, 87, 133, 87, 133 }, /*43*/ { 59, 93, 59, 93, 93, 150, 93, 150 }, /*44*/ { 69, 107, 69, 107, 107, 167, 107, 167 }, /*45*/ { 75, 120, 75, 120, 120, 192, 120, 192 }, /*46*/ { 85, 133, 85, 133, 133, 208, 133, 208 }, /*47*/ { 96, 153, 96, 153, 153, 242, 153, 242 }, /*48*/ { 107, 173, 107, 173, 173, 267, 173, 267 }, /*49*/ { 117, 187, 117, 187, 187, 300, 187, 300 }, /*50*/ { 139, 213, 139, 213, 213, 333, 213, 333 }, /*51*/ { 149, 240, 149, 240, 240, 383, 240, 383 }, /* from here below is only for intra */ /*46*/ { 171, 267, 171, 267, 267, 417, 267, 417 }, /*47*/ { 192, 307, 192, 307, 307, 483, 307, 483 }, /*48*/ { 213, 347, 213, 347, 347, 533, 347, 533 }, /*49*/ { 235, 373, 235, 373, 373, 600, 373, 600 }, /*50*/ { 277, 427, 277, 427, 427, 667, 427, 667 }, /*51*/ { 299, 480, 299, 480, 480, 767, 480, 767 }, }; __align16( int16_t, g_kiQuantMF[52][8]) = { /* 0*/ {26214, 16132, 26214, 16132, 16132, 10486, 16132, 10486 }, /* 1*/ {23832, 14980, 23832, 14980, 14980, 9320, 14980, 9320 }, /* 2*/ {20164, 13108, 20164, 13108, 13108, 8388, 13108, 8388 }, /* 3*/ {18724, 11650, 18724, 11650, 11650, 7294, 11650, 7294 }, /* 4*/ {16384, 10486, 16384, 10486, 10486, 6710, 10486, 6710 }, /* 5*/ {14564, 9118, 14564, 9118, 9118, 5786, 9118, 5786 }, /* 6*/ {13107, 8066, 13107, 8066, 8066, 5243, 8066, 5243 }, /* 7*/ {11916, 7490, 11916, 7490, 7490, 4660, 7490, 4660 }, /* 8*/ {10082, 6554, 10082, 6554, 6554, 4194, 6554, 4194 }, /* 9*/ { 9362, 5825, 9362, 5825, 5825, 3647, 5825, 3647 }, /*10*/ { 8192, 5243, 8192, 5243, 5243, 3355, 5243, 3355 }, /*11*/ { 7282, 4559, 7282, 4559, 4559, 2893, 4559, 2893 }, /*12*/ { 6554, 4033, 6554, 4033, 4033, 2622, 4033, 2622 }, /*13*/ { 5958, 3745, 5958, 3745, 3745, 2330, 3745, 2330 }, /*14*/ { 5041, 3277, 5041, 3277, 3277, 2097, 3277, 2097 }, /*15*/ { 4681, 2913, 4681, 2913, 2913, 1824, 2913, 1824 }, /*16*/ { 4096, 2622, 4096, 2622, 2622, 1678, 2622, 1678 }, /*17*/ { 3641, 2280, 3641, 2280, 2280, 1447, 2280, 1447 }, /*18*/ { 3277, 2017, 3277, 2017, 2017, 1311, 2017, 1311 }, /*19*/ { 2979, 1873, 2979, 1873, 1873, 1165, 1873, 1165 }, /*20*/ { 2521, 1639, 2521, 1639, 1639, 1049, 1639, 1049 }, /*21*/ { 2341, 1456, 2341, 1456, 1456, 912, 1456, 912 }, /*22*/ { 2048, 1311, 2048, 1311, 1311, 839, 1311, 839 }, /*23*/ { 1821, 1140, 1821, 1140, 1140, 723, 1140, 723 }, /*24*/ { 1638, 1008, 1638, 1008, 1008, 655, 1008, 655 }, /*25*/ { 1490, 936, 1490, 936, 936, 583, 936, 583 }, /*26*/ { 1260, 819, 1260, 819, 819, 524, 819, 524 }, /*27*/ { 1170, 728, 1170, 728, 728, 456, 728, 456 }, /*28*/ { 1024, 655, 1024, 655, 655, 419, 655, 419 }, /*29*/ { 910, 570, 910, 570, 570, 362, 570, 362 }, /*30*/ { 819, 504, 819, 504, 504, 328, 504, 328 }, /*31*/ { 745, 468, 745, 468, 468, 291, 468, 291 }, /*32*/ { 630, 410, 630, 410, 410, 262, 410, 262 }, /*33*/ { 585, 364, 585, 364, 364, 228, 364, 228 }, /*34*/ { 512, 328, 512, 328, 328, 210, 328, 210 }, /*35*/ { 455, 285, 455, 285, 285, 181, 285, 181 }, /*36*/ { 410, 252, 410, 252, 252, 164, 252, 164 }, /*37*/ { 372, 234, 372, 234, 234, 146, 234, 146 }, /*38*/ { 315, 205, 315, 205, 205, 131, 205, 131 }, /*39*/ { 293, 182, 293, 182, 182, 114, 182, 114 }, /*40*/ { 256, 164, 256, 164, 164, 105, 164, 105 }, /*41*/ { 228, 142, 228, 142, 142, 90, 142, 90 }, /*42*/ { 205, 126, 205, 126, 126, 82, 126, 82 }, /*43*/ { 186, 117, 186, 117, 117, 73, 117, 73 }, /*44*/ { 158, 102, 158, 102, 102, 66, 102, 66 }, /*45*/ { 146, 91, 146, 91, 91, 57, 91, 57 }, /*46*/ { 128, 82, 128, 82, 82, 52, 82, 52 }, /*47*/ { 114, 71, 114, 71, 71, 45, 71, 45 }, /*48*/ { 102, 63, 102, 63, 63, 41, 63, 41 }, /*49*/ { 93, 59, 93, 59, 59, 36, 59, 36 }, /*50*/ { 79, 51, 79, 51, 51, 33, 51, 33 }, /*51*/ { 73, 46, 73, 46, 46, 28, 46, 28 } }; /**************************************************************************** * HDM and Quant functions ****************************************************************************/ #define WELS_ABS_LC(a) ((iSign ^ (int32_t)(a)) - iSign) #define NEW_QUANT(pDct, iFF, iMF) (((iFF)+ WELS_ABS_LC(pDct))*(iMF)) >>16 #define WELS_NEW_QUANT(pDct,iFF,iMF) WELS_ABS_LC(NEW_QUANT(pDct, iFF, iMF)) void WelsQuant4x4_c(int16_t *pDct, int16_t* pFF, int16_t *pMF) { int32_t i, j, iSign; for( i = 0; i < 16; i+=4 ) { j = i & 0x07; iSign = WELS_SIGN(pDct[i]); pDct[i] = WELS_NEW_QUANT(pDct[i], pFF[j], pMF[j]); iSign = WELS_SIGN(pDct[i+1]); pDct[i+1] = WELS_NEW_QUANT(pDct[i+1], pFF[j+1], pMF[j+1]); iSign = WELS_SIGN(pDct[i+2]); pDct[i+2] = WELS_NEW_QUANT(pDct[i+2], pFF[j+2], pMF[j+2]); iSign = WELS_SIGN(pDct[i+3]); pDct[i+3] = WELS_NEW_QUANT(pDct[i+3], pFF[j+3], pMF[j+3]); } } void WelsQuant4x4Dc_c(int16_t *pDct, int16_t iFF, int16_t iMF) { int32_t i, iSign; for(i = 0; i < 16; i+=4) { iSign = WELS_SIGN(pDct[i]); pDct[i] = WELS_NEW_QUANT(pDct[i], iFF, iMF); iSign = WELS_SIGN(pDct[i+1]); pDct[i+1] = WELS_NEW_QUANT(pDct[i+1], iFF, iMF); iSign = WELS_SIGN(pDct[i+2]); pDct[i+2] = WELS_NEW_QUANT(pDct[i+2], iFF, iMF); iSign = WELS_SIGN(pDct[i+3]); pDct[i+3] = WELS_NEW_QUANT(pDct[i+3], iFF, iMF); } } void WelsQuantFour4x4_c(int16_t *pDct, int16_t* pFF, int16_t *pMF) { int32_t i, j, iSign; for( i = 0; i < 64; i+=4 ) { j = i & 0x07; iSign = WELS_SIGN(pDct[i]); pDct[i] = WELS_NEW_QUANT(pDct[i], pFF[j], pMF[j]); iSign = WELS_SIGN(pDct[i+1]); pDct[i+1] = WELS_NEW_QUANT(pDct[i+1], pFF[j+1], pMF[j+1]); iSign = WELS_SIGN(pDct[i+2]); pDct[i+2] = WELS_NEW_QUANT(pDct[i+2], pFF[j+2], pMF[j+2]); iSign = WELS_SIGN(pDct[i+3]); pDct[i+3] = WELS_NEW_QUANT(pDct[i+3], pFF[j+3], pMF[j+3]); } } void WelsQuantFour4x4Max_c(int16_t *pDct, int16_t* pFF, int16_t *pMF, int16_t *pMax) { int32_t i, j, k, iSign; int16_t iMaxAbs; for( k = 0; k < 4; k++) { iMaxAbs = 0; for( i = 0; i < 16; i++ ) { j = i & 0x07; iSign = WELS_SIGN(pDct[i]); pDct[i] = NEW_QUANT(pDct[i], pFF[j], pMF[j]); if( iMaxAbs < pDct[i]) iMaxAbs = pDct[i]; pDct[i] = WELS_ABS_LC(pDct[i]); } pDct += 16; pMax[k] = iMaxAbs; } } int32_t WelsHadamardQuant2x2Skip_c(int16_t *pRs, int16_t iFF, int16_t iMF) { int16_t pDct[4], s[4]; int16_t iThreshold = ((1<<16)-1)/iMF - iFF; s[0] = pRs[0] + pRs[32]; s[1] = pRs[0] - pRs[32]; s[2] = pRs[16] + pRs[48]; s[3] = pRs[16] - pRs[48]; pDct[0] = s[0] + s[2]; pDct[1] = s[0] - s[2]; pDct[2] = s[1] + s[3]; pDct[3] = s[1] - s[3]; return ((WELS_ABS(pDct[0]) > iThreshold) || (WELS_ABS(pDct[1]) > iThreshold) || (WELS_ABS(pDct[2]) > iThreshold) || (WELS_ABS(pDct[3]) > iThreshold)); } int32_t WelsHadamardQuant2x2_c(int16_t *pRs, const int16_t iFF, int16_t iMF, int16_t * pDct, int16_t * pBlock) { int16_t s[4]; int32_t iSign, i, iDcNzc = 0; s[0] = pRs[0] + pRs[32]; s[1] = pRs[0] - pRs[32]; s[2] = pRs[16] + pRs[48]; s[3] = pRs[16] - pRs[48]; pRs[0] = 0; pRs[16] = 0; pRs[32] = 0; pRs[48] = 0; pDct[0] = s[0] + s[2]; pDct[1] = s[0] - s[2]; pDct[2] = s[1] + s[3]; pDct[3] = s[1] - s[3]; iSign = WELS_SIGN(pDct[0]); pDct[0] = WELS_NEW_QUANT(pDct[0], iFF, iMF); iSign = WELS_SIGN(pDct[1]); pDct[1] = WELS_NEW_QUANT(pDct[1], iFF, iMF); iSign = WELS_SIGN(pDct[2]); pDct[2] = WELS_NEW_QUANT(pDct[2], iFF, iMF); iSign = WELS_SIGN(pDct[3]); pDct[3] = WELS_NEW_QUANT(pDct[3], iFF, iMF); ST64( pBlock, LD64(pDct) ); for(i=0; i<4; i++) iDcNzc += (pBlock[i] != 0); return iDcNzc; } /* dc value pick up and hdm_4x4 */ void WelsHadamardT4Dc_c( int16_t *pLumaDc, int16_t *pDct) { int32_t p[16], s[4]; int32_t i, iIdx; for(i = 0 ; i < 16 ; i +=4) { iIdx = ((i&0x08) << 4) +((i&0x04) << 3); s[0] = pDct[iIdx ] + pDct[iIdx+80]; s[3] = pDct[iIdx ] - pDct[iIdx+80]; s[1] = pDct[iIdx+16] + pDct[iIdx+64]; s[2] = pDct[iIdx+16] - pDct[iIdx+64]; p[i ] = s[0] + s[1]; p[i+2] = s[0] - s[1]; p[i+1] = s[3] + s[2]; p[i+3] = s[3] - s[2]; } for(i = 0 ; i < 4 ; i ++) { s[0] = p[i ] + p[i+12]; s[3] = p[i ] - p[i+12]; s[1] = p[i+4] + p[i+ 8]; s[2] = p[i+4] - p[i+ 8]; pLumaDc[i ] = WELS_CLIP3((s[0] + s[1] + 1) >> 1, -32768, 32767); pLumaDc[i+8 ] = WELS_CLIP3((s[0] - s[1] + 1) >> 1, -32768, 32767); pLumaDc[i+4 ] = WELS_CLIP3((s[3] + s[2] + 1) >> 1, -32768, 32767); pLumaDc[i+12] = WELS_CLIP3((s[3] - s[2] + 1) >> 1, -32768, 32767); } } /**************************************************************************** * DCT functions ****************************************************************************/ void WelsDctT4_c( int16_t *pDct, uint8_t *pPixel1, int32_t iStride1, uint8_t *pPixel2, int32_t iStride2 ) { int16_t i, pData[16], s[4]; for(i = 0 ; i < 16 ; i +=4) { const int32_t kiI1= 1 + i; const int32_t kiI2= 2 + i; const int32_t kiI3= 3 + i; pData[i ] = pPixel1[0] - pPixel2[0]; pData[kiI1] = pPixel1[1] - pPixel2[1]; pData[kiI2] = pPixel1[2] - pPixel2[2]; pData[kiI3] = pPixel1[3] - pPixel2[3]; pPixel1 += iStride1; pPixel2 += iStride2; /*horizontal transform */ s[0] = pData[i] + pData[kiI3]; s[3] = pData[i] - pData[kiI3]; s[1] = pData[kiI1] + pData[kiI2]; s[2] = pData[kiI1] - pData[kiI2]; pDct[i ] = s[0] + s[1]; pDct[kiI2] = s[0] - s[1]; pDct[kiI1] = (s[3] << 1) + s[2]; pDct[kiI3] = s[3] - (s[2] << 1); } /* vertical transform */ for(i = 0 ; i < 4 ; i ++) { const int32_t kiI4 = 4 + i; const int32_t kiI8 = 8 + i; const int32_t kiI12 = 12 + i; s[0] = pDct[i ] + pDct[kiI12]; s[3] = pDct[i ] - pDct[kiI12]; s[1] = pDct[kiI4] + pDct[kiI8 ]; s[2] = pDct[kiI4] - pDct[kiI8 ]; pDct[i ] = s[0] + s[1]; pDct[kiI8 ] = s[0] - s[1]; pDct[kiI4 ] = (s[3] << 1) + s[2]; pDct[kiI12] = s[3] - (s[2] << 1); } } void WelsDctFourT4_c(int16_t *pDct, uint8_t *pPixel1, int32_t iStride1, uint8_t *pPixel2, int32_t iStride2 ) { int32_t stride_1 = iStride1 << 2; int32_t stride_2 = iStride2 << 2; WelsDctT4_c( pDct, &pPixel1[0], iStride1, &pPixel2[0], iStride2 ); WelsDctT4_c( pDct + 16, &pPixel1[4], iStride1, &pPixel2[4], iStride2 ); WelsDctT4_c( pDct + 32, &pPixel1[stride_1 ], iStride1, &pPixel2[stride_2 ], iStride2 ); WelsDctT4_c( pDct + 48, &pPixel1[stride_1+4], iStride1, &pPixel2[stride_2+4], iStride2 ); } /**************************************************************************** * Scan and Score functions ****************************************************************************/ void WelsScan4x4DcAc_c( int16_t* pLevel, int16_t *pDct ) { ST32( pLevel, LD32(pDct) ); pLevel[2] = pDct[4]; pLevel[3] = pDct[8]; pLevel[4] = pDct[5]; ST32( pLevel+5, LD32(pDct+2) ); pLevel[7] = pDct[6]; pLevel[8] = pDct[9]; ST32( pLevel+9, LD32(pDct+12) ); pLevel[11] = pDct[10]; pLevel[12] = pDct[7]; pLevel[13] = pDct[11]; ST32( pLevel+14, LD32(pDct+14) ); } void WelsScan4x4Ac_c( int16_t* pLevel, int16_t* pDct ) { pLevel[0] = pDct[1]; pLevel[1] = pDct[4]; pLevel[2] = pDct[8]; pLevel[3] = pDct[5]; ST32( &pLevel[4], LD32(&pDct[2]) ); pLevel[6] = pDct[6]; pLevel[7] = pDct[9]; ST32( &pLevel[8], LD32(&pDct[12]) ); pLevel[10] = pDct[10]; pLevel[11] = pDct[7]; pLevel[12] = pDct[11]; ST32( &pLevel[13], LD32(&pDct[14]) ); pLevel[15] = 0; } void WelsScan4x4Dc( int16_t* pLevel, int16_t* pDct ) { ST32( pLevel, LD32(pDct) ); pLevel[2] = pDct[4]; pLevel[3] = pDct[8]; pLevel[4] = pDct[5]; ST32( pLevel+5, LD32(pDct+2) ); pLevel[7] = pDct[6]; pLevel[8] = pDct[9]; ST32( pLevel+9, LD32(pDct+12) ); pLevel[11] = pDct[10]; pLevel[12] = pDct[7]; pLevel[13] = pDct[11]; ST32( pLevel+14, LD32(pDct+14) ); } //refer to JVT-O079 int32_t WelsCalculateSingleCtr4x4_c( int16_t *pDct) { static const int32_t kiTRunTable[16] = { 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; int32_t iSingleCtr = 0; int32_t iIdx = 15; int32_t iRun; while( iIdx >= 0 && pDct[iIdx] == 0 ) --iIdx; while( iIdx >= 0 ) { -- iIdx; iRun = iIdx; while( iIdx >= 0 && pDct[iIdx] == 0 ) --iIdx; iRun -= iIdx; iSingleCtr += kiTRunTable[iRun]; } return iSingleCtr; } /**************************************************************************** * Copy functions ****************************************************************************/ void WelsCopy4x4( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS ) { const int32_t kiSrcStride2 = iStrideS << 1; const int32_t kiSrcStride3 = iStrideS + kiSrcStride2; const int32_t kiDstStride2 = iStrideD << 1; const int32_t kiDstStride3 = iStrideD + kiDstStride2; ST32( pDst, LD32(pSrc) ); ST32( pDst+iStrideD, LD32(pSrc+iStrideS) ); ST32( pDst+kiDstStride2, LD32(pSrc+kiSrcStride2) ); ST32( pDst+kiDstStride3, LD32(pSrc+kiSrcStride3) ); } void WelsCopy8x8_c( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS ) { int32_t i; for( i = 0; i < 4; i++) { ST32( pDst, LD32(pSrc )); ST32( pDst + 4 , LD32(pSrc + 4 )); ST32( pDst + iStrideD, LD32(pSrc + iStrideS)); ST32( pDst + iStrideD + 4 , LD32(pSrc + iStrideS + 4)); pDst += iStrideD << 1; pSrc += iStrideS << 1; } } void WelsCopy8x16_c( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS ) { int32_t i; for( i = 0; i < 8; ++i ) { ST32( pDst, LD32(pSrc )); ST32( pDst + 4 , LD32(pSrc + 4 )); ST32( pDst + iStrideD, LD32(pSrc + iStrideS)); ST32( pDst + iStrideD + 4 , LD32(pSrc + iStrideS + 4)); pDst += iStrideD << 1; pSrc += iStrideS << 1; } } void WelsCopy16x8_c( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS ) { int32_t i; for( i = 0; i < 8; i++) { ST32( pDst, LD32(pSrc )); ST32( pDst + 4 , LD32(pSrc + 4 )); ST32( pDst + 8 ,LD32(pSrc + 8 )); ST32( pDst + 12 , LD32(pSrc + 12)); pDst += iStrideD ; pSrc += iStrideS; } } void WelsCopy16x16_c( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS ) { int32_t i; for( i = 0; i < 16; i++) { ST32( pDst, LD32(pSrc )); ST32( pDst + 4 , LD32(pSrc + 4 )); ST32( pDst + 8 ,LD32(pSrc + 8 )); ST32( pDst + 12 , LD32(pSrc + 12)); pDst += iStrideD ; pSrc += iStrideS; } } int32_t WelsGetNoneZeroCount_c(int16_t * pLevel) { int32_t iCnt = 0; int32_t iIdx = 0; while (iIdx < 16) { iCnt += (pLevel[ iIdx] == 0); iCnt += (pLevel[1+iIdx] == 0); iCnt += (pLevel[2+iIdx] == 0); iCnt += (pLevel[3+iIdx] == 0); iIdx += 4; } return (16 - iCnt); } void WelsInitEncodingFuncs( SWelsFuncPtrList *pFuncList, uint32_t uiCpuFlag ) { pFuncList->pfCopy8x8Aligned = WelsCopy8x8_c; pFuncList->pfCopy16x16Aligned = pFuncList->pfCopy16x16NotAligned = WelsCopy16x16_c; pFuncList->pfCopy16x8NotAligned = WelsCopy16x8_c; pFuncList->pfCopy8x16Aligned = WelsCopy8x16_c; pFuncList->pfQuantizationHadamard2x2 = WelsHadamardQuant2x2_c; pFuncList->pfQuantizationHadamard2x2Skip = WelsHadamardQuant2x2Skip_c; pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_c; pFuncList->pfDctT4 = WelsDctT4_c; pFuncList->pfDctFourT4 = WelsDctFourT4_c; pFuncList->pfScan4x4 = WelsScan4x4DcAc_c; pFuncList->pfScan4x4Ac = WelsScan4x4Ac_c; pFuncList->pfCalculateSingleCtr4x4 = WelsCalculateSingleCtr4x4_c; pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_c; pFuncList->pfQuantization4x4 = WelsQuant4x4_c; pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_c; pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_c; pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_c; #if defined(X86_ASM) if ( uiCpuFlag & WELS_CPU_MMXEXT ) { pFuncList->pfQuantizationHadamard2x2 = WelsHadamardQuant2x2_mmx; pFuncList->pfQuantizationHadamard2x2Skip = WelsHadamardQuant2x2Skip_mmx; pFuncList->pfDctT4 = WelsDctT4_mmx; pFuncList->pfCopy8x8Aligned = WelsCopy8x8_mmx; pFuncList->pfCopy8x16Aligned = WelsCopy8x16_mmx; } if ( uiCpuFlag & WELS_CPU_SSE2 ) { pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_sse2; pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_sse2; pFuncList->pfQuantization4x4 = WelsQuant4x4_sse2; pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_sse2; pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_sse2; pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_sse2; pFuncList->pfCopy16x16Aligned = WelsCopy16x16_sse2; pFuncList->pfCopy16x16NotAligned = WelsCopy16x16NotAligned_sse2; pFuncList->pfCopy16x8NotAligned = WelsCopy16x8NotAligned_sse2; pFuncList->pfScan4x4 = WelsScan4x4DcAc_sse2; pFuncList->pfScan4x4Ac = WelsScan4x4Ac_sse2; pFuncList->pfCalculateSingleCtr4x4 = WelsCalculateSingleCtr4x4_sse2; pFuncList->pfDctFourT4 = WelsDctFourT4_sse2; } //#ifndef MACOS if ( uiCpuFlag & WELS_CPU_SSSE3 ) { pFuncList->pfScan4x4 = WelsScan4x4DcAc_ssse3; } //#endif//MACOS #endif//X86_ASM } }