ref: 8a0af4a3f212ee7fbe4cba44513e2f3115499ea4
parent: 7cbb75eac668fae4078fa93271e09d51d65a8990
author: Sindre Aamås <[email protected]>
date: Wed Jun 1 19:45:44 EDT 2016
[Processing/x86] DyadicBilinearDownsample optimizations Average vertically before horizontally; horizontal averaging is more worksome. Doing the vertical averaging first reduces the number of horizontal averages by half. Use pmaddubsw and pavgw to do the horizontal averaging for a slight performance improvement. Minor tweaks. Improve the SSSE3 dyadic downsample routines and drop the SSE4 routines. The non-temporal loads used in the SSE4 routines do nothing for cache- backed memory AFAIK. Adjust tests because averaging vertically first gives slightly different output. ~2.39x speedup for the widthx32 routine on Haswell when not memory-bound. ~2.20x speedup for the widthx16 routine on Haswell when not memory-bound. Note that the widthx16 routine can be unrolled for further speedup.
--- a/codec/common/x86/asm_inc.asm
+++ b/codec/common/x86/asm_inc.asm
@@ -485,7 +485,7 @@
%endmacro
%macro WELS_EXTERN 1
- ALIGN 16
+ ALIGN 16, nop
%ifdef PREFIX
global _%1
%define %1 _%1
--- a/codec/processing/src/downsample/downsample.cpp
+++ b/codec/processing/src/downsample/downsample.cpp
@@ -102,8 +102,6 @@
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsamplerWrap_ssse3;
}
if (iCpuFlag & WELS_CPU_SSE41) {
- sDownsampleFunc.pfHalfAverageWidthx32 = DyadicBilinearDownsamplerWidthx32_sse4;
- sDownsampleFunc.pfHalfAverageWidthx16 = DyadicBilinearDownsamplerWidthx16_sse4;
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_sse4;
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_sse4;
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse41;
--- a/codec/processing/src/downsample/downsample.h
+++ b/codec/processing/src/downsample/downsample.h
@@ -94,10 +94,6 @@
HalveDownsampleFunc DyadicBilinearDownsamplerWidthx16_ssse3;
// iSrcWidth= x32 pixels
HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_ssse3;
-// iSrcWidth= x16 pixels
-HalveDownsampleFunc DyadicBilinearDownsamplerWidthx16_sse4;
-// iSrcWidth= x32 pixels
-HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_sse4;
GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;
GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;
--- a/codec/processing/src/x86/downsample_bilinear.asm
+++ b/codec/processing/src/x86/downsample_bilinear.asm
@@ -40,6 +40,10 @@
;*************************************************************************/
%include "asm_inc.asm"
+%ifdef __NASM_VER__
+ %use smartalign
+%endif
+
;***********************************************************************
; Macros and other preprocessor constants
;***********************************************************************
@@ -471,7 +475,6 @@
-; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
;***********************************************************************
; void DyadicBilinearDownsamplerWidthx32_ssse3( unsigned char* pDst, const int iDstStride,
; unsigned char* pSrc, const int iSrcStride,
@@ -478,17 +481,6 @@
; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
- ;push ebx
- ;push edx
- ;push esi
- ;push edi
- ;push ebp
-
- ;mov edi, [esp+24] ; pDst
- ;mov edx, [esp+28] ; iDstStride
- ;mov esi, [esp+32] ; pSrc
- ;mov ecx, [esp+36] ; iSrcStride
- ;mov ebp, [esp+44] ; iSrcHeight
%ifdef X86_32
push r6
%assign push_num 1
@@ -496,7 +488,7 @@
%assign push_num 0
%endif
LOAD_6_PARA
- PUSH_XMM 8
+ PUSH_XMM 4
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@@ -508,15 +500,12 @@
%endif
sar r5, $01 ; iSrcHeight >> 1
- movdqa xmm7, [shufb_mask_low] ; mask low
- movdqa xmm6, [shufb_mask_high] ; mask high
+ WELS_DB1 xmm3
+ WELS_Zero xmm2
+ sar r4, $01 ; iSrcWidth >> 1
+ add r0, r4 ; pDst += iSrcWidth >> 1
.yloops4:
- ;mov eax, [esp+40] ; iSrcWidth
- ;sar eax, $01 ; iSrcWidth >> 1
- ;mov ebx, eax ; iDstWidth restored at ebx
- ;sar eax, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
- ;neg ebx ; - (iSrcWidth >> 1)
%ifdef X86_32
mov r4, arg5
%else
@@ -523,81 +512,32 @@
mov r4, r12
%endif
sar r4, $01 ; iSrcWidth >> 1
- mov r6, r4 ; iDstWidth restored at ebx
- sar r4, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
- neg r6 ; - (iSrcWidth >> 1)
+ neg r4 ; -(iSrcWidth >> 1)
+ mov r6, r4
+ align 16
; each loop = source bandwidth: 32 bytes
.xloops4:
- ; 1st part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
- ; xmm1: p P o O n N m M l L k K j J i I
- ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
- ; xmm3: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: P O N M L K J I H G F E D C B A
- ;: p o n m l k j i h g f e d c b a
- ;: P .. A
- ;: p .. a
+ movdqa xmm0, [r2+r3]
+ movdqa xmm1, [r2+r3+16]
+ pavgb xmm0, [r2] ; avg vertical pixels 0-15
+ pavgb xmm1, [r2+16] ; avg vertical pixels 16-31
+ add r2, 32 ; pSrc += 32
+ pmaddubsw xmm0, xmm3 ; pairwise horizontal sum neighboring pixels 0-15
+ pmaddubsw xmm1, xmm3 ; pairwise horizontal sum neighboring pixels 16-31
+ pavgw xmm0, xmm2 ; (sum + 1) >> 1
+ pavgw xmm1, xmm2 ; (sum + 1) >> 1
+ packuswb xmm0, xmm1 ; pack words to bytes
+ movdqa [r0+r4], xmm0 ; store results
+ add r4, 16
+ jl .xloops4
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movdqa xmm0, [r2] ; 1st_src_line
- movdqa xmm1, [r2+16] ; 1st_src_line + 16
- movdqa xmm2, [r2+r3] ; 2nd_src_line
- movdqa xmm3, [r2+r3+16] ; 2nd_src_line + 16
-
- ; packing & avg
- movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- ; another implementation for xmm4 high bits
-; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm4
-
- movdqa xmm5, xmm1
- pshufb xmm1, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm1
-; psrlw xmm5, 8
- pavgb xmm1, xmm5
-
- movdqa xmm4, xmm2
- pshufb xmm2, xmm7
- pshufb xmm4, xmm6
-; psubb xmm4, xmm2
-; psrlw xmm4, 8
- pavgb xmm2, xmm4
-
- movdqa xmm5, xmm3
- pshufb xmm3, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm3
-; psrlw xmm5, 8
- pavgb xmm3, xmm5
-
- packuswb xmm0, xmm1
- packuswb xmm2, xmm3
- pavgb xmm0, xmm2
-
- ; write pDst
- movdqa [r0], xmm0
-
- ; next SMB
- lea r2, [r2+32]
- lea r0, [r0+16]
-
- dec r4
- jg near .xloops4
-
; next line
lea r2, [r2+2*r3] ; next end of lines
lea r2, [r2+2*r6] ; reset to base 0 [- 2 * iDstWidth]
lea r0, [r0+r1]
- lea r0, [r0+r6] ; reset to base 0 [- iDstWidth]
- dec r5
- jg near .yloops4
+ sub r5, 1
+ jg .yloops4
%ifndef X86_32
pop r12
@@ -623,7 +563,7 @@
%assign push_num 0
%endif
LOAD_6_PARA
- PUSH_XMM 6
+ PUSH_XMM 4
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@@ -634,8 +574,11 @@
mov r12, r4
%endif
sar r5, $01 ; iSrcHeight >> 1
- movdqa xmm5, [shufb_mask_low] ; mask low
- movdqa xmm4, [shufb_mask_high] ; mask high
+ WELS_DB1 xmm3
+ WELS_Zero xmm2
+ add r2, r4 ; pSrc += iSrcWidth
+ sar r4, $01 ; iSrcWidth >> 1
+ add r0, r4 ; pDst += iSrcWidth >> 1
.yloops5:
%ifdef X86_32
@@ -644,279 +587,26 @@
mov r4, r12
%endif
sar r4, $01 ; iSrcWidth >> 1
- mov r6, r4 ; iDstWidth restored at ebx
- sar r4, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
- neg r6 ; - (iSrcWidth >> 1)
+ neg r4 ; -(iSrcWidth >> 1)
+ lea r6, [r2+r3] ; pSrc + iSrcStride
+ align 16
; each loop = source bandwidth: 16 bytes
.xloops5:
- ; horizonal loop: x16 bytes by source
- ; mem hi<- ->lo
- ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
- ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
+ movdqa xmm0, [r2+2*r4]
+ pavgb xmm0, [r6+2*r4] ; avg vertical pixels
+ pmaddubsw xmm0, xmm3 ; pairwise horizontal sum neighboring pixels
+ pavgw xmm0, xmm2 ; (sum + 1) >> 1
+ packuswb xmm0, xmm0 ; pack words to bytes
+ movlps [r0+r4], xmm0 ; store results
+ add r4, 8
+ jl .xloops5
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movdqa xmm0, [r2] ; 1st_src_line
- movdqa xmm1, [r2+r3] ; 2nd_src_line
-
- ; packing & avg
- movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm5 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm2, xmm4 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- ; another implementation for xmm2 high bits
-; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm2
-
- movdqa xmm3, xmm1
- pshufb xmm1, xmm5
- pshufb xmm3, xmm4
-; psubb xmm3, xmm1
-; psrlw xmm3, 8
- pavgb xmm1, xmm3
-
- pavgb xmm0, xmm1
- packuswb xmm0, xmm1
-
- ; write pDst
- movq [r0], xmm0
-
- ; next SMB
- lea r2, [r2+16]
- lea r0, [r0+8]
-
- dec r4
- jg near .xloops5
-
- lea r2, [r2+2*r3] ; next end of lines
- lea r2, [r2+2*r6] ; reset to base 0 [- 2 * iDstWidth]
- lea r0, [r0+r1]
- lea r0, [r0+r6] ; reset to base 0 [- iDstWidth]
-
- dec r5
- jg near .yloops5
-
-%ifndef X86_32
- pop r12
-%endif
-
- POP_XMM
- LOAD_6_PARA_POP
-%ifdef X86_32
- pop r6
-%endif
- ret
-
-; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx32_sse4( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
-%ifdef X86_32
- push r6
- %assign push_num 1
-%else
- %assign push_num 0
-%endif
- LOAD_6_PARA
- PUSH_XMM 8
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- SIGN_EXTENSION r5, r5d
-
-%ifndef X86_32
- push r12
- mov r12, r4
-%endif
- sar r5, $01 ; iSrcHeight >> 1
-
- movdqa xmm7, [shufb_mask_low] ; mask low
- movdqa xmm6, [shufb_mask_high] ; mask high
-
-.yloops6:
-%ifdef X86_32
- mov r4, arg5
-%else
- mov r4, r12
-%endif
- sar r4, $01 ; iSrcWidth >> 1
- mov r6, r4 ; iDstWidth restored at ebx
- sar r4, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
- neg r6 ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 32 bytes
-.xloops6:
- ; 1st part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
- ; xmm1: p P o O n N m M l L k K j J i I
- ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
- ; xmm3: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: P O N M L K J I H G F E D C B A
- ;: p o n m l k j i h g f e d c b a
- ;: P .. A
- ;: p .. a
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movntdqa xmm0, [r2] ; 1st_src_line
- movntdqa xmm1, [r2+16] ; 1st_src_line + 16
- movntdqa xmm2, [r2+r3] ; 2nd_src_line
- movntdqa xmm3, [r2+r3+16] ; 2nd_src_line + 16
-
- ; packing & avg
- movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm4
-
- movdqa xmm5, xmm1
- pshufb xmm1, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm1
-; psrlw xmm5, 8
- pavgb xmm1, xmm5
-
- movdqa xmm4, xmm2
- pshufb xmm2, xmm7
- pshufb xmm4, xmm6
-; psubb xmm4, xmm2
-; psrlw xmm4, 8
- pavgb xmm2, xmm4
-
- movdqa xmm5, xmm3
- pshufb xmm3, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm3
-; psrlw xmm5, 8
- pavgb xmm3, xmm5
-
- packuswb xmm0, xmm1
- packuswb xmm2, xmm3
- pavgb xmm0, xmm2
-
- ; write pDst
- movdqa [r0], xmm0
-
- ; next SMB
- lea r2, [r2+32]
- lea r0, [r0+16]
-
- dec r4
- jg near .xloops6
-
- lea r2, [r2+2*r3] ; next end of lines
- lea r2, [r2+2*r6] ; reset to base 0 [- 2 * iDstWidth]
- lea r0, [r0+r1]
- lea r0, [r0+r6] ; reset to base 0 [- iDstWidth]
-
- dec r5
- jg near .yloops6
-
-%ifndef X86_32
- pop r12
-%endif
-
- POP_XMM
- LOAD_6_PARA_POP
-%ifdef X86_32
- pop r6
-%endif
- ret
-
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
-%ifdef X86_32
- push r6
- %assign push_num 1
-%else
- %assign push_num 0
-%endif
- LOAD_6_PARA
- PUSH_XMM 6
- SIGN_EXTENSION r1, r1d
- SIGN_EXTENSION r3, r3d
- SIGN_EXTENSION r4, r4d
- SIGN_EXTENSION r5, r5d
-
-%ifndef X86_32
- push r12
- mov r12, r4
-%endif
- sar r5, $01 ; iSrcHeight >> 1
- movdqa xmm5, [shufb_mask_low] ; mask low
- movdqa xmm4, [shufb_mask_high] ; mask high
-
-.yloops7:
-%ifdef X86_32
- mov r4, arg5
-%else
- mov r4, r12
-%endif
- sar r4, $01 ; iSrcWidth >> 1
- mov r6, r4 ; iDstWidth restored at ebx
- sar r4, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
- neg r6 ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 16 bytes
-.xloops7:
- ; horizonal loop: x16 bytes by source
- ; mem hi<- ->lo
- ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
- ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movntdqa xmm0, [r2] ; 1st_src_line
- movntdqa xmm1, [r2+r3] ; 2nd_src_line
-
- ; packing & avg
- movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm5 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm2, xmm4 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm2
-
- movdqa xmm3, xmm1
- pshufb xmm1, xmm5
- pshufb xmm3, xmm4
-; psubb xmm3, xmm1
-; psrlw xmm3, 8
- pavgb xmm1, xmm3
-
- pavgb xmm0, xmm1
- packuswb xmm0, xmm1
-
- ; write pDst
- movq [r0], xmm0
-
- ; next SMB
- lea r2, [r2+16]
- lea r0, [r0+8]
-
- dec r4
- jg near .xloops7
-
; next line
lea r2, [r2+2*r3] ; next end of lines
- lea r2, [r2+2*r6] ; reset to base 0 [- 2 * iDstWidth]
lea r0, [r0+r1]
- lea r0, [r0+r6] ; reset to base 0 [- iDstWidth]
- dec r5
- jg near .yloops7
+ sub r5, 1
+ jg .yloops5
%ifndef X86_32
pop r12
--- a/test/api/decode_api_test.cpp
+++ b/test/api/decode_api_test.cpp
@@ -759,9 +759,15 @@
const uint32_t kiFrameRate = 12; //DO NOT CHANGE!
const uint32_t kiFrameNum = 100; //DO NOT CHANGE!
const char* pHashStr[] = { //DO NOT CHANGE!
+#ifdef X86_ASM
+ "244eebcb51f4c2a56e83fc5da3373cad9ec0e1e5",
+ "bbad99ef99e37b34bcb4f09a7ec4d144375f6be7",
+ "809f97e836650624d92f0b8e200a6ab25f810d6f"
+#else
"9c4e6146b29bac5d5d4be3c5bbab9c072dcb3f3f",
"f350001c333902029800bd291fbed915a4bdf19a",
"eb9d853b7daec03052c4850027ac94adc84c3a7e"
+#endif
};
class DecodeParseAPI : public ::testing::TestWithParam<EncodeDecodeFileParamBase>, public EncodeDecodeTestBase {
--- a/test/api/encoder_test.cpp
+++ b/test/api/encoder_test.cpp
@@ -123,7 +123,12 @@
},
{
"res/CiscoVT2people_320x192_12fps.yuv",
- "73156dfc1dc45924349b5b79f8debcac13d7231d", CAMERA_VIDEO_REAL_TIME, 320, 192, 12.0f, SM_SINGLE_SLICE, false, 2, false, false, false
+#ifdef X86_ASM
+ "a5341d588b769809c1f1d983e5a0fcef7362f3ad",
+#else
+ "73156dfc1dc45924349b5b79f8debcac13d7231d",
+#endif
+ CAMERA_VIDEO_REAL_TIME, 320, 192, 12.0f, SM_SINGLE_SLICE, false, 2, false, false, false
},
{
"res/Cisco_Absolute_Power_1280x720_30fps.yuv",
@@ -131,7 +136,12 @@
},
{
"res/Cisco_Absolute_Power_1280x720_30fps.yuv",
- "3943145545a2bd27a642b2045d4e3dbae55c6870", CAMERA_VIDEO_REAL_TIME, 1280, 720, 30.0f, SM_SINGLE_SLICE, false, 4, false, false, false
+#ifdef X86_ASM
+ "ec9d776a7d92cf0f6640065aee8af2450af0e993",
+#else
+ "3943145545a2bd27a642b2045d4e3dbae55c6870",
+#endif
+ CAMERA_VIDEO_REAL_TIME, 1280, 720, 30.0f, SM_SINGLE_SLICE, false, 4, false, false, false
},
// the following values may be adjusted for times since we start tuning the strategy
{
--- a/test/processing/ProcessUT_DownSample.cpp
+++ b/test/processing/ProcessUT_DownSample.cpp
@@ -30,6 +30,27 @@
}
}
+void DyadicBilinearDownsampler2_ref (uint8_t* pDst, const int32_t kiDstStride,
+ const uint8_t* pSrc, const int32_t kiSrcStride,
+ const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
+ uint8_t* pDstLine = pDst;
+ const uint8_t* pSrcLine1 = pSrc;
+ const uint8_t* pSrcLine2 = pSrc + kiSrcStride;
+ const int32_t kiDstWidth = kiSrcWidth >> 1;
+ const int32_t kiDstHeight = kiSrcHeight >> 1;
+
+ for (int32_t j = 0; j < kiDstHeight; j++) {
+ for (int32_t i = 0; i < kiDstWidth; i++) {
+ const int32_t kiTempCol1 = (pSrcLine1[2 * i + 0] + pSrcLine2[2 * i + 0] + 1) >> 1;
+ const int32_t kiTempCol2 = (pSrcLine1[2 * i + 1] + pSrcLine2[2 * i + 1] + 1) >> 1;
+ pDstLine[i] = (uint8_t) ((kiTempCol1 + kiTempCol2 + 1) >> 1);
+ }
+ pDstLine += kiDstStride;
+ pSrcLine1 += 2 * kiSrcStride;
+ pSrcLine2 += 2 * kiSrcStride;
+ }
+}
+
void GeneralBilinearFastDownsampler_ref (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
const int32_t kiDstHeight,
uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
@@ -162,7 +183,7 @@
}
}
-#define GENERATE_DyadicBilinearDownsampler_UT(func, ASM, CPUFLAGS) \
+#define GENERATE_DyadicBilinearDownsampler_UT_with_ref(func, ASM, CPUFLAGS, ref_func) \
TEST (DownSampleTest, func) { \
if (ASM) {\
int32_t iCpuCores = 0; \
@@ -190,7 +211,7 @@
dst_c[j] = dst_a[j] = rand() % 256; \
src_c[j] = src_a[j] = rand() % 256; \
} \
- DyadicBilinearDownsampler_ref (dst_c, dst_stride_c, src_c, src_stride_c, src_width_c, src_height_c); \
+ ref_func (dst_c, dst_stride_c, src_c, src_stride_c, src_width_c, src_height_c); \
func (dst_a, dst_stride_a, src_a, src_stride_a, src_width_a, src_height_a); \
for (int j = 0; j < (src_height_c >> 1); j++) { \
for (int m = 0; m < (src_width_c >> 1); m++) { \
@@ -199,6 +220,11 @@
} \
}
+#define GENERATE_DyadicBilinearDownsampler_UT(func, ASM, CPUFLAGS) \
+ GENERATE_DyadicBilinearDownsampler_UT_with_ref(func, ASM, CPUFLAGS, DyadicBilinearDownsampler_ref)
+#define GENERATE_DyadicBilinearDownsampler2_UT(func, ASM, CPUFLAGS) \
+ GENERATE_DyadicBilinearDownsampler_UT_with_ref(func, ASM, CPUFLAGS, DyadicBilinearDownsampler2_ref)
+
#define GENERATE_DyadicBilinearOneThirdDownsampler_UT(func, ASM, CPUFLAGS) \
TEST (DownSampleTest, func) { \
if (ASM) {\
@@ -328,11 +354,8 @@
GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_sse, 1, WELS_CPU_SSE)
GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx8_sse, 1, WELS_CPU_SSE)
-GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_ssse3, 1, WELS_CPU_SSSE3)
-GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_ssse3, 1, WELS_CPU_SSSE3)
-
-GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx32_sse4, 1, WELS_CPU_SSE41)
-GENERATE_DyadicBilinearDownsampler_UT (DyadicBilinearDownsamplerWidthx16_sse4, 1, WELS_CPU_SSE41)
+GENERATE_DyadicBilinearDownsampler2_UT (DyadicBilinearDownsamplerWidthx32_ssse3, 1, WELS_CPU_SSSE3)
+GENERATE_DyadicBilinearDownsampler2_UT (DyadicBilinearDownsamplerWidthx16_ssse3, 1, WELS_CPU_SSSE3)
GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_ssse3, 1, WELS_CPU_SSSE3)
GENERATE_DyadicBilinearOneThirdDownsampler_UT (DyadicBilinearOneThirdDownsampler_sse4, 1, WELS_CPU_SSE41)