shithub: openh264

ref: 4c829a12e2857b6e6346d83ef0d1eba1513ccf55
dir: /codec/processing/src/asm/downsample_bilinear.asm/

View raw version
;*!
;* \copy
;*     Copyright (c)  2009-2013, Cisco Systems
;*     All rights reserved.
;*
;*     Redistribution and use in source and binary forms, with or without
;*     modification, are permitted provided that the following conditions
;*     are met:
;*
;*        * Redistributions of source code must retain the above copyright
;*          notice, this list of conditions and the following disclaimer.
;*
;*        * Redistributions in binary form must reproduce the above copyright
;*          notice, this list of conditions and the following disclaimer in
;*          the documentation and/or other materials provided with the
;*          distribution.
;*
;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;*     POSSIBILITY OF SUCH DAMAGE.
;*
;*
;*	upsampling.asm
;*
;*  Abstract
;*		SIMD for pixel domain down sampling
;*
;*  History
;*		10/22/2009	Created
;*
;*************************************************************************/
%include "asm_inc.asm"
%ifdef X86_32
;***********************************************************************
; Macros and other preprocessor constants
;***********************************************************************


;***********************************************************************
; Some constants
;***********************************************************************

;***********************************************************************
; Local Data (Read Only)
;***********************************************************************

SECTION .rodata align=16

;***********************************************************************
; Various memory constants (trigonometric values or rounding values)
;***********************************************************************

ALIGN 16
shufb_mask_low:
	db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
shufb_mask_high:
	db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h


;***********************************************************************
; Code
;***********************************************************************

SECTION .text

;***********************************************************************
;	void DyadicBilinearDownsamplerWidthx32_sse(	unsigned char* pDst, const int iDstStride,
;					unsigned char* pSrc, const int iSrcStride,
;					const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
	push ebx
	push edx
	push esi
	push edi
	push ebp

	mov edi, [esp+24]	; pDst
	mov edx, [esp+28]	; iDstStride
	mov esi, [esp+32]	; pSrc
	mov ecx, [esp+36]	; iSrcStride
	mov ebp, [esp+44]	; iSrcHeight

	sar ebp, $01			; iSrcHeight >> 1

.yloops:
	mov eax, [esp+40]	; iSrcWidth
	sar eax, $01			; iSrcWidth >> 1
	mov ebx, eax		; iDstWidth restored at ebx
	sar eax, $04			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
	neg ebx				; - (iSrcWidth >> 1)
	; each loop = source bandwidth: 32 bytes
.xloops:
	; 1st part horizonal loop: x16 bytes
	;               mem  hi<-       ->lo
	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
	;=> target:
	;: H G F E D C B A, P O N M L K J I
	;: h g f e d c b a, p o n m l k j i
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	movq mm0, [esi]			; 1st pSrc line
	movq mm1, [esi+8]		; 1st pSrc line + 8
	movq mm2, [esi+ecx]		; 2nd pSrc line
	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8

	; to handle mm0, mm1, mm2, mm3
	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
	punpcklbw mm4, mm5		; d c D C b a B A
	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4

	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
	punpcklbw mm5, mm6		; h g H G f e F E
	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5

	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
	punpcklbw mm6, mm7		; l k L K j i J I
	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6

	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
	punpcklbw mm7, mm0 		; p o P O n m N M
	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7

	; to handle mm4, mm5, mm6, mm7
	movq mm0, mm4		;
	punpckldq mm0, mm5 	; H G F E D C B A
	punpckhdq mm4, mm5 	; h g f e d c b a

	movq mm1, mm6
	punpckldq mm1, mm7 	; P O N M L K J I
	punpckhdq mm6, mm7 	; p o n m l k j i

	; avg within MB horizon width (16 x 2 lines)
	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once

	; 2nd part horizonal loop: x16 bytes
	;               mem  hi<-       ->lo
	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
	;=> target:
	;: H G F E D C B A, P O N M L K J I
	;: h g f e d c b a, p o n m l k j i
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	movq mm1, [esi+16]		; 1st pSrc line + 16
	movq mm2, [esi+24]		; 1st pSrc line + 24
	movq mm3, [esi+ecx+16]	; 2nd pSrc line + 16
	movq mm4, [esi+ecx+24]	; 2nd pSrc line + 24

	; to handle mm1, mm2, mm3, mm4
	pshufw mm5, mm1, 0d8h	; d D b B c C a A ; 11011000 B
	pshufw mm6, mm5, 04eh	; c C a A d D b B ; 01001110 B
	punpcklbw mm5, mm6		; d c D C b a B A
	pshufw mm5, mm5, 0d8h  	; d c b a D C B A ; 11011000 B: mm5

	pshufw mm6, mm2, 0d8h	; h H f F g G e E ; 11011000 B
	pshufw mm7, mm6, 04eh	; g G e E h H f F ; 01001110 B
	punpcklbw mm6, mm7		; h g H G f e F E
	pshufw mm6, mm6, 0d8h  	; h g f e H G F E ; 11011000 B: mm6

	pshufw mm7, mm3, 0d8h	; l L j J k K i I ; 11011000 B
	pshufw mm1, mm7, 04eh	; k K i I l L j J ; 01001110 B
	punpcklbw mm7, mm1		; l k L K j i J I
	pshufw mm7, mm7, 0d8h  	; l k j i L K J I ; 11011000 B: mm7

	pshufw mm1, mm4, 0d8h	; p P n N o O m M ; 11011000 B
	pshufw mm2, mm1, 04eh	; o O m M p P n N ; 01001110 B
	punpcklbw mm1, mm2 		; p o P O n m N M
	pshufw mm1, mm1, 0d8h  	; p o n m P O N M ; 11011000 B: mm1

	; to handle mm5, mm6, mm7, mm1
	movq mm2, mm5
	punpckldq mm2, mm6 	; H G F E D C B A
	punpckhdq mm5, mm6 	; h g f e d c b a

	movq mm3, mm7
	punpckldq mm3, mm1 	; P O N M L K J I
	punpckhdq mm7, mm1 	; p o n m l k j i

	; avg within MB horizon width (16 x 2 lines)
	pavgb mm2, mm5		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
	pavgb mm3, mm7		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
	pavgb mm2, mm3		; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part

	movq [edi  ], mm0
	movq [edi+8], mm2

	; next SMB
	lea esi, [esi+32]
	lea edi, [edi+16]

	dec eax
	jg near .xloops

	; next line
	lea esi, [esi+2*ecx]	; next end of lines
	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
	lea edi, [edi+edx]
	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]

	dec ebp
	jg near .yloops

	WELSEMMS
	pop ebp
	pop	edi
	pop esi
	pop edx
	pop ebx
	ret

;***********************************************************************
;	void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
;					  unsigned char* pSrc, const int iSrcStride,
;					  const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
	push ebx
	push edx
	push esi
	push edi
	push ebp

	mov edi, [esp+24]	; pDst
	mov edx, [esp+28]	; iDstStride
	mov esi, [esp+32]	; pSrc
	mov ecx, [esp+36]	; iSrcStride
	mov ebp, [esp+44]	; iSrcHeight

	sar ebp, $01		; iSrcHeight >> 1

.yloops:
	mov eax, [esp+40]	; iSrcWidth
	sar eax, $01		; iSrcWidth >> 1
	mov ebx, eax		; iDstWidth restored at ebx
	sar eax, $03		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
	neg ebx			; - (iSrcWidth >> 1)
	; each loop = source bandwidth: 16 bytes
.xloops:
	; 1st part horizonal loop: x16 bytes
	;               mem  hi<-       ->lo
	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
	;=> target:
	;: H G F E D C B A, P O N M L K J I
	;: h g f e d c b a, p o n m l k j i
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	movq mm0, [esi]			; 1st pSrc line
	movq mm1, [esi+8]		; 1st pSrc line + 8
	movq mm2, [esi+ecx]		; 2nd pSrc line
	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8

	; to handle mm0, mm1, mm2, mm3
	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
	punpcklbw mm4, mm5		; d c D C b a B A
	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4

	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
	punpcklbw mm5, mm6		; h g H G f e F E
	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5

	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
	punpcklbw mm6, mm7		; l k L K j i J I
	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6

	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
	punpcklbw mm7, mm0 		; p o P O n m N M
	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7

	; to handle mm4, mm5, mm6, mm7
	movq mm0, mm4		;
	punpckldq mm0, mm5 	; H G F E D C B A
	punpckhdq mm4, mm5 	; h g f e d c b a

	movq mm1, mm6
	punpckldq mm1, mm7 	; P O N M L K J I
	punpckhdq mm6, mm7 	; p o n m l k j i

	; avg within MB horizon width (16 x 2 lines)
	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once

	movq [edi  ], mm0

	; next SMB
	lea esi, [esi+16]
	lea edi, [edi+8]

	dec eax
	jg near .xloops

	; next line
	lea esi, [esi+2*ecx]	; next end of lines
	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
	lea edi, [edi+edx]
	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]

	dec ebp
	jg near .yloops

	WELSEMMS
	pop ebp
	pop edi
	pop esi
	pop edx
	pop ebx
	ret

;***********************************************************************
;	void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
;					  unsigned char* pSrc, const int iSrcStride,
;					  const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
	push ebx
	push edx
	push esi
	push edi
	push ebp

	mov edi, [esp+24]	; pDst
	mov edx, [esp+28]	; iDstStride
	mov esi, [esp+32]	; pSrc
	mov ecx, [esp+36]	; iSrcStride
	mov ebp, [esp+44]	; iSrcHeight

	sar ebp, $01		; iSrcHeight >> 1

.yloops:
	mov eax, [esp+40]	; iSrcWidth
	sar eax, $01		; iSrcWidth >> 1
	mov ebx, eax		; iDstWidth restored at ebx
	sar eax, $02		; (iSrcWidth >> 1) / 4		; loop count = num_of_mb
	neg ebx			; - (iSrcWidth >> 1)
	; each loop = source bandwidth: 8 bytes
.xloops:
	; 1st part horizonal loop: x8 bytes
	;               mem  hi<-       ->lo
	;1st Line Src:	mm0: d D c C b B a A
	;2nd Line Src:	mm1: h H g G f F e E
	;=> target:
	;: H G F E D C B A
	;: h g f e d c b a
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	movq mm0, [esi]			; 1st pSrc line
	movq mm1, [esi+ecx]		; 2nd pSrc line

	; to handle mm0, mm1, mm2, mm3
	pshufw mm2, mm0, 0d8h	; d D b B c C a A ; 11011000 B
	pshufw mm3, mm2, 04eh	; c C a A d D b B ; 01001110 B
	punpcklbw mm2, mm3		; d c D C b a B A
	pshufw mm2, mm2, 0d8h  	; d c b a D C B A ; 11011000 B: mm4

	pshufw mm4, mm1, 0d8h	; h H f F g G e E ; 11011000 B
	pshufw mm5, mm4, 04eh	; g G e E h H f F ; 01001110 B
	punpcklbw mm4, mm5		; h g H G f e F E
	pshufw mm4, mm4, 0d8h  	; h g f e H G F E ; 11011000 B: mm5

	; to handle mm2, mm4
	movq mm0, mm2		;
	punpckldq mm0, mm4 	; H G F E D C B A
	punpckhdq mm2, mm4 	; h g f e d c b a

	; avg within MB horizon width (16 x 2 lines)
	pavgb mm0, mm2		; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
	pshufw mm1, mm0, 04eh	; 01001110 B
	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once

	movd [edi],	mm0

	; next unit
	lea esi, [esi+8]
	lea edi, [edi+4]

	dec eax
	jg near .xloops

	; next line
	lea esi, [esi+2*ecx]	; next end of lines
	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
	lea edi, [edi+edx]
	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]

	dec ebp
	jg near .yloops

	WELSEMMS
	pop ebp
	pop edi
	pop esi
	pop edx
	pop ebx
	ret



; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
;***********************************************************************
;	void DyadicBilinearDownsamplerWidthx32_ssse3(	unsigned char* pDst, const int iDstStride,
;					unsigned char* pSrc, const int iSrcStride,
;					const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
	push ebx
	push edx
	push esi
	push edi
	push ebp

	mov edi, [esp+24]	; pDst
	mov edx, [esp+28]	; iDstStride
	mov esi, [esp+32]	; pSrc
	mov ecx, [esp+36]	; iSrcStride
	mov ebp, [esp+44]	; iSrcHeight

	sar ebp, $01			; iSrcHeight >> 1

	movdqa xmm7, [shufb_mask_low]	; mask low
	movdqa xmm6, [shufb_mask_high]	; mask high

.yloops:
	mov eax, [esp+40]	; iSrcWidth
	sar eax, $01			; iSrcWidth >> 1
	mov ebx, eax		; iDstWidth restored at ebx
	sar eax, $04			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
	neg ebx				; - (iSrcWidth >> 1)
	; each loop = source bandwidth: 32 bytes
.xloops:
	; 1st part horizonal loop: x16 bytes
	;               mem  hi<-       ->lo
	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
	;				xmm1: p P o O n N m M l L k K j J i I
	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
	;				xmm3: p P o O n N m M l L k K j J i I
	;=> target:
	;: P O N M L K J I H G F E D C B A
	;: p o n m l k j i h g f e d c b a
	;: P ..                          A
	;: p ..                          a

	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	movdqa xmm0, [esi]			; 1st_src_line
	movdqa xmm1, [esi+16]		; 1st_src_line + 16
	movdqa xmm2, [esi+ecx]		; 2nd_src_line
	movdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16

	; packing & avg
	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
	; another implementation for xmm4 high bits
;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
	pavgb xmm0, xmm4

	movdqa xmm5, xmm1
	pshufb xmm1, xmm7
	pshufb xmm5, xmm6
;	psubb xmm5, xmm1
;	psrlw xmm5, 8
	pavgb xmm1, xmm5

	movdqa xmm4, xmm2
	pshufb xmm2, xmm7
	pshufb xmm4, xmm6
;	psubb xmm4, xmm2
;	psrlw xmm4, 8
	pavgb xmm2, xmm4

	movdqa xmm5, xmm3
	pshufb xmm3, xmm7
	pshufb xmm5, xmm6
;	psubb xmm5, xmm3
;	psrlw xmm5, 8
	pavgb xmm3, xmm5

	packuswb xmm0, xmm1
	packuswb xmm2, xmm3
	pavgb xmm0, xmm2

	; write pDst
	movdqa [edi], xmm0

	; next SMB
	lea esi, [esi+32]
	lea edi, [edi+16]

	dec eax
	jg near .xloops

	; next line
	lea esi, [esi+2*ecx]	; next end of lines
	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
	lea edi, [edi+edx]
	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]

	dec ebp
	jg near .yloops

	pop ebp
	pop	edi
	pop esi
	pop edx
	pop ebx
	ret

;***********************************************************************
;	void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
;					  unsigned char* pSrc, const int iSrcStride,
;					  const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
	push ebx
	push edx
	push esi
	push edi
	push ebp

	mov edi, [esp+24]	; pDst
	mov edx, [esp+28]	; iDstStride
	mov esi, [esp+32]	; pSrc
	mov ecx, [esp+36]	; iSrcStride
	mov ebp, [esp+44]	; iSrcHeight

	sar ebp, $01		; iSrcHeight >> 1
	movdqa xmm7, [shufb_mask_low]	; mask low
	movdqa xmm6, [shufb_mask_high]	; mask high

.yloops:
	mov eax, [esp+40]	; iSrcWidth
	sar eax, $01		; iSrcWidth >> 1
	mov ebx, eax		; iDstWidth restored at ebx
	sar eax, $03		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
	neg ebx			; - (iSrcWidth >> 1)
	; each loop = source bandwidth: 16 bytes
.xloops:
	; horizonal loop: x16 bytes by source
	;               mem  hi<-       ->lo
	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
	;=> target:
	;: H G F E D C B A, P O N M L K J I
	;: h g f e d c b a, p o n m l k j i

	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	movdqa xmm0, [esi]			; 1st_src_line
	movdqa xmm1, [esi+ecx]		; 2nd_src_line

	; packing & avg
	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
	; another implementation for xmm2 high bits
;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
	pavgb xmm0, xmm2

	movdqa xmm3, xmm1
	pshufb xmm1, xmm7
	pshufb xmm3, xmm6
;	psubb xmm3, xmm1
;	psrlw xmm3, 8
	pavgb xmm1, xmm3

	pavgb xmm0, xmm1
	packuswb xmm0, xmm1

	; write pDst
	movq [edi], xmm0

	; next SMB
	lea esi, [esi+16]
	lea edi, [edi+8]

	dec eax
	jg near .xloops

	; next line
	lea esi, [esi+2*ecx]	; next end of lines
	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
	lea edi, [edi+edx]
	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]

	dec ebp
	jg near .yloops

	pop ebp
	pop edi
	pop esi
	pop edx
	pop ebx
	ret

; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
;***********************************************************************
;	void DyadicBilinearDownsamplerWidthx32_sse4(	unsigned char* pDst, const int iDstStride,
;					unsigned char* pSrc, const int iSrcStride,
;					const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
	push ebx
	push edx
	push esi
	push edi
	push ebp

	mov edi, [esp+24]	; pDst
	mov edx, [esp+28]	; iDstStride
	mov esi, [esp+32]	; pSrc
	mov ecx, [esp+36]	; iSrcStride
	mov ebp, [esp+44]	; iSrcHeight

	sar ebp, $01			; iSrcHeight >> 1

	movdqa xmm7, [shufb_mask_low]	; mask low
	movdqa xmm6, [shufb_mask_high]	; mask high

.yloops:
	mov eax, [esp+40]	; iSrcWidth
	sar eax, $01			; iSrcWidth >> 1
	mov ebx, eax		; iDstWidth restored at ebx
	sar eax, $04			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
	neg ebx				; - (iSrcWidth >> 1)
	; each loop = source bandwidth: 32 bytes
.xloops:
	; 1st part horizonal loop: x16 bytes
	;               mem  hi<-       ->lo
	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
	;				xmm1: p P o O n N m M l L k K j J i I
	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
	;				xmm3: p P o O n N m M l L k K j J i I
	;=> target:
	;: P O N M L K J I H G F E D C B A
	;: p o n m l k j i h g f e d c b a
	;: P ..                          A
	;: p ..                          a

	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	movntdqa xmm0, [esi]			; 1st_src_line
	movntdqa xmm1, [esi+16]		; 1st_src_line + 16
	movntdqa xmm2, [esi+ecx]		; 2nd_src_line
	movntdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16

	; packing & avg
	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
	pavgb xmm0, xmm4

	movdqa xmm5, xmm1
	pshufb xmm1, xmm7
	pshufb xmm5, xmm6
;	psubb xmm5, xmm1
;	psrlw xmm5, 8
	pavgb xmm1, xmm5

	movdqa xmm4, xmm2
	pshufb xmm2, xmm7
	pshufb xmm4, xmm6
;	psubb xmm4, xmm2
;	psrlw xmm4, 8
	pavgb xmm2, xmm4

	movdqa xmm5, xmm3
	pshufb xmm3, xmm7
	pshufb xmm5, xmm6
;	psubb xmm5, xmm3
;	psrlw xmm5, 8
	pavgb xmm3, xmm5

	packuswb xmm0, xmm1
	packuswb xmm2, xmm3
	pavgb xmm0, xmm2

	; write pDst
	movdqa [edi], xmm0

	; next SMB
	lea esi, [esi+32]
	lea edi, [edi+16]

	dec eax
	jg near .xloops

	; next line
	lea esi, [esi+2*ecx]	; next end of lines
	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
	lea edi, [edi+edx]
	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]

	dec ebp
	jg near .yloops

	pop ebp
	pop	edi
	pop esi
	pop edx
	pop ebx
	ret

;***********************************************************************
;	void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
;					  unsigned char* pSrc, const int iSrcStride,
;					  const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
	push ebx
	push edx
	push esi
	push edi
	push ebp

	mov edi, [esp+24]	; pDst
	mov edx, [esp+28]	; iDstStride
	mov esi, [esp+32]	; pSrc
	mov ecx, [esp+36]	; iSrcStride
	mov ebp, [esp+44]	; iSrcHeight

	sar ebp, $01		; iSrcHeight >> 1
	movdqa xmm7, [shufb_mask_low]	; mask low
	movdqa xmm6, [shufb_mask_high]	; mask high

.yloops:
	mov eax, [esp+40]	; iSrcWidth
	sar eax, $01		; iSrcWidth >> 1
	mov ebx, eax		; iDstWidth restored at ebx
	sar eax, $03		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
	neg ebx			; - (iSrcWidth >> 1)
	; each loop = source bandwidth: 16 bytes
.xloops:
	; horizonal loop: x16 bytes by source
	;               mem  hi<-       ->lo
	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
	;=> target:
	;: H G F E D C B A, P O N M L K J I
	;: h g f e d c b a, p o n m l k j i

	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	movntdqa xmm0, [esi]			; 1st_src_line
	movntdqa xmm1, [esi+ecx]		; 2nd_src_line

	; packing & avg
	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
	pavgb xmm0, xmm2

	movdqa xmm3, xmm1
	pshufb xmm1, xmm7
	pshufb xmm3, xmm6
;	psubb xmm3, xmm1
;	psrlw xmm3, 8
	pavgb xmm1, xmm3

	pavgb xmm0, xmm1
	packuswb xmm0, xmm1

	; write pDst
	movq [edi], xmm0

	; next SMB
	lea esi, [esi+16]
	lea edi, [edi+8]

	dec eax
	jg near .xloops

	; next line
	lea esi, [esi+2*ecx]	; next end of lines
	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
	lea edi, [edi+edx]
	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]

	dec ebp
	jg near .yloops

	pop ebp
	pop edi
	pop esi
	pop edx
	pop ebx
	ret





;**************************************************************************************************************
;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
;							unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
;                           unsigned int uiScaleX, unsigned int uiScaleY );
;{
;**************************************************************************************************************

WELS_EXTERN	GeneralBilinearAccurateDownsampler_sse2
	push	ebp
	push	esi
	push	edi
	push	ebx
%define		pushsize	16
%define		localsize	28
%define		pDstData		esp + pushsize + localsize + 4
%define		dwDstStride		esp + pushsize + localsize + 8
%define		dwDstWidth		esp + pushsize + localsize + 12
%define		dwDstHeight		esp + pushsize + localsize + 16
%define		pSrcData		esp + pushsize + localsize + 20
%define		dwSrcStride		esp + pushsize + localsize + 24
%define		dwSrcWidth		esp + pushsize + localsize + 28
%define		dwSrcHeight		esp + pushsize + localsize + 32
%define		scale			esp + 0
%define		uiScaleX			esp + pushsize + localsize + 36
%define		uiScaleY			esp + pushsize + localsize + 40
%define		tmpHeight		esp + 12
%define		yInverse		esp + 16
%define		xInverse		esp + 20
%define		dstStep			esp + 24
	sub		esp,			localsize

	pxor	xmm0,	xmm0
	mov		edx,	32767
	mov		eax,	[uiScaleX]
	and		eax,	32767
	mov		ebx,	eax
	neg		ebx
	and		ebx,	32767
	movd	xmm1,		eax						; uinc(uiScaleX mod 32767)
	movd	xmm2,		ebx						; -uinc
	psllq	xmm1,		32
	por		xmm1,		xmm2					; 0 0  uinc  -uinc   (dword)
	pshufd	xmm7,		xmm1,	01000100b		; xmm7: uinc -uinc uinc -uinc

	mov		eax,	[uiScaleY]
	and		eax,	32767
	mov		ebx,	eax
	neg		ebx
	and		ebx,	32767
	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
	movd	xmm2,		ebx						; -vinc
	psllq	xmm6,		32
	por		xmm6,		xmm2					; 0 0 vinc -vinc (dword)
	pshufd	xmm6,		xmm6,	01010000b		; xmm6: vinc vinc -vinc -vinc

	mov		edx,		40003fffh
	movd	xmm5,		edx
	punpcklwd	xmm5,	xmm0					; 16384 16383
	pshufd	xmm5,		xmm5,	01000100b		; xmm5: 16384 16383 16384 16383


DOWNSAMPLE:

	mov		eax,			[dwDstHeight]
	mov		edi,			[pDstData]
	mov		edx,			[dwDstStride]
	mov		ecx,			[dwDstWidth]
	sub		edx,			ecx
	mov		[dstStep],	edx				; stride - width
	dec		eax
	mov		[tmpHeight],	eax
	mov		eax,			16384
	mov		[yInverse],		eax

	pshufd	xmm4,		xmm5,	01010000b	; initial v to 16384 16384 16383 16383

HEIGHT:
	mov		eax,	[yInverse]
	mov		esi,	[pSrcData]
	shr		eax,	15
	mul		dword [dwSrcStride]
	add		esi,	eax					; get current row address
	mov		ebp,	esi
	add		ebp,	[dwSrcStride]

	mov		eax,		16384
	mov		[xInverse],		eax
	mov		ecx,			[dwDstWidth]
	dec		ecx

	movdqa	xmm3,		xmm5			; initial u to 16384 16383 16384 16383

WIDTH:
	mov		eax,		[xInverse]
	shr		eax,		15

	movd	xmm1,		[esi+eax]		; xxxxxxba
	movd	xmm2,		[ebp+eax]		; xxxxxxdc
	pxor	xmm0,		xmm0
	punpcklwd	xmm1,	xmm2			; xxxxdcba
	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
	punpcklwd	xmm1,	xmm0			; 000d000c000b000a

	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
	pmaddwd	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
	movdqa	xmm0,	xmm2
	pmuludq	xmm2,	xmm1
	psrlq	xmm0,	32
	psrlq	xmm1,	32
	pmuludq	xmm0,	xmm1
	paddq	xmm2,	xmm0
	pshufd	xmm1,	xmm2,	00001110b
	paddq	xmm2,	xmm1
	psrlq	xmm2,	29

	movd	eax,	xmm2
	inc		eax
	shr		eax,	1
	mov		[edi],	al
	inc		edi

	mov		eax,		[uiScaleX]
	add		[xInverse],	eax

	paddw	xmm3,		xmm7			; inc u
	psllw	xmm3,		1
	psrlw	xmm3,		1

	loop	WIDTH

WIDTH_END:
	mov		eax,		[xInverse]
	shr		eax,		15
	mov		cl,			[esi+eax]
	mov		[edi],		cl
	inc		edi

	mov		eax,		[uiScaleY]
	add		[yInverse],	eax
	add		edi,		[dstStep]

	paddw	xmm4,	xmm6				; inc v
	psllw	xmm4,	1
	psrlw	xmm4,	1

	dec		dword [tmpHeight]
	jg		HEIGHT


LAST_ROW:
	mov		eax,	[yInverse]
	mov		esi,	[pSrcData]
	shr		eax,	15
	mul		dword [dwSrcStride]
	add		esi,	eax					; get current row address

	mov		eax,		16384
	mov		[xInverse],		eax
	mov		ecx,			[dwDstWidth]

LAST_ROW_WIDTH:
	mov		eax,		[xInverse]
	shr		eax,		15

	mov		al,			[esi+eax]
	mov		[edi],	al
	inc		edi

	mov		eax,		[uiScaleX]
	add		[xInverse],	eax

	loop	LAST_ROW_WIDTH

LAST_ROW_END:

	add		esp,			localsize
	pop		ebx
	pop		edi
	pop		esi
	pop		ebp
%undef		pushsize
%undef		localsize
%undef		pSrcData
%undef		dwSrcWidth
%undef		dwSrcHeight
%undef		dwSrcStride
%undef		pDstData
%undef		dwDstWidth
%undef		dwDstHeight
%undef		dwDstStride
%undef		scale
%undef		uiScaleX
%undef		uiScaleY
%undef		tmpHeight
%undef		yInverse
%undef		xInverse
%undef		dstStep
	ret




;**************************************************************************************************************
;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
;				unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
;               unsigned int uiScaleX, unsigned int uiScaleY );
;{
;**************************************************************************************************************

WELS_EXTERN	GeneralBilinearFastDownsampler_sse2
	push	ebp
	push	esi
	push	edi
	push	ebx
%define		pushsize	16
%define		localsize	28
%define		pDstData		esp + pushsize + localsize + 4
%define		dwDstStride		esp + pushsize + localsize + 8
%define		dwDstWidth		esp + pushsize + localsize + 12
%define		dwDstHeight		esp + pushsize + localsize + 16
%define		pSrcData		esp + pushsize + localsize + 20
%define		dwSrcStride		esp + pushsize + localsize + 24
%define		dwSrcWidth		esp + pushsize + localsize + 28
%define		dwSrcHeight		esp + pushsize + localsize + 32
%define		scale			esp + 0
%define		uiScaleX			esp + pushsize + localsize + 36
%define		uiScaleY			esp + pushsize + localsize + 40
%define		tmpHeight		esp + 12
%define		yInverse		esp + 16
%define		xInverse		esp + 20
%define		dstStep			esp + 24
	sub		esp,			localsize

	pxor	xmm0,	xmm0
	mov		edx,	65535
	mov		eax,	[uiScaleX]
	and		eax,	edx
	mov		ebx,	eax
	neg		ebx
	and		ebx,	65535
	movd	xmm1,		eax						; uinc(uiScaleX mod 65536)
	movd	xmm2,		ebx						; -uinc
	psllq	xmm1,		32
	por		xmm1,		xmm2					; 0 uinc 0 -uinc
	pshuflw	xmm7,		xmm1,	10001000b		; xmm7: uinc -uinc uinc -uinc

	mov		eax,	[uiScaleY]
	and		eax,	32767
	mov		ebx,	eax
	neg		ebx
	and		ebx,	32767
	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
	movd	xmm2,		ebx						; -vinc
	psllq	xmm6,		32
	por		xmm6,		xmm2					; 0 vinc 0 -vinc
	pshuflw	xmm6,		xmm6,	10100000b		; xmm6: vinc vinc -vinc -vinc

	mov		edx,		80007fffh				; 32768 32767
	movd	xmm5,		edx
	pshuflw	xmm5,		xmm5,		01000100b	; 32768 32767 32768 32767
	mov		ebx,		16384


FAST_DOWNSAMPLE:

	mov		eax,			[dwDstHeight]
	mov		edi,			[pDstData]
	mov		edx,			[dwDstStride]
	mov		ecx,			[dwDstWidth]
	sub		edx,			ecx
	mov		[dstStep],	edx				; stride - width
	dec		eax
	mov		[tmpHeight],	eax
	mov		eax,		16384
	mov		[yInverse],		eax

	pshuflw	xmm4,		xmm5,	01010000b
	psrlw	xmm4,		1				; initial v to 16384 16384 16383 16383

FAST_HEIGHT:
	mov		eax,	[yInverse]
	mov		esi,	[pSrcData]
	shr		eax,	15
	mul		dword [dwSrcStride]
	add		esi,	eax					; get current row address
	mov		ebp,	esi
	add		ebp,	[dwSrcStride]

	mov		eax,		32768
	mov		[xInverse],		eax
	mov		ecx,			[dwDstWidth]
	dec		ecx

	movdqa	xmm3,		xmm5			; initial u to 32768 32767 32768 32767

FAST_WIDTH:
	mov		eax,		[xInverse]
	shr		eax,		16

	movd	xmm1,		[esi+eax]		; xxxxxxba
	movd	xmm2,		[ebp+eax]		; xxxxxxdc
	punpcklwd	xmm1,	xmm2			; xxxxdcba
	punpcklbw	xmm1,	xmm0			; 0d0c0b0a

	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
	pmulhuw	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
	pmaddwd		xmm2,	xmm1
	pshufd	xmm1,	xmm2,	00000001b
	paddd	xmm2,	xmm1
	movd	xmm1,	ebx
	paddd	xmm2,	xmm1
	psrld	xmm2,	15

	packuswb	xmm2,	xmm0
	movd	eax,	xmm2
	mov		[edi],	al
	inc		edi

	mov		eax,		[uiScaleX]
	add		[xInverse],	eax

	paddw	xmm3,		xmm7			; inc u

	loop	FAST_WIDTH

FAST_WIDTH_END:
	mov		eax,		[xInverse]
	shr		eax,		16
	mov		cl,			[esi+eax]
	mov		[edi],		cl
	inc		edi

	mov		eax,		[uiScaleY]
	add		[yInverse],	eax
	add		edi,		[dstStep]

	paddw	xmm4,	xmm6				; inc v
	psllw	xmm4,	1
	psrlw	xmm4,	1

	dec		dword [tmpHeight]
	jg		FAST_HEIGHT


FAST_LAST_ROW:
	mov		eax,	[yInverse]
	mov		esi,	[pSrcData]
	shr		eax,	15
	mul		dword [dwSrcStride]
	add		esi,	eax					; get current row address

	mov		eax,		32768
	mov		[xInverse],		eax
	mov		ecx,			[dwDstWidth]

FAST_LAST_ROW_WIDTH:
	mov		eax,		[xInverse]
	shr		eax,		16

	mov		al,			[esi+eax]
	mov		[edi],	al
	inc		edi

	mov		eax,		[uiScaleX]
	add		[xInverse],	eax

	loop	FAST_LAST_ROW_WIDTH

FAST_LAST_ROW_END:

	add		esp,			localsize
	pop		ebx
	pop		edi
	pop		esi
	pop		ebp
%undef		pushsize
%undef		localsize
%undef		pSrcData
%undef		dwSrcWidth
%undef		dwSrcHeight
%undef		dwSrcStride
%undef		pDstData
%undef		dwDstWidth
%undef		dwDstHeight
%undef		dwDstStride
%undef		scale
%undef		uiScaleX
%undef		uiScaleY
%undef		tmpHeight
%undef		yInverse
%undef		xInverse
%undef		dstStep
	ret
%endif