shithub: openh264

Download patch

ref: 783e6fbd124f0d5ef4f993c24ab67abd80706cdb
parent: 7a9f15cb8a1371556242aa2de79d09ae42c57675
author: Guangwei Wang <[email protected]>
date: Fri Oct 28 12:20:30 EDT 2016

avoid text-rel on x86-32bits

--- a/codec/common/x86/dct.asm
+++ b/codec/common/x86/dct.asm
@@ -393,12 +393,38 @@
 ; out=%1 in=%1 clobber=%2
 %macro SSE2_DCT_HORIZONTAL 2
     pshuflw       %2, %1, 1bh               ; [x[3],x[2],x[1],x[0]] low qw
+%ifdef X86_32_PICASM
+    push          r0
+    mov           r0, esp
+    and           esp, 0xfffffff0
+    push          0xffff0001    ;wels_p1m1p1m1w_128
+    push          0xffff0001
+    push          0xffff0001
+    push          0xffff0001
+    push          0x0001ffff    ;wels_p1m1m1p1w_128
+    push          0xffff0001
+    push          0x0001ffff
+    push          0xffff0001
+    push          0x00020001    ;wels_p1p2p1p2w_128
+    push          0x00020001
+    push          0x00020001
+    push          0x00020001
+    pmullw        %1, [esp+32]  ; [x[0],-x[1],x[2],-x[3], ...]
+%else
     pmullw        %1, [wels_p1m1p1m1w_128]  ; [x[0],-x[1],x[2],-x[3], ...]
+%endif
     pshufhw       %2, %2, 1bh               ; [x[3],x[2],x[1],x[0]] high qw
     paddw         %1, %2                    ; s = [x[0]+x[3],-x[1]+x[2],x[2]+x[1],-x[3]+x[0], ...]
     pshufd        %2, %1, 0b1h              ; [s[2],s[3],s[0],s[1], ...]
+%ifdef X86_32_PICASM
+    pmullw        %1, [esp+16]  ; [s[0],-s[1],-s[2],s[3], ...]
+    pmullw        %2, [esp]  ; [s[2],2*s[3],s[0],2*s[1], ...]]
+    mov           esp, r0
+    pop           r0
+%else
     pmullw        %1, [wels_p1m1m1p1w_128]  ; [s[0],-s[1],-s[2],s[3], ...]
     pmullw        %2, [wels_p1p2p1p2w_128]  ; [s[2],2*s[3],s[0],2*s[1], ...]]
+%endif
     paddw         %1, %2                    ; y = [s[0]+s[2],-s[1]+2*s[3],-s[2]+s[0],s[3]+2*s[1], ...]
 %endmacro
 
@@ -410,7 +436,22 @@
 ;
 ; out=%1 in=%1 wels_p1m1m1p1w_128=%2 clobber=%3,%4
 %macro SSE2_IDCT_HORIZONTAL 4
+%ifdef X86_32_PICASM
+    push          r0
+    mov           r0, esp
+    and           esp, 0xfffffff0
+    push          0x80000000    ;wels_p0m8000p0m8000w_128
+    push          0x80000000
+    push          0x80000000
+    push          0x80000000
+    push          0xffffffff    ;wels_p1p1m1m1w_128
+    push          0x00010001
+    push          0xffffffff
+    push          0x00010001
+    movdqa        %3, [esp+16]
+%else
     movdqa        %3, [wels_p0m8000p0m8000w_128]
+%endif
     pmulhw        %3, %1                    ; x[0:7] * [0,-8000h,0,-8000h, ...] >> 16
     pshufd        %4, %1, 0b1h              ; [x[2],x[3],x[0],x[1], ...]
     pmullw        %4, %2                    ; [x[2],-x[3],-x[0],x[1], ...]
@@ -417,7 +458,13 @@
     paddw         %1, %3                    ; [x[0]+0,x[1]+(-x[1]>>1),x[2]+0,x[3]+(-x[3]>>1), ...]
     paddw         %1, %4                    ; s = [x[0]+x[2],(x[1]>>1)-x[3],x[2]-x[0],(x[3]>>1)+x[1], ...]
     pshuflw       %3, %1, 1bh               ; [s[3],s[2],s[1],s[0]] low qw
+%ifdef X86_32_PICASM
+    pmullw        %1, [esp]  ; [s[0],s[1],-s[2],-s[3], ...]
+    mov           esp, r0
+    pop           r0
+%else
     pmullw        %1, [wels_p1p1m1m1w_128]  ; [s[0],s[1],-s[2],-s[3], ...]
+%endif
     pshufhw       %3, %3, 1bh               ; [s[3],s[2],s[1],s[0]] high qw
     pmullw        %3, %2                    ; [s[3],-s[2],-s[1],s[0], ...]
     paddw         %1, %3                    ; y = [s[0]+s[3],s[1]-s[2],-s[2]-s[1],-s[3]+s[0], ...]
@@ -434,9 +481,24 @@
     punpckhqdq    %2, %1                    ; s03 = [x0+x3,x0-x3]
     punpcklqdq    %3, %1                    ; s12 = [x1+x2,x1-x2]
     movdqa        %1, %2
+%ifdef X86_32_PICASM
+    push          r0
+    mov           r0, esp
+    and           esp, 0xfffffff0
+    push          0x00020002    ;wels_4xp1w_4xp2w
+    push          0x00020002
+    push          0x00010001
+    push          0x00010001
+    pmullw        %1, [esp]    ; [s03[0],2*s03[1]]
+    paddw         %1, %3                    ; [y0,y1] = [s03[0]+s12[0],2*s03[1]+s12[1]]
+    pmullw        %3, [esp]    ; [s12[0],2*s12[1]]
+    mov           esp, r0
+    pop           r0
+%else
     pmullw        %1, [wels_4xp1w_4xp2w]    ; [s03[0],2*s03[1]]
     paddw         %1, %3                    ; [y0,y1] = [s03[0]+s12[0],2*s03[1]+s12[1]]
     pmullw        %3, [wels_4xp1w_4xp2w]    ; [s12[0],2*s12[1]]
+%endif
     psubw         %2, %3                    ; [y2,y3] = [s03[0]-s12[0],s03[1]-2*s12[1]]
 %endmacro
 
@@ -444,7 +506,20 @@
 ; Output is scrambled to save a negation.
 ; [y1,y0]=%1 [y2,y3]=%2 [x0,x1]=%1 [x2,x3]=%2 clobber=%3,%4
 %macro SSE2_IDCT_4x4P 4
+%ifdef X86_32_PICASM
+    push          r0
+    mov           r0, esp
+    and           esp, 0xfffffff0
+    push          0x80008000    ;wels_4xp0w_4xm8000w
+    push          0x80008000
+    push          0x00000000
+    push          0x00000000
+    movdqa        %4, [esp]
+    mov           esp, r0
+    pop           r0
+%else
     movdqa        %4, [wels_4xp0w_4xm8000w]
+%endif
     movdqa        %3, %1
     pmulhw        %3, %4                    ; x[0:1] * [0,-8000h] >> 16
     pmulhw        %4, %2                    ; x[2:3] * [0,-8000h] >> 16
@@ -521,7 +596,18 @@
     ;Load 4x8
     SSE2_Load4x8p  r4, xmm0, xmm1, xmm4, xmm2, xmm5
 
+%ifdef X86_32_PICASM
+    push          r5
+    mov           r5, esp
+    and           esp, 0xffffffe0
+    push          0x0001ffff    ;wels_p1m1m1p1w_128
+    push          0xffff0001
+    push          0x0001ffff
+    push          0xffff0001
+    movdqa xmm7, [esp]
+%else
     movdqa xmm7, [wels_p1m1m1p1w_128]
+%endif
     SSE2_IDCT_HORIZONTAL xmm0, xmm7, xmm5, xmm6
     SSE2_IDCT_HORIZONTAL xmm1, xmm7, xmm5, xmm6
     SSE2_IDCT_HORIZONTAL xmm4, xmm7, xmm5, xmm6
@@ -540,7 +626,13 @@
     lea     r2, [r2 + 2 * r3]
     SSE2_Load4x8p  r4+64, xmm0, xmm1, xmm4, xmm2, xmm5
 
+%ifdef X86_32_PICASM
+    movdqa xmm7, [esp]
+    mov    esp, r5
+    pop    r5
+%else
     movdqa xmm7, [wels_p1m1m1p1w_128]
+%endif
     SSE2_IDCT_HORIZONTAL xmm0, xmm7, xmm5, xmm6
     SSE2_IDCT_HORIZONTAL xmm1, xmm7, xmm5, xmm6
     SSE2_IDCT_HORIZONTAL xmm4, xmm7, xmm5, xmm6
@@ -604,7 +696,20 @@
 
     SSE2_Load2x4P xmm0, r4
     SSE2_Load2x4P xmm1, r4+16
+%ifdef X86_32_PICASM
+    push          r5
+    mov           r5, esp
+    and           esp, 0xfffffff0
+    push          0x0001ffff    ;wels_p1m1m1p1w_128
+    push          0xffff0001
+    push          0x0001ffff
+    push          0xffff0001
+    movdqa xmm4, [esp]
+    mov           esp, r5
+    pop           r5
+%else
     movdqa xmm4, [wels_p1m1m1p1w_128]
+%endif
     SSE2_IDCT_HORIZONTAL xmm0, xmm4, xmm2, xmm3
     SSE2_IDCT_HORIZONTAL xmm1, xmm4, xmm2, xmm3
     SSE2_IDCT_4x4P xmm0, xmm1, xmm2, xmm3
@@ -710,7 +815,20 @@
     vpshufb       y%9, y%9, y%8
     vpaddsw       y%4, y%4, y%9
     vpackuswb     y%3, y%3, y%4
+%ifdef X86_32_PICASM
+    push          r0
+    mov           r0, esp
+    and           esp, 0xffffffe0
+    push          0x0d0f0e0c    ;wels_shufb0231_128
+    push          0x090b0a08
+    push          0x05070604
+    push          0x01030200
+    vbroadcasti128 y%4, [esp]
+    mov           esp, r0
+    pop           r0
+%else
     vbroadcasti128 y%4, [wels_shufb0231_128]
+%endif
     vpshufb       y%3, y%3, y%4
     vextracti128  x%4, y%3, 1
     vmovlps       [%1         ], x%3
@@ -788,7 +906,20 @@
     AVX2_Loadzx4x4P %8, %4, %5, y%7, %9, %10
     vpaddsw        y%3, y%3, y%8
     vpackuswb      y%3, y%3, y%3
+%ifdef X86_32_PICASM
+    push          r0
+    mov           r0, esp
+    and           esp, 0xffffffe0
+    push          0x0d0f0e0c    ;wels_shufb0231_128
+    push          0x090b0a08
+    push          0x05070604
+    push          0x01030200
+    vbroadcasti128 y%8, [esp]
+    mov           esp, r0
+    pop           r0
+%else
     vbroadcasti128 y%8, [wels_shufb0231_128]
+%endif
     vpshufb        y%3, y%3, y%8
     vextracti128   x%8, y%3, 1
     vmovd          [%1         ], x%3
@@ -834,10 +965,39 @@
 ; Uses scrambled input to save a negation.
 ; [y0,y1,y2,y3]=%1 [x0,x3,x1,x2]=%1 wels_shufb2301=%2 clobber=%3
 %macro AVX2_DCT_HORIZONTAL 3
+%ifdef X86_32_PICASM
+    push          r0
+    mov           r0, esp
+    and           esp, 0xffffffe0
+    push          0xffff0001    ;wels_p1m1p1m1w_256
+    push          0xffff0001
+    push          0xffff0001
+    push          0xffff0001
+    push          0xffff0001
+    push          0xffff0001
+    push          0xffff0001
+    push          0xffff0001
+    push          0xfffeffff    ;wels_p1m2m1m2w_256
+    push          0x00020001
+    push          0xfffeffff
+    push          0x00020001
+    push          0xfffeffff
+    push          0x00020001
+    push          0xfffeffff
+    push          0x00020001
+    vpsignw       %3, %1, [esp+32]  ; [x0,-x3,x1,-x2]
+%else
     vpsignw       %3, %1, [wels_p1m1p1m1w_256]  ; [x0,-x3,x1,-x2]
+%endif
     vpshufb       %1, %1, %2                    ; [x3,x0,x2,x1]
     vpaddw        %1, %1, %3                    ; s = [x0+x3,-x3+x0,x1+x2,-x2+x1]
+%ifdef X86_32_PICASM
+    vpmullw       %3, %1, [esp]  ; [s[0],2*s[1],-s[2],-2*s[3], ...]
+    mov           esp, r0
+    pop           r0
+%else
     vpmullw       %3, %1, [wels_p1p2m1m2w_256]  ; [s[0],2*s[1],-s[2],-2*s[3], ...]
+%endif
     vpshufd       %1, %1, 0b1h                  ; [s[2],s[3],s[0],s[1], ...]
     vpaddw        %1, %1, %3                    ; [y0,y1,y2,y3] = [s[0]+s[2],2*s[1]+s[3],-s[2]+s[0],-2*s[3]+s[1], ...]
 %endmacro
@@ -848,11 +1008,40 @@
 %macro AVX2_IDCT_HORIZONTAL 3
     vpsraw        %3, %1, 1                     ; [x0>>1,x1>>1,x2>>1,x3>>1]
     vpblendw      %3, %1, %3, 10101010b         ; [x0,x1>>1,x2,x3>>1]
+%ifdef X86_32_PICASM
+    push          r0
+    mov           r0, esp
+    and           esp, 0xffffffe0
+    push          0xffffffff    ;wels_p1p1m1m1w_256
+    push          0x00010001
+    push          0xffffffff
+    push          0x00010001
+    push          0xffffffff
+    push          0x00010001
+    push          0xffffffff
+    push          0x00010001
+    push          0xffff0001    ;wels_p1m1p1m1w_256
+    push          0xffff0001
+    push          0xffff0001
+    push          0xffff0001
+    push          0xffff0001
+    push          0xffff0001
+    push          0xffff0001
+    push          0xffff0001
+    vpsignw       %1, %1, [esp+32]  ; [x0,x1,-x2,-x3]
+%else
     vpsignw       %1, %1, [wels_p1p1m1m1w_256]  ; [x0,x1,-x2,-x3]
+%endif
     vpshufd       %3, %3, 0b1h                  ; [x2,x3>>1,x0,x1>>1]
     vpaddw        %1, %3, %1                    ; s = [x2+x0,(x3>>1)+x1,x0-x2,(x1>>1)-x3]
     vpshufb       %3, %1, %2                    ; [s[1],s[0],s[3],s[2], ...]
+%ifdef X86_32_PICASM
+    vpsignw       %1, %1, [esp]  ; [s[0],-s[1],s[2],-s[3], ...]
+    mov           esp, r0
+    pop           r0
+%else
     vpsignw       %1, %1, [wels_p1m1p1m1w_256]  ; [s[0],-s[1],s[2],-s[3], ...]
+%endif
     vpaddw        %1, %1, %3                    ; [y0,y3,y1,y2] = [s[0]+s[1],-s[1]+s[0],s[2]+s[3],-s[3]+s[2], ...]
 %endmacro
 
@@ -860,10 +1049,39 @@
 ; Uses scrambled input to save a negation.
 ; [y0,y1,y2,y3]=%1 [x0,x3,x1,x2]=%1 clobber=%2
 %macro AVX2_DCT_4x4P 2
+%ifdef X86_32_PICASM
+    push          r0
+    mov           r0, esp
+    and           esp, 0xffffffe0
+    push          0xffffffff    ;wels_4xp1w_4xm1w_256
+    push          0xffffffff
+    push          0x00010001
+    push          0x00010001
+    push          0xffffffff
+    push          0xffffffff
+    push          0x00010001
+    push          0x00010001
+    push          0xfffefffe    ;wels_4xp1w_4xp2w_4xm1w_4xm2w
+    push          0xfffefffe
+    push          0xffffffff
+    push          0xffffffff
+    push          0x00020002
+    push          0x00020002
+    push          0x00010001
+    push          0x00010001
+    vpsignw       %2, %1, [esp+32]         ; [x0,-x3,x1,-x2]
+%else
     vpsignw       %2, %1, [wels_4xp1w_4xm1w_256]         ; [x0,-x3,x1,-x2]
+%endif
     vpshufd       %1, %1, 4eh                            ; [x3,x0,x2,x1]
     vpaddw        %1, %1, %2                             ; s = [x0+x3,-x3+x0,x1+x2,-x2+x1]
+%ifdef X86_32_PICASM
+    vpmullw       %2, %1, [esp] ; [s[0],2*s[1],-s[2],-2*s[3]]
+    mov           esp, r0
+    pop           r0
+%else
     vpmullw       %2, %1, [wels_4xp1w_4xp2w_4xm1w_4xm2w] ; [s[0],2*s[1],-s[2],-2*s[3]]
+%endif
     vpermq        %1, %1, 4eh                            ; [s[2],s[3],s[0],s[1]]
     vpaddw        %1, %1, %2                             ; [y0,y1,y2,y3] = [s[0]+s[2],2*s[1]+s[3],-s[2]+s[0],-2*s[3]+s[1]]
 %endmacro
@@ -874,11 +1092,40 @@
 %macro AVX2_IDCT_4x4P 2
     vpsraw        %2, %1, 1                              ; [x0>>1,x1>>1,x2>>1,x3>>1]
     vpblendw      %2, %1, %2, 11110000b                  ; [x0,x1>>1,x2,x3>>1]
+%ifdef X86_32_PICASM
+    push          r0
+    mov           r0, esp
+    and           esp, 0xffffffe0
+    push          0xffffffff    ;wels_8xp1w_8xm1w
+    push          0xffffffff
+    push          0xffffffff
+    push          0xffffffff
+    push          0x00010001
+    push          0x00010001
+    push          0x00010001
+    push          0x00010001
+    push          0xffffffff    ;wels_4xp1w_4xm1w_256
+    push          0xffffffff
+    push          0x00010001
+    push          0x00010001
+    push          0xffffffff
+    push          0xffffffff
+    push          0x00010001
+    push          0x00010001
+    vpsignw       %1, %1, [esp+32]             ; [x0,x1,-x2,-x3]
+%else
     vpsignw       %1, %1, [wels_8xp1w_8xm1w]             ; [x0,x1,-x2,-x3]
+%endif
     vpermq        %2, %2, 4eh                            ; [x2,x3>>1,x0,x1>>1]
     vpaddw        %1, %2, %1                             ; s = [x2+x0,(x3>>1)+x1,x0-x2,(x1>>1)-x3]
     vpshufd       %2, %1, 4eh                            ; [s[1],s[0],s[3],s[2]]
+%ifdef X86_32_PICASM
+    vpmullw       %1, %1, [esp]         ; [s[0],-s[1],s[2],-s[3], ...]
+    mov           esp, r0
+    pop           r0
+%else
     vpmullw       %1, %1, [wels_4xp1w_4xm1w_256]         ; [s[0],-s[1],s[2],-s[3], ...]
+%endif
     vpaddw        %1, %1, %2                             ; [y0,y3,y1,y2] = [s[0]+s[1],-s[1]+s[0],s[2]+s[3],-s[3]+s[2]]
 %endmacro
 
@@ -892,7 +1139,22 @@
     SIGN_EXTENSION r2, r2d
     SIGN_EXTENSION r4, r4d
 
+%ifdef X86_32_PICASM
+    push     r5
+    mov      r5, esp
+    and      esp, 0xffffffe0
+    push     0x80068005    ;wels_shufb0312_movzxw_128
+    push     0x80078004
+    push     0x80028001
+    push     0x80038000
+    push     0x0d0c0f0e   ;wels_shufb2301_128
+    push     0x09080b0a
+    push     0x05040706
+    push     0x01000302
+    vbroadcasti128 ymm6, [esp+16]
+%else
     vbroadcasti128 ymm6, [wels_shufb0312_movzxw_128]
+%endif
 
     ;Load 4x16
     AVX2_LoadDiff16P mm0, r1, r2, r3, r4, mm6, mm4, mm5
@@ -907,7 +1169,13 @@
     AVX2_LoadDiff16P mm3, r1, r2, r3, r4, mm6, mm4, mm5
 
     AVX2_DCT ymm0, ymm1, ymm2, ymm3, ymm5
+%ifdef X86_32_PICASM
+    vbroadcasti128 ymm6, [esp]
+    mov      esp, r5
+    pop      r5
+%else
     vbroadcasti128 ymm6, [wels_shufb2301_128]
+%endif
     AVX2_DCT_HORIZONTAL ymm0, ymm6, ymm5
     AVX2_DCT_HORIZONTAL ymm1, ymm6, ymm5
     AVX2_DCT_HORIZONTAL ymm2, ymm6, ymm5
@@ -940,7 +1208,26 @@
     SIGN_EXTENSION r3, r3d
 
     AVX2_Load4x16P mm0, mm1, mm2, mm3, r4, mm5
+%ifdef X86_32_PICASM
+    push     r5
+    mov      r5, esp
+    and      esp, 0xffffffe0
+    push     0x0d0c0f0e    ;wels_shufb2301_128
+    push     0x09080b0a
+    push     0x05040706
+    push     0x01000302
+    push     0x80068005    ;wels_shufb0312_movzxw_128
+    push     0x80078004
+    push     0x80028001
+    push     0x80038000
+    push     0x00200020    ;wels_dw32_128
+    push     0x00200020
+    push     0x00200020
+    push     0x00200020
+    vbroadcasti128 ymm6, [esp+32]
+%else
     vbroadcasti128 ymm6, [wels_shufb2301_128]
+%endif
     AVX2_IDCT_HORIZONTAL ymm0, ymm6, ymm5
     AVX2_IDCT_HORIZONTAL ymm1, ymm6, ymm5
     AVX2_IDCT_HORIZONTAL ymm2, ymm6, ymm5
@@ -947,8 +1234,15 @@
     AVX2_IDCT_HORIZONTAL ymm3, ymm6, ymm5
     AVX2_IDCT ymm0, ymm1, ymm2, ymm3, ymm5
 
+%ifdef X86_32_PICASM
+    vbroadcasti128 ymm6, [esp+16]
+    vbroadcasti128 ymm7, [esp]
+    mov     esp, r5
+    pop     r5
+%else
     vbroadcasti128 ymm6, [wels_shufb0312_movzxw_128]
     vbroadcasti128 ymm7, [wels_dw32_128]
+%endif
     AVX2_StoreDiff32P r0, r1, mm0, mm1, r2, r3, mm7, mm6, mm5, mm4
     add r2, r3
     add r0, r1
@@ -969,10 +1263,31 @@
     SIGN_EXTENSION r2, r2d
     SIGN_EXTENSION r4, r4d
 
+%ifdef X86_32_PICASM
+    push     r5
+    mov      r5, esp
+    and      esp, 0xffffffe0
+    push     0x80068005    ;wels_shufb0312_movzxw_128
+    push     0x80078004
+    push     0x80028001
+    push     0x80038000
+    push     0x0d0c0f0e   ;wels_shufb2301_128
+    push     0x09080b0a
+    push     0x05040706
+    push     0x01000302
+    vbroadcasti128 ymm1, [esp+16]
+%else
     vbroadcasti128 ymm1, [wels_shufb0312_movzxw_128]
+%endif
     AVX2_LoadDiff4x4P mm0, r1, r2, r3, r4, mm1, mm2, mm3, mm4
     AVX2_DCT_4x4P ymm0, ymm2
+%ifdef X86_32_PICASM
+    vbroadcasti128 ymm1, [esp]
+    mov     esp, r5
+    pop     r5
+%else
     vbroadcasti128 ymm1, [wels_shufb2301_128]
+%endif
     AVX2_DCT_HORIZONTAL ymm0, ymm1, ymm2
     AVX2_Store4x4P r0, mm0
     vzeroupper
@@ -1001,11 +1316,37 @@
     SIGN_EXTENSION r3, r3d
 
     AVX2_Load4x4P mm0, r4
+%ifdef X86_32_PICASM
+    push     r5
+    mov      r5, esp
+    and      esp, 0xffffffe0
+    push     0x0d0c0f0e   ;wels_shufb2301_128
+    push     0x09080b0a
+    push     0x05040706
+    push     0x01000302
+    push     0x80068005    ;wels_shufb0312_movzxw_128
+    push     0x80078004
+    push     0x80028001
+    push     0x80038000
+    push     0x00200020    ;wels_dw32_128
+    push     0x00200020
+    push     0x00200020
+    push     0x00200020
+    vbroadcasti128 ymm4, [esp+32]
+%else
     vbroadcasti128 ymm4, [wels_shufb2301_128]
+%endif
     AVX2_IDCT_HORIZONTAL ymm0, ymm4, ymm1
     AVX2_IDCT_4x4P ymm0, ymm1
+%ifdef X86_32_PICASM
+    vbroadcasti128 ymm4, [esp+16]
+    vbroadcasti128 ymm5, [esp]
+    mov     esp, r5
+    pop     r5
+%else
     vbroadcasti128 ymm4, [wels_shufb0312_movzxw_128]
     vbroadcasti128 ymm5, [wels_dw32_128]
+%endif
     AVX2_StoreDiff4x4P r0, r1, mm0, r2, r3, mm5, mm4, mm1, mm2, mm3
     vzeroupper
 
--- a/codec/common/x86/deblock.asm
+++ b/codec/common/x86/deblock.asm
@@ -157,9 +157,25 @@
     ; Unbias and split into a non-negative and a non-positive part.
     ; Clip each part to iTc via minub.
     ; Add/subtract each part to/from p0/q0 and clip.
+%ifdef X86_32_PICASM
+    push       r0
+    mov        r0, esp
+    sub        esp, 16
+    and        esp, -16
+    push       0x60606060    ;WELS_DB96_16
+    push       0x60606060
+    push       0x60606060
+    push       0x60606060
+    movdqa     %6, [esp]
+    psubusb    %6, %8
+    psubusb    %8, [esp]
+    mov        esp, r0
+    pop        r0
+%else
     movdqa     %6, [WELS_DB96_16]
     psubusb    %6, %8
     psubusb    %8, [WELS_DB96_16]
+%endif
     pminub     %6, %5
     pminub     %8, %5
     psubusb    %2, %6
@@ -182,8 +198,21 @@
     movd     xmm1, arg3d
     movd     xmm2, arg4d
     pxor     xmm3, xmm3
+%ifdef X86_32_PICASM
+    push     r4
+    mov      r4, esp
+    sub      esp, 16
+    and      esp, -16
+    push     0x7f7f7f7f
+    push     0x7f7f7f7f
+    push     0x7f7f7f7f
+    push     0x7f7f7f7f
+    pxor     xmm1, [esp]
+    pxor     xmm2, [esp]
+%else
     pxor     xmm1, [WELS_DB127_16]
     pxor     xmm2, [WELS_DB127_16]
+%endif
     pshufb   xmm1, xmm3                       ; iAlpha ^ 0x7f
     pshufb   xmm2, xmm3                       ; iBeta  ^ 0x7f
     mov      r2, r1                           ; iStride
@@ -196,22 +225,40 @@
     MOVDQ    xmm0, [r0 + 0 * r2]              ; q0
     movdqa   xmm4, xmm6
     SSE2_AbsDiffUB xmm6, xmm0, xmm3           ; |p0 - q0|
+%ifdef X86_32_PICASM
+    SSE2_CmpltUB xmm6, xmm1, [esp]  ; bDeltaP0Q0 = |p0 - q0| < iAlpha
+%else
     SSE2_CmpltUB xmm6, xmm1, [WELS_DB127_16]  ; bDeltaP0Q0 = |p0 - q0| < iAlpha
+%endif
     MOVDQ    xmm1, [r0 + 1 * r2]              ; q1
     SSE2_AbsDiffUB xmm7, xmm4, xmm3           ; |p1 - p0|
     SSE2_AbsDiffUB xmm0, xmm1, xmm3           ; |q1 - q0|
     pmaxub   xmm7, xmm0                       ; max(|p1 - p0|, |q1 - q0|)
+%ifdef X86_32_PICASM
+    SSE2_CmpltUB xmm7, xmm2, [esp]  ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
+%else
     SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16]  ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
+%endif
     pand     xmm6, xmm7                       ; bDeltaP0Q0P1P0Q1Q0 = bDeltaP0Q0 & bDeltaP1P0 & bDeltaQ1Q0
     MOVDQ    xmm7, [r3 + 2 * r1]              ; p2
     movdqa   xmm0, xmm7
     SSE2_AbsDiffUB xmm7, xmm4, xmm3           ; |p2 - p0|
+%ifdef X86_32_PICASM
+    SSE2_CmpltUB xmm7, xmm2, [esp]  ; bDeltaP2P0 = |p2 - p0| < iBeta
+%else
     SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16]  ; bDeltaP2P0 = |p2 - p0| < iBeta
+%endif
     MOVDQ    xmm5, [r0 + 2 * r2]              ; q2
     MOVDQ    xmm3, [r0 + 0 * r2]              ; q0
     movdqa   xmm1, xmm5
     SSE2_AbsDiffUB xmm5, xmm3, xmm4           ; |q2 - q0|
+%ifdef X86_32_PICASM
+    SSE2_CmpltUB xmm5, xmm2, [esp]  ; bDeltaQ2Q0 = |q2 - q0| < iBeta
+    mov      esp, r4
+    pop      r4
+%else
     SSE2_CmpltUB xmm5, xmm2, [WELS_DB127_16]  ; bDeltaQ2Q0 = |q2 - q0| < iBeta
+%endif
 
     pavgb    xmm3, [r3 + 0 * r1]
     pcmpeqw  xmm2, xmm2  ; FFh
@@ -226,7 +273,21 @@
     pxor     xmm1, xmm2
 
     movd     xmm3, [r4]
+%ifdef X86_32_PICASM
+    push     r0
+    mov      r0, esp
+    sub      esp, 16
+    and      esp, -16
+    push     0x03030303    ;WELS_SHUFB0000111122223333
+    push     0x02020202
+    push     0x01010101
+    push     0x00000000
+    pshufb   xmm3, [esp] ; iTc
+    mov      esp, r0
+    pop      r0
+%else
     pshufb   xmm3, [WELS_SHUFB0000111122223333] ; iTc
+%endif
     movdqa   xmm4, xmm3  ; iTc0 = iTc
     pcmpgtb  xmm3, xmm2  ; iTc > -1 ? 0xff : 0x00
     pand     xmm6, xmm3  ; bDeltaP0Q0P1P0Q1Q0 &= iTc > -1
@@ -328,8 +389,21 @@
     add      r2, 1
     movd     xmm3, r2d
     pxor     xmm4, xmm4
+%ifdef X86_32_PICASM
+    push     r4
+    mov      r4, esp
+    sub      esp, 16
+    and      esp, -16
+    push     0x7f7f7f7f    ;WELS_DB127_16
+    push     0x7f7f7f7f
+    push     0x7f7f7f7f
+    push     0x7f7f7f7f
+    pxor     xmm1, [esp]
+    pxor     xmm2, [esp]
+%else
     pxor     xmm1, [WELS_DB127_16]
     pxor     xmm2, [WELS_DB127_16]
+%endif
     pshufb   xmm1, xmm4                       ; iAlpha ^ 0x7f
     pshufb   xmm2, xmm4                       ; iBeta  ^ 0x7f
     pshufb   xmm3, xmm4                       ; (iAlpha >> 2) + 1
@@ -344,23 +418,41 @@
     movdqa   xmm4, xmm6
     SSE2_AbsDiffUB xmm6, xmm0, xmm5           ; |p0 - q0|
     SSE2_CmpgeUB xmm3, xmm6                   ; |p0 - q0| < (iAlpha >> 2) + 2
+%ifdef X86_32_PICASM
+    SSE2_CmpltUB xmm6, xmm1, [esp]  ; bDeltaP0Q0 = |p0 - q0| < iAlpha
+%else
     SSE2_CmpltUB xmm6, xmm1, [WELS_DB127_16]  ; bDeltaP0Q0 = |p0 - q0| < iAlpha
+%endif
     MOVDQ    xmm1, [r0 + 1 * r2]              ; q1
     SSE2_AbsDiffUB xmm7, xmm4, xmm5           ; |p1 - p0|
     SSE2_AbsDiffUB xmm0, xmm1, xmm5           ; |q1 - q0|
     pmaxub   xmm7, xmm0                       ; max(|p1 - p0|, |q1 - q0|)
+%ifdef X86_32_PICASM
+    SSE2_CmpltUB xmm7, xmm2, [esp]  ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
+%else
     SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16]  ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
+%endif
     pand     xmm6, xmm7                       ; & bDeltaP0Q0
 
     MOVDQ    xmm7, [r3 + 2 * r1]              ; p2
     SSE2_AbsDiffUB xmm7, xmm4, xmm5           ; |p2 - p0|
+%ifdef X86_32_PICASM
+    SSE2_CmpltUB xmm7, xmm2, [esp]  ; bDeltaP2P0 = |p2 - p0| < iBeta
+%else
     SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16]  ; bDeltaP2P0 = |p2 - p0| < iBeta
+%endif
     pand     xmm7, xmm3                       ; &= |p0 - q0| < (iAlpha >> 2) + 2
 
     MOVDQ    xmm0, [r0 + 0 * r2]              ; q0
     MOVDQ    xmm5, [r0 + 2 * r2]              ; q2
     SSE2_AbsDiffUB xmm5, xmm0, xmm4           ; |q2 - q0|
+%ifdef X86_32_PICASM
+    SSE2_CmpltUB xmm5, xmm2, [esp]  ; bDeltaQ2Q0 = |q2 - q0| < iBeta
+    mov      esp, r4
+    pop      r4
+%else
     SSE2_CmpltUB xmm5, xmm2, [WELS_DB127_16]  ; bDeltaQ2Q0 = |q2 - q0| < iBeta
+%endif
     pand     xmm5, xmm3                       ; &= |p0 - q0| < (iAlpha >> 2) + 2
 
 %ifdef X86_32
@@ -369,12 +461,26 @@
     mov      r2, esp
     sub      esp,  16
     and      esp, -16
+%ifdef X86_32_PICASM
+    push     0x01010101
+    push     0x01010101
+    push     0x01010101
+    push     0x01010101
+    sub      esp, 16
     movdqa   [esp], xmm5
+    SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm5, xmm4, 1, [esp+16]
+    movdqa   xmm5, [esp]
+    neg      r1
+    SSE2_DeblockLumaEq4_3x16P r0, r1, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, [esp+16]
+    mov      esp, r2
+%else
+    movdqa   [esp], xmm5
     SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm5, xmm4, 1, [WELS_DB1_16]
     movdqa   xmm5, [esp]
     mov      esp, r2
     neg      r1
     SSE2_DeblockLumaEq4_3x16P r0, r1, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, [WELS_DB1_16]
+%endif
 %else
     movdqa   xmm9, [WELS_DB1_16]
     SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm8, xmm4, 1, xmm9
--- a/codec/common/x86/mc_chroma.asm
+++ b/codec/common/x86/mc_chroma.asm
@@ -119,7 +119,14 @@
     paddw mm0, mm1
     movq mm1,mm7
 
+%ifdef X86_32_PICASM
+    pcmpeqw mm7, mm7
+    psrlw   mm7, 15
+    psllw   mm7, 5
+    paddw   mm0, mm7
+%else
     paddw mm0, [h264_d0x20_mmx]
+%endif
     psrlw mm0, 6
 
     WELS_Zero mm7
@@ -194,7 +201,14 @@
     paddw xmm0, xmm1
     movdqa xmm1,xmm7
 
+%ifdef X86_32_PICASM
+    pcmpeqw xmm7, xmm7
+    psrlw   xmm7, 15
+    psllw   xmm7, 5
+    paddw   xmm0, xmm7
+%else
     paddw xmm0, [h264_d0x20_sse2]
+%endif
     psrlw xmm0, 6
 
     WELS_Zero xmm7
@@ -243,7 +257,13 @@
 
     sub r2, r3 ;sub esi, edi
     sub r2, r3
+%ifdef X86_32_PICASM
+    pcmpeqw xmm7, xmm7
+    psrlw   xmm7, 15
+    psllw   xmm7, 5
+%else
     movdqa xmm7, [h264_d0x20_sse2]
+%endif
 
     movdqu xmm0, [r0]
     movdqa xmm1, xmm0
--- a/codec/common/x86/mc_luma.asm
+++ b/codec/common/x86/mc_luma.asm
@@ -112,8 +112,22 @@
 
 SECTION .text
 
+%ifdef X86_32_PICASM
 
+%macro MOVEIMM_DW16 1
+    pcmpeqw      %1,  %1
+    psrlw        %1,  15
+    psllw        %1,  4
+%endmacro
 
+%macro MOVEIMM_DW32 1
+    pcmpeqw      %1,  %1
+    psrlw        %1,  15
+    psllw        %1,  5
+%endmacro
+
+%endif
+
 ;*******************************************************************************
 ; void McHorVer20WidthEq4_mmx( const uint8_t *pSrc,
 ;                       int iSrcStride,
@@ -130,7 +144,11 @@
 
     sub r0, 2
     WELS_Zero mm7
+%ifdef X86_32_PICASM
+    MOVEIMM_DW16 mm6
+%else
     movq mm6, [h264_w0x10_1]
+%endif
 .height_loop:
     movd mm0, [r0]
     punpcklbw mm0, mm7
@@ -179,9 +197,14 @@
 
 %macro FILTER_HV_W8 9
     paddw   %1, %6
+%ifdef X86_32_PICASM
+    MOVEIMM_DW16 %8
+    paddw   %1, %8
+%else
+    paddw   %1, [h264_w0x10_1]
+%endif
     movdqa  %8, %3
     movdqa  %7, %2
-    paddw   %1, [h264_w0x10_1]
     paddw   %8, %4
     paddw   %7, %5
     psllw   %8, 2
@@ -198,9 +221,14 @@
 
 %macro FILTER_HV_W4 9
 paddw   %1, %6
+%ifdef X86_32_PICASM
+MOVEIMM_DW16 %8
+paddw   %1, %8
+%else
+paddw   %1, [h264_w0x10_1]
+%endif
 movdqa  %8, %3
 movdqa  %7, %2
-paddw   %1, [h264_w0x10_1]
 paddw   %8, %4
 paddw   %7, %5
 psllw   %8, 2
@@ -291,7 +319,11 @@
     lea r0, [r0-2]            ;pSrc -= 2;
 
     pxor xmm7, xmm7
+%ifdef X86_32_PICASM
+    MOVEIMM_DW16 xmm6
+%else
     movdqa xmm6, [h264_w0x10_1]
+%endif
 .y_loop:
     movq xmm0, [r0]
     punpcklbw xmm0, xmm7
@@ -347,7 +379,11 @@
     lea r0, [r0-2]            ;pSrc -= 2;
 
     pxor xmm7, xmm7
+%ifdef X86_32_PICASM
+    MOVEIMM_DW16 xmm6
+%else
     movdqa xmm6, [h264_w0x10_1]
+%endif
 .y_loop:
 
     movq xmm0, [r0]
@@ -819,7 +855,12 @@
     paddw xmm0, xmm6
     psllw xmm6, 2
     paddw xmm0, xmm6
+%ifdef X86_32_PICASM
+    MOVEIMM_DW16 xmm6
+    paddw xmm0, xmm6
+%else
     paddw xmm0, [h264_w0x10_1]
+%endif
     psraw  xmm0, 5
     packuswb xmm0, xmm0
     movd [r2], xmm0
@@ -836,7 +877,11 @@
     paddw xmm2, xmm5
     psllw xmm5, 2
     paddw xmm2, xmm5
+%ifdef X86_32_PICASM
+    paddw xmm2, xmm6
+%else
     paddw xmm2, [h264_w0x10_1]
+%endif
     psraw  xmm2, 5
     packuswb xmm2, xmm2
     movq [r2+1], xmm2
@@ -873,7 +918,12 @@
     paddw xmm0, xmm4
     psllw xmm4, 2
     paddw xmm0, xmm4
+%ifdef X86_32_PICASM
+    MOVEIMM_DW16 xmm6
+    paddw xmm0, xmm6
+%else
     paddw xmm0, [h264_w0x10_1]
+%endif
     psraw  xmm0, 5
     packuswb xmm0, xmm0
     movq [r2], xmm0
@@ -901,7 +951,12 @@
     paddw xmm0, xmm6
     psllw xmm6, 2
     paddw xmm0, xmm6
+%ifdef X86_32_PICASM
+    MOVEIMM_DW16 xmm6
+    paddw xmm0, xmm6
+%else
     paddw xmm0, [h264_w0x10_1]
+%endif
     psraw  xmm0, 5
     packuswb xmm0, xmm0
     movd [r2+8], xmm0
@@ -919,7 +974,11 @@
     paddw xmm2, xmm5
     psllw xmm5, 2
     paddw xmm2, xmm5
+%ifdef X86_32_PICASM
+    paddw xmm2, xmm6
+%else
     paddw xmm2, [h264_w0x10_1]
+%endif
     psraw  xmm2, 5
     packuswb xmm2, xmm2
     movq [r2+9], xmm2
@@ -976,7 +1035,12 @@
 paddw xmm0, xmm6
 psllw xmm6, 2
 paddw xmm0, xmm6
+%ifdef X86_32_PICASM
+MOVEIMM_DW16 xmm6
+paddw xmm0, xmm6
+%else
 paddw xmm0, [h264_w0x10_1]
+%endif
 psraw  xmm0, 5
 packuswb xmm0, xmm0
 movd [r2], xmm0
@@ -993,7 +1057,11 @@
 paddw xmm2, xmm5
 psllw xmm5, 2
 paddw xmm2, xmm5
+%ifdef X86_32_PICASM
+paddw xmm2, xmm6
+%else
 paddw xmm2, [h264_w0x10_1]
+%endif
 psraw  xmm2, 5
 packuswb xmm2, xmm2
 movd [r2+1], xmm2
@@ -1170,7 +1238,12 @@
     psubw  %1, %7
     psraw   %1, 2
     paddw  %8, %1
+%ifdef X86_32_PICASM
+    MOVEIMM_DW32 %7
+    paddw  %8, %7
+%else
     paddw  %8, [h264_mc_hc_32]
+%endif
     psraw   %8, 6
     packuswb %8, %8
     movq %9, %8
@@ -1522,7 +1595,12 @@
 psubw  %1, %7
 psraw   %1, 2
 paddw  %8, %1
+%ifdef X86_32_PICASM
+MOVEIMM_DW32 %7
+paddw  %8, %7
+%else
 paddw  %8, [h264_mc_hc_32]
+%endif
 psraw   %8, 6
 packuswb %8, %8
 movd %9, %8
@@ -1801,7 +1879,12 @@
     movdqa          %7, %3
     pmaddubsw       %7, %6
     paddw           %1, %7
+%ifdef X86_32_PICASM
+    MOVEIMM_DW16    %7
+    paddw            %1, %7
+%else
     paddw           %1, [h264_w0x10_1]
+%endif
     psraw           %1, 5
 %endmacro
 
@@ -1818,7 +1901,12 @@
     movdqa          %7, %4
     pmaddubsw       %7, %6
     paddw           %1, %7
+%ifdef X86_32_PICASM
+    MOVEIMM_DW16    %7
+    paddw            %1, %7
+%else
     paddw           %1, [h264_w0x10_1]
+%endif
     psraw           %1, 5
 %endmacro
 
@@ -1828,7 +1916,20 @@
     pshufb          %1, %2
     pshufb          %5, %3
     pshufd          %6, %1, 10110001b
+%ifdef X86_32_PICASM
+    push            r0
+    mov             r0, esp
+    and             esp, 0xfffffff0
+    push            0x14141414    ;db20_128
+    push            0x14141414
+    push            0x14141414
+    push            0x14141414
+    pmaddubsw       %1, [esp]
+    mov             esp, r0
+    pop             r0
+%else
     pmaddubsw       %1, [db20_128]
+%endif
     pmaddubsw       %5, %4
     pmaddubsw       %6, %4
     paddw           %1, %5
@@ -1838,7 +1939,12 @@
 ; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6
 %macro SSSE3_FilterHorizontal_8px 6
     SSSE3_FilterHorizontalbw_8px %1, %2, %3, %4, %5, %6
+%ifdef X86_32_PICASM
+    MOVEIMM_DW16    %5
+    paddw           %1, %5
+%else
     paddw           %1, [h264_w0x10_1]
+%endif
     psraw           %1, 5
 %endmacro
 
@@ -1853,7 +1959,20 @@
     pshufb          %7, %4
     punpcklqdq      %6, %7
     pshufd          %7, %1, 10110001b
+%ifdef X86_32_PICASM
+    push            r0
+    mov             r0, esp
+    and             esp, 0xfffffff0
+    push            0x14141414    ;db20_128
+    push            0x14141414
+    push            0x14141414
+    push            0x14141414
+    pmaddubsw       %1, [esp]
+    mov             esp, r0
+    pop             r0
+%else
     pmaddubsw       %1, [db20_128]
+%endif
     pmaddubsw       %6, %5
     pmaddubsw       %7, %5
     paddw           %1, %6
@@ -1863,13 +1982,31 @@
 ; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7
 %macro SSSE3_FilterHorizontal_2x4px 7
     SSSE3_FilterHorizontalbw_2x4px %1, %2, %3, %4, %5, %6, %7
+%ifdef X86_32_PICASM
+    MOVEIMM_DW16    %6
+    paddw           %1, %6
+%else
     paddw           %1, [h264_w0x10_1]
+%endif
     psraw           %1, 5
 %endmacro
 
 ; pixels=%1 -32768>>scale=%2 tmp=%3
 %macro SSSE3_FilterHorizontalbw_2px 3
+%ifdef X86_32_PICASM
+    push            r1
+    mov             r1, esp
+    and             esp, 0xfffffff0
+    push            0x0000fe0a
+    push            0xd8d80afe
+    push            0x0000fe0a
+    push            0xd8d80afe
+    pmaddubsw       %1, [esp]
+    mov             esp, r1
+    pop             r1
+%else
     pmaddubsw       %1, [maddubsw_m2p10_m40m40_p10m2_p0p0_128]
+%endif
     pmaddwd         %1, %2
     pshufd          %3, %1, 10110001b
     paddd           %1, %3
@@ -1877,8 +2014,33 @@
 
 ; pixels=%1 tmp=%2
 %macro SSSE3_FilterHorizontal_2px 2
+%ifdef X86_32_PICASM
+    push            r1
+    mov             r1, esp
+    and             esp, 0xfffffff0
+    push            0x0000fe0a
+    push            0xd8d80afe
+    push            0x0000fe0a
+    push            0xd8d80afe
+    pmaddubsw       %1, [esp]
+    push            0xfc00fc00
+    push            0xfc00fc00
+    push            0xfc00fc00
+    push            0xfc00fc00
+    pmaddwd         %1, [esp]
+    pshufd          %2, %1, 10110001b
+    paddd           %1, %2
+    push            0x00008000
+    push            0x00008000
+    push            0x00008000
+    push            0x00008000
+    paddd           %1, [esp]
+    mov             esp, r1
+    pop             r1
+%else
     SSSE3_FilterHorizontalbw_2px %1, [dwm1024_128], %2
     paddd           %1, [dd32768_128]
+%endif
 %endmacro
 
 ; px0=%1 px1=%2 px2=%3 px3=%4 px4=%5 px5=%6 tmp=%7
@@ -1893,8 +2055,14 @@
     paddw           %7, %4
     paddw           %1, %7
     psraw           %1, 2
+%ifdef X86_32_PICASM
+    paddw           %1, %7
+    MOVEIMM_DW32    %7
+    paddw           %1, %7
+%else
     paddw           %7, [h264_mc_hc_32]
     paddw           %1, %7
+%endif
     psraw           %1, 6
 %endmacro
 
@@ -1931,6 +2099,23 @@
     lea             i_srcstride3, [3 * i_srcstride]
     cmp             i_width, 4
     jg              .width8or16
+
+%ifdef X86_32_PICASM
+    push            0xfb01fb01
+    push            0xfb01fb01
+    push            0xfb01fb01
+    push            0xfb01fb01
+    movdqu          xmm6, [esp]
+    push            0x01fb01fb
+    push            0x01fb01fb
+    push            0x01fb01fb
+    push            0x01fb01fb
+    movdqu          xmm7, [esp]
+    push            0x14141414    ;db20_128
+    push            0x14141414
+    push            0x14141414
+    push            0x14141414
+%endif
     movd            xmm0, [p_src]
     movd            xmm4, [p_src + i_srcstride]
     punpcklbw       xmm0, xmm4
@@ -1949,8 +2134,14 @@
     movd            xmm3, [p_src]
     punpcklbw       xmm4, xmm3
     punpcklqdq      xmm2, xmm4
+%ifdef X86_32_PICASM
+    movdqu          xmm5, [esp]
+    SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, xmm6, xmm5, xmm7, xmm4
+    add             esp, 48
+%else
     movdqa          xmm5, [db20_128]
     SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
+%endif
     packuswb        xmm0, xmm0
     movd            [p_dst], xmm0
     psrlq           xmm0, 32
@@ -1961,7 +2152,11 @@
     movd            xmm0, [p_src + 2 * i_srcstride]
     punpcklbw       xmm4, xmm0
     punpcklqdq      xmm3, xmm4
+%ifdef X86_32_PICASM
+    SSSE3_FilterVertical_8px xmm1, xmm2, xmm3, xmm6, xmm5, xmm7, xmm4
+%else
     SSSE3_FilterVertical_8px xmm1, xmm2, xmm3, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
+%endif
     packuswb        xmm1, xmm1
     movd            [p_dst], xmm1
     psrlq           xmm1, 32
@@ -1972,7 +2167,11 @@
     movd            xmm4, [p_src + i_srcstride3]
     punpcklbw       xmm0, xmm4
     jg              .width4_height_ge8
+%ifdef X86_32_PICASM
+    SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, xmm6, xmm5, xmm7, xmm4
+%else
     SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
+%endif
     packuswb        xmm2, xmm2
     movd            [p_dst], xmm2
 .width4_height_le5_done:
@@ -1987,7 +2186,11 @@
     movd            xmm1, [p_src]
     punpcklbw       xmm4, xmm1
     punpcklqdq      xmm0, xmm4
+%ifdef X86_32_PICASM
+    SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, xmm6, xmm5, xmm7, xmm4
+%else
     SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
+%endif
     packuswb        xmm2, xmm2
     movd            [p_dst], xmm2
     psrlq           xmm2, 32
@@ -1998,7 +2201,11 @@
     movd            xmm2, [p_src + 2 * i_srcstride]
     punpcklbw       xmm4, xmm2
     punpcklqdq      xmm1, xmm4
+%ifdef X86_32_PICASM
+    SSSE3_FilterVertical_8px xmm3, xmm0, xmm1, xmm6, xmm5, xmm7, xmm4
+%else
     SSSE3_FilterVertical_8px xmm3, xmm0, xmm1, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
+%endif
     packuswb        xmm3, xmm3
     movd            [p_dst], xmm3
     psrlq           xmm3, 32
@@ -2008,7 +2215,11 @@
     lea             p_dst, [p_dst + 2 * i_dststride]
     movd            xmm4, [p_src + i_srcstride3]
     punpcklbw       xmm2, xmm4
+%ifdef X86_32_PICASM
+    SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, xmm6, xmm5, xmm7, xmm4
+%else
     SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [maddubsw_p1m5_128], xmm5, [maddubsw_m5p1_128], xmm4
+%endif
     packuswb        xmm0, xmm0
     movd            [p_dst], xmm0
 .width4_height_ge8_done:
@@ -2027,6 +2238,31 @@
 .xloop:
     push            p_src
     push            p_dst
+%ifdef X86_32_PICASM
+    push            i_width
+    mov             i_width, esp
+    and             esp, 0xfffffff0
+    push            0xfb01fb01    ;[esp+64]maddubsw_p1m5_128
+    push            0xfb01fb01
+    push            0xfb01fb01
+    push            0xfb01fb01
+    push            0x14141414    ;[esp+48]db20_128
+    push            0x14141414
+    push            0x14141414
+    push            0x14141414
+    push            0x01fb01fb    ;[esp+32]maddubsw_m5p1_128
+    push            0x01fb01fb
+    push            0x01fb01fb
+    push            0x01fb01fb
+    push            0x14fb14fb    ;[esp+16]maddubsw_m5p20_128
+    push            0x14fb14fb
+    push            0x14fb14fb
+    push            0x14fb14fb
+    push            0xfb14fb14    ;[esp] maddubsw_p20m5_128
+    push            0xfb14fb14
+    push            0xfb14fb14
+    push            0xfb14fb14
+%endif
     test            i_ycnt, 1
     jnz             .yloop_begin_even
     movq            xmm0, [p_src]
@@ -2040,7 +2276,11 @@
     movq            xmm5, [p_src + i_srcstride]
     lea             p_src, [p_src + 2 * i_srcstride]
     punpcklbw       xmm4, xmm5
+%ifdef X86_32_PICASM
+    SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [esp+64], [esp+48], [esp+32], xmm7
+%else
     SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm7
+%endif
     packuswb        xmm0, xmm0
     movlps          [p_dst], xmm0
     add             p_dst, i_dststride
@@ -2057,20 +2297,36 @@
     punpcklbw       xmm4, xmm5
 .yloop:
     movq            xmm6, [p_src]
+%ifdef X86_32_PICASM
+    SSSE3_FilterVertical2_8px xmm1, xmm6, xmm2, xmm4, [esp+16], [esp], xmm0, xmm7
+%else
     SSSE3_FilterVertical2_8px xmm1, xmm6, xmm2, xmm4, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm0, xmm7
+%endif
     movq            xmm7, [p_src + i_srcstride]
     punpcklbw       xmm6, xmm7
+%ifdef X86_32_PICASM
+    SSSE3_FilterVertical_8px xmm2, xmm4, xmm6, [esp+64], [esp+48], [esp+32], xmm0
+%else
     SSSE3_FilterVertical_8px xmm2, xmm4, xmm6, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm0
+%endif
     packuswb        xmm1, xmm2
     movlps          [p_dst], xmm1
     movhps          [p_dst + i_dststride], xmm1
     lea             p_dst, [p_dst + 2 * i_dststride]
     movq            xmm0, [p_src + 2 * i_srcstride]
+%ifdef X86_32_PICASM
+    SSSE3_FilterVertical2_8px xmm3, xmm0, xmm4, xmm6, [esp+16], [esp], xmm2, xmm1
+%else
     SSSE3_FilterVertical2_8px xmm3, xmm0, xmm4, xmm6, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm2, xmm1
+%endif
     movq            xmm1, [p_src + i_srcstride3]
     lea             p_src, [p_src + 4 * i_srcstride]
     punpcklbw       xmm0, xmm1
+%ifdef X86_32_PICASM
+    SSSE3_FilterVertical_8px xmm4, xmm6, xmm0, [esp+64], [esp+48], [esp+32], xmm2
+%else
     SSSE3_FilterVertical_8px xmm4, xmm6, xmm0, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm2
+%endif
     packuswb        xmm3, xmm4
     movlps          [p_dst], xmm3
     movhps          [p_dst + i_dststride], xmm3
@@ -2078,20 +2334,36 @@
     jle             .yloop_exit
     lea             p_dst, [p_dst + 2 * i_dststride]
     movq            xmm2, [p_src]
+%ifdef X86_32_PICASM
+    SSSE3_FilterVertical2_8px xmm5, xmm2, xmm6, xmm0, [esp+16], [esp], xmm4, xmm3
+%else
     SSSE3_FilterVertical2_8px xmm5, xmm2, xmm6, xmm0, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm4, xmm3
+%endif
     movq            xmm3, [p_src + i_srcstride]
     punpcklbw       xmm2, xmm3
+%ifdef X86_32_PICASM
+    SSSE3_FilterVertical_8px xmm6, xmm0, xmm2, [esp+64], [esp+48], [esp+32], xmm4
+%else
     SSSE3_FilterVertical_8px xmm6, xmm0, xmm2, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm4
+%endif
     packuswb        xmm5, xmm6
     movlps          [p_dst], xmm5
     movhps          [p_dst + i_dststride], xmm5
     lea             p_dst, [p_dst + 2 * i_dststride]
     movq            xmm4, [p_src + 2 * i_srcstride]
+%ifdef X86_32_PICASM
+    SSSE3_FilterVertical2_8px xmm7, xmm4, xmm0, xmm2, [esp+16], [esp], xmm6, xmm5
+%else
     SSSE3_FilterVertical2_8px xmm7, xmm4, xmm0, xmm2, [maddubsw_m5p20_128], [maddubsw_p20m5_128], xmm6, xmm5
+%endif
     movq            xmm5, [p_src + i_srcstride3]
     lea             p_src, [p_src + 4 * i_srcstride]
     punpcklbw       xmm4, xmm5
+%ifdef X86_32_PICASM
+    SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [esp+64], [esp+48], [esp+32], xmm6
+%else
     SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [maddubsw_p1m5_128], [db20_128], [maddubsw_m5p1_128], xmm6
+%endif
     packuswb        xmm7, xmm0
     movlps          [p_dst], xmm7
     movhps          [p_dst + i_dststride], xmm7
@@ -2099,6 +2371,10 @@
     sub             i_ycnt, 8
     jg              .yloop
 .yloop_exit:
+%ifdef X86_32_PICASM
+    mov             esp, i_width
+    pop             i_width
+%endif
     pop             p_dst
     pop             p_src
     sub             i_width, 8
@@ -2148,9 +2424,28 @@
     SIGN_EXTENSION  r3, r3d
     SIGN_EXTENSION  r4, r4d
     SIGN_EXTENSION  r5, r5d
+%ifdef X86_32_PICASM
+    push            0x090a0809        ;shufb_32435465768798A9
+    push            0x07080607
+    push            0x05060405
+    push            0x03040203
+    movdqu          xmm4, [esp]
+    push            0x0c0b0b0a
+    push            0x06050504
+    push            0x08070706
+    push            0x02010100
+    movdqu          xmm5, [esp]
+    push            0x01fb01fb
+    push            0xfb01fb01
+    push            0x01fb01fb
+    push            0xfb01fb01
+    movdqu          xmm6, [esp]
+    add             esp, 48
+%else
     movdqa          xmm4, [shufb_32435465768798A9]
     movdqa          xmm5, [shufb_011267784556ABBC]
     movdqa          xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+%endif
     cmp             i_width, 8
     je              .width8_yloop
     jg              .width16_yloop
@@ -2229,9 +2524,28 @@
     SIGN_EXTENSION  r3, r3d
     SIGN_EXTENSION  r4, r4d
     SIGN_EXTENSION  r5, r5d
+%ifdef X86_32_PICASM
+    push            0x090a0809        ;shufb_32435465768798A9
+    push            0x07080607
+    push            0x05060405
+    push            0x03040203
+    movdqu          xmm5, [esp]
+    push            0x0c0b0b0a
+    push            0x06050504
+    push            0x08070706
+    push            0x02010100
+    movdqu          xmm6, [esp]
+    push            0x01fb01fb
+    push            0xfb01fb01
+    push            0x01fb01fb
+    push            0xfb01fb01
+    movdqu          xmm7, [esp]
+    add             esp, 48
+%else
     movdqa          xmm5, [shufb_32435465768798A9]
     movdqa          xmm6, [shufb_011267784556ABBC]
     movdqa          xmm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+%endif
     cmp             i_width, 9
     je              .width9_yloop
     jg              .width17_yloop
@@ -2329,9 +2643,28 @@
     SIGN_EXTENSION  r3, r3d
     sub             p_src, i_srcstride
     sub             p_src, i_srcstride
+%ifdef X86_32_PICASM
+    push            0x090a0809        ;shufb_32435465768798A9
+    push            0x07080607
+    push            0x05060405
+    push            0x03040203
+    movdqu          xmm4, [esp]
+    push            0x0c0b0b0a
+    push            0x06050504
+    push            0x08070706
+    push            0x02010100
+    movdqu          xmm5, [esp]
+    push            0x01fb01fb
+    push            0xfb01fb01
+    push            0x01fb01fb
+    push            0xfb01fb01
+    movdqu          xmm6, [esp]
+    add             esp, 48
+%else
     movdqa          xmm4, [shufb_32435465768798A9]
     movdqa          xmm5, [shufb_011267784556ABBC]
     movdqa          xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+%endif
     sub             i_height, 1
 .yloop:
     movdqu          xmm0, [p_src - 2]
@@ -2443,9 +2776,28 @@
     SIGN_EXTENSION  r4, r4d
     sub             p_src, i_srcstride
     sub             p_src, i_srcstride
+%ifdef X86_32_PICASM
+    push            0x090a0809        ;shufb_32435465768798A9
+    push            0x07080607
+    push            0x05060405
+    push            0x03040203
+    movdqu          xmm4, [esp]
+    push            0x0c0b0b0a
+    push            0x06050504
+    push            0x08070706
+    push            0x02010100
+    movdqu          xmm5, [esp]
+    push            0x01fb01fb
+    push            0xfb01fb01
+    push            0x01fb01fb
+    push            0xfb01fb01
+    movdqu          xmm6, [esp]
+    add             esp, 48
+%else
     movdqa          xmm4, [shufb_32435465768798A9]
     movdqa          xmm5, [shufb_011267784556ABBC]
     movdqa          xmm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+%endif
     sub             i_height, 1
 .yloop:
     movdqu          xmm0, [p_src - 2]
@@ -2623,9 +2975,28 @@
     sub             p_src, i_srcstride
     pcmpeqw         xmm4, xmm4
     psllw           xmm4, 15                                ; dw -32768
+%ifdef X86_32_PICASM
+    push            0x090a0809        ;shufb_32435465768798A9
+    push            0x07080607
+    push            0x05060405
+    push            0x03040203
+    movdqu          xmm5, [esp]
+    push            0x0c0b0b0a
+    push            0x06050504
+    push            0x08070706
+    push            0x02010100
+    movdqu          xmm6, [esp]
+    push            0x01fb01fb
+    push            0xfb01fb01
+    push            0x01fb01fb
+    push            0xfb01fb01
+    movdqu          xmm7, [esp]
+    add             esp, 48
+%else
     movdqa          xmm5, [shufb_32435465768798A9]
     movdqa          xmm6, [shufb_011267784556ABBC]
     movdqa          xmm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+%endif
     cmp             i_width, 9
     jne             .width17_yloop
 
@@ -2909,7 +3280,24 @@
     vpshufb         %5, %1, %3
     vpshufb         %1, %1, %2
     vpshufd         %6, %1, 10110001b
+%ifdef X86_32_PICASM
+    push            r0
+    mov             r0, esp
+    and             esp, 0xffffffe0
+    push            0x14141414
+    push            0x14141414
+    push            0x14141414
+    push            0x14141414
+    push            0x14141414
+    push            0x14141414
+    push            0x14141414
+    push            0x14141414
+    vpmaddubsw      %1, %1, [esp]
+    mov             esp, r0
+    pop             r0
+%else
     vpmaddubsw      %1, %1, [db20_256]
+%endif
     vpmaddubsw      %5, %5, %4
     vpmaddubsw      %6, %6, %4
     vpaddw          %1, %1, %5
@@ -2919,7 +3307,14 @@
 ; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 db20=%4 tmp=%5,%6
 %macro AVX2_FilterHorizontal_16px 6
     AVX2_FilterHorizontalbw_16px %1, %2, %3, %4, %5, %6
+%ifdef X86_32_PICASM
+    vpcmpeqw        %6, %6, %6
+    vpsrlw          %6, %6, 15
+    vpsllw          %6, %6, 4
+    vpaddw          %1, %1, %6
+%else
     vpaddw          %1, %1, [h264_w0x10_256]
+%endif
     vpsraw          %1, %1, 5
 %endmacro
 
@@ -2932,7 +3327,24 @@
     vpunpcklqdq     %1, %1, %2
     vpunpcklqdq     %6, %6, %7
     vpshufd         %7, %1, 10110001b
+%ifdef X86_32_PICASM
+    push            r0
+    mov             r0, esp
+    and             esp, 0xffffffe0
+    push            0x14141414
+    push            0x14141414
+    push            0x14141414
+    push            0x14141414
+    push            0x14141414
+    push            0x14141414
+    push            0x14141414
+    push            0x14141414
+    vpmaddubsw      %1, %1, [esp]
+    mov             esp, r0
+    pop             r0
+%else
     vpmaddubsw      %1, %1, [db20_256]
+%endif
     vpmaddubsw      %6, %6, %5
     vpmaddubsw      %7, %7, %5
     vpaddw          %1, %1, %6
@@ -2942,7 +3354,14 @@
 ; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 db20=%5 tmp=%6,%7
 %macro AVX2_FilterHorizontal_4x4px 7
     AVX2_FilterHorizontalbw_4x4px %1, %2, %3, %4, %5, %6, %7
+%ifdef X86_32_PICASM
+    vpcmpeqw        %7, %7, %7
+    vpsrlw          %7, %7, 15
+    vpsllw          %7, %7, 4
+    vpaddw          %1, %1, %7
+%else
     vpaddw          %1, %1, [h264_w0x10_256]
+%endif
     vpsraw          %1, %1, 5
 %endmacro
 
@@ -2956,8 +3375,45 @@
 
 ; pixels=%1 tmp=%2
 %macro AVX2_FilterHorizontal_4px 2
+%ifdef X86_32_PICASM
+    push            r0
+    mov             r0, esp
+    and             esp, 0xffffffe0
+    push            0x0000fe0a    ;maddubsw_m2p10_m40m40_p10m2_p0p0_256
+    push            0xd8d80afe
+    push            0x0000fe0a
+    push            0xd8d80afe
+    push            0x0000fe0a
+    push            0xd8d80afe
+    push            0x0000fe0a
+    push            0xd8d80afe
+    push            0xfc00fc00    ;dwm1024_256
+    push            0xfc00fc00
+    push            0xfc00fc00
+    push            0xfc00fc00
+    push            0xfc00fc00
+    push            0xfc00fc00
+    push            0xfc00fc00
+    push            0xfc00fc00
+    push            0x00008000    ;dd32768_256
+    push            0x00008000
+    push            0x00008000
+    push            0x00008000
+    push            0x00008000
+    push            0x00008000
+    push            0x00008000
+    push            0x00008000
+    vpmaddubsw      %1, %1, [esp+64]
+    vpmaddwd        %1, %1, [esp+32]
+    vpshufd         %2, %1, 10110001b
+    vpaddd          %1, %1, %2
+    vpaddd          %1, %1, [esp]
+    mov             esp, r0
+    pop             r0
+%else
     AVX2_FilterHorizontalbw_4px %1, [dwm1024_256], %2
     vpaddd          %1, %1, [dd32768_256]
+%endif
 %endmacro
 
 ; px_ab=%1 px_cd=%2 px_ef=%3 maddubsw_ab=%4 maddubsw_cd=%5 maddubsw_ef=%6 tmp=%7
@@ -2967,7 +3423,14 @@
     vpaddw          %1, %1, %7
     vpmaddubsw      %7, %3, %6
     vpaddw          %1, %1, %7
+%ifdef X86_32_PICASM
+    vpcmpeqw        %7, %7, %7
+    vpsrlw          %7, %7, 15
+    vpsllw          %7, %7, 4
+    vpaddw          %1, %1, %7
+%else
     vpaddw          %1, %1, [h264_w0x10_256]
+%endif
     vpsraw          %1, %1, 5
 %endmacro
 
@@ -2981,7 +3444,14 @@
     vpaddw          %1, %1, %7
     vpmaddubsw      %7, %4, %6
     vpaddw          %1, %1, %7
+%ifdef X86_32_PICASM
+    vpcmpeqw        %7, %7, %7
+    vpsrlw          %7, %7, 15
+    vpsllw          %7, %7, 4
+    vpaddw          %1, %1, %7
+%else
     vpaddw          %1, %1, [h264_w0x10_256]
+%endif
     vpsraw          %1, %1, 5
 %endmacro
 
@@ -2995,7 +3465,24 @@
     vpaddw          %7, %3, %4
     vpaddw          %1, %1, %7
     vpsraw          %1, %1, 2
+%ifdef X86_32_PICASM
+    push            r0
+    mov             r0, esp
+    and             esp, 0xffffffe0
+    push            0x00200020
+    push            0x00200020
+    push            0x00200020
+    push            0x00200020
+    push            0x00200020
+    push            0x00200020
+    push            0x00200020
+    push            0x00200020
+    vpaddw          %7, %7, [esp]
+    mov             esp, r0
+    pop             r0
+%else
     vpaddw          %7, %7, [dw32_256]
+%endif
     vpaddw          %1, %1, %7
     vpsraw          %1, %1, 6
 %endmacro
@@ -3035,6 +3522,32 @@
     je              .width8
     jg              .width16
 ; .width4:
+%ifdef X86_32_PICASM
+    push            i_width
+    mov             i_width, esp
+    and             esp, 0xffffffe0
+    sub             esp, 16
+    push            0x14141414    ;db20_128
+    push            0x14141414
+    push            0x14141414
+    push            0x14141414
+    push            0xfb01fb01    ;maddubsw_p1m5_256
+    push            0xfb01fb01
+    push            0xfb01fb01
+    push            0xfb01fb01
+    push            0xfb01fb01
+    push            0xfb01fb01
+    push            0xfb01fb01
+    push            0xfb01fb01
+    push            0x01fb01fb    ;maddubsw_m5p1_256
+    push            0x01fb01fb
+    push            0x01fb01fb
+    push            0x01fb01fb
+    push            0x01fb01fb
+    push            0x01fb01fb
+    push            0x01fb01fb
+    push            0x01fb01fb
+%endif
     vmovd           xmm0, [p_src]
     vpbroadcastd    xmm5, [p_src + i_srcstride]
     vpunpcklbw      xmm0, xmm0, xmm5
@@ -3061,8 +3574,13 @@
     vpunpcklbw      ymm5, ymm5, ymm4
     vpblendd        ymm3, ymm3, ymm5, 11001100b
     vpblendd        ymm2, ymm2, ymm3, 11110000b
+%ifdef X86_32_PICASM
+    vbroadcasti128  ymm6, [esp+64]
+    AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [esp+32], ymm6, [esp], ymm5
+%else
     vbroadcasti128  ymm6, [db20_128]
     AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [maddubsw_p1m5_256], ymm6, [maddubsw_m5p1_256], ymm5
+%endif
     vpackuswb       ymm0, ymm0, ymm0
     vmovd           [p_dst], xmm0
     vpsrlq          xmm5, xmm0, 32
@@ -3078,7 +3596,11 @@
     vpbroadcastd    ymm5, [p_src + i_srcstride3]
     vpunpcklbw      ymm4, ymm4, ymm5
     jg              .width4_height_ge8
+%ifdef X86_32_PICASM
+    AVX2_FilterVertical_16px xmm2, xmm3, xmm4, [esp+32], xmm6, [esp], xmm5
+%else
     AVX2_FilterVertical_16px xmm2, xmm3, xmm4, [maddubsw_p1m5_256], xmm6, [maddubsw_m5p1_256], xmm5
+%endif
     vpackuswb       xmm2, xmm2, xmm2
     vmovd           [p_dst], xmm2
     jmp             .width4_done
@@ -3094,7 +3616,11 @@
     vpunpcklbw      ymm5, ymm5, ymm0
     vpblendd        ymm1, ymm1, ymm5, 11001100b
     vpblendd        ymm4, ymm4, ymm1, 11110000b
+%ifdef X86_32_PICASM
+    AVX2_FilterVertical_16px ymm2, ymm3, ymm4, [esp+32], ymm6, [esp], ymm5
+%else
     AVX2_FilterVertical_16px ymm2, ymm3, ymm4, [maddubsw_p1m5_256], ymm6, [maddubsw_m5p1_256], ymm5
+%endif
     vpackuswb       ymm2, ymm2, ymm2
     vmovd           [p_dst], xmm2
     vpsrlq          xmm5, xmm2, 32
@@ -3109,10 +3635,18 @@
     lea             p_dst, [p_dst + 2 * i_dststride]
     vmovd           xmm5, [p_src + i_srcstride3]
     vpunpcklbw      xmm0, xmm0, xmm5
+%ifdef X86_32_PICASM
+    AVX2_FilterVertical_16px xmm4, xmm1, xmm0, [esp+32], xmm6, [esp], xmm5
+%else
     AVX2_FilterVertical_16px xmm4, xmm1, xmm0, [maddubsw_p1m5_256], xmm6, [maddubsw_m5p1_256], xmm5
+%endif
     vpackuswb       xmm4, xmm4, xmm4
     vmovd           [p_dst], xmm4
 .width4_done:
+%ifdef X86_32_PICASM
+    mov             esp, i_width
+    pop             i_width
+%endif
     vzeroupper
     POP_XMM
     LOAD_6_PARA_POP
@@ -3122,6 +3656,32 @@
     ret
 
 .width8:
+%ifdef X86_32_PICASM
+    push            i_width
+    mov             i_width, esp
+    and             esp, 0xffffffe0
+    sub             esp, 16
+    push            0x14141414    ;db20_128
+    push            0x14141414
+    push            0x14141414
+    push            0x14141414
+    push            0xfb01fb01    ;maddubsw_p1m5_256
+    push            0xfb01fb01
+    push            0xfb01fb01
+    push            0xfb01fb01
+    push            0xfb01fb01
+    push            0xfb01fb01
+    push            0xfb01fb01
+    push            0xfb01fb01
+    push            0x01fb01fb    ;maddubsw_m5p1_256
+    push            0x01fb01fb
+    push            0x01fb01fb
+    push            0x01fb01fb
+    push            0x01fb01fb
+    push            0x01fb01fb
+    push            0x01fb01fb
+    push            0x01fb01fb
+%endif
     sub             i_height, 1
     vmovq           xmm0, [p_src]
     vmovq           xmm4, [p_src + i_srcstride]
@@ -3141,8 +3701,13 @@
     vmovq           xmm3, [p_src + 2 * i_srcstride]
     vpunpcklbw      xmm4, xmm4, xmm3
     vinserti128     ymm2, ymm2, xmm4, 1
+%ifdef X86_32_PICASM
+    vbroadcasti128  ymm5, [esp+64]
+    AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [esp+32], ymm5, [esp], ymm4
+%else
     vbroadcasti128  ymm5, [db20_128]
     AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
+%endif
     vmovq           xmm4, [p_src + i_srcstride3]
     lea             p_src, [p_src + 4 * i_srcstride]
     vpunpcklbw      xmm3, xmm3, xmm4
@@ -3149,7 +3714,11 @@
     vmovq           xmm6, [p_src]
     vpunpcklbw      xmm4, xmm4, xmm6
     vinserti128     ymm3, ymm3, xmm4, 1
+%ifdef X86_32_PICASM
+    AVX2_FilterVertical_16px ymm1, ymm2, ymm3, [esp+32], ymm5, [esp], ymm4
+%else
     AVX2_FilterVertical_16px ymm1, ymm2, ymm3, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
+%endif
     vpackuswb       ymm0, ymm0, ymm1
     vmovlps         [p_dst], xmm0
     vextracti128    xmm1, ymm0, 1
@@ -3163,7 +3732,11 @@
     vmovq           xmm4, [p_src + i_srcstride]
     vpunpcklbw      xmm0, xmm6, xmm4
     jg              .width8_height_ge8
+%ifdef X86_32_PICASM
+    AVX2_FilterVertical_16px xmm2, xmm3, xmm0, [esp+32], xmm5, [esp], xmm4
+%else
     AVX2_FilterVertical_16px xmm2, xmm3, xmm0, [maddubsw_p1m5_256], xmm5, [maddubsw_m5p1_256], xmm4
+%endif
     vpackuswb       xmm2, xmm2, xmm2
     vmovlps         [p_dst], xmm2
     jmp             .width8_done
@@ -3171,7 +3744,11 @@
     vmovq           xmm1, [p_src + 2 * i_srcstride]
     vpunpcklbw      xmm4, xmm4, xmm1
     vinserti128     ymm0, ymm0, xmm4, 1
+%ifdef X86_32_PICASM
+    AVX2_FilterVertical_16px ymm2, ymm3, ymm0, [esp+32], ymm5, [esp], ymm4
+%else
     AVX2_FilterVertical_16px ymm2, ymm3, ymm0, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
+%endif
     vmovq           xmm4, [p_src + i_srcstride3]
     lea             p_src, [p_src + 4 * i_srcstride]
     vpunpcklbw      xmm1, xmm1, xmm4
@@ -3178,7 +3755,11 @@
     vmovq           xmm6, [p_src]
     vpunpcklbw      xmm4, xmm4, xmm6
     vinserti128     ymm1, ymm1, xmm4, 1
+%ifdef X86_32_PICASM
+    AVX2_FilterVertical_16px ymm3, ymm0, ymm1, [esp+32], ymm5, [esp], ymm4
+%else
     AVX2_FilterVertical_16px ymm3, ymm0, ymm1, [maddubsw_p1m5_256], ymm5, [maddubsw_m5p1_256], ymm4
+%endif
     vpackuswb       ymm2, ymm2, ymm3
     vmovlps         [p_dst], xmm2
     vextracti128    xmm3, ymm2, 1
@@ -3192,10 +3773,18 @@
     jl              .width8_done
     vmovq           xmm4, [p_src + i_srcstride]
     vpunpcklbw      xmm2, xmm6, xmm4
+%ifdef X86_32_PICASM
+    AVX2_FilterVertical_16px xmm0, xmm1, xmm2, [esp+32], xmm5, [esp], xmm4
+%else
     AVX2_FilterVertical_16px xmm0, xmm1, xmm2, [maddubsw_p1m5_256], xmm5, [maddubsw_m5p1_256], xmm4
+%endif
     vpackuswb       xmm0, xmm0, xmm0
     vmovlps         [p_dst], xmm0
 .width8_done:
+%ifdef X86_32_PICASM
+    mov             esp, i_width
+    pop             i_width
+%endif
     vzeroupper
     POP_XMM
     LOAD_6_PARA_POP
@@ -3205,6 +3794,51 @@
     ret
 
 .width16:
+%ifdef X86_32_PICASM
+    push            i_width
+    mov             i_width, esp
+    and             esp, 0xffffffe0
+    push            0x14141414    ;db20_128
+    push            0x14141414
+    push            0x14141414
+    push            0x14141414
+    push            0x14141414
+    push            0x14141414
+    push            0x14141414
+    push            0x14141414
+    push            0xfb01fb01    ;maddubsw_p1m5_256
+    push            0xfb01fb01
+    push            0xfb01fb01
+    push            0xfb01fb01
+    push            0xfb01fb01
+    push            0xfb01fb01
+    push            0xfb01fb01
+    push            0xfb01fb01
+    push            0x01fb01fb    ;maddubsw_m5p1_256
+    push            0x01fb01fb
+    push            0x01fb01fb
+    push            0x01fb01fb
+    push            0x01fb01fb
+    push            0x01fb01fb
+    push            0x01fb01fb
+    push            0x01fb01fb
+    push            0x14fb14fb    ;maddubsw_m5p20_256
+    push            0x14fb14fb
+    push            0x14fb14fb
+    push            0x14fb14fb
+    push            0x14fb14fb
+    push            0x14fb14fb
+    push            0x14fb14fb
+    push            0x14fb14fb
+    push            0xfb14fb14    ;maddubsw_p20m5_256
+    push            0xfb14fb14
+    push            0xfb14fb14
+    push            0xfb14fb14
+    push            0xfb14fb14
+    push            0xfb14fb14
+    push            0xfb14fb14
+    push            0xfb14fb14
+%endif
     sub             i_height, 1
     test            i_height, 1
     jnz             .width16_yloop_begin_even
@@ -3231,7 +3865,11 @@
     lea             p_src, [p_src + 2 * i_srcstride]
     vpblendd        ymm5, ymm5, ymm6, 11110000b
     vpunpcklbw      ymm4, ymm4, ymm5
+%ifdef X86_32_PICASM
+    AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [esp+96], [esp+128], [esp+64], ymm7
+%else
     AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm7
+%endif
     vpackuswb       ymm0, ymm0, ymm0
     vpermq          ymm0, ymm0, 1000b
     vmovdqa         [p_dst], xmm0
@@ -3261,12 +3899,20 @@
     vmovq           xmm6, [p_src]
     vpbroadcastq    ymm7, [p_src + 8]
     vpblendd        ymm6, ymm6, ymm7, 11110000b
+%ifdef X86_32_PICASM
+    AVX2_FilterVertical2_16px ymm1, ymm6, ymm2, ymm4, [esp+32], [esp], ymm0, ymm7
+%else
     AVX2_FilterVertical2_16px ymm1, ymm6, ymm2, ymm4, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm0, ymm7
+%endif
     vmovq           xmm7, [p_src + i_srcstride]
     vpbroadcastq    ymm0, [p_src + i_srcstride + 8]
     vpblendd        ymm7, ymm7, ymm0, 11110000b
     vpunpcklbw      ymm6, ymm6, ymm7
+%ifdef X86_32_PICASM
+    AVX2_FilterVertical_16px ymm2, ymm4, ymm6, [esp+96], [esp+128], [esp+64], ymm0
+%else
     AVX2_FilterVertical_16px ymm2, ymm4, ymm6, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm0
+%endif
     vpackuswb       ymm1, ymm1, ymm2
     vpermq          ymm1, ymm1, 11011000b
     vmovdqa         [p_dst], xmm1
@@ -3275,13 +3921,21 @@
     vmovq           xmm0, [p_src + 2 * i_srcstride]
     vpbroadcastq    ymm1, [p_src + 2 * i_srcstride + 8]
     vpblendd        ymm0, ymm0, ymm1, 11110000b
+%ifdef X86_32_PICASM
+    AVX2_FilterVertical2_16px ymm3, ymm0, ymm4, ymm6, [esp+32], [esp], ymm2, ymm1
+%else
     AVX2_FilterVertical2_16px ymm3, ymm0, ymm4, ymm6, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm2, ymm1
+%endif
     vmovq           xmm1, [p_src + i_srcstride3]
     vpbroadcastq    ymm2, [p_src + i_srcstride3 + 8]
     lea             p_src, [p_src + 4 * i_srcstride]
     vpblendd        ymm1, ymm1, ymm2, 11110000b
     vpunpcklbw      ymm0, ymm0, ymm1
+%ifdef X86_32_PICASM
+    AVX2_FilterVertical_16px ymm4, ymm6, ymm0, [esp+96], [esp+128], [esp+64], ymm2
+%else
     AVX2_FilterVertical_16px ymm4, ymm6, ymm0, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm2
+%endif
     vpackuswb       ymm3, ymm3, ymm4
     vpermq          ymm3, ymm3, 11011000b
     vmovdqa         [p_dst], xmm3
@@ -3290,12 +3944,20 @@
     vmovq           xmm2, [p_src]
     vpbroadcastq    ymm3, [p_src + 8]
     vpblendd        ymm2, ymm2, ymm3, 11110000b
+%ifdef X86_32_PICASM
+    AVX2_FilterVertical2_16px ymm5, ymm2, ymm6, ymm0, [esp+32], [esp], ymm4, ymm3
+%else
     AVX2_FilterVertical2_16px ymm5, ymm2, ymm6, ymm0, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm4, ymm3
+%endif
     vmovq           xmm3, [p_src + i_srcstride]
     vpbroadcastq    ymm4, [p_src + i_srcstride + 8]
     vpblendd        ymm3, ymm3, ymm4, 11110000b
     vpunpcklbw      ymm2, ymm2, ymm3
+%ifdef X86_32_PICASM
+    AVX2_FilterVertical_16px ymm6, ymm0, ymm2, [esp+96], [esp+128], [esp+64], ymm4
+%else
     AVX2_FilterVertical_16px ymm6, ymm0, ymm2, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm4
+%endif
     vpackuswb       ymm5, ymm5, ymm6
     vpermq          ymm5, ymm5, 11011000b
     vmovdqa         [p_dst], xmm5
@@ -3304,13 +3966,21 @@
     vmovq           xmm4, [p_src + 2 * i_srcstride]
     vpbroadcastq    ymm5, [p_src + 2 * i_srcstride + 8]
     vpblendd        ymm4, ymm4, ymm5, 11110000b
+%ifdef X86_32_PICASM
+    AVX2_FilterVertical2_16px ymm7, ymm4, ymm0, ymm2, [esp+32], [esp], ymm6, ymm5
+%else
     AVX2_FilterVertical2_16px ymm7, ymm4, ymm0, ymm2, [maddubsw_m5p20_256], [maddubsw_p20m5_256], ymm6, ymm5
+%endif
     vmovq           xmm5, [p_src + i_srcstride3]
     vpbroadcastq    ymm6, [p_src + i_srcstride3 + 8]
     lea             p_src, [p_src + 4 * i_srcstride]
     vpblendd        ymm5, ymm5, ymm6, 11110000b
     vpunpcklbw      ymm4, ymm4, ymm5
+%ifdef X86_32_PICASM
+    AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [esp+96], [esp+128], [esp+64], ymm6
+%else
     AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [maddubsw_p1m5_256], [db20_256], [maddubsw_m5p1_256], ymm6
+%endif
     vpackuswb       ymm7, ymm7, ymm0
     vpermq          ymm7, ymm7, 11011000b
     vmovdqa         [p_dst], xmm7
@@ -3318,6 +3988,10 @@
     lea             p_dst, [p_dst + 2 * i_dststride]
     sub             i_height, 8
     jg              .width16_yloop
+%ifdef X86_32_PICASM
+    mov             esp, i_width
+    pop             i_width
+%endif
     vzeroupper
     POP_XMM
     LOAD_6_PARA_POP
@@ -3358,9 +4032,32 @@
     SIGN_EXTENSION  r3, r3d
     SIGN_EXTENSION  r4, r4d
     SIGN_EXTENSION  r5, r5d
+%ifdef X86_32_PICASM
+    push            r1
+    mov             r1, esp
+    and             esp, 0xfffffff0
+    push            0x090a0809        ;shufb_32435465768798A9
+    push            0x07080607
+    push            0x05060405
+    push            0x03040203
+    vbroadcasti128  ymm4, [esp]
+    push            0x0c0b0b0a
+    push            0x06050504
+    push            0x08070706
+    push            0x02010100
+    vbroadcasti128  ymm5, [esp]
+    push            0x01fb01fb
+    push            0xfb01fb01
+    push            0x01fb01fb
+    push            0xfb01fb01
+    vbroadcasti128  ymm6, [esp]
+    mov             esp, r1
+    pop             r1
+%else
     vbroadcasti128  ymm4, [shufb_32435465768798A9]
     vbroadcasti128  ymm5, [shufb_011267784556ABBC]
     vbroadcasti128  ymm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+%endif
     cmp             i_width, 8
     je              .width8
     jg              .width16_yloop
@@ -3464,9 +4161,32 @@
     SIGN_EXTENSION  r3, r3d
     SIGN_EXTENSION  r4, r4d
     SIGN_EXTENSION  r5, r5d
+%ifdef X86_32_PICASM
+    push            r1
+    mov             r1, esp
+    and             esp, 0xfffffff0
+    push            0x090a0809        ;shufb_32435465768798A9
+    push            0x07080607
+    push            0x05060405
+    push            0x03040203
+    vbroadcasti128  ymm5, [esp]
+    push            0x0c0b0b0a
+    push            0x06050504
+    push            0x08070706
+    push            0x02010100
+    vbroadcasti128  ymm6, [esp]
+    push            0x01fb01fb
+    push            0xfb01fb01
+    push            0x01fb01fb
+    push            0xfb01fb01
+    vbroadcasti128  ymm7, [esp]
+    mov             esp, r1
+    pop             r1
+%else
     vbroadcasti128  ymm5, [shufb_32435465768798A9]
     vbroadcasti128  ymm6, [shufb_011267784556ABBC]
     vbroadcasti128  ymm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+%endif
     cmp             i_width, 9
     je              .width9
     jg              .width17
@@ -3607,9 +4327,32 @@
     sub             p_src, i_srcstride
     sub             p_src, i_srcstride
     lea             i_srcstride3, [3 * i_srcstride]
+%ifdef X86_32_PICASM
+    push            r1
+    mov             r1, esp
+    and             esp, 0xfffffff0
+    push            0x090a0809        ;shufb_32435465768798A9
+    push            0x07080607
+    push            0x05060405
+    push            0x03040203
+    vbroadcasti128  ymm4, [esp]
+    push            0x0c0b0b0a
+    push            0x06050504
+    push            0x08070706
+    push            0x02010100
+    vbroadcasti128  ymm5, [esp]
+    push            0x01fb01fb
+    push            0xfb01fb01
+    push            0x01fb01fb
+    push            0xfb01fb01
+    vbroadcasti128  ymm6, [esp]
+    mov             esp, r1
+    pop             r1
+%else
     vbroadcasti128  ymm4, [shufb_32435465768798A9]
     vbroadcasti128  ymm5, [shufb_011267784556ABBC]
     vbroadcasti128  ymm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+%endif
     sub             i_height, 3
 .yloop:
     vmovdqu         xmm0, [p_src - 2]
@@ -3732,9 +4475,32 @@
     SIGN_EXTENSION  r3, r3d
     sub             p_src, i_srcstride
     sub             p_src, i_srcstride
+%ifdef X86_32_PICASM
+    push            r1
+    mov             r1, esp
+    and             esp, 0xfffffff0
+    push            0x090a0809        ;shufb_32435465768798A9
+    push            0x07080607
+    push            0x05060405
+    push            0x03040203
+    vbroadcasti128  ymm3, [esp]
+    push            0x0c0b0b0a
+    push            0x06050504
+    push            0x08070706
+    push            0x02010100
+    vbroadcasti128  ymm4, [esp]
+    push            0x01fb01fb
+    push            0xfb01fb01
+    push            0x01fb01fb
+    push            0xfb01fb01
+    vbroadcasti128  ymm5, [esp]
+    mov             esp, r1
+    pop             r1
+%else
     vbroadcasti128  ymm3, [shufb_32435465768798A9]
     vbroadcasti128  ymm4, [shufb_011267784556ABBC]
     vbroadcasti128  ymm5, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+%endif
     sub             i_height, 1
 .yloop:
     vmovdqu         xmm0, [p_src - 2]
@@ -3953,9 +4719,32 @@
     SIGN_EXTENSION  r3, r3d
     sub             p_src, i_srcstride
     sub             p_src, i_srcstride
+%ifdef X86_32_PICASM
+    push            r1
+    mov             r1, esp
+    and             esp, 0xfffffff0
+    push            0x090a0809        ;shufb_32435465768798A9
+    push            0x07080607
+    push            0x05060405
+    push            0x03040203
+    vbroadcasti128  ymm4, [esp]
+    push            0x0c0b0b0a
+    push            0x06050504
+    push            0x08070706
+    push            0x02010100
+    vbroadcasti128  ymm5, [esp]
+    push            0x01fb01fb
+    push            0xfb01fb01
+    push            0x01fb01fb
+    push            0xfb01fb01
+    vbroadcasti128  ymm6, [esp]
+    mov             esp, r1
+    pop             r1
+%else
     vbroadcasti128  ymm4, [shufb_32435465768798A9]
     vbroadcasti128  ymm5, [shufb_011267784556ABBC]
     vbroadcasti128  ymm6, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+%endif
     sub             i_height, 1
 .yloop:
     vmovdqu         xmm0, [p_src - 2]
@@ -4114,9 +4903,47 @@
     sub             p_src, i_srcstride
     sub             p_src, i_srcstride
     lea             i_srcstride3, [3 * i_srcstride]
+%ifdef X86_32_PICASM
+    push            r5
+    mov             r5, esp
+    and             esp, 0xffffffe0
+    push            0x090a0809        ;shufb_32435465768798A9
+    push            0x07080607
+    push            0x05060405
+    push            0x03040203
+    vbroadcasti128  ymm5, [esp]
+    push            0x0c0b0b0a
+    push            0x06050504
+    push            0x08070706
+    push            0x02010100
+    vbroadcasti128  ymm6, [esp]
+    push            0x01fb01fb
+    push            0xfb01fb01
+    push            0x01fb01fb
+    push            0xfb01fb01
+    vbroadcasti128  ymm7, [esp]
+    sub             esp, 16
+    push            0x0000fe0a    ;maddubsw_m2p10_m40m40_p10m2_p0p0_256
+    push            0xd8d80afe
+    push            0x0000fe0a
+    push            0xd8d80afe
+    push            0x0000fe0a
+    push            0xd8d80afe
+    push            0x0000fe0a
+    push            0xd8d80afe
+    push            0x80008000    ;dwm32768_256
+    push            0x80008000
+    push            0x80008000
+    push            0x80008000
+    push            0x80008000
+    push            0x80008000
+    push            0x80008000
+    push            0x80008000
+%else
     vbroadcasti128  ymm5, [shufb_32435465768798A9]
     vbroadcasti128  ymm6, [shufb_011267784556ABBC]
     vbroadcasti128  ymm7, [maddubsw_p1m5_p1m5_m5p1_m5p1_128]
+%endif
     sub             i_height, 3
 .yloop:
     vmovdqu         xmm0, [p_src - 2]
@@ -4134,7 +4961,14 @@
     vinserti128     ymm0, ymm0, [p_src + i_srcstride3 + 6], 1
     lea             p_src, [p_src + 4 * i_srcstride]
     vpunpckhqdq     ymm4, ymm4, ymm0
+%ifdef X86_32_PICASM
+    vpmaddubsw      ymm4, ymm4, [esp+32]
+    vpmaddwd        ymm4, ymm4, [esp]
+    vpshufd         ymm2, ymm4, 10110001b
+    vpaddd          ymm4, ymm4, ymm2
+%else
     AVX2_FilterHorizontalbw_4px ymm4, [dwm32768_256], ymm2
+%endif
     vmovlps         [p_dst + 26], xmm4
     vmovdqa         [p_dst + 16], xmm3
     vextracti128    xmm2, ymm4, 1
@@ -4157,7 +4991,16 @@
     vmovdqu         xmm3, [p_src + i_srcstride - 2]
     vinserti128     ymm3, ymm3, [p_src + i_srcstride + 6], 1
     vpunpckhqdq     ymm4, ymm0, ymm3
+%ifdef X86_32_PICASM
+    vpmaddubsw      ymm4, ymm4, [esp+32]
+    vpmaddwd        ymm4, ymm4, [esp]
+    vpshufd         ymm2, ymm4, 10110001b
+    vpaddd          ymm4, ymm4, ymm2
+    mov             esp, r5
+    pop             r5
+%else
     AVX2_FilterHorizontalbw_4px ymm4, [dwm32768_256], ymm2
+%endif
     AVX2_FilterHorizontalbw_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2
     AVX2_FilterHorizontalbw_16px ymm3, ymm5, ymm6, ymm7, ymm1, ymm2
     vextracti128    xmm4, ymm4, 1
--- a/codec/common/x86/satd_sad.asm
+++ b/codec/common/x86/satd_sad.asm
@@ -773,9 +773,28 @@
 %endif
 
     pxor        xmm4,   xmm4
+%ifdef X86_32_PICASM
+    push        0xff01ff01
+    push        0xff01ff01
+    push        0x01010101
+    push        0x01010101
+    movdqu      xmm5,   [esp]
+    push        0xffff0001
+    push        0xffff0001
+    push        0xffff0001
+    push        0xffff0001
+    movdqu      xmm6,   [esp]
+    push        0x00010001
+    push        0x00010001
+    push        0x00010001
+    push        0x00010001
+    movdqu      xmm7,   [esp]
+    add         esp, 48
+%else
     movdqa      xmm5,   [HSumSubDB1]
     movdqa      xmm6,   [HSumSubDW1]
     movdqa      xmm7,   [PDW1]
+%endif
     sub         r0,    r1
     movdqu      xmm0,   [r0]
     movhlps     xmm1,   xmm0
@@ -974,7 +993,88 @@
     SIGN_EXTENSION r3, r3d
     SIGN_EXTENSION r5, r5d
 loop_chroma_satdx3:
+%ifdef X86_32_PICASM
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0xff01ff01
+    push        0xff01ff01
+    push        0x01010101
+    push        0x01010101
+    movdqa      xmm5, [esp]
+    push        0xffff0001
+    push        0xffff0001
+    push        0xffff0001
+    push        0xffff0001
+    movdqa      xmm6, [esp]
+    push        0x00010001
+    push        0x00010001
+    push        0x00010001
+    push        0x00010001
+    movdqa      xmm7, [esp]
+    mov         esp, r0
+    mov         r0, [esp + push_num*4 + 4]
+
+    sub         r0,    r1
+    movq        xmm0,  [r0]
+    punpcklqdq  xmm0,  xmm0
+    SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
+    movdqa      [r6],  xmm0 ;V
+    add         r0,     r1
+    pinsrb      xmm0,   byte[r0-1], 0
+    pinsrb      xmm0,   byte[r0+r1-1], 1
+    lea         r0,    [r0+2*r1]
+    pinsrb      xmm0,   byte[r0-1],     2
+    pinsrb      xmm0,   byte[r0+r1-1], 3
+    lea         r0,    [r0+2*r1]
+    pinsrb      xmm0,   byte[r0-1],     4
+    pinsrb      xmm0,   byte[r0+r1-1], 5
+    lea         r0,    [r0+2*r1]
+    pinsrb      xmm0,   byte[r0-1],     6
+    pinsrb      xmm0,   byte[r0+r1-1], 7
+    punpcklqdq  xmm0,   xmm0
+    SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
+;movdqa      [r6+16], xmm0 ;H
+;(sum+2)>>2
+    mov        DWORD [r6+16], 0x0002
+    mov        DWORD [r6+20], 0x0000
+    mov        DWORD [r6+24], 0x0002
+    mov        DWORD [r6+28], 0x0000
+    movdqa      xmm6,   [r6+16]
+    movdqa      [r6+16], xmm0 ;H
+
+    movdqa      xmm5,   xmm4
+    punpckhqdq  xmm5,   xmm1
+    paddd       xmm5,   xmm6
+    psrld       xmm5,   2
+;(sum1+sum2+4)>>3
+    paddd       xmm6,   xmm6
+    paddd       xmm4,   xmm1
+    paddd       xmm4,   xmm6
+    psrld       xmm4,   3
+;satd *16
+    pslld       xmm5,   4
+    pslld       xmm4,   4
+;temp satd
+    movdqa      xmm6,   xmm4
+    punpcklqdq  xmm4,   xmm5
+    psllq       xmm4,   32
+    psrlq       xmm4,   32
+    movdqa      [r6+32], xmm4
+    punpckhqdq  xmm5,   xmm6
+    psllq       xmm5,   32
+    psrlq       xmm5,   32
+    movdqa      [r6+48], xmm5
+
+    pxor        xmm4,   xmm4 ;V
+    pxor        xmm5,   xmm5 ;H
+    pxor        xmm6,   xmm6 ;DC
+    mov         r0,    0
+    SSE41_ChromaGetX38x4Satd r0, 0
+    inc         r0
+    SSE41_ChromaGetX38x4Satd r0, 0
+%else
     SSE41_ChromaGetX38x8Satd
+%endif
     SSEReg2MMX  xmm4, mm0,mm1
     SSEReg2MMX  xmm5, mm2,mm3
     SSEReg2MMX  xmm6, mm5,mm6
@@ -981,7 +1081,89 @@
     mov r0,     arg8
     mov r2,     arg9
 
+%ifdef X86_32_PICASM
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0xff01ff01
+    push        0xff01ff01
+    push        0x01010101
+    push        0x01010101
+    movdqa      xmm5,   [esp]
+    push        0xffff0001
+    push        0xffff0001
+    push        0xffff0001
+    push        0xffff0001
+    movdqa      xmm6,   [esp]
+    push        0x00010001
+    push        0x00010001
+    push        0x00010001
+    push        0x00010001
+    movdqa      xmm7,   [esp]
+    mov         esp,    r0
+    mov r0,     arg8
+
+    sub         r0,    r1
+    movq        xmm0,   [r0]
+    punpcklqdq  xmm0,   xmm0
+    SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
+    movdqa      [r6],  xmm0 ;V
+    add         r0,    r1
+    pinsrb      xmm0,   byte[r0-1], 0
+    pinsrb      xmm0,   byte[r0+r1-1], 1
+    lea         r0,    [r0+2*r1]
+    pinsrb      xmm0,   byte[r0-1],     2
+    pinsrb      xmm0,   byte[r0+r1-1], 3
+    lea         r0,    [r0+2*r1]
+    pinsrb      xmm0,   byte[r0-1],     4
+    pinsrb      xmm0,   byte[r0+r1-1], 5
+    lea         r0,    [r0+2*r1]
+    pinsrb      xmm0,   byte[r0-1],     6
+    pinsrb      xmm0,   byte[r0+r1-1], 7
+    punpcklqdq  xmm0,   xmm0
+    SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
+    ;movdqa      [r6+16], xmm0 ;H
+;(sum+2)>>2
+
+    mov        DWORD [r6+16], 0x0002
+    mov        DWORD [r6+20], 0x0000
+    mov        DWORD [r6+24], 0x0002
+    mov        DWORD [r6+28], 0x0000
+    movdqa      xmm6,   [r6+16]
+    movdqa      [r6+16], xmm0 ;H
+
+    movdqa      xmm5,   xmm4
+    punpckhqdq  xmm5,   xmm1
+    paddd       xmm5,   xmm6
+    psrld       xmm5,   2
+;(sum1+sum2+4)>>3
+    paddd       xmm6,   xmm6
+    paddd       xmm4,   xmm1
+    paddd       xmm4,   xmm6
+    psrld       xmm4,   3
+;satd *16
+    pslld       xmm5,   4
+    pslld       xmm4,   4
+;temp satd
+    movdqa      xmm6,   xmm4
+    punpcklqdq  xmm4,   xmm5
+    psllq       xmm4,   32
+    psrlq       xmm4,   32
+    movdqa      [r6+32], xmm4
+    punpckhqdq  xmm5,   xmm6
+    psllq       xmm5,   32
+    psrlq       xmm5,   32
+    movdqa      [r6+48], xmm5
+
+    pxor        xmm4,   xmm4 ;V
+    pxor        xmm5,   xmm5 ;H
+    pxor        xmm6,   xmm6 ;DC
+    mov         r0,    0
+    SSE41_ChromaGetX38x4Satd r0, 0
+    inc         r0
+    SSE41_ChromaGetX38x4Satd r0, 0
+%else
     SSE41_ChromaGetX38x8Satd
+%endif
 
     MMXReg2SSE  xmm0, xmm3, mm0, mm1
     MMXReg2SSE  xmm1, xmm3, mm2, mm3
@@ -1279,7 +1461,16 @@
     PUSH_XMM 8
     SIGN_EXTENSION r1, r1d
     SIGN_EXTENSION r3, r3d
+%ifdef X86_32_PICASM
+    push        0xff01ff01
+    push        0x01010101
+    push        0xff01ff01
+    push        0x01010101
+    movdqu      xmm4,   [esp]
+    add         esp, 16
+%else
     movdqa      xmm4,[HSwapSumSubDB1]
+%endif
     movd        xmm2,[r2]
     movd        xmm5,[r2+r3]
     shufps      xmm2,xmm5,0
@@ -1337,7 +1528,17 @@
     PUSH_XMM 8
     SIGN_EXTENSION r1, r1d
     SIGN_EXTENSION r3, r3d
+
+%ifdef X86_32_PICASM
+    push        0xff01ff01
+    push        0xff01ff01
+    push        0x01010101
+    push        0x01010101
+    movdqu      xmm7,   [esp]
+    add         esp, 16
+%else
     movdqa      xmm7, [HSumSubDB1]
+%endif
     lea         r4,  [r1+r1*2]
     lea         r5,  [r3+r3*2]
     pxor        xmm6, xmm6
@@ -1370,7 +1571,17 @@
     PUSH_XMM 8
     SIGN_EXTENSION r1, r1d
     SIGN_EXTENSION r3, r3d
+
+%ifdef X86_32_PICASM
+    push        0xff01ff01
+    push        0xff01ff01
+    push        0x01010101
+    push        0x01010101
+    movdqu      xmm7,   [esp]
+    add         esp, 16
+%else
     movdqa      xmm7, [HSumSubDB1]
+%endif
     lea         r4,  [r1+r1*2]
     lea         r5,  [r3+r3*2]
     pxor        xmm6, xmm6
@@ -1410,7 +1621,16 @@
     push  r0
     push  r2
 
+%ifdef X86_32_PICASM
+    push        0xff01ff01
+    push        0xff01ff01
+    push        0x01010101
+    push        0x01010101
+    movdqu      xmm7,   [esp]
+    add         esp, 16
+%else
     movdqa      xmm7, [HSumSubDB1]
+%endif
     lea         r4,  [r1+r1*2]
     lea         r5,  [r3+r3*2]
     pxor        xmm6,   xmm6
@@ -1457,7 +1677,16 @@
     push  r0
     push  r2
 
+%ifdef X86_32_PICASM
+    push        0xff01ff01
+    push        0xff01ff01
+    push        0x01010101
+    push        0x01010101
+    movdqu      xmm7,   [esp]
+    add         esp, 16
+%else
     movdqa      xmm7, [HSumSubDB1]
+%endif
     lea         r4,  [r1+r1*2]
     lea         r5,  [r3+r3*2]
     pxor        xmm6,   xmm6
@@ -1634,7 +1863,19 @@
     SIGN_EXTENSION r1, r1d
     SIGN_EXTENSION r3, r3d
 
+%ifdef X86_32_PICASM
+    mov         r1, esp
+    and         esp, 0xfffffff0
+    push        0xff01ff01
+    push        0xff01ff01
+    push        0x01010101
+    push        0x01010101
+    vbroadcasti128 ymm7, [esp]
+    mov            esp, r1
+    mov            r1, [esp + push_num*4 + 8]
+%else
     vbroadcasti128 ymm7, [HSumSubDB1]
+%endif
     lea            r5, [3 * r1]
     lea            r6, [3 * r3]
     vpxor          ymm6, ymm6, ymm6
@@ -1700,8 +1941,21 @@
     SIGN_EXTENSION r1, r1d
     SIGN_EXTENSION r3, r3d
 
+%ifdef X86_32_PICASM
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0xff01ff01
+    push        0xff01ff01
+    push        0x01010101
+    push        0x01010101
+    vpbroadcastq xmm0, [esp]
+    vpbroadcastq ymm6, [esp + 8]
+    mov            esp, r0
+    mov            r0, [esp + push_num*4 + 4]
+%else
     vpbroadcastq xmm0, [HSumSubDB1]
     vpbroadcastq ymm6, [HSumSubDB1 + 8]
+%endif
     vpblendd     ymm6, ymm0, ymm6, 11110000b
     lea          r5, [3 * r1]
     lea          r6, [3 * r3]
--- a/codec/decoder/core/x86/intra_pred.asm
+++ b/codec/decoder/core/x86/intra_pred.asm
@@ -132,7 +132,20 @@
 %macro COPY_16_TIMES 2
     movdqa      %2, [%1-16]
     psrldq      %2, 15
+%ifdef X86_32_PICASM
+    push        r5
+    mov         r5, esp
+    and         esp, 0xfffffff0
+    push        0x01010101    ;mmx_01bytes
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    pmuludq     %2, [esp]
+    mov         esp, r5
+    pop         r5
+%else
     pmuludq     %2, [mmx_01bytes]
+%endif
     pshufd      %2, %2, 0
 %endmacro
 
@@ -139,7 +152,20 @@
 %macro COPY_16_TIMESS 3
     movdqa      %2, [%1+%3-16]
     psrldq      %2, 15
+%ifdef X86_32_PICASM
+    push        r5
+    mov         r5, esp
+    and         esp, 0xfffffff0
+    push        0x01010101    ;mmx_01bytes
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    pmuludq     %2, [esp]
+    mov         esp, r5
+    pop         r5
+%else
     pmuludq     %2, [mmx_01bytes]
+%endif
     pshufd      %2, %2, 0
 %endmacro
 
@@ -179,23 +205,50 @@
     %assign push_num 0
     LOAD_2_PARA
     SIGN_EXTENSION r1, r1d
+%ifdef X86_32_PICASM
+    push        r3
+    mov         r3, esp
+    and         esp, 0xfffffff0
+    push        0x01010101    ;mmx_01bytes
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+%endif
 
     movzx       r2, byte [r0-1]
     movd        xmm0,   r2d
+%ifdef X86_32_PICASM
+    pmuludq     xmm0,   [esp]
+%else
     pmuludq     xmm0,   [mmx_01bytes]
+%endif
 
     movzx       r2, byte [r0+r1-1]
     movd        xmm1,   r2d
+%ifdef X86_32_PICASM
+    pmuludq     xmm1,   [esp]
+%else
     pmuludq     xmm1,   [mmx_01bytes]
+%endif
 
     lea         r0, [r0+r1]
     movzx       r2, byte [r0+r1-1]
     movd        xmm2,   r2d
+%ifdef X86_32_PICASM
+    pmuludq     xmm2,   [esp]
+%else
     pmuludq     xmm2,   [mmx_01bytes]
+%endif
 
     movzx       r2, byte [r0+2*r1-1]
     movd        xmm3,   r2d
+%ifdef X86_32_PICASM
+    pmuludq     xmm3,   [esp]
+    mov         esp, r3
+    pop         r3
+%else
     pmuludq     xmm3,   [mmx_01bytes]
+%endif
 
     sub         r0,    r1
     movd        [r0], xmm0
@@ -223,11 +276,37 @@
     ;for H
     pxor    xmm7,   xmm7
     movq    xmm0,   [r0]
+%ifdef X86_32_PICASM
+    push    r0
+    mov     r0, esp
+    and     esp, 0xfffffff0
+    push    0x00010002
+    push    0x00030004
+    push    0x00050006
+    push    0x00070008
+    movdqa  xmm5,   [esp]
+    mov     esp, r0
+    pop     r0
+%else
     movdqa  xmm5,   [sse2_plane_dec]
+%endif
     punpcklbw xmm0, xmm7
     pmullw  xmm0,   xmm5
     movq    xmm1,   [r0 + 9]
+%ifdef X86_32_PICASM
+    push    r0
+    mov     r0, esp
+    and     esp, 0xfffffff0
+    push    0x00080007    ;sse2_plane_inc
+    push    0x00060005
+    push    0x00040003
+    push    0x00020001
+    movdqa  xmm6,   [esp]
+    mov     esp, r0
+    pop     r0
+%else
     movdqa  xmm6,   [sse2_plane_inc]
+%endif
     punpcklbw xmm1, xmm7
     pmullw  xmm1,   xmm6
     psubw   xmm1,   xmm0
@@ -282,7 +361,19 @@
     SSE2_Copy8Times xmm0, r3d       ; xmm0 = s,s,s,s,s,s,s,s
 
     xor     r2, r2
+%ifdef X86_32_PICASM
+    mov     r2, esp
+    and     esp, 0xfffffff0
+    push    0x0000ffff    ;sse2_plane_inc_minus
+    push    0xfffefffd
+    push    0xfffcfffb
+    push    0xfffafff9
+    movdqa  xmm5,   [esp]
+    mov     esp, r2
+    xor     r2, r2
+%else
     movdqa  xmm5,   [sse2_plane_inc_minus]
+%endif
 
 get_i16x16_luma_pred_plane_sse2_1:
     movdqa  xmm2,   xmm1
@@ -395,11 +486,30 @@
 
     pxor    mm7,    mm7
     movq    mm0,    [r0]
+%ifdef X86_32_PICASM
+    push    r5
+    mov     r5, esp
+    and     esp, 0xfffffff0
+    push    0x00010002    ;sse2_plane_dec_c
+    push    0x00030004
+    push    0x00040003    ;sse2_plane_inc_c
+    push    0x00020001
+    push    0x00040003    ;
+    push    0x00020001
+    push    0x0000ffff
+    push    0xfffefffd
+    movq    mm5,    [esp+24]
+%else
     movq    mm5,    [sse2_plane_dec_c]
+%endif
     punpcklbw mm0,  mm7
     pmullw  mm0,    mm5
     movq    mm1,    [r0 + 5]
+%ifdef X86_32_PICASM
+    movq    mm6,    [esp+16]
+%else
     movq    mm6,    [sse2_plane_inc_c]
+%endif
     punpcklbw mm1,  mm7
     pmullw  mm1,    mm6
     psubw   mm1,    mm0
@@ -451,7 +561,13 @@
     SSE2_Copy8Times xmm0, r3d       ; xmm0 = s,s,s,s,s,s,s,s
 
     xor     r2, r2
+%ifdef X86_32_PICASM
+    movdqa  xmm5,   [esp]
+    mov     esp, r5
+    pop     r5
+%else
     movdqa  xmm5,   [sse2_plane_mul_b_c]
+%endif
 
 get_i_chroma_pred_plane_sse2_1:
     movdqa  xmm2,   xmm1
@@ -513,7 +629,20 @@
     movq        mm4,mm3             ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
     pavgb       mm3,mm1             ;mm3=([11]+[21]+1)/2
     pxor        mm1,mm4             ;find odd value in the lowest bit of each byte
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    pand        mm1,[esp]   ;set the odd bit
+    mov         esp, r0
+    pop         r0
+%else
     pand        mm1,[mmx_01bytes]   ;set the odd bit
+%endif
     psubusb     mm3,mm1             ;decrease 1 from odd bytes
     pavgb       mm2,mm3             ;mm2=(([11]+[21]+1)/2+1+[16])/2
 
@@ -538,7 +667,20 @@
     movq        %1,     [%3-8]
     psrlq       %1,     38h
 
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    pmullw      %1,     [esp]
+    mov         esp, r0
+    pop         r0
+%else
     pmullw      %1,     [mmx_01bytes]
+%endif
     pshufw      %1,     %1, 0
     movq        [%4],   %1
 %endmacro
@@ -547,7 +689,20 @@
     movq        %1,     [%3+r1-8]
     psrlq       %1,     38h
 
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    pmullw      %1,     [esp]
+    mov         esp, r0
+    pop         r0
+%else
     pmullw      %1,     [mmx_01bytes]
+%endif
     pshufw      %1,     %1, 0
     movq        [%4],   %1
 %endmacro
@@ -561,7 +716,20 @@
     movq        mm0,    [r2-8]
     psrlq       mm0,    38h
 
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    pmullw      mm0,        [esp]
+    mov         esp, r0
+    pop         r0
+%else
     pmullw      mm0,        [mmx_01bytes]
+%endif
     pshufw      mm0,    mm0,    0
     movq        [r0],   mm0
 
@@ -673,7 +841,18 @@
     pavgb       mm1, mm0
 
     pxor        mm4, mm0                ; find odd value in the lowest bit of each byte
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x01010101
+    push        0x01010101
+    pand        mm4, [esp]
+    mov         esp, r0
+    pop         r0
+%else
     pand        mm4, [mmx_01bytes]      ; set the odd bit
+%endif
     psubusb     mm1, mm4                ; decrease 1 from odd bytes
 
     pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
@@ -758,7 +937,18 @@
     pavgb       mm2, mm0
 
     pxor        mm5, mm0                ; find odd value in the lowest bit of each byte
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x01010101
+    push        0x01010101
+    pand        mm5, [esp]      ; set the odd bit
+    mov         esp, r0
+    pop         r0
+%else
     pand        mm5, [mmx_01bytes]      ; set the odd bit
+%endif
     psubusb     mm2, mm5                ; decrease 1 from odd bytes
 
     pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
@@ -840,7 +1030,18 @@
     pavgb       mm2, mm0
 
     pxor        mm3, mm0                ; find odd value in the lowest bit of each byte
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x01010101
+    push        0x01010101
+    pand        mm3, [esp]      ; set the odd bit
+    mov         esp, r0
+    pop         r0
+%else
     pand        mm3, [mmx_01bytes]      ; set the odd bit
+%endif
     psubusb     mm2, mm3                ; decrease 1 from odd bytes
 
     movq        mm3, mm0
@@ -920,7 +1121,18 @@
     movq        mm3, mm1
     pavgb       mm1, mm2
     pxor        mm3, mm2                ; find odd value in the lowest bit of each byte
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x01010101
+    push        0x01010101
+    pand        mm3, [esp]      ; set the odd bit
+    mov         esp, r0
+    pop         r0
+%else
     pand        mm3, [mmx_01bytes]      ; set the odd bit
+%endif
     psubusb     mm1, mm3                ; decrease 1 from odd bytes
 
     pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
@@ -987,7 +1199,18 @@
     movq        mm4, mm2
     pavgb       mm2, mm0
     pxor        mm4, mm0                ; find odd value in the lowest bit of each byte
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x01010101
+    push        0x01010101
+    pand        mm4, [esp]              ; set the odd bit
+    mov         esp, r0
+    pop         r0
+%else
     pand        mm4, [mmx_01bytes]      ; set the odd bit
+%endif
     psubusb     mm2, mm4                ; decrease 1 from odd bytes
 
     pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
@@ -1052,7 +1275,18 @@
     movq        mm1, mm2
     paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
 
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x00000000
+    push        0x00000002
+    movq        mm4, [esp]
+    mov         esp, r0
+    pop         r0
+%else
     movq        mm4, [mmx_0x02]
+%endif
 
     paddq       mm0, mm4
     psrlq       mm0, 0x02
@@ -1068,13 +1302,30 @@
     paddq       mm1, mm4
     psrlq       mm1, 0x03
 
+%ifdef X86_32_PICASM
+    push        r5
+    mov         r5, esp
+    and         esp, 0xfffffff0
+    push        0x01010101
+    push        0x01010101
+    pmuludq     mm0, [esp]
+    pmuludq     mm3, [esp]
+%else
     pmuludq     mm0, [mmx_01bytes]
     pmuludq     mm3, [mmx_01bytes]
+%endif
     psllq       mm0, 0x20
     pxor        mm0, mm3                 ; mm0 = m_up
 
+%ifdef X86_32_PICASM
+    pmuludq     mm2, [esp]
+    pmuludq     mm1, [esp]
+    mov         esp, r5
+    pop         r5
+%else
     pmuludq     mm2, [mmx_01bytes]
     pmuludq     mm1, [mmx_01bytes]
+%endif
     psllq       mm1, 0x20
     pxor        mm1, mm2                 ; mm2 = m_down
 
@@ -1134,7 +1385,20 @@
     movd        xmm1, r2d
     paddw       xmm0, xmm1
     psrld       xmm0, 0x05
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    pmuludq     xmm0, [esp]
+    mov         esp, r0
+    pop         r0
+%else
     pmuludq     xmm0, [mmx_01bytes]
+%endif
     pshufd      xmm0, xmm0, 0
 
     movdqa      [r4],       xmm0
@@ -1258,7 +1522,20 @@
     SIGN_EXTENSION r1, r1d
     lea r2, [2*r1+r1]       ; 3*kiStride
 
+%ifdef X86_32_PICASM
+    push    r0
+    mov     r0, esp
+    and     esp, 0xfffffff0
+    push    0x80808080
+    push    0x80808080
+    push    0x80808080
+    push    0x80808080
+    movdqa xmm0, [esp]
+    mov     esp, r0
+    pop     r0
+%else
     movdqa xmm0, [sse2_dc_0x80]
+%endif
     movdqa xmm1, xmm0
     movdqa [r0], xmm0
     movdqa [r0+r1], xmm1
@@ -1375,7 +1652,13 @@
     paddw xmm1, xmm3            ; w4+..+7 w4+..+7 w4+..+7 w4+..+7 ..
     punpckhqdq xmm1, xmm7
     punpcklqdq xmm0, xmm1       ; sum1 sum1 sum1 sum1 sum0 sum0 sum0 sum0
+%ifdef X86_32_PICASM
+    pcmpeqw  xmm6, xmm6
+    psrlw    xmm6, 15
+    psllw    xmm6, 1
+%else
     movdqa xmm6, [sse2_wd_0x02]
+%endif
     paddw xmm0, xmm6
     psraw xmm0, 02h
     packuswb xmm0, xmm7
@@ -1400,7 +1683,18 @@
     LOAD_2_PARA
     SIGN_EXTENSION r1, r1d
     lea r2, [2*r1+r1]
+%ifdef X86_32_PICASM
+    push r0
+    mov  r0, esp
+    and  esp, 0xfffffff0
+    push 0x80808080
+    push 0x80808080
+    movq mm0, [esp]
+    mov  esp, r0
+    pop  r0
+%else
     movq mm0, [sse2_dc_0x80]
+%endif
     movq mm1, mm0
     movq [r0], mm0
     movq [r0+r1], mm1
--- a/codec/encoder/core/inc/encode_mb_aux.h
+++ b/codec/encoder/core/inc/encode_mb_aux.h
@@ -75,7 +75,9 @@
 
 #ifdef X86_ASM
 
+#ifndef X86_32_PICASM
 int32_t WelsGetNoneZeroCount_sse2 (int16_t* pLevel);
+#endif
 int32_t WelsGetNoneZeroCount_sse42 (int16_t* pLevel);
 
 /****************************************************************************
@@ -84,7 +86,9 @@
 void WelsScan4x4Ac_sse2 (int16_t* zig_value, int16_t* pDct);
 void WelsScan4x4DcAc_ssse3 (int16_t* pLevel, int16_t* pDct);
 void WelsScan4x4DcAc_sse2 (int16_t* pLevel, int16_t* pDct);
+#ifndef X86_32_PICASM
 int32_t WelsCalculateSingleCtr4x4_sse2 (int16_t* pDct);
+#endif
 
 /****************************************************************************
  * DCT functions
--- a/codec/encoder/core/inc/set_mb_syn_cavlc.h
+++ b/codec/encoder/core/inc/set_mb_syn_cavlc.h
@@ -78,10 +78,12 @@
 int32_t CavlcParamCal_c (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
                          int32_t iEndIdx);
 #ifdef  X86_ASM
+#ifndef  X86_32_PICASM
 int32_t CavlcParamCal_sse2 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
                             int32_t iEndIdx);
 int32_t CavlcParamCal_sse42 (int16_t* pCoffLevel, uint8_t* pRun, int16_t* pLevel, int32_t* pTotalCoeffs ,
                              int32_t iEndIdx);
+#endif
 #endif
 
 #if defined(__cplusplus)
--- a/codec/encoder/core/src/encode_mb_aux.cpp
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -500,7 +500,9 @@
     pFuncList->pfCopy8x16Aligned        = WelsCopy8x16_mmx;
   }
   if (uiCpuFlag & WELS_CPU_SSE2) {
+#ifndef X86_32_PICASM
     pFuncList->pfGetNoneZeroCount       = WelsGetNoneZeroCount_sse2;
+#endif
     pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_sse2;
 
     pFuncList->pfQuantization4x4        = WelsQuant4x4_sse2;
@@ -514,7 +516,9 @@
 
     pFuncList->pfScan4x4                = WelsScan4x4DcAc_sse2;
     pFuncList->pfScan4x4Ac              = WelsScan4x4Ac_sse2;
+#ifndef X86_32_PICASM
     pFuncList->pfCalculateSingleCtr4x4  = WelsCalculateSingleCtr4x4_sse2;
+#endif
 
     pFuncList->pfDctT4                  = WelsDctT4_sse2;
     pFuncList->pfDctFourT4              = WelsDctFourT4_sse2;
--- a/codec/encoder/core/src/set_mb_syn_cavlc.cpp
+++ b/codec/encoder/core/src/set_mb_syn_cavlc.cpp
@@ -280,14 +280,19 @@
   pFuncList->pfCavlcParamCal = CavlcParamCal_c;
 
 #if defined(X86_32_ASM)
+#ifndef X86_32_PICASM
   if (uiCpuFlag & WELS_CPU_SSE2) {
     pFuncList->pfCavlcParamCal = CavlcParamCal_sse2;
   }
 #endif
+#endif
+
 #ifdef X86_ASM
+#ifndef X86_32_PICASM
   if (uiCpuFlag & WELS_CPU_SSE42) {
     pFuncList->pfCavlcParamCal = CavlcParamCal_sse42;
   }
+#endif
 #endif
   if (iEntropyCodingModeFlag) {
     pFuncList->pfStashMBStatus = StashMBStatusCabac;
--- a/codec/encoder/core/x86/coeff.asm
+++ b/codec/encoder/core/x86/coeff.asm
@@ -369,6 +369,7 @@
 
 %ifdef X86_32
 
+%ifndef X86_32_PICASM
 ;***********************************************************************
 ;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
 ;***********************************************************************
@@ -506,8 +507,10 @@
     pop edi
     pop ebx
     ret
-%endif
+%endif ;%ifndef X86_32_PICASM
+%endif ;%ifdef X86_32
 
+%ifndef X86_32_PICASM
 ;***********************************************************************
 ;int32_t CavlcParamCal_sse42(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
 ;***********************************************************************
@@ -670,3 +673,5 @@
 %undef r_tmp2d
 %undef p_shufb_lut
 %undef p_run_lut
+
+%endif  ;ifndef X86_32_PICASM
--- a/codec/encoder/core/x86/intra_pred.asm
+++ b/codec/encoder/core/x86/intra_pred.asm
@@ -144,7 +144,20 @@
 %macro COPY_16_TIMES 2
     movdqa      %2, [%1-16]
     psrldq      %2, 15
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    pmuludq     %2, [esp]
+    mov         esp, r0
+    pop         r0
+%else
     pmuludq     %2, [mmx_01bytes]
+%endif
     pshufd      %2, %2, 0
 %endmacro
 
@@ -151,7 +164,20 @@
 %macro COPY_16_TIMESS 3
     movdqa      %2, [%1+%3-16]
     psrldq      %2, 15
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    pmuludq     %2, [esp]
+    mov         esp, r0
+    pop         r0
+%else
     pmuludq     %2, [mmx_01bytes]
+%endif
     pshufd      %2, %2, 0
 %endmacro
 
@@ -193,11 +219,26 @@
     SIGN_EXTENSION r2, r2d
     movzx       r3, byte [r1-1]
     movd        xmm0,   r3d
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    pmuludq     xmm0,   [esp]
+%else
     pmuludq     xmm0,   [mmx_01bytes]
+%endif
 
     movzx       r3, byte [r1+r2-1]
     movd        xmm1,   r3d
+%ifdef X86_32_PICASM
+    pmuludq     xmm1,   [esp]
+%else
     pmuludq     xmm1,   [mmx_01bytes]
+%endif
 
     unpcklps    xmm0,   xmm1
 
@@ -204,11 +245,21 @@
     lea         r1, [r1+r2*2]
     movzx       r3, byte [r1-1]
     movd        xmm2,   r3d
+%ifdef X86_32_PICASM
+    pmuludq     xmm2,   [esp]
+%else
     pmuludq     xmm2,   [mmx_01bytes]
+%endif
 
     movzx       r3, byte [r1+r2-1]
     movd        xmm3,   r3d
+%ifdef X86_32_PICASM
+    pmuludq     xmm3,   [esp]
+    mov         esp,    r0
+    pop         r0
+%else
     pmuludq     xmm3,   [mmx_01bytes]
+%endif
 
     unpcklps    xmm2,   xmm3
     unpcklpd    xmm0,   xmm2
@@ -233,11 +284,34 @@
     ;for H
     pxor    xmm7,   xmm7
     movq    xmm0,   [r1]
+%ifdef X86_32_PICASM
+    push    r5
+    mov     r5, esp
+    and     esp, 0xfffffff0
+    push    0x00010002    ;sse2_plane_dec
+    push    0x00030004
+    push    0x00050006
+    push    0x00070008
+    push    0x00080007    ;sse_plane_inc
+    push    0x00060005
+    push    0x00040003
+    push    0x00020001
+    push    0x0000ffff    ;sse_plane_inc_minus
+    push    0xfffefffd
+    push    0xfffcfffb
+    push    0xfffafff9
+    movdqa  xmm5,   [esp+32]
+%else
     movdqa  xmm5,   [sse2_plane_dec]
+%endif
     punpcklbw xmm0, xmm7
     pmullw  xmm0,   xmm5
     movq    xmm1,   [r1 + 9]
+%ifdef X86_32_PICASM
+    movdqa  xmm6,   [esp+16]
+%else
     movdqa  xmm6,   [sse2_plane_inc]
+%endif
     punpcklbw xmm1, xmm7
     pmullw  xmm1,   xmm6
     psubw   xmm1,   xmm0
@@ -283,7 +357,13 @@
     SSE2_Copy8Times xmm0, r3d       ; xmm0 = s,s,s,s,s,s,s,s
 
     xor     r3, r3
+%ifdef X86_32_PICASM
+    movdqa  xmm5,   [esp]
+    mov     esp,    r5
+    pop     r5
+%else
     movdqa  xmm5,   [sse2_plane_inc_minus]
+%endif
 
 get_i16x16_luma_pred_plane_sse2_1:
     movdqa  xmm2,   xmm1
@@ -321,11 +401,30 @@
 
     pxor    mm7,    mm7
     movq    mm0,    [r1]
+%ifdef X86_32_PICASM
+    push    r5
+    mov     r5, esp
+    and     esp, 0xfffffff0
+    push    0x00010002    ;sse2_plane_dec_c
+    push    0x00030004
+    push    0x00040003    ;sse2_plane_inc_c
+    push    0x00020001
+    push    0x00040003    ;sse2_plane_mul_b_c
+    push    0x00020001
+    push    0x0000ffff
+    push    0xfffefffd
+    movq    mm5,    [esp+24]
+%else
     movq    mm5,    [sse2_plane_dec_c]
+%endif
     punpcklbw mm0,  mm7
     pmullw  mm0,    mm5
     movq    mm1,    [r1 + 5]
+%ifdef X86_32_PICASM
+    movq    mm6,    [esp+16]
+%else
     movq    mm6,    [sse2_plane_inc_c]
+%endif
     punpcklbw mm1,  mm7
     pmullw  mm1,    mm6
     psubw   mm1,    mm0
@@ -375,7 +474,13 @@
     SSE2_Copy8Times xmm0, r3d   ; xmm0 = s,s,s,s,s,s,s,s
 
     xor     r3, r3
+%ifdef X86_32_PICASM
+    movdqa  xmm5,   [esp]
+    mov     esp,    r5
+    pop     r5
+%else
     movdqa  xmm5,   [sse2_plane_mul_b_c]
+%endif
 
 get_i_chroma_pred_plane_sse2_1:
     movdqa  xmm2,   xmm1
@@ -434,7 +539,18 @@
     movq        mm4,mm3             ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
     pavgb       mm3,mm1             ;mm3=([11]+[21]+1)/2
     pxor        mm1,mm4             ;find odd value in the lowest bit of each byte
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x01010101
+    push        0x01010101
+    pand        mm1,[esp]   ;set the odd bit
+    mov         esp, r0
+    pop         r0
+%else
     pand        mm1,[mmx_01bytes]   ;set the odd bit
+%endif
     psubusb     mm3,mm1             ;decrease 1 from odd bytes
     pavgb       mm2,mm3             ;mm2=(([11]+[21]+1)/2+1+[16])/2
 
@@ -503,7 +619,20 @@
     psrlq       %1,     38h
 
     ;pmuludq        %1,     [mmx_01bytes]       ;extend to 4 bytes
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    pmullw      %1,     [esp]
+    mov         esp, r0
+    pop         r0
+%else
     pmullw      %1,     [mmx_01bytes]
+%endif
     pshufw      %1,     %1, 0
     movq        [%4],   %1
 %endmacro
@@ -513,7 +642,20 @@
     psrlq       %1,     38h
 
     ;pmuludq        %1,     [mmx_01bytes]       ;extend to 4 bytes
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    pmullw      %1,     [esp]
+    mov         esp, r0
+    pop         r0
+%else
     pmullw      %1,     [mmx_01bytes]
+%endif
     pshufw      %1,     %1, 0
     movq        [%4],   %1
 %endmacro
@@ -526,7 +668,20 @@
     psrlq       mm0,    38h
 
     ;pmuludq        mm0,    [mmx_01bytes]       ;extend to 4 bytes
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    pmullw      mm0,        [esp]
+    mov         esp, r0
+    pop         r0
+%else
     pmullw      mm0,        [mmx_01bytes]
+%endif
     pshufw      mm0,    mm0,    0
     movq        [r0],   mm0
 
@@ -636,7 +791,18 @@
     pavgb       mm1, mm0
 
     pxor        mm4, mm0                ; find odd value in the lowest bit of each byte
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x01010101
+    push        0x01010101
+    pand        mm4, [esp]      ; set the odd bit
+    mov         esp, r0
+    pop         r0
+%else
     pand        mm4, [mmx_01bytes]      ; set the odd bit
+%endif
     psubusb     mm1, mm4                ; decrease 1 from odd bytes
 
     pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
@@ -715,7 +881,18 @@
     pavgb       mm2, mm0
 
     pxor        mm5, mm0                ; find odd value in the lowest bit of each byte
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x01010101
+    push        0x01010101
+    pand        mm5, [esp]      ; set the odd bit
+    mov         esp, r0
+    pop         r0
+%else
     pand        mm5, [mmx_01bytes]      ; set the odd bit
+%endif
     psubusb     mm2, mm5                ; decrease 1 from odd bytes
 
     pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
@@ -794,7 +971,18 @@
     pavgb       mm2, mm0
 
     pxor        mm3, mm0                ; find odd value in the lowest bit of each byte
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x01010101
+    push        0x01010101
+    pand        mm3, [esp]      ; set the odd bit
+    mov         esp, r0
+    pop         r0
+%else
     pand        mm3, [mmx_01bytes]      ; set the odd bit
+%endif
     psubusb     mm2, mm3                ; decrease 1 from odd bytes
 
     movq        mm3, mm0
@@ -872,7 +1060,18 @@
     movq        mm3, mm1
     pavgb       mm1, mm2
     pxor        mm3, mm2                ; find odd value in the lowest bit of each byte
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x01010101
+    push        0x01010101
+    pand        mm3, [esp]      ; set the odd bit
+    mov         esp, r0
+    pop         r0
+%else
     pand        mm3, [mmx_01bytes]      ; set the odd bit
+%endif
     psubusb     mm1, mm3                ; decrease 1 from odd bytes
 
     pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
@@ -936,7 +1135,18 @@
     movq        mm4, mm2
     pavgb       mm2, mm0
     pxor        mm4, mm0                ; find odd value in the lowest bit of each byte
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x01010101
+    push        0x01010101
+    pand        mm4, [esp]      ; set the odd bit
+    mov         esp, r0
+    pop         r0
+%else
     pand        mm4, [mmx_01bytes]      ; set the odd bit
+%endif
     psubusb     mm2, mm4                ; decrease 1 from odd bytes
 
     pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
@@ -998,7 +1208,18 @@
     movq        mm1, mm2
     paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
 
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x00000000
+    push        0x00000002
+    movq        mm4, [esp]
+    mov         esp, r0
+    pop         r0
+%else
     movq        mm4, [mmx_0x02]
+%endif
 
     paddq       mm0, mm4
     psrlq       mm0, 0x02
@@ -1014,13 +1235,32 @@
     paddq       mm1, mm4
     psrlq       mm1, 0x03
 
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    pmuludq     mm0, [esp]
+    pmuludq     mm3, [esp]
+%else
     pmuludq     mm0, [mmx_01bytes]
     pmuludq     mm3, [mmx_01bytes]
+%endif
     psllq       mm0, 0x20
     pxor        mm0, mm3                 ; mm0 = m_up
 
+%ifdef X86_32_PICASM
+    pmuludq     mm2, [esp]
+    pmuludq     mm1, [esp]
+    mov         esp, r0
+    pop         r0
+%else
     pmuludq     mm2, [mmx_01bytes]
     pmuludq     mm1, [mmx_01bytes]
+%endif
     psllq       mm1, 0x20
     pxor        mm1, mm2                 ; mm2 = m_down
 
@@ -1076,7 +1316,20 @@
     movd        xmm1, r3d
     paddw       xmm0, xmm1
     psrld       xmm0, 0x05
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    push        0x01010101
+    pmuludq     xmm0, [esp]
+    mov         esp, r0
+    pop         r0
+%else
     pmuludq     xmm0, [mmx_01bytes]
+%endif
     pshufd      xmm0, xmm0, 0
 
     movdqa      [r0], xmm0
@@ -1098,4 +1351,4 @@
 
     pop r4
     pop r3
-    ret
\ No newline at end of file
+    ret
--- a/codec/encoder/core/x86/sample_sc.asm
+++ b/codec/encoder/core/x86/sample_sc.asm
@@ -696,9 +696,26 @@
     mov     ebx,    [height]
     mov     [i_height], ebx
 
+%ifdef X86_32_PICASM
+    push    r0
+    mov     r0, esp
+    and     esp, 0xfffffff0
+    push    0x00100010                  ;mv_x_inc_x4
+    push    0x00100010
+    push    0x00040004                  ;mv_y_inc_x4
+    push    0x00040004
+    push    0x000c0008                  ;mx_x_offset_x4
+    push    0x00040000
+    movq    xmm7,   [esp+16]            ; x_qpel inc
+    movq    xmm6,   [esp+8]             ; y_qpel inc
+    movq    xmm5,   [esp]               ; x_qpel vector
+    mov     esp,    r0
+    pop     r0
+%else
     movq    xmm7,   [mv_x_inc_x4]       ; x_qpel inc
     movq    xmm6,   [mv_y_inc_x4]       ; y_qpel inc
     movq    xmm5,   [mx_x_offset_x4]    ; x_qpel vector
+%endif
     pxor    xmm4,   xmm4
     pxor    xmm3,   xmm3                ; y_qpel vector
 HASH_HEIGHT_LOOP_SSE2:
@@ -1398,9 +1415,24 @@
     push r13
     mov     r12,    r2
 
+%ifdef X86_32_PICASM
+    push    r0
+    mov     r0, esp
+    and     esp, 0xfffffff0
+    push    0x00100010                  ;mv_x_inc_x4
+    push    0x00100010
+    push    0x00040004                  ;mv_y_inc_x4
+    push    0x00040004
+    push    0x000c0008                  ;mx_x_offset_x4
+    push    0x00040000
+    movq    xmm7,   [esp+16]            ; x_qpel inc
+    movq    xmm6,   [esp+8]             ; y_qpel inc
+    movq    xmm5,   [esp]               ; x_qpel vector
+%else
     movq    xmm7,   [mv_x_inc_x4]       ; x_qpel inc
     movq    xmm6,   [mv_y_inc_x4]       ; y_qpel inc
     movq    xmm5,   [mx_x_offset_x4]    ; x_qpel vector
+%endif
     pxor    xmm4,   xmm4
     pxor    xmm3,   xmm3                ; y_qpel vector
 HASH_HEIGHT_LOOP_SSE2:
--- a/codec/encoder/core/x86/score.asm
+++ b/codec/encoder/core/x86/score.asm
@@ -207,8 +207,26 @@
     pextrw      r1d,  xmm1, 0           ; eax = [8]
     pinsrw      xmm0, r1d, 7            ; xmm0[7]   =   [8]
     pinsrw      xmm1, r2d, 0            ; xmm1[0]   =   [7]
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x0d0c0706              ;pb_scanacdc_maska
+    push        0x05040b0a
+    push        0x0f0e0908
+    push        0x03020100
+    push        0x0f0e0d0c              ;pb_scanacdc_maskb
+    push        0x07060100
+    push        0x05040b0a
+    push        0x09080302
+    pshufb      xmm1, [esp]
+    pshufb      xmm0, [esp+16]
+    mov         esp, r0
+    pop         r0
+%else
     pshufb      xmm1, [pb_scanacdc_maskb]
     pshufb      xmm0, [pb_scanacdc_maska]
+%endif
 
     movdqa     [r0],xmm0
     movdqa     [r0+16], xmm1
@@ -250,6 +268,7 @@
     ret
 
 
+%ifndef X86_32_PICASM
 ;***********************************************************************
 ;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
 ;***********************************************************************
@@ -306,8 +325,10 @@
     mov retrd, r0d
     %endif
     ret
+%endif ;ifndef X86_32_PICASM
 
 
+%ifndef X86_32_PICASM
 ;***********************************************************************
 ; int32_t WelsGetNoneZeroCount_sse2(int16_t* level);
 ;***********************************************************************
@@ -336,6 +357,7 @@
     add   retrq, r1
     ;add       al,  [nozero_count_table+r1]
     ret
+%endif ;%ifndef X86_32_PICASM
 
 ;***********************************************************************
 ; int32_t WelsGetNoneZeroCount_sse42(int16_t* level);
--- a/codec/processing/src/x86/denoisefilter.asm
+++ b/codec/processing/src/x86/denoisefilter.asm
@@ -147,7 +147,20 @@
     movdqa      %2, %1
     psrldq      %2, 2
     punpcklbw   %2, %4
+%ifdef X86_32_PICASM
+    push        r0
+    mov         r0, esp
+    and         esp, 0xfffffff0
+    push        0x00140014
+    push        0x00140014
+    push        0x00140014
+    push        0x00140014
+    pmullw      %2, [esp]
+    mov         esp, r0
+    pop         r0
+%else
     pmullw      %2, [sse2_20]
+%endif
     paddw       %3, %2
 
     movdqa      %2, %1
@@ -184,7 +197,13 @@
 
     movq        xmm6,   [r0]
     punpcklbw   xmm6,   xmm7
+%ifdef X86_32_PICASM
+    pcmpeqw     xmm3,   xmm3
+    psrlw       xmm3,   15
+    psllw       xmm3,   5
+%else
     movdqa      xmm3,   [sse2_32]
+%endif
     pxor        xmm4,   xmm4        ; nTotWeight
     pxor        xmm5,   xmm5        ; nSum
 
--- a/codec/processing/src/x86/downsample_bilinear.asm
+++ b/codec/processing/src/x86/downsample_bilinear.asm
@@ -1253,7 +1253,20 @@
     pmaddwd     xmm2,   xmm1
     pshufd  xmm1,   xmm2,   00000001b
     paddd   xmm2,   xmm1
+%ifdef X86_32_PICASM
+    push    r0
+    mov     r0, esp
+    and     esp, 0xffffffe0
+    push    0x00000000
+    push    0x00000000
+    push    0x00000000
+    push    0x00004000
+    movdqa  xmm1,   [esp]
+    mov     esp, r0
+    pop     r0
+%else
     movdqa  xmm1,   [add_extra_half]
+%endif
     paddd   xmm2,   xmm1
     psrld   xmm2,   15
 
@@ -1554,7 +1567,20 @@
     pmaddwd     xmm2,   xmm1
     pshufd  xmm1,   xmm2,   00000001b
     paddd   xmm2,   xmm1
+%ifdef X86_32_PICASM
+    push    r0
+    mov     r0, esp
+    and     esp, 0xffffffe0
+    push    0x00000000
+    push    0x00000000
+    push    0x00000000
+    push    0x00004000
+    movdqa  xmm1,   [esp]
+    mov     esp, r0
+    pop     r0
+%else
     movdqa  xmm1,   [add_extra_half]
+%endif
     paddd   xmm2,   xmm1
     psrld   xmm2,   15
 
@@ -1671,15 +1697,52 @@
     ;1st line
     movdqa xmm0, [r2]                         ;F * e E * d D * c C * b B * a A
     movdqa xmm1, xmm0
+%ifdef X86_32_PICASM
+    push   r0
+    mov    r0, esp
+    and    esp, 0xfffffff0
+    push   0x80808080    ;shufb_mask_onethird_low_1
+    push   0x80808080
+    push   0x80800f0c
+    push   0x09060300
+    push   0x80808080    ;shufb_mask_onethird_high_1
+    push   0x80808080
+    push   0x8080800d
+    push   0x0a070401
+    push   0x80808080    ;shufb_mask_onethird_low_2
+    push   0x800e0b08
+    push   0x05028080
+    push   0x80808080
+    push   0x80808080    ;shufb_mask_onethird_high_2
+    push   0x800f0c09
+    push   0x06030080
+    push   0x80808080
+    push   0x0d0a0704    ;shufb_mask_onethird_low_3
+    push   0x01808080
+    push   0x80808080
+    push   0x80808080
+    push   0x0e0b0805    ;shufb_mask_onethird_high_3
+    push   0x02808080
+    push   0x80808080
+    push   0x80808080
+    movdqa xmm5, [esp+80]
+    movdqa xmm6, [esp+64]
+%else
     movdqa xmm5, [shufb_mask_onethird_low_1]
     movdqa xmm6, [shufb_mask_onethird_high_1]
+%endif
     pshufb xmm0, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0
     pshufb xmm1, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1
 
     movdqa xmm2, [r2+16]                      ;k K * j J * i I * h H * g G * f
     movdqa xmm3, xmm2
+%ifdef X86_32_PICASM
+    movdqa xmm5, [esp+48]
+    movdqa xmm6, [esp+32]
+%else
     movdqa xmm5, [shufb_mask_onethird_low_2]
     movdqa xmm6, [shufb_mask_onethird_high_2]
+%endif
     pshufb xmm2, xmm5                           ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2
     pshufb xmm3, xmm6                           ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3
 
@@ -1688,8 +1751,13 @@
 
     movdqa xmm2, [r2+32]                      ;* p P * o O * n N * m M * l L *
     movdqa xmm3, xmm2
+%ifdef X86_32_PICASM
+    movdqa xmm5, [esp+16]
+    movdqa xmm6, [esp]
+%else
     movdqa xmm5, [shufb_mask_onethird_low_3]
     movdqa xmm6, [shufb_mask_onethird_high_3]
+%endif
     pshufb xmm2, xmm5                           ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2
     pshufb xmm3, xmm6                           ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3
 
@@ -1700,15 +1768,25 @@
     ;2nd line
     movdqa xmm2, [r2+r3]                      ;F' *  e' E' *  d' D' *  c' C' *  b' B' *  a' A'
     movdqa xmm3, xmm2
+%ifdef X86_32_PICASM
+    movdqa xmm5, [esp+80]
+    movdqa xmm6, [esp+64]
+%else
     movdqa xmm5, [shufb_mask_onethird_low_1]
     movdqa xmm6, [shufb_mask_onethird_high_1]
+%endif
     pshufb xmm2, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2
     pshufb xmm3, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0  e' d' c' b' a' -> xmm3
 
     movdqa xmm1, [r2+r3+16]                   ;k' K' *  j' J' *  i' I' *  h' H' *  g' G' *  f'
     movdqa xmm4, xmm1
+%ifdef X86_32_PICASM
+    movdqa xmm5, [esp+48]
+    movdqa xmm6, [esp+32]
+%else
     movdqa xmm5, [shufb_mask_onethird_low_2]
     movdqa xmm6, [shufb_mask_onethird_high_2]
+%endif
     pshufb xmm1, xmm5                           ;0 0 0 0 0 K' J' I' H' G' 0  0 0 0 0 0 -> xmm1
     pshufb xmm4, xmm6                           ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4
 
@@ -1717,8 +1795,15 @@
 
     movdqa xmm1, [r2+r3+32]                   ; *  p' P' *  o' O' *  n' N' *  m' M' *  l' L' *
     movdqa xmm4, xmm1
+%ifdef X86_32_PICASM
+    movdqa xmm5, [esp+16]
+    movdqa xmm6, [esp]
+    mov    esp, r0
+    pop    r0
+%else
     movdqa xmm5, [shufb_mask_onethird_low_3]
     movdqa xmm6, [shufb_mask_onethird_high_3]
+%endif
     pshufb xmm1, xmm5                           ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1
     pshufb xmm4, xmm6                           ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4
 
@@ -1821,15 +1906,52 @@
     ;1st line
     movntdqa xmm0, [r2]                         ;F * e E * d D * c C * b B * a A
     movdqa xmm1, xmm0
+%ifdef X86_32_PICASM
+    push   r0
+    mov    r0, esp
+    and    esp, 0xfffffff0
+    push   0x80808080    ;shufb_mask_onethird_low_1
+    push   0x80808080
+    push   0x80800f0c
+    push   0x09060300
+    push   0x80808080    ;shufb_mask_onethird_high_1
+    push   0x80808080
+    push   0x8080800d
+    push   0x0a070401
+    push   0x80808080    ;shufb_mask_onethird_low_2
+    push   0x800e0b08
+    push   0x05028080
+    push   0x80808080
+    push   0x80808080    ;shufb_mask_onethird_high_2
+    push   0x800f0c09
+    push   0x06030080
+    push   0x80808080
+    push   0x0d0a0704    ;shufb_mask_onethird_low_3
+    push   0x01808080
+    push   0x80808080
+    push   0x80808080
+    push   0x0e0b0805    ;shufb_mask_onethird_high_3
+    push   0x02808080
+    push   0x80808080
+    push   0x80808080
+    movdqa xmm5, [esp+80]
+    movdqa xmm6, [esp+64]
+%else
     movdqa xmm5, [shufb_mask_onethird_low_1]
     movdqa xmm6, [shufb_mask_onethird_high_1]
+%endif
     pshufb xmm0, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0
     pshufb xmm1, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1
 
     movntdqa xmm2, [r2+16]                      ;k K * j J * i I * h H * g G * f
     movdqa xmm3, xmm2
+%ifdef X86_32_PICASM
+    movdqa xmm5, [esp+48]
+    movdqa xmm6, [esp+32]
+%else
     movdqa xmm5, [shufb_mask_onethird_low_2]
     movdqa xmm6, [shufb_mask_onethird_high_2]
+%endif
     pshufb xmm2, xmm5                           ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2
     pshufb xmm3, xmm6                           ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3
 
@@ -1838,8 +1960,13 @@
 
     movntdqa xmm2, [r2+32]                      ;* p P * o O * n N * m M * l L *
     movdqa xmm3, xmm2
+%ifdef X86_32_PICASM
+    movdqa xmm5, [esp+16]
+    movdqa xmm6, [esp]
+%else
     movdqa xmm5, [shufb_mask_onethird_low_3]
     movdqa xmm6, [shufb_mask_onethird_high_3]
+%endif
     pshufb xmm2, xmm5                           ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2
     pshufb xmm3, xmm6                           ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3
 
@@ -1850,15 +1977,25 @@
     ;2nd line
     movntdqa xmm2, [r2+r3]                      ;F' *  e' E' *  d' D' *  c' C' *  b' B' *  a' A'
     movdqa xmm3, xmm2
+%ifdef X86_32_PICASM
+    movdqa xmm5, [esp+80]
+    movdqa xmm6, [esp+64]
+%else
     movdqa xmm5, [shufb_mask_onethird_low_1]
     movdqa xmm6, [shufb_mask_onethird_high_1]
+%endif
     pshufb xmm2, xmm5                           ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2
     pshufb xmm3, xmm6                           ;0 0 0 0 0 0 0 0 0 0 0  e' d' c' b' a' -> xmm3
 
     movntdqa xmm1, [r2+r3+16]                   ;k' K' *  j' J' *  i' I' *  h' H' *  g' G' *  f'
     movdqa xmm4, xmm1
+%ifdef X86_32_PICASM
+    movdqa xmm5, [esp+48]
+    movdqa xmm6, [esp+32]
+%else
     movdqa xmm5, [shufb_mask_onethird_low_2]
     movdqa xmm6, [shufb_mask_onethird_high_2]
+%endif
     pshufb xmm1, xmm5                           ;0 0 0 0 0 K' J' I' H' G' 0  0 0 0 0 0 -> xmm1
     pshufb xmm4, xmm6                           ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4
 
@@ -1867,8 +2004,15 @@
 
     movntdqa xmm1, [r2+r3+32]                   ; *  p' P' *  o' O' *  n' N' *  m' M' *  l' L' *
     movdqa xmm4, xmm1
+%ifdef X86_32_PICASM
+    movdqa xmm5, [esp+16]
+    movdqa xmm6, [esp]
+    mov    esp, r0
+    pop    r0
+%else
     movdqa xmm5, [shufb_mask_onethird_low_3]
     movdqa xmm6, [shufb_mask_onethird_high_3]
+%endif
     pshufb xmm1, xmm5                           ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1
     pshufb xmm4, xmm6                           ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4
 
@@ -2112,7 +2256,20 @@
     add r6, r0
     movq xmm7, [r6]
 
+%ifdef X86_32_PICASM
+    push   r0
+    mov    r0, esp
+    and    esp, 0xfffffff0
+    push   0x80808080
+    push   0x0d090501
+    push   0x80808080
+    push   0x0c080400
+    movdqa xmm6, [esp]
+    mov    esp, r0
+    pop    r0
+%else
     movdqa xmm6, [shufb_mask_quarter]
+%endif
 .yloops_quarter_sse3:
     ;mov eax, [esp+40]   ; iSrcWidth
     ;sar eax, $02            ; iSrcWidth >> 2
@@ -2221,7 +2378,20 @@
     add r6, r0
     movq xmm7, [r6]
 
+%ifdef X86_32_PICASM
+    push   r0
+    mov    r0, esp
+    and    esp, 0xfffffff0
+    push   0x80808080
+    push   0x0d090501
+    push   0x80808080
+    push   0x0c080400
+    movdqa xmm6, [esp]
+    mov    esp, r0
+    pop    r0
+%else
     movdqa xmm6, [shufb_mask_quarter]    ;mask
+%endif
 
 .yloops_quarter_sse4:
 %ifdef X86_32
@@ -2364,7 +2534,20 @@
 
 %macro SSSE3_BilinearFastDownsample4xOrLess_8px 0
     movdqa          xmm_tmp0, xmm_xpos_int
+%ifdef X86_32_PICASM
+    push            r0
+    mov             r0, esp
+    and             esp, 0xfffffff0
+    push            0x08080808
+    push            0x08080808
+    push            0x00000000
+    push            0x00000000
+    pshufb          xmm_tmp0, [esp]
+    mov             esp, r0
+    pop             r0
+%else
     pshufb          xmm_tmp0, [shufb_0000000088888888]
+%endif
     psubb           xmm_xpos_int, xmm_tmp0
     SSE2_UnpckXFracuw xmm_tmp0, xmm_tmp1, xmm_xpos_frac
     mov             r_tmp0, i_xpos
@@ -2372,7 +2555,24 @@
     lddqu           xmm_tmp3, [p_src_row0 + r_tmp0]
     lddqu           xmm_tmp4, [p_src_row1 + r_tmp0]
     movdqa          xmm_tmp2, xmm_xpos_int
+%ifdef X86_32_PICASM
+    push            r5
+    mov             r5, esp
+    and             esp, 0xffffffe0
+    push            0x80808080    ;db80h_256
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    punpcklbw       xmm_tmp2, [esp]
+    mov             esp, r5
+    pop             r5
+%else
     punpcklbw       xmm_tmp2, [db80h_256]
+%endif
     pshufb          xmm_tmp3, xmm_tmp2
     pshufb          xmm_tmp4, xmm_tmp2
     SSE2_BilinearFastCalcXYFrac xmm_tmp0, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
@@ -2385,7 +2585,24 @@
     lddqu           xmm_tmp3, [p_src_row0 + r_tmp0]
     lddqu           xmm_tmp4, [p_src_row1 + r_tmp0]
     movdqa          xmm_tmp2, xmm_xpos_int
+%ifdef X86_32_PICASM
+    push            r5
+    mov             r5, esp
+    and             esp, 0xffffffe0
+    push            0x80808080    ;db80h_256
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    punpckhbw       xmm_tmp2, [esp]
+    mov             esp, r5
+    pop             r5
+%else
     punpckhbw       xmm_tmp2, [db80h_256]
+%endif
     pshufb          xmm_tmp3, xmm_tmp2
     pshufb          xmm_tmp4, xmm_tmp2
     SSE2_BilinearFastCalcXYFrac xmm_tmp1, xmm_tmp2, xmm_yfrac0, xmm_yfrac1
@@ -2524,13 +2741,43 @@
 
 %macro SSE41_BilinearAccurateDownsample4xOrLess_8px 0
     movdqa          xmm_tmp0, xmm_xpos_int
+%ifdef X86_32_PICASM
+    push            r0
+    mov             r0, esp
+    and             esp, 0xfffffff0
+    push            0x08080808
+    push            0x08080808
+    push            0x00000000
+    push            0x00000000
+    pshufb          xmm_tmp0, [esp]
+    mov             esp, r0
+    pop             r0
+%else
     pshufb          xmm_tmp0, [shufb_0000000088888888]
+%endif
     psubb           xmm_xpos_int, xmm_tmp0
     SSE2_UnpckXFracw xmm_tmp0, xmm_tmp1, xmm_xpos_frac, xmm_7fff
     mov             r_tmp0, i_xpos
     shr             r_tmp0, 16
     movdqa          xmm_tmp3, xmm_xpos_int
+%ifdef X86_32_PICASM
+    push            r5
+    mov             r5, esp
+    and             esp, 0xffffffe0
+    push            0x80808080    ;db80h_256
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    punpcklbw       xmm_tmp3, [esp]
+    mov             esp, r5
+    pop             r5
+%else
     punpcklbw       xmm_tmp3, [db80h_256]
+%endif
     lddqu           xmm_tmp4, [p_src_row0 + r_tmp0]
     lddqu           xmm_tmp2, [p_src_row1 + r_tmp0]
     lea             r_tmp0, [i_xpos + 4 * i_scalex]
@@ -2542,7 +2789,24 @@
     pmaddwd         xmm_tmp2, xmm_tmp0
     SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp3
     movdqa          xmm_tmp2, xmm_xpos_int
+%ifdef X86_32_PICASM
+    push            r5
+    mov             r5, esp
+    and             esp, 0xffffffe0
+    push            0x80808080    ;db80h_256
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    punpckhbw       xmm_tmp2, [esp]
+    mov             esp, r5
+    pop             r5
+%else
     punpckhbw       xmm_tmp2, [db80h_256]
+%endif
     lddqu           xmm_tmp4, [p_src_row0 + r_tmp0]
     lddqu           xmm_tmp3, [p_src_row1 + r_tmp0]
     pshufb          xmm_tmp4, xmm_tmp2
@@ -3321,7 +3585,20 @@
 %endmacro
 
 %macro AVX2_BilinearFastDownsample4xOrLess_16px 0
+%ifdef X86_32_PICASM
+    push            r0
+    mov             r0, esp
+    and             esp, 0xfffffff0
+    push            0x08080808
+    push            0x08080808
+    push            0x00000000
+    push            0x00000000
+    vbroadcasti128  ymm_tmp0, [esp]
+    mov             esp, r0
+    pop             r0
+%else
     vbroadcasti128  ymm_tmp0, [shufb_0000000088888888]
+%endif
     vpshufb         ymm_tmp0, ymm_xpos_int, ymm_tmp0
     vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
     AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff
@@ -3365,7 +3642,20 @@
 %endmacro
 
 %macro AVX2_BilinearFastDownsample8xOrLess_16px 0
+%ifdef X86_32_PICASM
+    push            r0
+    mov             r0, esp
+    and             esp, 0xffffffe0
+    push            0x0c0c0c0c
+    push            0x08080808
+    push            0x04040404
+    push            0x00000000
+    vbroadcasti128  ymm_tmp0, [esp]
+    mov             esp, r0
+    pop             r0
+%else
     vbroadcasti128  ymm_tmp0, [shufb_000044448888CCCC]
+%endif
     vpshufb         ymm_tmp0, ymm_xpos_int, ymm_tmp0
     vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
     mov             r_tmp0, i_xpos
@@ -3604,7 +3894,20 @@
 %endmacro
 
 %macro AVX2_BilinearAccurateDownsample4xOrLess_16px 0
+%ifdef X86_32_PICASM
+    push            r5
+    mov             r5, esp
+    and             esp, 0xffffffe0
+    push            0x08080808    ;shufb_0000000088888888
+    push            0x08080808
+    push            0x00000000
+    push            0x00000000
+    vbroadcasti128  ymm_tmp0, [esp]
+    mov             esp, r5
+    pop             r5
+%else
     vbroadcasti128  ymm_tmp0, [shufb_0000000088888888]
+%endif
     vpshufb         ymm_tmp0, ymm_xpos_int, ymm_tmp0
     vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
     AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_7fff
@@ -3619,7 +3922,24 @@
     lea             r_tmp0, [i_xpos + 2 * i_scalex2]
     lea             i_xpos, [r_tmp0 + 4 * i_scalex2]
     shr             r_tmp0, 16
+%ifdef X86_32_PICASM
+    push            r5
+    mov             r5, esp
+    and             esp, 0xffffffe0
+    push            0x80808080    ;db80h_256
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    vpunpcklbw      ymm_tmp3, ymm_xpos_int, [esp]
+    mov             esp, r5
+    pop             r5
+%else
     vpunpcklbw      ymm_tmp3, ymm_xpos_int, [db80h_256]
+%endif
     vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp3
     vpshufb         ymm_tmp2, ymm_tmp2, ymm_tmp3
     vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_tmp0
@@ -3632,7 +3952,24 @@
     shr             r_tmp0, 16
     vinserti128     ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1
     vinserti128     ymm_tmp2, ymm_tmp2, [p_src_row1 + r_tmp0], 1
+%ifdef X86_32_PICASM
+    push            r5
+    mov             r5, esp
+    and             esp, 0xffffffe0
+    push            0x80808080    ;db80h_256
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    vpunpckhbw      ymm_tmp3, ymm_xpos_int, [esp]
+    mov             esp, r5
+    pop             r5
+%else
     vpunpckhbw      ymm_tmp3, ymm_xpos_int, [db80h_256]
+%endif
     vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp3
     vpshufb         ymm_tmp2, ymm_tmp2, ymm_tmp3
     vpmaddwd        ymm_tmp4, ymm_tmp4, ymm_tmp1
@@ -3648,7 +3985,20 @@
 %endmacro
 
 %macro AVX2_BilinearAccurateDownsample8xOrLess_16px 0
+%ifdef X86_32_PICASM
+    push            r5
+    mov             r5, esp
+    and             esp, 0xffffffe0
+    push            0x0c0c0c0c    ;shufb_000044448888cccc
+    push            0x08080808
+    push            0x04040404
+    push            0x00000000
+    vbroadcasti128  ymm_tmp0, [esp]
+    mov             esp, r5
+    pop             r5
+%else
     vbroadcasti128  ymm_tmp0, [shufb_000044448888CCCC]
+%endif
     vpshufb         ymm_tmp0, ymm_xpos_int, ymm_tmp0
     vpsubb          ymm_xpos_int, ymm_xpos_int, ymm_tmp0
     mov             r_tmp0, i_xpos
@@ -3669,7 +4019,24 @@
     shr             r_tmp0, 16
     vinserti128     ymm_tmp0, ymm_tmp0, [p_src_row0 + r_tmp0], 1
     vinserti128     ymm_tmp1, ymm_tmp1, [p_src_row1 + r_tmp0], 1
+%ifdef X86_32_PICASM
+    push            r5
+    mov             r5, esp
+    and             esp, 0xffffffe0
+    push            0x80808080    ;db80h_256
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    push            0x80808080
+    vpunpcklbw      ymm_tmp3, ymm_xpos_int, [esp]
+    mov             esp, r5
+    pop             r5
+%else
     vpunpcklbw      ymm_tmp3, ymm_xpos_int, [db80h_256]
+%endif
     vpshufb         ymm_tmp4, ymm_tmp4, ymm_tmp3
     vpshufb         ymm_tmp5, ymm_tmp5, ymm_tmp3
     vpshufb         ymm_tmp0, ymm_tmp0, ymm_tmp3
--- a/test/encoder/EncUT_Cavlc.cpp
+++ b/test/encoder/EncUT_Cavlc.cpp
@@ -77,14 +77,18 @@
 }
 
 #ifdef X86_32_ASM
+#ifndef X86_32_PICASM
 TEST (CavlcTest, CavlcParamCal_sse2) {
   TestCavlcParamCal (CavlcParamCal_sse2);
 }
 #endif
+#endif
 
 #ifdef X86_ASM
+#ifndef X86_32_PICASM
 TEST (CavlcTest, CavlcParamCal_sse42) {
   if (WelsCPUFeatureDetect (0) & WELS_CPU_SSE42)
     TestCavlcParamCal (CavlcParamCal_sse42);
 }
+#endif
 #endif
--- a/test/encoder/EncUT_EncoderMbAux.cpp
+++ b/test/encoder/EncUT_EncoderMbAux.cpp
@@ -222,6 +222,7 @@
 }
 #endif //HAVE_AVX2
 
+#ifndef X86_32_PICASM
 TEST (EncodeMbAuxTest, WelsCalculateSingleCtr4x4_sse2) {
   CMemoryAlign cMemoryAlign (0);
   ALLOC_MEMORY (int16_t, iDctC, 16);
@@ -235,6 +236,7 @@
   FREE_MEMORY (iDctC);
   FREE_MEMORY (iDctS);
 }
+#endif //#ifndef X86_32_PICASM
 #endif
 
 void copy (uint8_t* pDst, int32_t iDStride, uint8_t* pSrc, int32_t iSStride, int32_t iWidth, int32_t iHeight) {
@@ -302,9 +304,11 @@
   TestGetNoneZeroCount (WelsGetNoneZeroCount_c);
 }
 #ifdef X86_ASM
+#ifndef X86_32_PICASM
 TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse2) {
   TestGetNoneZeroCount (WelsGetNoneZeroCount_sse2);
 }
+#endif
 TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse42) {
   if (WelsCPUFeatureDetect (0) & WELS_CPU_SSE42)
     TestGetNoneZeroCount (WelsGetNoneZeroCount_sse42);