shithub: dav1d

Download patch

ref: d21dc801529a4aeeaad0d7da4bd1f8e675cba269
parent: 81a264586522e44a145152ce27bc4cbe4152dad4
author: Henrik Gramner <[email protected]>
date: Mon Feb 3 18:56:06 EST 2020

x86: Add miscellaneous minor scalar optimizations

Shave off a few instructions, or save a few bytes, in various places.
Also change some instructions to use appropriately sized registers.

--- a/src/x86/itx.asm
+++ b/src/x86/itx.asm
@@ -4630,9 +4630,9 @@
 .zero_loop_half:
     mova         [rax+64*0], m0
     mova         [rax+64*1], m0
-    mova         [rax+64*2], m0
-    mova         [rax+64*3], m0
     add                 rax, 64*4
+    mova         [rax-64*2], m0
+    mova         [rax-64*1], m0
     sub                 r0d, 2
     jg .zero_loop_half
     RET
@@ -4697,12 +4697,9 @@
     lea                dstq, [r5+16]
     jmp .loop
 .ret:
-    sub                  cq, 32
+    sub                  cd, eax
     pxor                 m0, m0
-    mov                 r0d, 4
-    mov                 r1d, 8
-    cmp                  cq, rax
-    cmova               r0d, r1d
+    add                  cd, 384
 .zero_loop:
     mova         [rax+32*0], m0
     mova         [rax+32*1], m0
@@ -4709,8 +4706,8 @@
     mova         [rax+32*2], m0
     mova         [rax+32*3], m0
     add                 rax, 32*4
-    dec                 r0d
-    jg .zero_loop
+    sub                  cd, 128
+    jge .zero_loop
     RET
 
 cglobal inv_txfm_add_dct_dct_32x32, 4, 4, 0, dst, stride, c, eob
--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -2026,7 +2026,7 @@
     movd                 m1, [o(pw_2896x8)]
     pmulhrsw             m0, m1, [coeffq]
 %ifidn %2, dct
-    movd                m2, [o(pw_16384)]
+    movd                 m2, [o(pw_16384)]
     mov            [coeffq], eobd
     mov                 r2d, 2
     lea                tx2q, [o(m(inv_txfm_add_dct_dct_16x4).end)]
@@ -4483,7 +4483,7 @@
     add                   dstq, strideq
     dec                    r3d
     jg .loop
-    jmp                tx2q
+    jmp                   tx2q
 
 .end:
     RET
@@ -4551,7 +4551,7 @@
 
 .end3:
     mov                   dstq, r3
-    lea                     r3, [r3+8]
+    add                     r3, 8
     lea                   tx2q, [o(m(idct_32x8_internal).end4)]
     jmp   m(idct_8x8_internal).pass2_main
 
@@ -4564,7 +4564,7 @@
 
 .end5:
     mov                   dstq, r3
-    lea                     r3, [r3+8]
+    add                     r3, 8
     lea                   tx2q, [o(m(idct_32x8_internal).end6)]
     jmp   m(idct_8x8_internal).pass2_main
 
@@ -4721,7 +4721,7 @@
     mova   [rsp+gprsize+16*23], m5                        ;in5
     mova   [rsp+gprsize+16*22], m7                        ;in7
 
-    cmp                eobd, 150
+    cmp                   eobd, 150
     jg .full
 
     mova                    m1, m4                        ;in4
@@ -5036,24 +5036,20 @@
 cglobal inv_txfm_add_identity_identity_16x32, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
     %undef cmp
 
-    mov                     r4, 1
-    mov                     r5, 2
+    mov                    r4d, eobd
     cmp                   eobd, 43                ;if (eob > 43)
-    cmovg                   r4, r5                ;  iteration_count++
-    inc                     r5
-    cmp                   eobd, 150               ;if (eob > 150)
-    cmovg                   r4, r5                ;  iteration_count++
-    inc                     r5
-    cmp                   eobd, 278               ;if (eob > 278)
-    cmovg                   r4, r5                ;  iteration_count++
+    sbb                    r3d, r3d               ;  iteration_count++
+    cmp                    r4d, 150               ;if (eob > 150)
+    sbb                    r3d, 0                 ;  iteration_count++
+    cmp                    r4d, 278               ;if (eob > 278)
+    sbb                    r3d, -4                ;  iteration_count++
 
 %if ARCH_X86_32
     LEA                     r5, $$
 %endif
-    lea                     r3, [dstq+8]
-    mov             [rsp+16*3], r3
-    mov                     r3, r4
-    mov     [rsp+gprsize+16*3], r4
+    lea                     r4, [dstq+8]
+    mov             [rsp+16*3], r4
+    mov     [rsp+gprsize+16*3], r3d
     mov   [rsp+gprsize*2+16*3], coeffq
 
 .loop:
@@ -5089,15 +5085,15 @@
     call  m(idct_8x8_internal).end3
     lea                   dstq, [dstq+strideq*2]
     add                 coeffq, 16
-    dec                     r3
+    dec                    r3d
     jg .loop
     mov                 coeffq, [rsp+gprsize*2+16*3]
     add                 coeffq, 64*8
-    mov                     r3, [rsp+gprsize+16*3]
+    mov                    r3d, [rsp+gprsize+16*3]
     xor                   dstq, dstq
     mov     [rsp+gprsize+16*3], dstq
     mov                   dstq, [rsp+16*3]
-    test                    r3, r3
+    test                   r3d, r3d
     jnz .loop
     RET
 
@@ -5105,20 +5101,19 @@
 cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
     %undef cmp
 
-    mov                     r4, 12                ;0100b
-    mov                     r5, 136               ;1000 1000b
-    cmp                   eobd, 43                ;if (eob > 43)
-    cmovg                   r4, r5                ;  iteration_count+2
-    mov                     r5, 34952             ;1000 1000 1000 1000b
-    cmp                   eobd, 150               ;if (eob > 150)
-    cmovg                   r4, r5                ;  iteration_count += 4
+    mov                    r4d, 12                ;0100b
+    mov                    r5d, 136               ;1000 1000b
+    cmp                   eobd, 44                ;if (eob > 43)
+    cmovns                 r4d, r5d               ;  iteration_count+2
+    cmp                   eobd, 151               ;if (eob > 150)
+    mov                    r3d, 34952             ;1000 1000 1000 1000b
+    cmovs                  r3d, r4d               ;  iteration_count += 4
 
 %if ARCH_X86_32
     LEA                     r5, $$
 %endif
-    lea                     r3, [dstq+8]
-    mov             [rsp+16*3], r3
-    mov                     r3, r4
+    lea                     r4, [dstq+8]
+    mov             [rsp+16*3], r4
 
 .loop:
     LOAD_8ROWS          coeffq, 32, 1
@@ -5147,16 +5142,13 @@
 
 .loop_end:
     add                 coeffq, 16
-    shr                     r3, 2
-    test                    r3, r3
+    shr                    r3d, 2
     jz .ret
-    test                    r3, 2
+    test                   r3d, 2
     jnz .loop
-    mov                     r4, r3
-    and                     r4, 1
-    shl                     r4, 3
-    add                 coeffq, r4
-    add                 coeffq, 32*7
+    mov                    r4d, r3d
+    and                    r4d, 1
+    lea                 coeffq, [coeffq+r4*8+32*7]
     mov                   dstq, [rsp+16*3]
     lea                     r4, [dstq+8]
     mov             [rsp+16*3], r4
@@ -5189,17 +5181,16 @@
 cglobal idct_32x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     %undef cmp
 
-    mov                     r5, 4
-    mov                     r4, 2
+    mov                    r4d, 2
     sub                   eobd, 136
-    cmovge                  r4, r5
+    mov  [rsp+gprsize*1+16*35], eobd
+    mov                    r3d, 4
+    cmovs                  r3d, r4d
 
 %if ARCH_X86_32
     LEA                     r5, $$
 %endif
 
-    mov  [rsp+gprsize*1+16*35], eobd
-    mov                     r3, r4
     mov  [rsp+gprsize*2+16*35], coeffq
 
 .pass1_loop:
@@ -5295,17 +5286,17 @@
     SAVE_8ROWS    coeffq+64*24, 64
 
     add                 coeffq, 16
-    dec                     r3
+    dec                    r3d
     jg .pass1_loop
 
 
 .pass2:
     mov                 coeffq, [rsp+gprsize*2+16*35]
-    mov                     r3, 4
+    mov                    r3d, 4
     lea                   tx2q, [o(m(idct_32x32_internal).pass2_end)]
 
 .pass2_loop:
-    mov  [rsp+gprsize*3+16*35], r3
+    mov  [rsp+gprsize*3+16*35], r3d
     lea                     r3, [dstq+8]
     mov  [rsp+gprsize*2+16*35], r3
 
@@ -5405,8 +5396,8 @@
     lea                   tx2q, [o(m(idct_32x32_internal).pass2_end)]
     add                 coeffq, 16*32
     mov                   dstq, [rsp+gprsize*2+16*35]
-    mov                     r3, [rsp+gprsize*3+16*35]
-    dec                     r3
+    mov                    r3d, [rsp+gprsize*3+16*35]
+    dec                    r3d
     jg .pass2_loop
 
     ret
@@ -5415,21 +5406,20 @@
 cglobal inv_txfm_add_identity_identity_32x32, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2
     %undef cmp
 
-    mov                     r4, 2
-    mov                     r5, 4
+    mov                    r4d, 2
     cmp                   eobd, 136
-    cmovge                  r4, r5
+    mov                    r3d, 4
+    cmovs                  r3d, r4d
 
 %if ARCH_X86_32
     LEA                     r5, $$
 %endif
 
-    lea                     r3, [dstq+8]
-    mov   [rsp+gprsize*0+16*3], r3
-    mov   [rsp+gprsize*1+16*3], r4
-    mov   [rsp+gprsize*2+16*3], r4
+    lea                     r4, [dstq+8]
+    mov   [rsp+gprsize*0+16*3], r4
+    mov   [rsp+gprsize*1+16*3], r3d
+    mov   [rsp+gprsize*2+16*3], r3d
     mov   [rsp+gprsize*3+16*3], coeffq
-    mov                     r3, r4
 
 .loop:
     LOAD_8ROWS          coeffq, 64
@@ -5449,11 +5439,11 @@
     REPX   {mova [coeffq+64*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
 
     add                 coeffq, 16
-    dec                     r3
+    dec                    r3d
     jg .loop
 
-    mov                     r4, [rsp+gprsize*2+16*3]
-    dec                     r4
+    mov                    r4d, [rsp+gprsize*2+16*3]
+    dec                    r4d
     jle .ret
 
     mov                   dstq, [rsp+gprsize*0+16*3]
@@ -5462,7 +5452,7 @@
     lea                     r3, [dstq+8]
     add                 coeffq, 64*8
     mov   [rsp+gprsize*0+16*3], r3
-    mov                     r3, [rsp+gprsize*1+16*3]
+    mov                    r3d, [rsp+gprsize*1+16*3]
     mov   [rsp+gprsize*3+16*3], coeffq
     jmp .loop
 
@@ -5496,17 +5486,16 @@
 cglobal idct_16x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     %undef cmp
 
-    mov                     r5, 4
-    mov                     r4, 2
+    mov                    r4d, 2
     sub                   eobd, 151
-    cmovge                  r4, r5
+    mov  [rsp+gprsize*1+16*67], eobd
+    mov                    r3d, 4
+    cmovs                  r3d, r4d
 
 %if ARCH_X86_32
     LEA                     r5, $$
 %endif
 
-    mov  [rsp+gprsize*1+16*67], eobd
-    mov                     r3, r4
     mov  [rsp+gprsize*2+16*67], coeffq
 
 .pass1_loop:
@@ -5531,17 +5520,17 @@
     SAVE_8ROWS     coeffq+64*0, 64
 
     add                 coeffq, 16
-    dec                     r3
+    dec                    r3d
     jg .pass1_loop
 
     mov                 coeffq, [rsp+gprsize*2+16*67]
-    mov                     r3, 2
+    mov                    r3d, 2
     lea                     r4, [dstq+8]
     mov  [rsp+gprsize*2+16*67], r4
     lea                     r4, [o(m(idct_16x64_internal).end1)]
 
 .pass2_loop:
-    mov  [rsp+gprsize*3+16*67], r3
+    mov  [rsp+gprsize*3+16*67], r3d
     mov                   eobd, [rsp+gprsize*1+16*67]
 
     mova                    m0, [coeffq+16*4 ]            ;in1
@@ -5673,12 +5662,12 @@
     sub                    rsp, 16*32
 
     mov                   dstq, [rsp+gprsize*2+16*67]
-    mov                     r3, [rsp+gprsize*3+16*67]
+    mov                    r3d, [rsp+gprsize*3+16*67]
     lea                     r4, [dstq+8]
     mov  [rsp+gprsize*2+16*67], r4
     lea                     r4, [o(m(idct_16x64_internal).end1)]
 
-    dec                     r3
+    dec                    r3d
     jg .pass2_loop
     ret
 
@@ -6648,17 +6637,16 @@
 cglobal idct_32x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     %undef cmp
 
-    mov                     r5, 4
-    mov                     r4, 2
+    mov                    r4d, 2
     sub                   eobd, 136
-    cmovge                  r4, r5
+    mov  [rsp+gprsize*1+16*67], eobd
+    mov                    r3d, 4
+    cmovs                  r3d, r4d
 
 %if ARCH_X86_32
     LEA                     r5, $$
 %endif
 
-    mov  [rsp+gprsize*1+16*67], eobd
-    mov                     r3, r4
     mov  [rsp+gprsize*2+16*67], coeffq
 
 .pass1_loop:
@@ -6744,12 +6732,12 @@
     SAVE_8ROWS    coeffq+64*24, 64
 
     add                 coeffq, 16
-    dec                     r3
+    dec                    r3d
     jg .pass1_loop
 
 .pass2:
     mov                 coeffq, [rsp+gprsize*2+16*67]
-    mov                     r3, 4
+    mov                    r3d, 4
     lea                     r4, [dstq+8]
     mov  [rsp+gprsize*2+16*67], r4
     lea                     r4, [o(m(idct_16x64_internal).end1)]
@@ -6782,17 +6770,16 @@
 cglobal idct_64x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     %undef cmp
 
-    mov                     r5, 4
-    mov                     r4, 2
+    mov                    r4d, 2
     sub                   eobd, 136
-    cmovge                  r4, r5
+    mov  [rsp+gprsize*1+16*67], eobd
+    mov                    r3d, 4
+    cmovs                  r3d, r4d
 
 %if ARCH_X86_32
     LEA                     r5, $$
 %endif
 
-    mov  [rsp+gprsize*1+16*67], eobd
-    mov                     r3, r4
     mov  [rsp+gprsize*2+16*67], coeffq
     mov  [rsp+gprsize*3+16*67], dstq
     lea                   dstq, [rsp+gprsize+16*69]
@@ -6907,7 +6894,7 @@
 
     add                 coeffq, 16
     add                   dstq, 16
-    dec                     r3
+    dec                    r3d
     jg .pass1_loop
 
 .pass2:
@@ -6917,7 +6904,7 @@
     lea                   dstq, [dstq+32]
     mov  [rsp+gprsize*1+16*35], eobd
     lea                   tx2q, [o(m(idct_64x32_internal).pass2_end)]
-    mov                     r3, 4
+    mov                    r3d, 4
     jmp m(idct_32x32_internal).pass2_loop
 
 .pass2_end:
@@ -6929,8 +6916,8 @@
     lea                   tx2q, [o(m(idct_64x32_internal).pass2_end)]
     add                 coeffq, 16*32
     mov                   dstq, [rsp+gprsize*2+16*35]
-    mov                     r3, [rsp+gprsize*3+16*35]
-    dec                     r3
+    mov                    r3d, [rsp+gprsize*3+16*35]
+    dec                    r3d
     jg m(idct_32x32_internal).pass2_loop
 
 .pass2_end2:
@@ -6937,7 +6924,7 @@
     mov                   dstq, [rsp+gprsize*3+16*67]
     mov                 coeffq, [rsp+gprsize*2+16*67]
     lea                   tx2q, [o(m(idct_32x32_internal).pass2_end)]
-    mov                     r3, 4
+    mov                    r3d, 4
     jmp m(idct_32x32_internal).pass2_loop
 
 
@@ -6963,10 +6950,10 @@
 cglobal idct_64x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     %undef cmp
 
-    mov                     r5, 4
-    mov                     r4, 2
+    mov                    r5d, 4
+    mov                    r4d, 2
     sub                   eobd, 136
-    cmovge                  r4, r5
+    cmovns                 r4d, r5d
 
 %if ARCH_X86_32
     LEA                     r5, $$
@@ -6973,7 +6960,7 @@
 %endif
 
     mov  [rsp+gprsize*1+16*67], eobd
-    mov                     r3, r4
+    mov                    r3d, r4d
     mov  [rsp+gprsize*4+16*67], coeffq
     mov  [rsp+gprsize*3+16*67], dstq
     lea                   dstq, [rsp+gprsize+16*69]
@@ -7096,7 +7083,7 @@
 
     add                 coeffq, 16
     add                   dstq, 16
-    dec                     r3
+    dec                    r3d
     jg .pass1_loop
 
 .pass2:
@@ -7103,7 +7090,7 @@
     mov                   dstq, [rsp+gprsize*3+16*67]
     mov                 coeffq, [rsp+gprsize*2+16*67]
     lea                   dstq, [dstq+32]
-    mov                     r3, 4
+    mov                    r3d, 4
     lea                     r4, [dstq+8]
     mov  [rsp+gprsize*2+16*67], r4
     lea                     r4, [o(m(idct_64x64_internal).pass2_end)]
@@ -7122,18 +7109,18 @@
     sub                    rsp, 16*32
 
     mov                   dstq, [rsp+gprsize*2+16*67]
-    mov                     r3, [rsp+gprsize*3+16*67]
+    mov                    r3d, [rsp+gprsize*3+16*67]
     lea                     r4, [dstq+8]
     mov  [rsp+gprsize*2+16*67], r4
     lea                     r4, [o(m(idct_64x64_internal).pass2_end)]
 
-    dec                     r3
+    dec                    r3d
     jg  m(idct_16x64_internal).pass2_loop
 
 .pass2_end2:
     mov                 coeffq, [rsp+gprsize*4+16*67]
     mov                   dstq, [rsp+gprsize*2+16*67]
-    mov                     r3, 4
+    mov                    r3d, 4
     sub                   dstq, 72
     lea                     r4, [dstq+8]
     mov  [rsp+gprsize*2+16*67], r4
--- a/src/x86/looprestoration.asm
+++ b/src/x86/looprestoration.asm
@@ -347,7 +347,7 @@
     punpckhbw    xm0, xm1
 
     ; when we reach this, xm0 contains left two px in highest words
-    cmp           xq, -16
+    cmp           xd, -16
     jle .loop_x
 .partial_load_and_extend:
     vpbroadcastb  m3, [srcq-1]
@@ -396,17 +396,17 @@
     ; else if x < xlimd we extend from previous load (this implies have_right=0)
     ; else we are done
 
-    cmp           xq, -16
+    cmp           xd, -16
     jle .loop_x
-    test          xq, xq
+    test          xd, xd
     jl .partial_load_and_extend
-    cmp           xq, xlimq
+    cmp           xd, xlimd
     jl .right_extend
 
     add       sumsqq, (384+16)*4
     add         sumq, (384+16)*2
     add         srcq, strideq
-    dec hd
+    dec           hd
     jg .loop_y
     RET
 
@@ -418,7 +418,7 @@
     shr        ylimd, 2
     sub        ylimd, 2                             ; -2 if have_bottom=0, else 0
 .loop_x:
-    lea           yd, [hd+ylimd+2]
+    lea           yd, [hq+ylimq+2]
     lea   sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
     lea     sum_ptrq, [sumq+xq*2+2-(384+16)*2]
     test       edged, 4                             ; have_top
@@ -720,9 +720,9 @@
     punpckhbw    xm0, xm1
 
     ; when we reach this, xm0 contains left two px in highest words
-    cmp           xq, -16
+    cmp           xd, -16
     jle .loop_x
-    test          xq, xq
+    test          xd, xd
     jge .right_extend
 .partial_load_and_extend:
     vpbroadcastb  m3, [srcq-1]
@@ -781,11 +781,11 @@
     ; else if x < xlimd we extend from previous load (this implies have_right=0)
     ; else we are done
 
-    cmp           xq, -16
+    cmp           xd, -16
     jle .loop_x
-    test          xq, xq
+    test          xd, xd
     jl .partial_load_and_extend
-    cmp           xq, xlimq
+    cmp           xd, xlimd
     jl .right_extend
 
     add       sumsqq, (384+16)*4
@@ -803,7 +803,7 @@
     shr        ylimd, 2
     sub        ylimd, 3                             ; -3 if have_bottom=0, else -1
 .loop_x:
-    lea           yd, [hd+ylimd+2]
+    lea           yd, [hq+ylimq+2]
     lea   sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
     lea     sum_ptrq, [sumq+xq*2+2-(384+16)*2]
     test       edged, 4                             ; have_top
--- a/src/x86/looprestoration_ssse3.asm
+++ b/src/x86/looprestoration_ssse3.asm
@@ -725,7 +725,7 @@
     punpckhbw    xm0, xm1
 
     ; when we reach this, m0 contains left two px in highest words
-    cmp           xq, -8
+    cmp           xd, -8
     jle .loop_x
 .partial_load_and_extend:
     movd          m3, [srcq-4]
@@ -1299,9 +1299,9 @@
     punpckhbw     m0, m1
 
     ; when we reach this, m0 contains left two px in highest words
-    cmp           xq, -8
+    cmp           xd, -8
     jle .loop_x
-    test          xq, xq
+    test          xd, xd
     jge .right_extend
 .partial_load_and_extend:
     XCHG_PIC_REG
@@ -1394,11 +1394,11 @@
     ; else if x < xlimd we extend from previous load (this implies have_right=0)
     ; else we are done
 
-    cmp           xq, -8
+    cmp           xd, -8
     jle .loop_x
-    test          xq, xq
+    test          xd, xd
     jl .partial_load_and_extend
-    cmp           xq, xlimq
+    cmp           xd, xlimd
     jl .right_extend
 
     add       sumsqq, (384+16)*4
--- a/src/x86/mc_ssse3.asm
+++ b/src/x86/mc_ssse3.asm
@@ -1425,7 +1425,7 @@
     jmp                  wq
 .h_w2:
 %if ARCH_X86_32
-    and                 mxd, 0xff
+    and                 mxd, 0x7f
 %else
     movzx               mxd, mxb
 %endif
@@ -1455,7 +1455,7 @@
     RET
 .h_w4:
 %if ARCH_X86_32
-    and                 mxd, 0xff
+    and                 mxd, 0x7f
 %else
     movzx               mxd, mxb
 %endif
@@ -1850,7 +1850,11 @@
     %assign stack_offset org_stack_offset
     cmp                  wd, 4
     jg .hv_w8
-    and                 mxd, 0xff
+%if ARCH_X86_32
+    and                 mxd, 0x7f
+%else
+    movzx               mxd, mxb
+%endif
     dec                srcq
     movd                 m1, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
 %if ARCH_X86_32
@@ -2511,7 +2515,7 @@
     jmp                  wq
 .h_w4:
 %if ARCH_X86_32
-    and                 mxd, 0xff
+    and                 mxd, 0x7f
 %else
     movzx               mxd, mxb
 %endif
@@ -2635,7 +2639,7 @@
 .v:
 %if ARCH_X86_32
     mov                 mxd, myd
-    and                 mxd, 0xff
+    and                 mxd, 0x7f
 %else
  %assign stack_offset org_stack_offset
     WIN64_SPILL_XMM      16
@@ -2849,12 +2853,12 @@
     %assign stack_offset org_stack_offset
     cmp                  wd, 4
     jg .hv_w8
-    and                 mxd, 0xff
+    and                 mxd, 0x7f
     movd                 m1, [base_reg+mxq*8+subpel_filters-prep_ssse3+2]
 %if ARCH_X86_32
     mov                 mxd, myd
-    and                 mxd, 0xff
     shr                 myd, 16
+    and                 mxd, 0x7f
     cmp                  hd, 4
     cmovle              myd, mxd
     movq                 m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
@@ -3101,9 +3105,9 @@
  %define             accuv0  [rsp+mmsize*11]
  %define             accuv1  [rsp+mmsize*12]
     movq                 m1, [base_reg+mxq*8+subpel_filters-prep_ssse3]
-    movzx               mxd, myw
-    and                 mxd, 0xff
+    mov                 mxd, myd
     shr                 myd, 16
+    and                 mxd, 0x7f
     cmp                  hd, 4
     cmovle              myd, mxd
     movq                 m5, [base_reg+myq*8+subpel_filters-prep_ssse3]