shithub: dav1d

Download patch

ref: a440af4a51abf484b637ef936872dd378f40d86a
parent: 18d2d7507e00b79e6093961a3ce82b9f82ac50c7
author: Henrik Gramner <[email protected]>
date: Sat Jan 12 16:38:35 EST 2019

Add ipred_z3 AVX2 asm

Also backport some minor optimizations to z1.

--- a/src/x86/ipred.asm
+++ b/src/x86/ipred.asm
@@ -28,7 +28,7 @@
 
 %if ARCH_X86_64
 
-SECTION_RODATA 32
+SECTION_RODATA 64
 
 %macro SMOOTH_WEIGHT_TABLE 1-*
     %rep %0
@@ -57,7 +57,6 @@
      18,  16,  15,  13,  12,  10,   9,   8, \
       7,   6,   6,   5,   5,   4,   4,   4
 
-; Note that the order of (some of) the following z constants matter
 z_filter_wh:  db  7,  7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
               db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1
 z_filter_k:   db  0, 16,  0, 16,  0, 20,  0, 20,  8, 16,  8, 16
@@ -65,10 +64,18 @@
               db  0,  0,  0,  0,  0,  0,  0,  0,  8,  0,  8,  0
 z_filter_s:   db  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7
               db  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15
+              db 15, 15, 15, 15, 15, 15, 15, 15 ; should be in one cache line
+pb_12:        times 4 db 12 ; those are just placed here for alignment.
+pb_14:        times 4 db 14
+z3_shuf:      db  8,  7,  7,  6,  6,  5,  5,  4,  4,  3,  3,  2,  2,  1,  1,  0
 z_filter_t0:  db 55,127, 39,127, 39,127,  7, 15, 31,  7, 15, 31,  0,  3, 31,  0
 z_filter_t1:  db 39, 63, 19, 47, 19, 47,  3,  3,  3,  3,  3,  3,  0,  0,  0,  0
-z_upsample:   db  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
-z_shuf_w4:    db  0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
+z_upsample1:  db  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
+z_upsample2:  db  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  8,  8,  8
+z_upsample3:  db  0,  0,  0,  0,  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5
+z1_shuf_w4:   db  0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
+z3_shuf_w4:   db  4,  3,  3,  2,  2,  1,  1,  0, 12, 11, 11, 10, 10,  9,  9,  8
+z_transpose4: db  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
 z_base_inc:   dw  0*64,  1*64,  2*64,  3*64,  4*64,  5*64,  6*64,  7*64
               dw 16*64, 17*64, 18*64, 19*64, 20*64, 21*64, 22*64, 23*64
 
@@ -76,13 +83,14 @@
 filter_shuf1: db 10,  4, 10,  4, 37,  6,  5,  6,103,  9,  7,  9, 72, -1,  8, -1
               db 16,  4,  0,  4, 53,  6,  5,  6,119, 11,  7, 11, 95, -1, 15, -1
 filter_shuf2: db  3,  4,  3,  4,  5,  6,  5,  6,  7,  2,  7,  2,  1, -1,  1, -1
-filter_shuf3: db  3,  4,  3,  4,  5,  6,  5,  6,  7, 11,  7, 11, 15, -1, 15, -1
+filter_shuf3: db  3,  4,  3,  4,  5,  6,  5,  6,  7, 11,  7, 11; 15, -1, 15, -1
+pb_127_m127:  times 2 db 127, -127
 ipred_v_shuf: db  0,  1,  0,  1,  4,  5,  4,  5,  8,  9,  8,  9, 12, 13, 12, 13
               db  2,  3,  2,  3,  6,  7,  6,  7, 10, 11, 10, 11, 14, 15, 14, 15
 ipred_h_shuf: db  7,  7,  7,  7,  3,  3,  3,  3,  5,  5,  5,  5,  1,  1,  1,  1
-              db  6,  6,  6,  6,  2,  2,  2,  2,  4,  4,  4,  4,  0,  0,  0,  0
+              db  6,  6,  6,  6,  2,  2,  2,  2,  4,  4,  4,  4;  0,  0,  0,  0
+pw_64:        times 2 dw 64
 
-pb_0to15:
 cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1
                         db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
                         ; w=8, w_pad=1 as well as second half of previous one
@@ -94,26 +102,27 @@
                         ; w=16,w_pad=3
                         db 0, 1, 2, 3, 4, 5
                         times 13 db 6, 7
+pb_15to0:               db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
-pb_1:   times 4 db 1
-pb_2:   times 4 db 2
-pb_4:   times 4 db 4
-pb_8:   times 4 db 8
-pb_12:  times 4 db 12
-pb_14:  times 4 db 14
-pb_15   times 4 db 15
-pb_31:  times 4 db 31
-pb_128: times 4 db 128
-pw_1:   times 2 dw 1
-pw_8:   times 2 dw 8
-pw_62:  times 2 dw 62
-pw_64:  times 2 dw 64
-pw_128: times 2 dw 128
-pw_255: times 2 dw 255
-pw_512: times 2 dw 512
+%define pb_0to15 cfl_ac_w16_pad_shuffle
+%define pb_1  (ipred_h_shuf+12)
+%define pb_2  (ipred_h_shuf+20)
+%define pb_3  (ipred_h_shuf+ 4)
+%define pb_4  (ipred_h_shuf+24)
+%define pb_7  (ipred_h_shuf+ 0)
+%define pb_8  (z_upsample2 +12)
+%define pb_15 (z_filter_s  +32)
+%define pw_8  (z_filter_k  +32)
 
-pb_36_m4:    times 2 db  36,   -4
-pb_127_m127: times 2 db 127, -127
+pb_27:    times 4 db 27
+pb_31:    times 4 db 31
+pb_128:   times 4 db 128
+pw_1:     times 2 dw 1
+pw_62:    times 2 dw 62
+pw_128:   times 2 dw 128
+pw_255:   times 2 dw 255
+pw_512:   times 2 dw 512
+pb_36_m4: times 2 db 36, -4
 
 %macro JMP_TABLE 3-*
     %xdefine %1_%2_table (%%table - 2*4)
@@ -138,6 +147,7 @@
 JMP_TABLE ipred_dc_left,  avx2, h4, h8, h16, h32, h64
 JMP_TABLE ipred_h,        avx2, w4, w8, w16, w32, w64
 JMP_TABLE ipred_z1,       avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z3,       avx2, h4, h8, h16, h32, h64
 JMP_TABLE ipred_cfl,      avx2, h4, h8, h16, h32, w4, w8, w16, w32, \
                                 s4-8*4, s8-8*4, s16-8*4, s32-8*4
 JMP_TABLE ipred_cfl_left, avx2, h4, h8, h16, h32
@@ -1315,10 +1325,8 @@
     jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
     ALLOC_STACK         -32, 8
     mova                xm1, [tlq-1]
-    pshufb              xm0, xm1, [z_upsample]
-    vpbroadcastd        xm2, [pb_8]
-    pminub              xm2, [z_filter_s+6]
-    pshufb              xm1, xm2
+    pshufb              xm0, xm1, [z_upsample1]
+    pshufb              xm1, [z_upsample2]
     vpbroadcastd        xm2, [pb_36_m4] ; upshifted by 2 to be able to reuse
     add                 dxd, dxd        ; pw_512 (which is already in m3)
     pmaddubsw           xm0, xm2        ; for rounding instead of pw_2048
@@ -1375,13 +1383,14 @@
 .filter_strength: ; w4/w8/w16
     ; The C version uses a lot of branches, but we can do all the comparisons
     ; in parallel and use popcnt to get the final filter strength value.
+%define base r3-z_filter_t0
+    lea                  r3, [z_filter_t0]
     movd                xm0, maxbased
     movd                xm2, angled
-    lea                  r3, [z_filter_t0]
     shr              angled, 8 ; is_sm << 1
     vpbroadcastb         m0, xm0
     vpbroadcastb         m2, xm2
-    pcmpeqb              m1, m0, [r3-z_filter_t0+z_filter_wh]
+    pcmpeqb              m1, m0, [base+z_filter_wh]
     pand                 m1, m2
     mova                xm2, [r3+angleq*8] ; upper ymm half zero in both cases
     pcmpgtb              m1, m2
@@ -1398,14 +1407,13 @@
     call .filter_strength
     mov            maxbased, 7
     jz .w4_main ; filter_strength == 0
-    lea                  r3, [z_filter_k-4]
-    vpbroadcastd         m7, [pb_8]
+    vpbroadcastd         m7, [base+pb_8]
     vbroadcasti128       m2, [tlq-1]
-    pminub               m1, m7, [r3-z_filter_k+z_filter_s+4]
-    vpbroadcastd         m8, [r3+r5*4+12*0]
-    pminub               m7, [r3-z_filter_k+z_filter_s+12]
-    vpbroadcastd         m9, [r3+r5*4+12*1]
-    vpbroadcastd        m10, [r3+r5*4+12*2]
+    pminub               m1, m7, [base+z_filter_s]
+    vpbroadcastd         m8, [base+z_filter_k-4+r5*4+12*0]
+    pminub               m7, [base+z_filter_s+8]
+    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
+    vpbroadcastd        m10, [base+z_filter_k-4+r5*4+12*2]
     pshufb               m0, m2, m1
     shufps               m1, m7, q2121
     pmaddubsw            m0, m8
@@ -1432,7 +1440,7 @@
     mov                 r3d, dxd ; xpos
     movd                xm9, maxbased
     vpbroadcastw         m9, xm9
-    vbroadcasti128       m8, [z_shuf_w4]
+    vbroadcasti128       m8, [z1_shuf_w4]
     psrlw                m7, 8  ; top[max_base_x]
     paddw               m10, m6, m6
     psubw                m9, m0 ; max_base_x
@@ -1502,7 +1510,7 @@
     movd                xm6, hd
     vinserti128          m0, [tlq+7], 1
     vpbroadcastb        xm6, xm6
-    vbroadcasti128       m1, [z_upsample]
+    vbroadcasti128       m1, [z_upsample1]
     pminub              xm6, xm2
     vpbroadcastd         m7, [pb_36_m4]
     vinserti128          m2, xm6, 1
@@ -1561,9 +1569,8 @@
     jg .w8_upsample_loop
     RET
 .w8_no_intra_edge_filter:
-    mov                 r3d, 15
-    cmp                  hd, 8
-    cmova          maxbased, r3d
+    and            maxbased, 7
+    or             maxbased, 8 ; imin(h+7, 15)
     jmp .w8_main
 .w8_no_upsample:
     %assign stack_offset org_stack_offset
@@ -1572,27 +1579,22 @@
     test             angled, 0x400
     jnz .w8_no_intra_edge_filter
     call .filter_strength
-    vpbroadcastd        xm6, [pb_15]
-    pminub              xm6, xm0 ; imin(h, 8) + 7
-    movd           maxbased, xm6
-    movzx          maxbased, maxbaseb
     jz .w8_main ; filter_strength == 0
-    lea                  r3, [z_filter_k-4]
     movu                xm2, [tlq]
-    pminub              xm1, xm6, [r3-z_filter_k+z_filter_s+18]
+    pminub              xm1, xm0, [base+z_filter_s+14]
     vinserti128          m2, [tlq-1], 1
-    vinserti128          m1, [r3-z_filter_k+z_filter_s+ 4], 1
-    vpbroadcastd         m7, [r3+r5*4+12*0]
-    pminub              xm6, [r3-z_filter_k+z_filter_s+26]
-    vinserti128          m6, [r3-z_filter_k+z_filter_s+12], 1
-    pshufb               m0, m2, m1
-    pmaddubsw            m0, m7
-    vpbroadcastd         m7, [r3+r5*4+12*1]
+    vinserti128          m1, [base+z_filter_s+ 0], 1
+    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*0]
+    pminub              xm0, [base+z_filter_s+22]
+    vinserti128          m0, [base+z_filter_s+ 8], 1
+    pshufb               m6, m2, m1
+    pmaddubsw            m6, m7
+    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*1]
     movzx               r3d, byte [tlq+15]
-    shufps               m1, m6, q2121
+    shufps               m1, m0, q2121
     pshufb               m1, m2, m1
     pmaddubsw            m1, m7
-    paddw                m0, m1
+    paddw                m1, m6
     sub                 r5d, 3
     jnz .w8_3tap
     ; filter_strength == 3 uses a 5-tap filter instead of a 3-tap one,
@@ -1600,15 +1602,15 @@
     ; slightly different from out[max_base_x] when h > w.
     vpbroadcastd         m7, [z_filter_k+4*8]
     movzx               r2d, byte [tlq+14]
-    pshufb               m2, m6
+    pshufb               m2, m0
     pmaddubsw            m2, m7
     sub                 r2d, r3d
     lea                 r2d, [r2+r3*8+4]
     shr                 r2d, 3 ; (tlq[w*2-2] + tlq[w*2-1]*7 + 4) >> 3
     mov            [rsp+16], r2b
-    paddw                m0, m2
+    paddw                m1, m2
 .w8_3tap:
-    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
     sar                 r5d, 1
     mov                 tlq, rsp
     add                 r5d, 17 ; w*2 + (filter_strength == 3)
@@ -1615,9 +1617,9 @@
     cmp                  hd, 8
     cmova          maxbased, r5d
     mov            [tlq+r5], r3b
-    vextracti128        xm1, m0, 1
-    packuswb            xm1, xm0
-    mova              [tlq], xm1
+    vextracti128        xm0, m1, 1
+    packuswb            xm0, xm1
+    mova              [tlq], xm0
 .w8_main:
     movd                xm2, dxd
     vbroadcasti128       m0, [z_base_inc]
@@ -1668,9 +1670,8 @@
 .w8_end:
     RET
 .w16_no_intra_edge_filter:
-    mov                 r3d, 31
-    cmp                  hd, 16
-    cmova          maxbased, r3d
+    and            maxbased, 15
+    or             maxbased, 16 ; imin(h+15, 31)
     jmp .w16_main
 ALIGN function_align
 .w16:
@@ -1680,25 +1681,18 @@
     test             angled, 0x400
     jnz .w16_no_intra_edge_filter
     call .filter_strength
-    vpbroadcastd         m1, [pb_31]
-    pminub               m0, m1 ; imin(h, 16) + 15
-    movd           maxbased, xm0
-    movzx          maxbased, maxbaseb
     jz .w16_main ; filter_strength == 0
-    lea                  r3, [z_filter_k-4]
-    vpbroadcastd         m1, [pb_12]
-    vpbroadcastd        m11, [pb_15]
-    vbroadcasti128       m6, [r3-z_filter_k+z_filter_s+12]
-    vinserti128          m2, m6, [r3-z_filter_k+z_filter_s+4], 0
-    vinserti128          m6, [r3-z_filter_k+z_filter_s+20], 1
+    vpbroadcastd         m1, [base+pb_12]
+    vbroadcasti128       m6, [base+z_filter_s+8]
+    vinserti128          m2, m6, [base+z_filter_s], 0
+    vinserti128          m6, [base+z_filter_s+16], 1
     mova               xm10, [tlq-1]
     vinserti128         m10, [tlq+3], 1
-    vpbroadcastd         m9, [r3+r5*4+12*0]
-    vbroadcasti128       m7, [r3-z_filter_k+z_filter_s+18]
-    vinserti128          m8, m7, [r3-z_filter_k+z_filter_s+10], 0
-    vinserti128          m7, [r3-z_filter_k+z_filter_s+26], 1
+    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*0]
+    vbroadcasti128       m7, [base+z_filter_s+14]
+    vinserti128          m8, m7, [base+z_filter_s+6], 0
+    vinserti128          m7, [base+z_filter_s+22], 1
     psubw                m0, m1
-    pminub               m0, m11 ; imin(h+3, 15)
     movu               xm11, [tlq+12]
     vinserti128         m11, [tlq+16], 1
     pminub               m8, m0
@@ -1709,7 +1703,7 @@
     pshufb               m1, m11, m8
     shufps               m8, m7, q2121
     pmaddubsw            m1, m9
-    vpbroadcastd         m9, [r3+r5*4+12*1]
+    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
     movzx               r3d, byte [tlq+31]
     pshufb               m2, m10, m2
     pmaddubsw            m2, m9
@@ -2131,6 +2125,1169 @@
 .w64_end:
     RET
 
+cglobal ipred_z3, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase
+    %assign org_stack_offset stack_offset
+    lea                  r6, [ipred_z3_avx2_table]
+    tzcnt                hd, hm
+    movifnidn        angled, anglem
+    lea                  r7, [dr_intra_derivative+90*2]
+    dec                 tlq
+    movsxd               hq, [r6+hq*4]
+    sub              angled, 180
+    add                  hq, r6
+    movzx               dyd, angleb
+    xor              angled, 0x400
+    neg                 dyq
+    movzx               dyd, word [r7+dyq*2]
+    vpbroadcastd         m3, [pw_512]
+    vpbroadcastd         m4, [pw_62]
+    vpbroadcastd         m5, [pw_64]
+    mov              org_wd, wd
+    jmp                  hq
+.h4:
+    lea                  r7, [strideq*3]
+    cmp              angleb, 40
+    jae .h4_no_upsample
+    lea                 r4d, [angleq-1024]
+    sar                 r4d, 7
+    add                 r4d, wd
+    jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm)
+    ALLOC_STACK         -32, 9
+    movu                xm8, [tlq-7]
+    pshufb              xm0, xm8, [z_upsample3]
+    vpbroadcastb        xm2, xm8
+    pshufb              xm1, xm8, [z_filter_s+2]
+    mova           [rsp+16], xm2 ; top[max_base_y]
+    vpbroadcastd        xm2, [pb_36_m4]
+    add                 dyd, dyd
+    pmaddubsw           xm0, xm2
+    pmaddubsw           xm1, xm2
+    movd                xm7, dyd
+    mov                 r2d, dyd
+    vpbroadcastw         m7, xm7
+    paddw               xm1, xm0
+    pmulhrsw            xm1, xm3
+    pslldq               m6, m7, 8
+    paddw               xm2, xm7, xm7
+    paddw                m6, m7
+    packuswb            xm1, xm1
+    paddw                m6, m2
+    punpcklbw           xm1, xm8
+    mova                xm8, [z_transpose4]
+    psllw                m7, 2
+    pshufb              xm1, [pb_15to0]
+    mova              [rsp], xm1
+.h4_upsample_loop:
+    lea                 r4d, [r2+dyq]
+    shr                 r2d, 6
+    vpbroadcastq         m1, [rsp+r2]
+    lea                 r2d, [r4+dyq]
+    shr                 r4d, 6
+    vpbroadcastq         m2, [rsp+r4]
+    lea                 r4d, [r2+dyq]
+    shr                 r2d, 6
+    movq                xm0, [rsp+r2]
+    lea                 r2d, [r4+dyq]
+    shr                 r4d, 6
+    movhps              xm0, [rsp+r4]
+    vpblendd             m1, m2, 0xc0
+    pand                 m2, m4, m6
+    vpblendd             m0, m1, 0xf0
+    psubw                m1, m5, m2
+    psllw                m2, 8
+    por                  m1, m2
+    pmaddubsw            m0, m1
+    paddw                m6, m7
+    pmulhrsw             m0, m3
+    vextracti128        xm1, m0, 1
+    packuswb            xm1, xm0
+    pshufb              xm1, xm8
+    movd   [dstq+strideq*0], xm1
+    pextrd [dstq+strideq*1], xm1, 1
+    pextrd [dstq+strideq*2], xm1, 2
+    pextrd [dstq+r7       ], xm1, 3
+    add                dstq, 4
+    sub                  wd, 4
+    jg .h4_upsample_loop
+    RET
+ALIGN function_align
+.filter_strength: ; h4/h8/h16
+%define base r4-z_filter_t0
+    lea                  r4, [z_filter_t0]
+    movd                xm0, maxbased
+    movd                xm2, angled
+    shr              angled, 8 ; is_sm << 1
+    vpbroadcastb         m0, xm0
+    vpbroadcastb         m2, xm2
+    pcmpeqb              m1, m0, [base+z_filter_wh]
+    pand                 m1, m2
+    mova                xm2, [r4+angleq*8]
+    pcmpgtb              m1, m2
+    pmovmskb            r5d, m1
+    popcnt              r5d, r5d
+    ret
+.h4_no_upsample:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -16, 12
+    mov            maxbased, 7
+    test             angled, 0x400 ; !enable_intra_edge_filter
+    jnz .h4_main
+    lea            maxbased, [wq+3]
+    call .filter_strength
+    mov            maxbased, 7
+    jz .h4_main ; filter_strength == 0
+    vpbroadcastd         m7, [base+pb_7]
+    vbroadcasti128       m2, [tlq-14]
+    pmaxub               m1, m7, [base+z_filter_s-4]
+    vpbroadcastd         m8, [base+z_filter_k-4+r5*4+12*0]
+    pmaxub               m7, [base+z_filter_s+4]
+    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
+    vpbroadcastd        m10, [base+z_filter_k-4+r5*4+12*2]
+    pshufb               m0, m2, m1
+    shufps               m1, m7, q2121
+    pmaddubsw            m0, m8
+    pshufb               m1, m2, m1
+    pmaddubsw            m1, m9
+    pshufb               m2, m7
+    pmaddubsw            m2, m10
+    paddw                m0, m1
+    paddw                m0, m2
+    pmulhrsw             m0, m3
+    mov                 r4d, 9
+    lea                 tlq, [rsp+15]
+    cmp                  wd, 4
+    cmova          maxbased, r4d
+    vextracti128        xm1, m0, 1
+    packuswb            xm0, xm1
+    mova              [rsp], xm0
+.h4_main:
+    movd                xm6, dyd
+    vpbroadcastq         m0, [z_base_inc] ; base_inc << 6
+    mov                  r4, tlq
+    sub                 tlq, 4
+    neg                 dyq
+    vpbroadcastw         m6, xm6
+    sub                  r4, maxbaseq
+    shl            maxbased, 6
+    vpbroadcastb         m7, [r4]
+    lea                  r4, [dyq+63] ; ypos
+    movd                xm9, maxbased
+    sub            maxbased, 63
+    vbroadcasti128       m8, [z3_shuf_w4]
+    neg            maxbaseq
+    vpbroadcastw         m9, xm9
+    psrlw                m7, 8  ; top[max_base_y]
+    paddw               m10, m6, m6
+    psubw                m9, m0 ; max_base_y
+    vpblendd             m6, m10, 0xcc
+    mova                xm0, xm10
+    paddw                m6, m0 ; ypos2 ypos3 ypos0 ypos1
+    paddw               m10, m10
+    mova               xm11, [z_transpose4]
+.h4_loop:
+    lea                  r5, [r4+dyq]
+    sar                  r4, 6 ; base0
+    vpbroadcastq         m1, [tlq+r4]
+    lea                  r4, [r5+dyq]
+    sar                  r5, 6 ; base1
+    vpbroadcastq         m2, [tlq+r5]
+    lea                  r5, [r4+dyq]
+    sar                  r4, 6 ; base2
+    movq                xm0, [tlq+r4]
+    lea                  r4, [r5+dyq]
+    sar                  r5, 6 ; base3
+    movhps              xm0, [tlq+r5]
+    vpblendd             m1, m2, 0xc0
+    pand                 m2, m4, m6 ; frac << 1
+    vpblendd             m0, m1, 0xf0
+    psubw                m1, m5, m2 ; (32 - frac) << 1
+    psllw                m2, 8
+    pshufb               m0, m8
+    por                  m1, m2     ; (32-frac, frac) << 1
+    pmaddubsw            m0, m1
+    pcmpgtw              m1, m9, m6 ; base < max_base_y
+    pmulhrsw             m0, m3
+    paddsw               m6, m10    ; ypos += dy
+    vpblendvb            m0, m7, m0, m1
+    vextracti128        xm1, m0, 1
+    packuswb            xm1, xm0
+    pshufb              xm1, xm11   ; transpose
+    movd   [dstq+strideq*0], xm1
+    pextrd [dstq+strideq*1], xm1, 1
+    pextrd [dstq+strideq*2], xm1, 2
+    pextrd [dstq+r7       ], xm1, 3
+    add                dstq, 4
+    sub                  wd, 4
+    jz .h4_end
+    cmp                 r4d, maxbased
+    jg .h4_loop
+    packuswb            xm7, xm7
+.h4_end_loop:
+    movd   [dstq+strideq*0], xm7
+    movd   [dstq+strideq*1], xm7
+    movd   [dstq+strideq*2], xm7
+    movd   [dstq+r7       ], xm7
+    add                dstq, 4
+    sub                  wd, 4
+    jg .h4_end_loop
+.h4_end:
+    RET
+ALIGN function_align
+.h8:
+    lea                 r4d, [angleq+216]
+    mov                 r4b, wb
+    cmp                 r4d, 8
+    ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -32, 8
+    and                 r4d, 4
+    mova                xm0, [tlq-15]
+    vinserti128          m0, [tlq- 9], 1
+    movd                xm1, r4d
+    movu                xm2, [z_filter_s+2]
+    vinserti128          m2, [z_filter_s+6], 1
+    vpbroadcastb        xm1, xm1 ; w & 4
+    vpbroadcastd         m7, [pb_36_m4]
+    pmaxub              xm1, [z_upsample3] ; clip 4x8
+    vinserti128          m1, [z_upsample1], 1
+    add                 dyd, dyd
+    pshufb               m1, m0, m1
+    pshufb               m2, m0, m2
+    vinserti128          m0, [tlq-7], 1
+    movd                xm6, dyd
+    pmaddubsw            m1, m7
+    pmaddubsw            m2, m7
+    vpbroadcastw         m6, xm6
+    mov                 r2d, dyd
+    lea                  r5, [strideq*3]
+    paddw                m7, m6, m6
+    paddw                m1, m2
+    vpblendd             m6, m7, 0xf0
+    pmulhrsw             m1, m3
+    pslldq               m2, m7, 8
+    paddw                m7, m7
+    paddw                m6, m2
+    vbroadcasti128       m2, [pb_15to0]
+    packuswb             m1, m1
+    punpcklbw            m1, m0
+    pshufb               m1, m2
+    vextracti128   [rsp+ 0], m1, 1
+    mova           [rsp+16], xm1
+.h8_upsample_loop:
+    lea                 r4d, [r2+dyq]
+    shr                 r2d, 6 ; base0
+    movu                xm0, [rsp+r2]
+    lea                 r2d, [r4+dyq]
+    shr                 r4d, 6 ; base1
+    vinserti128          m0, [rsp+r4], 1
+    lea                 r4d, [r2+dyq]
+    shr                 r2d, 6 ; base2
+    pand                 m1, m4, m6
+    psubw                m2, m5, m1
+    psllw                m1, 8
+    por                  m2, m1
+    punpcklqdq           m1, m2, m2 ; frac0 frac1
+    pmaddubsw            m0, m1
+    movu                xm1, [rsp+r2]
+    lea                 r2d, [r4+dyq]
+    shr                 r4d, 6 ; base3
+    vinserti128          m1, [rsp+r4], 1
+    punpckhqdq           m2, m2 ; frac2 frac3
+    pmaddubsw            m1, m2
+    pmulhrsw             m0, m3
+    paddw                m6, m7
+    pmulhrsw             m1, m3
+    lea                  r4, [dstq+strideq*4]
+    psllw                m1, 8
+    por                  m0, m1
+    vextracti128        xm1, m0, 1
+    punpcklbw           xm2, xm0, xm1
+    punpckhbw           xm0, xm1
+    movd   [dstq+strideq*0], xm2
+    pextrd [dstq+strideq*1], xm2, 1
+    pextrd [dstq+strideq*2], xm2, 2
+    pextrd [dstq+r5       ], xm2, 3
+    movd   [r4  +strideq*0], xm0
+    pextrd [r4  +strideq*1], xm0, 1
+    pextrd [r4  +strideq*2], xm0, 2
+    pextrd [r4  +r5       ], xm0, 3
+    add                dstq, 4
+    sub                  wd, 4
+    jg .h8_upsample_loop
+    RET
+.h8_no_intra_edge_filter:
+    and            maxbased, 7
+    or             maxbased, 8 ; imin(w+7, 15)
+    jmp .h8_main
+.h8_no_upsample:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -32, 10
+    lea            maxbased, [wq+7]
+    test             angled, 0x400
+    jnz .h8_no_intra_edge_filter
+    call .filter_strength
+    jz .h8_main ; filter_strength == 0
+    vpbroadcastd        xm6, [base+pb_15]
+    pcmpeqb             xm1, xm1
+    psubusb             xm6, xm0
+    psubb               xm6, xm1 ; w == 4 ? 5 : 1
+    movu                xm2, [tlq-16]
+    pmaxub              xm1, xm6, [base+z_filter_s]
+    vinserti128          m2, [tlq-14], 1
+    vinserti128          m1, [base+z_filter_s+12], 1
+    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*0]
+    pmaxub              xm6, [base+z_filter_s+ 8]
+    vinserti128          m6, [base+z_filter_s+20], 1
+    pshufb               m0, m2, m1
+    pmaddubsw            m0, m7
+    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*1]
+    movzx               r4d, byte [tlq-15]
+    shufps               m1, m6, q2121
+    pshufb               m1, m2, m1
+    pmaddubsw            m1, m7
+    paddw                m0, m1
+    sub                 r5d, 3
+    jnz .h8_3tap
+    vpbroadcastd         m7, [z_filter_k+4*8]
+    movzx               r2d, byte [tlq-14]
+    pshufb               m2, m6
+    pmaddubsw            m2, m7
+    sub                 r2d, r4d
+    lea                 r2d, [r2+r4*8+4]
+    shr                 r2d, 3
+    mov            [rsp+15], r2b
+    paddw                m0, m2
+.h8_3tap:
+    pmulhrsw             m0, m3
+    sar                 r5d, 1
+    lea                 tlq, [rsp+31]
+    add                 r5d, 17
+    cmp                  wd, 8
+    cmova          maxbased, r5d
+    neg                  r5
+    mov            [tlq+r5], r4b
+    vextracti128        xm1, m0, 1
+    packuswb            xm0, xm1
+    mova           [tlq-15], xm0
+.h8_main:
+    movd                xm2, dyd
+    vbroadcasti128       m0, [z_base_inc]
+    mov                  r4, tlq
+    sub                 tlq, 8
+    neg                 dyq
+    vpbroadcastw         m2, xm2
+    sub                  r4, maxbaseq
+    shl            maxbased, 6
+    vpbroadcastb         m7, [r4]
+    lea                  r4, [dyq+63]
+    movd                xm9, maxbased
+    sub            maxbased, 63
+    vbroadcasti128       m8, [z3_shuf]
+    neg            maxbaseq
+    vpbroadcastw         m9, xm9
+    psrlw                m7, 8
+    psubw                m9, m0
+    paddw                m6, m2, m2
+    vpblendd             m2, m6, 0x0f
+.h8_loop:
+    lea                  r5, [r4+dyq]
+    sar                  r4, 6
+    pand                 m0, m4, m2
+    psubw                m1, m5, m0
+    psllw                m0, 8
+    por                  m1, m0
+    vbroadcasti128       m0, [tlq+r4]
+    lea                  r4, [r5+dyq]
+    sar                  r5, 6
+    vinserti128          m0, [tlq+r5], 0
+    sub                 rsp, 8*2
+    pshufb               m0, m8
+    pmaddubsw            m0, m1
+    pcmpgtw              m1, m9, m2
+    paddsw               m2, m6
+    pmulhrsw             m0, m3
+    vpblendvb            m0, m7, m0, m1
+    vextracti128        xm1, m0, 1
+    psllw               xm0, 8
+    por                 xm0, xm1 ; interleave rows (partial transpose)
+    mova              [rsp], xm0
+    sub                  wd, 2
+    jz .h8_transpose
+    cmp                 r4d, maxbased
+    jg .h8_loop
+    packuswb            xm0, xm7, xm7
+.h8_end_loop:
+    sub                 rsp, 8*2
+    mova              [rsp], xm0
+    sub                  wd, 2
+    jg .h8_end_loop
+.h8_transpose:
+    mova                xm2, [rsp+16*1]
+    sub              org_wd, 8
+    lea                  r2, [strideq*3]
+    lea                  r6, [dstq+org_wq]
+    cmovg              dstq, r6
+    punpcklwd           xm1, xm2, xm0
+    punpckhwd           xm2, xm0
+    lea                  r6, [dstq+strideq*4]
+    jge .h8_w8
+    add                 rsp, 16*2
+    movd   [dstq+strideq*0], xm1
+    pextrd [dstq+strideq*1], xm1, 1
+    pextrd [dstq+strideq*2], xm1, 2
+    pextrd [dstq+r2       ], xm1, 3
+    movd   [r6  +strideq*0], xm2
+    pextrd [r6  +strideq*1], xm2, 1
+    pextrd [r6  +strideq*2], xm2, 2
+    pextrd [r6  +r2       ], xm2, 3
+    jmp .h8_end
+.h8_w8_loop:
+    mova                xm0, [rsp+16*0]
+    mova                xm2, [rsp+16*1]
+    punpcklwd           xm1, xm2, xm0
+    punpckhwd           xm2, xm0
+.h8_w8: ; w8/w16/w32
+    mova                xm0, [rsp+16*2]
+    mova                xm4, [rsp+16*3]
+    add                 rsp, 16*4
+    punpcklwd           xm3, xm4, xm0
+    punpckhwd           xm4, xm0
+    punpckldq           xm0, xm3, xm1
+    punpckhdq           xm3, xm1
+    punpckldq           xm1, xm4, xm2
+    punpckhdq           xm4, xm2
+    movq   [dstq+strideq*0], xm0
+    movhps [dstq+strideq*1], xm0
+    movq   [dstq+strideq*2], xm3
+    movhps [dstq+r2       ], xm3
+    movq   [r6  +strideq*0], xm1
+    movhps [r6  +strideq*1], xm1
+    movq   [r6  +strideq*2], xm4
+    movhps [r6  +r2       ], xm4
+    sub                dstq, 8
+    sub                  r6, 8
+    sub              org_wd, 8
+    jge .h8_w8_loop
+.h8_end:
+    RET
+.h16_no_intra_edge_filter:
+    and            maxbased, 15
+    or             maxbased, 16 ; imin(w+15, 31)
+    jmp .h16_main
+ALIGN function_align
+.h16:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -64, 12
+    lea            maxbased, [wq+15]
+    test             angled, 0x400
+    jnz .h16_no_intra_edge_filter
+    call .filter_strength
+    jz .h16_main ; filter_strength == 0
+    vpbroadcastd        m11, [base+pb_27]
+    vpbroadcastd         m1, [base+pb_1]
+    vbroadcasti128       m6, [base+z_filter_s+12]
+    vinserti128          m2, m6, [base+z_filter_s+4], 0
+    vinserti128          m6, [base+z_filter_s+20], 1
+    movu               xm10, [tlq-18]
+    vinserti128         m10, [tlq-14], 1
+    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*0]
+    vbroadcasti128       m7, [base+z_filter_s+8]
+    vinserti128          m8, m7, [base+z_filter_s+0], 0
+    vinserti128          m7, [base+z_filter_s+16], 1
+    psubusb             m11, m0
+    por                  m1, m11
+    movu               xm11, [tlq-32]
+    vinserti128         m11, [tlq-28], 1
+    pmaxub               m8, m1
+    pmaxub               m7, m1
+    pshufb               m0, m10, m2
+    shufps               m2, m6, q2121
+    pmaddubsw            m0, m9
+    pshufb               m1, m11, m8
+    shufps               m8, m7, q2121
+    pmaddubsw            m1, m9
+    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
+    movzx               r4d, byte [tlq-31]
+    pshufb               m2, m10, m2
+    pmaddubsw            m2, m9
+    pshufb               m8, m11, m8
+    pmaddubsw            m8, m9
+    paddw                m0, m2
+    paddw                m1, m8
+    sub                 r5d, 3
+    jnz .h16_3tap
+    vpbroadcastd         m9, [z_filter_k+4*8]
+    movzx               r2d, byte [tlq-30]
+    pshufb              m10, m6
+    pmaddubsw           m10, m9
+    pshufb              m11, m7
+    pmaddubsw           m11, m9
+    sub                 r2d, r4d
+    lea                 r2d, [r2+r4*8+4]
+    shr                 r2d, 3
+    mov            [rsp+31], r2b
+    paddw                m0, m10
+    paddw                m1, m11
+.h16_3tap:
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    sar                 r5d, 1
+    lea                 tlq, [rsp+63]
+    add                 r5d, 33
+    cmp                  wd, 16
+    cmova          maxbased, r5d
+    neg                  r5
+    mov            [tlq+r5], r4b
+    packuswb             m0, m1
+    vpermq               m0, m0, q2031
+    mova           [tlq-31], m0
+.h16_main:
+    movd                xm6, dyd
+    vbroadcasti128       m0, [z_base_inc]
+    mov                  r4, tlq
+    sub                 tlq, 8
+    neg                 dyq
+    vpbroadcastw         m6, xm6
+    sub                  r4, maxbaseq
+    shl            maxbased, 6
+    vpbroadcastb         m7, [r4]
+    lea                  r4, [dyq+63]
+    movd                xm9, maxbased
+    sub            maxbased, 63
+    vbroadcasti128       m8, [z3_shuf]
+    neg            maxbaseq
+    vpbroadcastw         m9, xm9
+    psubw                m9, m0
+    paddw               m11, m6, m6
+    psubw               m10, m9, m3 ; 64*8
+    vpblendd             m6, m11, 0xf0
+.h16_loop:
+    lea                  r5, [r4+dyq]
+    sar                  r4, 6
+    pand                 m1, m4, m6
+    psubw                m2, m5, m1
+    psllw                m1, 8
+    por                  m2, m1
+    movu                xm0, [tlq+r4-0]
+    movu                xm1, [tlq+r4-8]
+    lea                  r4, [r5+dyq]
+    sar                  r5, 6
+    vinserti128          m0, [tlq+r5-0], 1
+    vinserti128          m1, [tlq+r5-8], 1
+    sub                 rsp, 32
+    pshufb               m0, m8
+    pshufb               m1, m8
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    pcmpgtw              m1, m9, m6
+    pcmpgtw              m2, m10, m6
+    packsswb             m1, m2
+    paddsw               m6, m11
+    vpblendvb            m0, m7, m0, m1
+    vpermq               m0, m0, q3120
+    mova              [rsp], m0
+    sub                  wd, 2
+    jz .h16_transpose
+    cmp                 r4d, maxbased
+    jg .h16_loop
+    mova                 m0, m7
+.h16_end_loop:
+    sub                 rsp, 32
+    mova              [rsp], m7
+    sub                  wd, 2
+    jg .h16_end_loop
+.h16_transpose:
+    mova                 m2, [rsp+32*1]
+    sub              org_wd, 8
+    lea                  r2, [strideq*3]
+    lea                  r6, [dstq+org_wq]
+    cmovg              dstq, r6
+    punpcklbw            m1, m2, m0
+    punpckhbw            m2, m0
+    lea                  r3, [strideq*5]
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
+    lea                  r4, [strideq+r2*2] ; stride*7
+    jge .h16_w8
+    add                 rsp, 32*2
+    movd   [dstq+strideq*0], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    pextrd [dstq+strideq*2], xm0, 2
+    pextrd [dstq+r2       ], xm0, 3
+    vextracti128        xm0, m0, 1
+    movd   [dstq+strideq*4], xm1
+    pextrd [dstq+r3       ], xm1, 1
+    pextrd [dstq+r2*2     ], xm1, 2
+    pextrd [dstq+r4       ], xm1, 3
+    lea                dstq, [dstq+strideq*8]
+    vextracti128        xm1, m1, 1
+    movd   [dstq+strideq*0], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    pextrd [dstq+strideq*2], xm0, 2
+    pextrd [dstq+r2       ], xm0, 3
+    movd   [dstq+strideq*4], xm1
+    pextrd [dstq+r3       ], xm1, 1
+    pextrd [dstq+r2*2     ], xm1, 2
+    pextrd [dstq+r4       ], xm1, 3
+    jmp .h16_end
+.h16_w8_loop:
+    mova                 m0, [rsp+32*0]
+    mova                 m2, [rsp+32*1]
+    punpcklbw            m1, m2, m0
+    punpckhbw            m2, m0
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
+.h16_w8:
+    mova                 m2, [rsp+32*2]
+    mova                 m4, [rsp+32*3]
+    lea                  r6, [dstq+strideq*8]
+    add                 rsp, 32*4
+    punpcklbw            m3, m4, m2
+    punpckhbw            m4, m2
+    punpcklbw            m2, m3, m4
+    punpckhbw            m3, m4
+    punpckldq            m4, m2, m0
+    punpckhdq            m2, m0
+    punpckldq            m0, m3, m1
+    punpckhdq            m3, m1
+    movq   [dstq+strideq*0], xm4
+    movhps [dstq+strideq*1], xm4
+    vextracti128        xm4, m4, 1
+    movq   [dstq+strideq*2], xm2
+    movhps [dstq+r2       ], xm2
+    vextracti128        xm2, m2, 1
+    movq   [dstq+strideq*4], xm0
+    movhps [dstq+r3       ], xm0
+    vextracti128        xm0, m0, 1
+    movq   [dstq+r2*2     ], xm3
+    movhps [dstq+r4       ], xm3
+    vextracti128        xm3, m3, 1
+    movq     [r6+strideq*0], xm4
+    movhps   [r6+strideq*1], xm4
+    movq     [r6+strideq*2], xm2
+    movhps   [r6+r2       ], xm2
+    movq     [r6+strideq*4], xm0
+    movhps   [r6+r3       ], xm0
+    movq     [r6+r2*2     ], xm3
+    movhps   [r6+r4       ], xm3
+    sub                dstq, 8
+    sub              org_wd, 8
+    jge .h16_w8_loop
+.h16_end:
+    RET
+ALIGN function_align
+.h32:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -96, 15
+    lea            maxbased, [wq+31]
+    and            maxbased, 31
+    or             maxbased, 32 ; imin(w+31, 63)
+    test             angled, 0x400 ; !enable_intra_edge_filter
+    jnz .h32_main
+    vbroadcasti128       m0, [pb_0to15]
+    mov                 r4d, 21
+    mov                 r5d, 3
+    movu               xm11, [tlq-66]    ; 56-63
+    vinserti128         m11, [tlq-52], 1 ; 40-47
+    sub                 r4d, wd ; 21-w
+    cmovg               r5d, r4d
+    movu               xm12, [tlq-58]    ; 48-55
+    vinserti128         m12, [tlq-44], 1 ; 32-39
+    sub                 r4d, 8 ; 13-w
+    movd                xm1, r5d
+    movu               xm13, [tlq-34]    ; 24-31
+    vinserti128         m13, [tlq-20], 1 ;  8-15
+    movd                xm2, r4d
+    vpbroadcastb         m1, xm1
+    movu               xm14, [tlq-28]    ; 16-23
+    vinserti128         m14, [tlq-14], 1 ;  0- 7
+    vpbroadcastb         m2, xm2
+    pmaxsb               m1, m0 ; clip 16x32 and (32|64)x32
+    movu                 m7, [z_filter_s+4]
+    pshufb              m11, m1
+    vinserti128          m8, m7, [z_filter_s+8], 1
+    vinserti128          m7, [z_filter_s+16], 0
+    pmaxsb               m2, m0 ; clip 8x32
+    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
+    pshufb              m12, m2
+    pshufb               m0, m11, m8
+    pmaddubsw            m0, m9
+    pshufb               m2, m12, m8
+    pmaddubsw            m2, m9
+    pshufb               m1, m13, m8
+    pmaddubsw            m1, m9
+    shufps               m8, m7, q1021
+    pshufb               m6, m14, m8
+    pmaddubsw            m6, m9
+    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
+    pshufb              m10, m11, m8
+    pmaddubsw           m10, m9
+    paddw                m0, m10
+    pshufb              m10, m12, m8
+    pmaddubsw           m10, m9
+    paddw                m2, m10
+    pshufb              m10, m13, m8
+    pmaddubsw           m10, m9
+    shufps               m8, m7, q2121
+    paddw                m1, m10
+    pshufb              m10, m14, m8
+    pmaddubsw           m10, m9
+    paddw                m6, m10
+    vpbroadcastd         m9, [z_filter_k+4*2+12*2]
+    pshufb              m11, m8
+    pmaddubsw           m11, m9
+    pshufb              m12, m8
+    pmaddubsw           m12, m9
+    movzx               r4d, byte [tlq-63]
+    movzx               r2d, byte [tlq-62]
+    paddw                m0, m11
+    paddw                m2, m12
+    pshufb              m13, m8
+    pmaddubsw           m13, m9
+    pshufb              m14, m7
+    pmaddubsw           m14, m9
+    paddw                m1, m13
+    paddw                m6, m14
+    sub                 r2d, r4d
+    lea                 r2d, [r2+r4*8+4] ; edge case for 64x32
+    pmulhrsw             m0, m3
+    pmulhrsw             m2, m3
+    pmulhrsw             m1, m3
+    pmulhrsw             m6, m3
+    shr                 r2d, 3
+    mov            [rsp+31], r2b
+    lea                 tlq, [rsp+95]
+    mov            [tlq-65], r4b
+    mov                 r4d, 65
+    cmp                  wd, 32
+    cmova          maxbased, r4d
+    packuswb             m0, m2
+    packuswb             m1, m6
+    mova           [tlq-63], m0
+    mova           [tlq-31], m1
+.h32_main:
+    movd                xm6, dyd
+    mov                  r4, tlq
+    sub                 tlq, 8
+    neg                 dyq
+    vpbroadcastw         m6, xm6
+    sub                  r4, maxbaseq
+    shl            maxbased, 6
+    vpbroadcastb         m7, [r4]
+    lea                  r4, [dyq+63]
+    movd                xm9, maxbased
+    sub            maxbased, 63
+    vbroadcasti128       m8, [z3_shuf]
+    neg            maxbaseq
+    vpbroadcastw         m9, xm9
+    psubw                m9, [z_base_inc]
+    mova                m11, m6
+    psubw               m10, m9, m3 ; 64*8
+.h32_loop:
+    mov                  r5, r4
+    sar                  r5, 6
+    pand                 m1, m4, m6
+    psubw                m2, m5, m1
+    psllw                m1, 8
+    por                  m2, m1
+    movu                xm0, [tlq+r5- 0]
+    vinserti128          m0, [tlq+r5-16], 1
+    movu                xm1, [tlq+r5- 8]
+    vinserti128          m1, [tlq+r5-24], 1
+    sub                 rsp, 32
+    add                  r4, dyq
+    pshufb               m0, m8
+    pshufb               m1, m8
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    pcmpgtw              m1, m9, m6
+    pcmpgtw              m2, m10, m6
+    packsswb             m1, m2
+    paddsw               m6, m11
+    vpblendvb            m0, m7, m0, m1
+    mova              [rsp], m0
+    dec                  wd
+    jz .h32_transpose
+    cmp                 r4d, maxbased
+    jg .h32_loop
+.h32_end_loop:
+    sub                 rsp, 32
+    mova              [rsp], m7
+    dec                  wd
+    jg .h32_end_loop
+.h32_transpose:
+    lea                dstq, [dstq+org_wq-8]
+    lea                  r2, [strideq*3]
+    lea                  r3, [strideq*5]
+    lea                  r4, [strideq+r2*2] ; stride*7
+.h32_w8_loop:
+    mova                 m7, [rsp+32*0]
+    mova                 m6, [rsp+32*1]
+    mova                 m5, [rsp+32*2]
+    mova                 m4, [rsp+32*3]
+    mova                 m3, [rsp+32*4]
+    mova                 m2, [rsp+32*5]
+    mova                 m1, [rsp+32*6]
+    mova                 m0, [rsp+32*7]
+    lea                  r6, [dstq+strideq*8]
+    add                 rsp, 32*8
+    punpcklbw            m8, m0, m1
+    punpckhbw            m0, m1
+    punpcklbw            m1, m2, m3
+    punpckhbw            m2, m3
+    punpcklbw            m3, m4, m5
+    punpckhbw            m4, m5
+    punpcklbw            m5, m6, m7
+    punpckhbw            m6, m7
+    punpcklwd            m7, m8, m1
+    punpckhwd            m8, m1
+    punpcklwd            m1, m0, m2
+    punpckhwd            m0, m2
+    punpcklwd            m2, m3, m5
+    punpckhwd            m3, m5
+    punpcklwd            m5, m4, m6
+    punpckhwd            m4, m6
+    punpckldq            m6, m7, m2
+    punpckhdq            m7, m2
+    punpckldq            m2, m8, m3
+    punpckhdq            m8, m3
+    punpckldq            m3, m1, m5
+    punpckhdq            m1, m5
+    punpckldq            m5, m0, m4
+    punpckhdq            m0, m4
+    movq   [dstq+strideq*0], xm6
+    movhps [dstq+strideq*1], xm6
+    vextracti128        xm6, m6, 1
+    movq   [dstq+strideq*2], xm7
+    movhps [dstq+r2       ], xm7
+    vextracti128        xm7, m7, 1
+    movq   [dstq+strideq*4], xm2
+    movhps [dstq+r3       ], xm2
+    vextracti128        xm2, m2, 1
+    movq   [dstq+r2*2     ], xm8
+    movhps [dstq+r4       ], xm8
+    vextracti128        xm8, m8, 1
+    movq     [r6+strideq*0], xm3
+    movhps   [r6+strideq*1], xm3
+    vextracti128        xm3, m3, 1
+    movq     [r6+strideq*2], xm1
+    movhps   [r6+r2       ], xm1
+    vextracti128        xm1, m1, 1
+    movq     [r6+strideq*4], xm5
+    movhps   [r6+r3       ], xm5
+    vextracti128        xm5, m5, 1
+    movq     [r6+r2*2     ], xm0
+    movhps   [r6+r4       ], xm0
+    lea                  r6, [r6+strideq*8]
+    vextracti128        xm0, m0, 1
+    movq     [r6+strideq*0], xm6
+    movhps   [r6+strideq*1], xm6
+    movq     [r6+strideq*2], xm7
+    movhps   [r6+r2       ], xm7
+    movq     [r6+strideq*4], xm2
+    movhps   [r6+r3       ], xm2
+    movq     [r6+r2*2     ], xm8
+    movhps   [r6+r4       ], xm8
+    lea                  r6, [r6+strideq*8]
+    movq     [r6+strideq*0], xm3
+    movhps   [r6+strideq*1], xm3
+    movq     [r6+strideq*2], xm1
+    movhps   [r6+r2       ], xm1
+    movq     [r6+strideq*4], xm5
+    movhps   [r6+r3       ], xm5
+    movq     [r6+r2*2     ], xm0
+    movhps   [r6+r4       ], xm0
+    sub                dstq, 8
+    sub              org_wd, 8
+    jg .h32_w8_loop
+    RET
+ALIGN function_align
+.h64:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK        -128, 16
+    lea            maxbased, [wq+63]
+    test             angled, 0x400 ; !enable_intra_edge_filter
+    jnz .h64_main
+    mov                 r4d, 21
+    vpbroadcastb       xm11, [tlq-127]
+    vpblendd           xm11, [tlq-130], 0x0e ; 120-127
+    sub                 r4d, wd ; 21-w
+    mov                 r5d, 3
+    vinserti128         m11, [tlq-116], 1    ; 104-111
+    movu                 m7, [z_filter_s+4]
+    cmp                  wd, 32
+    cmove               r4d, r5d
+    vinserti128          m8, m7, [z_filter_s+8], 1
+    vbroadcasti128       m6, [pb_0to15]
+    movd                xm1, r4d
+    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
+    movu               xm12, [tlq-122]       ; 112-119
+    vinserti128         m12, [tlq-108], 1    ;  96-103
+    vpbroadcastb         m1, xm1
+    movu               xm13, [tlq- 98]       ;  88- 95
+    vinserti128         m13, [tlq- 84], 1    ;  72- 79
+    movu               xm14, [tlq- 90]       ;  80- 87
+    vinserti128         m14, [tlq- 76], 1    ;  64- 71
+    vinserti128          m7, [z_filter_s+16], 0
+    pshufb               m0, m11, m8
+    pmaddubsw            m0, m9
+    pshufb               m2, m12, m8
+    pmaddubsw            m2, m9
+    pmaxsb               m1, m6 ; clip (16|32)x64
+    pshufb              m13, m1
+    pshufb               m1, m13, m8
+    pmaddubsw            m1, m9
+    pshufb               m6, m14, m8
+    pmaddubsw            m6, m9
+    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
+    shufps              m15, m8, m7, q1021
+    pshufb              m10, m11, m15
+    pmaddubsw           m10, m9
+    paddw                m0, m10
+    pshufb              m10, m12, m15
+    pmaddubsw           m10, m9
+    paddw                m2, m10
+    pshufb              m10, m13, m15
+    pmaddubsw           m10, m9
+    paddw                m1, m10
+    pshufb              m10, m14, m15
+    pmaddubsw           m10, m9
+    paddw                m6, m10
+    vpbroadcastd         m9, [z_filter_k+4*2+12*2]
+    shufps              m10, m8, m7, q2132
+    pshufb              m11, m10
+    pmaddubsw           m11, m9
+    pshufb              m12, m10
+    pmaddubsw           m12, m9
+    pshufb              m13, m10
+    pmaddubsw           m13, m9
+    pshufb              m14, m10
+    pmaddubsw           m14, m9
+    paddw                m0, m11
+    paddw                m2, m12
+    paddw                m1, m13
+    paddw                m6, m14
+    movu               xm11, [tlq-66]    ; 56-63
+    vinserti128         m11, [tlq-52], 1 ; 40-47
+    movu               xm12, [tlq-58]    ; 48-55
+    vinserti128         m12, [tlq-44], 1 ; 32-39
+    movu               xm13, [tlq-34]    ; 24-31
+    vinserti128         m13, [tlq-20], 1 ;  8-15
+    movu               xm14, [tlq-28]    ; 16-23
+    vinserti128         m14, [tlq-14], 1 ;  0- 7
+    pmulhrsw             m0, m3
+    pmulhrsw             m2, m3
+    pmulhrsw             m1, m3
+    pmulhrsw             m6, m3
+    lea                 tlq, [rsp+127]
+    packuswb             m0, m2
+    packuswb             m1, m6
+    mova          [tlq-127], m0
+    mova          [tlq- 95], m1
+    pshufb               m0, m11, m10
+    pmaddubsw            m0, m9
+    pshufb               m2, m12, m10
+    pmaddubsw            m2, m9
+    pshufb               m1, m13, m10
+    pmaddubsw            m1, m9
+    pshufb               m6, m14, m7
+    pmaddubsw            m6, m9
+    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
+    pshufb               m7, m11, m15
+    pmaddubsw            m7, m9
+    paddw                m0, m7
+    pshufb               m7, m12, m15
+    pmaddubsw            m7, m9
+    paddw                m2, m7
+    pshufb               m7, m13, m15
+    pmaddubsw            m7, m9
+    paddw                m1, m7
+    pshufb               m7, m14, m10
+    pmaddubsw            m7, m9
+    paddw                m6, m7
+    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
+    pshufb              m11, m8
+    pmaddubsw           m11, m9
+    pshufb              m12, m8
+    pmaddubsw           m12, m9
+    pshufb              m13, m8
+    pmaddubsw           m13, m9
+    pshufb              m14, m15
+    pmaddubsw           m14, m9
+    paddw                m0, m11
+    paddw                m2, m12
+    paddw                m1, m13
+    paddw                m6, m14
+    pmulhrsw             m0, m3
+    pmulhrsw             m2, m3
+    pmulhrsw             m1, m3
+    pmulhrsw             m6, m3
+    packuswb             m0, m2
+    packuswb             m1, m6
+    mova           [tlq-63], m0
+    mova           [tlq-31], m1
+.h64_main:
+    movd                xm6, dyd
+    mov                  r4, tlq
+    sub                 tlq, 24
+    neg                 dyq
+    vpbroadcastw         m6, xm6
+    sub                  r4, maxbaseq
+    shl            maxbased, 6
+    vpbroadcastb         m7, [r4]
+    lea                  r4, [dyq+63]
+    movd               xm10, maxbased
+    sub            maxbased, 63
+    vbroadcasti128       m8, [z3_shuf]
+    neg            maxbaseq
+    mova                xm1, [z_base_inc+16]
+    vinserti128          m1, [z_base_inc], 1
+    vpbroadcastw        m10, xm10
+    psllw                m0, m3, 2   ; 64*32
+    psubw               m10, m1
+    mova                m14, m6
+    psubw               m11, m10, m3 ; 64*8
+    psubw               m12, m10, m0
+    psubw               m13, m11, m0
+.h64_loop:
+    mov                  r5, r4
+    sar                  r5, 6
+    movu                 m0, [tlq+r5-0]
+    movu                 m1, [tlq+r5-8]
+    pand                 m2, m4, m6
+    psubw                m9, m5, m2
+    psllw                m2, 8
+    por                  m9, m2
+    pshufb               m0, m8
+    pshufb               m1, m8
+    pmaddubsw            m0, m9
+    pmaddubsw            m1, m9
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    pcmpgtw              m1, m10, m6
+    pcmpgtw              m2, m11, m6
+    packsswb             m1, m2
+    vpblendvb            m2, m7, m0, m1
+    movu                 m0, [tlq+r5-32]
+    movu                 m1, [tlq+r5-40]
+    add                  r4, dyq
+    sub                 rsp, 64
+    mova           [rsp+32], m2
+    pshufb               m0, m8
+    pshufb               m1, m8
+    pmaddubsw            m0, m9
+    pmaddubsw            m1, m9
+    pcmpgtw              m9, m12, m6
+    pcmpgtw              m2, m13, m6
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    paddsw               m6, m14
+    packsswb             m9, m2
+    packuswb             m0, m1
+    vpblendvb            m0, m7, m0, m9
+    mova              [rsp], m0
+    dec                  wd
+    jz .h64_transpose
+    cmp                 r4d, maxbased
+    jg .h64_loop
+.h64_end_loop:
+    sub                 rsp, 64
+    mova           [rsp+32], m7
+    mova           [rsp+ 0], m7
+    dec                  wd
+    jg .h64_end_loop
+.h64_transpose:
+    lea                  r2, [strideq*3]
+    lea                  r3, [strideq*5]
+    imul                 r5, strideq, -8
+    lea                dstq, [dstq+org_wq-16]
+    lea                  r4, [strideq+r2*2] ; stride*7
+.h64_transpose_loop0:
+    lea                  r6, [rsp+16*3]
+.h64_transpose_loop:
+    mova                xm0, [r6+64*15]
+    vinserti128          m0, [r6+64* 7], 1
+    mova                xm1, [r6+64*14]
+    vinserti128          m1, [r6+64* 6], 1
+    mova                xm2, [r6+64*13]
+    vinserti128          m2, [r6+64* 5], 1
+    mova                xm3, [r6+64*12]
+    vinserti128          m3, [r6+64* 4], 1
+    mova                xm4, [r6+64*11]
+    vinserti128          m4, [r6+64* 3], 1
+    mova                xm5, [r6+64*10]
+    vinserti128          m5, [r6+64* 2], 1
+    mova                xm6, [r6+64* 9]
+    vinserti128          m6, [r6+64* 1], 1
+    mova                xm7, [r6+64* 8]
+    vinserti128          m7, [r6+64* 0], 1
+    sub                  r6, 16
+    punpcklbw            m8, m0, m1
+    punpckhbw            m0, m1
+    punpcklbw            m1, m2, m3
+    punpckhbw            m2, m3
+    punpcklbw            m3, m4, m5
+    punpckhbw            m4, m5
+    punpcklbw            m5, m6, m7
+    punpckhbw            m6, m7
+    punpcklwd            m7, m8, m1
+    punpckhwd            m8, m1
+    punpcklwd            m1, m0, m2
+    punpckhwd            m0, m2
+    punpcklwd            m2, m3, m5
+    punpckhwd            m3, m5
+    punpcklwd            m5, m4, m6
+    punpckhwd            m4, m6
+    punpckldq            m6, m7, m2
+    punpckhdq            m7, m2
+    punpckldq            m2, m8, m3
+    punpckhdq            m8, m3
+    punpckldq            m3, m1, m5
+    punpckhdq            m1, m5
+    punpckldq            m5, m0, m4
+    punpckhdq            m0, m4
+    vpermq               m6, m6, q3120
+    vpermq               m7, m7, q3120
+    vpermq               m2, m2, q3120
+    vpermq               m8, m8, q3120
+    vpermq               m3, m3, q3120
+    vpermq               m1, m1, q3120
+    vpermq               m5, m5, q3120
+    vpermq               m0, m0, q3120
+    mova         [dstq+strideq*0], xm6
+    vextracti128 [dstq+strideq*1], m6, 1
+    mova         [dstq+strideq*2], xm7
+    vextracti128 [dstq+r2       ], m7, 1
+    mova         [dstq+strideq*4], xm2
+    vextracti128 [dstq+r3       ], m2, 1
+    mova         [dstq+r2*2     ], xm8
+    vextracti128 [dstq+r4       ], m8, 1
+    sub               dstq, r5
+    mova         [dstq+strideq*0], xm3
+    vextracti128 [dstq+strideq*1], m3, 1
+    mova         [dstq+strideq*2], xm1
+    vextracti128 [dstq+r2       ], m1, 1
+    mova         [dstq+strideq*4], xm5
+    vextracti128 [dstq+r3       ], m5, 1
+    mova         [dstq+r2*2     ], xm0
+    vextracti128 [dstq+r4       ], m0, 1
+    sub                dstq, r5
+    cmp                  r6, rsp
+    jae .h64_transpose_loop
+    add                 rsp, 64*16
+    lea                dstq, [dstq+r5*8-16]
+    sub              org_wd, 16
+    jg .h64_transpose_loop0
+.h64_end:
+    RET
+
 %macro FILTER_XMM 4 ; dst, src, tmp, shuf
 %ifnum %4
     pshufb             xm%2, xm%4
@@ -2168,7 +3325,7 @@
     pmaddubsw           m%3, m5
     paddw               m%1, m%3
     psraw               m%1, 4
-    vperm2i128          m%3, m%1, m%1, 0x01
+    vpermq              m%3, m%1, q1032
     packuswb            m%1, m%3
 %endmacro
 
--- a/src/x86/ipred_init_tmpl.c
+++ b/src/x86/ipred_init_tmpl.c
@@ -39,6 +39,7 @@
 decl_angular_ipred_fn(dav1d_ipred_smooth_v_avx2);
 decl_angular_ipred_fn(dav1d_ipred_smooth_h_avx2);
 decl_angular_ipred_fn(dav1d_ipred_z1_avx2);
+decl_angular_ipred_fn(dav1d_ipred_z3_avx2);
 decl_angular_ipred_fn(dav1d_ipred_filter_avx2);
 
 decl_cfl_pred_fn(dav1d_ipred_cfl_avx2);
@@ -86,6 +87,7 @@
     c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_avx2;
     c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_avx2;
     c->intra_pred[Z1_PRED]       = dav1d_ipred_z1_avx2;
+    c->intra_pred[Z3_PRED]       = dav1d_ipred_z3_avx2;
     c->intra_pred[FILTER_PRED]   = dav1d_ipred_filter_avx2;
 
     c->cfl_pred[DC_PRED]      = dav1d_ipred_cfl_avx2;