shithub: openh264

Download patch

ref: f36959bf4b897b66e54bf5e236618aaff517ec67
parent: a63e13eecdf495b6b4880a4dc4b03715e6640a55
author: Sindre Aamås <[email protected]>
date: Tue Mar 7 09:16:59 EST 2017

[Common/x86] Simplify satd_sad X86_32_PICASM handling

Utilize program counter-relative offsets to simplify X86_32_PICASM
code.

In order for this to work with nasm, data constants are placed in
the text segment.

--- a/codec/common/x86/satd_sad.asm
+++ b/codec/common/x86/satd_sad.asm
@@ -53,7 +53,11 @@
 ;***********************************************************************
 ; Data
 ;***********************************************************************
+%ifdef X86_32_PICASM
+SECTION .text align=16
+%else
 SECTION .rodata align=16
+%endif
 
 align 16
 HSumSubDB1:   db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
@@ -772,29 +776,12 @@
     mov  r12, r2
 %endif
 
+    INIT_X86_32_PIC r2
     pxor        xmm4,   xmm4
-%ifdef X86_32_PICASM
-    push        0xff01ff01
-    push        0xff01ff01
-    push        0x01010101
-    push        0x01010101
-    movdqu      xmm5,   [esp]
-    push        0xffff0001
-    push        0xffff0001
-    push        0xffff0001
-    push        0xffff0001
-    movdqu      xmm6,   [esp]
-    push        0x00010001
-    push        0x00010001
-    push        0x00010001
-    push        0x00010001
-    movdqu      xmm7,   [esp]
-    add         esp, 48
-%else
-    movdqa      xmm5,   [HSumSubDB1]
-    movdqa      xmm6,   [HSumSubDW1]
-    movdqa      xmm7,   [PDW1]
-%endif
+    movdqa      xmm5,   [pic(HSumSubDB1)]
+    movdqa      xmm6,   [pic(HSumSubDW1)]
+    movdqa      xmm7,   [pic(PDW1)]
+    DEINIT_X86_32_PIC
     sub         r0,    r1
     movdqu      xmm0,   [r0]
     movhlps     xmm1,   xmm0
@@ -916,9 +903,9 @@
 ret
 
 %macro SSE41_ChromaGetX38x8Satd 0
-    movdqa      xmm5,   [HSumSubDB1]
-    movdqa      xmm6,   [HSumSubDW1]
-    movdqa      xmm7,   [PDW1]
+    movdqa      xmm5,   [pic(HSumSubDB1)]
+    movdqa      xmm6,   [pic(HSumSubDW1)]
+    movdqa      xmm7,   [pic(PDW1)]
     sub         r0,    r1
     movq        xmm0,   [r0]
     punpcklqdq  xmm0,   xmm0
@@ -940,7 +927,7 @@
     SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
     movdqa      [r6+16], xmm0 ;H
 ;(sum+2)>>2
-    movdqa      xmm6,   [PDQ2]
+    movdqa      xmm6,   [pic(PDQ2)]
     movdqa      xmm5,   xmm4
     punpckhqdq  xmm5,   xmm1
     paddd       xmm5,   xmm6
@@ -993,88 +980,8 @@
     SIGN_EXTENSION r3, r3d
     SIGN_EXTENSION r5, r5d
 loop_chroma_satdx3:
-%ifdef X86_32_PICASM
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0xff01ff01
-    push        0xff01ff01
-    push        0x01010101
-    push        0x01010101
-    movdqa      xmm5, [esp]
-    push        0xffff0001
-    push        0xffff0001
-    push        0xffff0001
-    push        0xffff0001
-    movdqa      xmm6, [esp]
-    push        0x00010001
-    push        0x00010001
-    push        0x00010001
-    push        0x00010001
-    movdqa      xmm7, [esp]
-    mov         esp, r0
-    mov         r0, [esp + push_num*4 + 4]
-
-    sub         r0,    r1
-    movq        xmm0,  [r0]
-    punpcklqdq  xmm0,  xmm0
-    SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
-    movdqa      [r6],  xmm0 ;V
-    add         r0,     r1
-    pinsrb      xmm0,   byte[r0-1], 0
-    pinsrb      xmm0,   byte[r0+r1-1], 1
-    lea         r0,    [r0+2*r1]
-    pinsrb      xmm0,   byte[r0-1],     2
-    pinsrb      xmm0,   byte[r0+r1-1], 3
-    lea         r0,    [r0+2*r1]
-    pinsrb      xmm0,   byte[r0-1],     4
-    pinsrb      xmm0,   byte[r0+r1-1], 5
-    lea         r0,    [r0+2*r1]
-    pinsrb      xmm0,   byte[r0-1],     6
-    pinsrb      xmm0,   byte[r0+r1-1], 7
-    punpcklqdq  xmm0,   xmm0
-    SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
-;movdqa      [r6+16], xmm0 ;H
-;(sum+2)>>2
-    mov        DWORD [r6+16], 0x0002
-    mov        DWORD [r6+20], 0x0000
-    mov        DWORD [r6+24], 0x0002
-    mov        DWORD [r6+28], 0x0000
-    movdqa      xmm6,   [r6+16]
-    movdqa      [r6+16], xmm0 ;H
-
-    movdqa      xmm5,   xmm4
-    punpckhqdq  xmm5,   xmm1
-    paddd       xmm5,   xmm6
-    psrld       xmm5,   2
-;(sum1+sum2+4)>>3
-    paddd       xmm6,   xmm6
-    paddd       xmm4,   xmm1
-    paddd       xmm4,   xmm6
-    psrld       xmm4,   3
-;satd *16
-    pslld       xmm5,   4
-    pslld       xmm4,   4
-;temp satd
-    movdqa      xmm6,   xmm4
-    punpcklqdq  xmm4,   xmm5
-    psllq       xmm4,   32
-    psrlq       xmm4,   32
-    movdqa      [r6+32], xmm4
-    punpckhqdq  xmm5,   xmm6
-    psllq       xmm5,   32
-    psrlq       xmm5,   32
-    movdqa      [r6+48], xmm5
-
-    pxor        xmm4,   xmm4 ;V
-    pxor        xmm5,   xmm5 ;H
-    pxor        xmm6,   xmm6 ;DC
-    mov         r0,    0
-    SSE41_ChromaGetX38x4Satd r0, 0
-    inc         r0
-    SSE41_ChromaGetX38x4Satd r0, 0
-%else
+    INIT_X86_32_PIC r4
     SSE41_ChromaGetX38x8Satd
-%endif
     SSEReg2MMX  xmm4, mm0,mm1
     SSEReg2MMX  xmm5, mm2,mm3
     SSEReg2MMX  xmm6, mm5,mm6
@@ -1081,89 +988,8 @@
     mov r0,     arg8
     mov r2,     arg9
 
-%ifdef X86_32_PICASM
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0xff01ff01
-    push        0xff01ff01
-    push        0x01010101
-    push        0x01010101
-    movdqa      xmm5,   [esp]
-    push        0xffff0001
-    push        0xffff0001
-    push        0xffff0001
-    push        0xffff0001
-    movdqa      xmm6,   [esp]
-    push        0x00010001
-    push        0x00010001
-    push        0x00010001
-    push        0x00010001
-    movdqa      xmm7,   [esp]
-    mov         esp,    r0
-    mov r0,     arg8
-
-    sub         r0,    r1
-    movq        xmm0,   [r0]
-    punpcklqdq  xmm0,   xmm0
-    SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
-    movdqa      [r6],  xmm0 ;V
-    add         r0,    r1
-    pinsrb      xmm0,   byte[r0-1], 0
-    pinsrb      xmm0,   byte[r0+r1-1], 1
-    lea         r0,    [r0+2*r1]
-    pinsrb      xmm0,   byte[r0-1],     2
-    pinsrb      xmm0,   byte[r0+r1-1], 3
-    lea         r0,    [r0+2*r1]
-    pinsrb      xmm0,   byte[r0-1],     4
-    pinsrb      xmm0,   byte[r0+r1-1], 5
-    lea         r0,    [r0+2*r1]
-    pinsrb      xmm0,   byte[r0-1],     6
-    pinsrb      xmm0,   byte[r0+r1-1], 7
-    punpcklqdq  xmm0,   xmm0
-    SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
-    ;movdqa      [r6+16], xmm0 ;H
-;(sum+2)>>2
-
-    mov        DWORD [r6+16], 0x0002
-    mov        DWORD [r6+20], 0x0000
-    mov        DWORD [r6+24], 0x0002
-    mov        DWORD [r6+28], 0x0000
-    movdqa      xmm6,   [r6+16]
-    movdqa      [r6+16], xmm0 ;H
-
-    movdqa      xmm5,   xmm4
-    punpckhqdq  xmm5,   xmm1
-    paddd       xmm5,   xmm6
-    psrld       xmm5,   2
-;(sum1+sum2+4)>>3
-    paddd       xmm6,   xmm6
-    paddd       xmm4,   xmm1
-    paddd       xmm4,   xmm6
-    psrld       xmm4,   3
-;satd *16
-    pslld       xmm5,   4
-    pslld       xmm4,   4
-;temp satd
-    movdqa      xmm6,   xmm4
-    punpcklqdq  xmm4,   xmm5
-    psllq       xmm4,   32
-    psrlq       xmm4,   32
-    movdqa      [r6+32], xmm4
-    punpckhqdq  xmm5,   xmm6
-    psllq       xmm5,   32
-    psrlq       xmm5,   32
-    movdqa      [r6+48], xmm5
-
-    pxor        xmm4,   xmm4 ;V
-    pxor        xmm5,   xmm5 ;H
-    pxor        xmm6,   xmm6 ;DC
-    mov         r0,    0
-    SSE41_ChromaGetX38x4Satd r0, 0
-    inc         r0
-    SSE41_ChromaGetX38x4Satd r0, 0
-%else
     SSE41_ChromaGetX38x8Satd
-%endif
+    DEINIT_X86_32_PIC
 
     MMXReg2SSE  xmm0, xmm3, mm0, mm1
     MMXReg2SSE  xmm1, xmm3, mm2, mm3
@@ -1457,20 +1283,12 @@
 ;***********************************************************************
 WELS_EXTERN WelsSampleSatd4x4_sse41
     %assign  push_num 0
+    INIT_X86_32_PIC r5
     LOAD_4_PARA
     PUSH_XMM 8
     SIGN_EXTENSION r1, r1d
     SIGN_EXTENSION r3, r3d
-%ifdef X86_32_PICASM
-    push        0xff01ff01
-    push        0x01010101
-    push        0xff01ff01
-    push        0x01010101
-    movdqu      xmm4,   [esp]
-    add         esp, 16
-%else
-    movdqa      xmm4,[HSwapSumSubDB1]
-%endif
+    movdqa      xmm4,[pic(HSwapSumSubDB1)]
     movd        xmm2,[r2]
     movd        xmm5,[r2+r3]
     shufps      xmm2,xmm5,0
@@ -1511,6 +1329,7 @@
     SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
     ret
 
 ;***********************************************************************
@@ -1524,21 +1343,13 @@
     push  r5
 %endif
     %assign  push_num 2
+    INIT_X86_32_PIC r6
     LOAD_4_PARA
     PUSH_XMM 8
     SIGN_EXTENSION r1, r1d
     SIGN_EXTENSION r3, r3d
 
-%ifdef X86_32_PICASM
-    push        0xff01ff01
-    push        0xff01ff01
-    push        0x01010101
-    push        0x01010101
-    movdqu      xmm7,   [esp]
-    add         esp, 16
-%else
-    movdqa      xmm7, [HSumSubDB1]
-%endif
+    movdqa      xmm7, [pic(HSumSubDB1)]
     lea         r4,  [r1+r1*2]
     lea         r5,  [r3+r3*2]
     pxor        xmm6, xmm6
@@ -1549,6 +1360,7 @@
     SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
 %ifdef X86_32
     pop  r5
     pop  r4
@@ -1572,16 +1384,9 @@
     SIGN_EXTENSION r1, r1d
     SIGN_EXTENSION r3, r3d
 
-%ifdef X86_32_PICASM
-    push        0xff01ff01
-    push        0xff01ff01
-    push        0x01010101
-    push        0x01010101
-    movdqu      xmm7,   [esp]
-    add         esp, 16
-%else
-    movdqa      xmm7, [HSumSubDB1]
-%endif
+    INIT_X86_32_PIC_NOPRESERVE r4
+    movdqa      xmm7, [pic(HSumSubDB1)]
+    DEINIT_X86_32_PIC
     lea         r4,  [r1+r1*2]
     lea         r5,  [r3+r3*2]
     pxor        xmm6, xmm6
@@ -1614,6 +1419,7 @@
     push  r5
 %endif
     %assign  push_num 2
+    INIT_X86_32_PIC r6
     LOAD_4_PARA
     PUSH_XMM 8
     SIGN_EXTENSION r1, r1d
@@ -1621,16 +1427,7 @@
     push  r0
     push  r2
 
-%ifdef X86_32_PICASM
-    push        0xff01ff01
-    push        0xff01ff01
-    push        0x01010101
-    push        0x01010101
-    movdqu      xmm7,   [esp]
-    add         esp, 16
-%else
-    movdqa      xmm7, [HSumSubDB1]
-%endif
+    movdqa      xmm7, [pic(HSumSubDB1)]
     lea         r4,  [r1+r1*2]
     lea         r5,  [r3+r3*2]
     pxor        xmm6,   xmm6
@@ -1650,6 +1447,7 @@
     SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
     POP_XMM
     LOAD_4_PARA_POP
+    DEINIT_X86_32_PIC
 %ifdef X86_32
     pop  r5
     pop  r4
@@ -1677,16 +1475,9 @@
     push  r0
     push  r2
 
-%ifdef X86_32_PICASM
-    push        0xff01ff01
-    push        0xff01ff01
-    push        0x01010101
-    push        0x01010101
-    movdqu      xmm7,   [esp]
-    add         esp, 16
-%else
-    movdqa      xmm7, [HSumSubDB1]
-%endif
+    INIT_X86_32_PIC_NOPRESERVE r4
+    movdqa      xmm7, [pic(HSumSubDB1)]
+    DEINIT_X86_32_PIC
     lea         r4,  [r1+r1*2]
     lea         r5,  [r3+r3*2]
     pxor        xmm6,   xmm6
@@ -1863,19 +1654,9 @@
     SIGN_EXTENSION r1, r1d
     SIGN_EXTENSION r3, r3d
 
-%ifdef X86_32_PICASM
-    mov         r1, esp
-    and         esp, 0xfffffff0
-    push        0xff01ff01
-    push        0xff01ff01
-    push        0x01010101
-    push        0x01010101
-    vbroadcasti128 ymm7, [esp]
-    mov            esp, r1
-    mov            r1, [esp + push_num*4 + 8]
-%else
-    vbroadcasti128 ymm7, [HSumSubDB1]
-%endif
+    INIT_X86_32_PIC_NOPRESERVE r5
+    vbroadcasti128 ymm7, [pic(HSumSubDB1)]
+    DEINIT_X86_32_PIC
     lea            r5, [3 * r1]
     lea            r6, [3 * r3]
     vpxor          ymm6, ymm6, ymm6
@@ -1941,22 +1722,11 @@
     SIGN_EXTENSION r1, r1d
     SIGN_EXTENSION r3, r3d
 
-%ifdef X86_32_PICASM
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0xff01ff01
-    push        0xff01ff01
-    push        0x01010101
-    push        0x01010101
-    vpbroadcastq xmm0, [esp]
-    vpbroadcastq ymm6, [esp + 8]
-    mov            esp, r0
-    mov            r0, [esp + push_num*4 + 4]
-%else
-    vpbroadcastq xmm0, [HSumSubDB1]
-    vpbroadcastq ymm6, [HSumSubDB1 + 8]
-%endif
+    INIT_X86_32_PIC_NOPRESERVE r5
+    vpbroadcastq xmm0, [pic(HSumSubDB1)]
+    vpbroadcastq ymm6, [pic(HSumSubDB1 + 8)]
     vpblendd     ymm6, ymm0, ymm6, 11110000b
+    DEINIT_X86_32_PIC
     lea          r5, [3 * r1]
     lea          r6, [3 * r3]
     vpxor        ymm5, ymm5, ymm5