shithub: openh264

Download patch

ref: f76daa92ad831c5ca179d2ec16ac9d7996874b51
parent: 36ee29037b3ebc36e848561392c05bbf2cb4cab8
author: Sindre Aamås <[email protected]>
date: Tue Mar 7 09:24:50 EST 2017

[Encoder/x86] Simplify and extend score X86_32_PICASM handling

Utilize program counter-relative offsets to simplify X86_32_PICASM
code.

In order for this to work with nasm, data constants are placed in
the text segment.

Extend support for X86_32_PICASM to all routines and enable disabled
routines.

--- a/codec/encoder/core/inc/encode_mb_aux.h
+++ b/codec/encoder/core/inc/encode_mb_aux.h
@@ -75,9 +75,7 @@
 
 #ifdef X86_ASM
 
-#ifndef X86_32_PICASM
 int32_t WelsGetNoneZeroCount_sse2 (int16_t* pLevel);
-#endif
 int32_t WelsGetNoneZeroCount_sse42 (int16_t* pLevel);
 
 /****************************************************************************
@@ -86,9 +84,7 @@
 void WelsScan4x4Ac_sse2 (int16_t* zig_value, int16_t* pDct);
 void WelsScan4x4DcAc_ssse3 (int16_t* pLevel, int16_t* pDct);
 void WelsScan4x4DcAc_sse2 (int16_t* pLevel, int16_t* pDct);
-#ifndef X86_32_PICASM
 int32_t WelsCalculateSingleCtr4x4_sse2 (int16_t* pDct);
-#endif
 
 /****************************************************************************
  * DCT functions
--- a/codec/encoder/core/src/encode_mb_aux.cpp
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -500,9 +500,7 @@
     pFuncList->pfCopy8x16Aligned        = WelsCopy8x16_mmx;
   }
   if (uiCpuFlag & WELS_CPU_SSE2) {
-#ifndef X86_32_PICASM
     pFuncList->pfGetNoneZeroCount       = WelsGetNoneZeroCount_sse2;
-#endif
     pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_sse2;
 
     pFuncList->pfQuantization4x4        = WelsQuant4x4_sse2;
@@ -516,9 +514,7 @@
 
     pFuncList->pfScan4x4                = WelsScan4x4DcAc_sse2;
     pFuncList->pfScan4x4Ac              = WelsScan4x4Ac_sse2;
-#ifndef X86_32_PICASM
     pFuncList->pfCalculateSingleCtr4x4  = WelsCalculateSingleCtr4x4_sse2;
-#endif
 
     pFuncList->pfDctT4                  = WelsDctT4_sse2;
     pFuncList->pfDctFourT4              = WelsDctFourT4_sse2;
--- a/codec/encoder/core/x86/score.asm
+++ b/codec/encoder/core/x86/score.asm
@@ -49,7 +49,11 @@
 ;***********************************************************************
 ; Local Data (Read Only)
 ;***********************************************************************
+%ifdef X86_32_PICASM
+SECTION .text align=16
+%else
 SECTION .rodata align=16
+%endif
 
 ;align 16
 ;se2_2 dw 2, 2, 2, 2, 2, 2, 2, 2
@@ -200,6 +204,7 @@
 ;***********************************************************************
 WELS_EXTERN WelsScan4x4DcAc_ssse3
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_2_PARA
     movdqa     xmm0, [r1]
     movdqa     xmm1, [r1+16]
@@ -207,29 +212,12 @@
     pextrw      r1d,  xmm1, 0           ; eax = [8]
     pinsrw      xmm0, r1d, 7            ; xmm0[7]   =   [8]
     pinsrw      xmm1, r2d, 0            ; xmm1[0]   =   [7]
-%ifdef X86_32_PICASM
-    push        r0
-    mov         r0, esp
-    and         esp, 0xfffffff0
-    push        0x0d0c0706              ;pb_scanacdc_maska
-    push        0x05040b0a
-    push        0x0f0e0908
-    push        0x03020100
-    push        0x0f0e0d0c              ;pb_scanacdc_maskb
-    push        0x07060100
-    push        0x05040b0a
-    push        0x09080302
-    pshufb      xmm1, [esp]
-    pshufb      xmm0, [esp+16]
-    mov         esp, r0
-    pop         r0
-%else
-    pshufb      xmm1, [pb_scanacdc_maskb]
-    pshufb      xmm0, [pb_scanacdc_maska]
-%endif
+    pshufb      xmm1, [pic(pb_scanacdc_maskb)]
+    pshufb      xmm0, [pic(pb_scanacdc_maska)]
 
     movdqa     [r0],xmm0
     movdqa     [r0+16], xmm1
+    DEINIT_X86_32_PIC
     ret
 ;***********************************************************************
 ;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct )
@@ -268,7 +256,6 @@
     ret
 
 
-%ifndef X86_32_PICASM
 ;***********************************************************************
 ;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
 ;***********************************************************************
@@ -279,6 +266,7 @@
     %else
     %assign push_num 0
     %endif
+    INIT_X86_32_PIC r4
     LOAD_1_PARA
     movdqa    xmm0, [r0]
     movdqa    xmm1, [r0+16]
@@ -309,16 +297,17 @@
 .find1end:
     sub       r1, r2
     sub       r1, 1
-    lea   r2,  [i_ds_table]
+    lea   r2,  [pic(i_ds_table)]
     add       r0b,  [r2+r1]
     mov       r1, r3
     and       r3, 0xff
     shr       r1, 8
     and       r1, 0xff
-    lea   r2 , [low_mask_table]
+    lea   r2 , [pic(low_mask_table)]
     add       r0b,  [r2 +r3]
-    lea   r2, [high_mask_table]
+    lea   r2, [pic(high_mask_table)]
     add       r0b,  [r2+r1]
+    DEINIT_X86_32_PIC
     %ifdef X86_32
     pop r3
     %else
@@ -325,15 +314,14 @@
     mov retrd, r0d
     %endif
     ret
-%endif ;ifndef X86_32_PICASM
 
 
-%ifndef X86_32_PICASM
 ;***********************************************************************
 ; int32_t WelsGetNoneZeroCount_sse2(int16_t* level);
 ;***********************************************************************
 WELS_EXTERN WelsGetNoneZeroCount_sse2
     %assign push_num 0
+    INIT_X86_32_PIC r3
     LOAD_1_PARA
     movdqa    xmm0, [r0]
     movdqa    xmm1, [r0+16]
@@ -350,14 +338,14 @@
 ;   and       ecx,  0xff    ; we do not need this due to high 16bits equal to 0 yet
 ;   xor       retr,  retr
     ;add       al,  [nozero_count_table+r2]
-    lea       r0 , [nozero_count_table]
+    lea       r0 , [pic(nozero_count_table)]
     movzx     r2, byte [r0+r2]
     movzx     r1,   byte [r0+r1]
     mov   retrq, r2
     add   retrq, r1
     ;add       al,  [nozero_count_table+r1]
+    DEINIT_X86_32_PIC
     ret
-%endif ;%ifndef X86_32_PICASM
 
 ;***********************************************************************
 ; int32_t WelsGetNoneZeroCount_sse42(int16_t* level);
--- a/test/encoder/EncUT_EncoderMbAux.cpp
+++ b/test/encoder/EncUT_EncoderMbAux.cpp
@@ -222,7 +222,6 @@
 }
 #endif //HAVE_AVX2
 
-#ifndef X86_32_PICASM
 TEST (EncodeMbAuxTest, WelsCalculateSingleCtr4x4_sse2) {
   CMemoryAlign cMemoryAlign (0);
   ALLOC_MEMORY (int16_t, iDctC, 16);
@@ -236,7 +235,6 @@
   FREE_MEMORY (iDctC);
   FREE_MEMORY (iDctS);
 }
-#endif //#ifndef X86_32_PICASM
 #endif
 
 void copy (uint8_t* pDst, int32_t iDStride, uint8_t* pSrc, int32_t iSStride, int32_t iWidth, int32_t iHeight) {
@@ -304,11 +302,9 @@
   TestGetNoneZeroCount (WelsGetNoneZeroCount_c);
 }
 #ifdef X86_ASM
-#ifndef X86_32_PICASM
 TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse2) {
   TestGetNoneZeroCount (WelsGetNoneZeroCount_sse2);
 }
-#endif
 TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_sse42) {
   if (WelsCPUFeatureDetect (0) & WELS_CPU_SSE42)
     TestGetNoneZeroCount (WelsGetNoneZeroCount_sse42);