shithub: libvpx

--- a/test/lpf_8_test.cc

+++ b/test/lpf_8_test.cc

@@ -458,7 +458,8 @@

 INSTANTIATE_TEST_CASE_P(

     MMX, Loop8Test6Param,

     ::testing::Values(

-        make_tuple(&vpx_lpf_horizontal_4_mmx, &vpx_lpf_horizontal_4_c, 8, 1),

+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_4_mmx>,

+                   &wrapper_nc<vpx_lpf_horizontal_4_c>, 8, 1),

         make_tuple(&wrapper_nc<vpx_lpf_vertical_4_mmx>,

                    &wrapper_nc<vpx_lpf_vertical_4_c>, 8, 1)));

 #endif  // HAVE_MMX

@@ -609,8 +610,8 @@

                    &wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1),

         make_tuple(&wrapper_nc<vpx_lpf_vertical_8_neon>,

                    &wrapper_nc<vpx_lpf_vertical_8_c>, 8, 1),

-        make_tuple(&vpx_lpf_horizontal_4_neon,

-                   &vpx_lpf_horizontal_4_c, 8, 1),

+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_4_neon>,

+                   &wrapper_nc<vpx_lpf_horizontal_4_c>, 8, 1),

         make_tuple(&wrapper_nc<vpx_lpf_vertical_4_neon>,

                    &wrapper_nc<vpx_lpf_vertical_4_c>, 8, 1)));

 INSTANTIATE_TEST_CASE_P(

@@ -633,7 +634,8 @@

 INSTANTIATE_TEST_CASE_P(

     DSPR2, Loop8Test6Param,

     ::testing::Values(

-        make_tuple(&vpx_lpf_horizontal_4_dspr2, &vpx_lpf_horizontal_4_c, 8, 1),

+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_4_dspr2>,

+                   &wrapper_nc<vpx_lpf_horizontal_4_c>, 8, 1),

         make_tuple(&wrapper_nc<vpx_lpf_horizontal_8_dspr2>,

                    &wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1),

         make_tuple(&vpx_lpf_horizontal_16_dspr2,

@@ -666,7 +668,8 @@

 INSTANTIATE_TEST_CASE_P(

     MSA, Loop8Test6Param,

     ::testing::Values(

-        make_tuple(&vpx_lpf_horizontal_4_msa, &vpx_lpf_horizontal_4_c, 8, 1),

+        make_tuple(&wrapper_nc<vpx_lpf_horizontal_4_msa>,

+                   &wrapper_nc<vpx_lpf_horizontal_4_c>, 8, 1),

         make_tuple(&wrapper_nc<vpx_lpf_horizontal_8_msa>,

                    &wrapper_nc<vpx_lpf_horizontal_8_c>, 8, 1),

         make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 1),

--- a/vp10/common/loopfilter.c

+++ b/vp10/common/loopfilter.c

@@ -535,10 +535,10 @@

           } else {

             if (mask_4x4_int & 1)

               vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,

-                                   lfi->hev_thr, 1);

+                                   lfi->hev_thr);

             else if (mask_4x4_int & 2)

               vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,

-                                   lfin->lim, lfin->hev_thr, 1);

+                                   lfin->lim, lfin->hev_thr);

           count = 2;

         } else {

@@ -546,7 +546,7 @@

           if (mask_4x4_int & 1)

             vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,

-                                 lfi->hev_thr, 1);

+                                 lfi->hev_thr);

       } else if (mask_4x4 & 1) {

         if ((mask_4x4 & 3) == 3) {

@@ -563,22 +563,22 @@

           } else {

             if (mask_4x4_int & 1)

               vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,

-                                   lfi->hev_thr, 1);

+                                   lfi->hev_thr);

             else if (mask_4x4_int & 2)

               vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,

-                                   lfin->lim, lfin->hev_thr, 1);

+                                   lfin->lim, lfin->hev_thr);

           count = 2;

         } else {

-          vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);

+          vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);

           if (mask_4x4_int & 1)

             vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,

-                                 lfi->hev_thr, 1);

+                                 lfi->hev_thr);

       } else if (mask_4x4_int & 1) {

         vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,

-                             lfi->hev_thr, 1);

+                             lfi->hev_thr);

     s += 8 * count;

--- a/vp9/common/vp9_loopfilter.c

+++ b/vp9/common/vp9_loopfilter.c

@@ -535,10 +535,10 @@

           } else {

             if (mask_4x4_int & 1)

               vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,

-                                   lfi->hev_thr, 1);

+                                   lfi->hev_thr);

             else if (mask_4x4_int & 2)

               vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,

-                                   lfin->lim, lfin->hev_thr, 1);

+                                   lfin->lim, lfin->hev_thr);

           count = 2;

         } else {

@@ -546,7 +546,7 @@

           if (mask_4x4_int & 1)

             vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,

-                                 lfi->hev_thr, 1);

+                                 lfi->hev_thr);

       } else if (mask_4x4 & 1) {

         if ((mask_4x4 & 3) == 3) {

@@ -563,22 +563,22 @@

           } else {

             if (mask_4x4_int & 1)

               vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,

-                                   lfi->hev_thr, 1);

+                                   lfi->hev_thr);

             else if (mask_4x4_int & 2)

               vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,

-                                   lfin->lim, lfin->hev_thr, 1);

+                                   lfin->lim, lfin->hev_thr);

           count = 2;

         } else {

-          vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);

+          vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);

           if (mask_4x4_int & 1)

             vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,

-                                 lfi->hev_thr, 1);

+                                 lfi->hev_thr);

       } else if (mask_4x4_int & 1) {

         vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,

-                             lfi->hev_thr, 1);

+                             lfi->hev_thr);

     s += 8 * count;

--- a/vpx_dsp/arm/loopfilter_4_neon.asm

+++ b/vpx_dsp/arm/loopfilter_4_neon.asm

@@ -16,15 +16,12 @@

 ; Currently vpx only works on iterations 8 at a time. The vp8 loop filter

 ; works on 16 iterations at a time.

-; TODO(fgalligan): See about removing the count code as this function is only

-; called with a count of 1.

 ; void vpx_lpf_horizontal_4_neon(uint8_t *s,

 ;                                int p /* pitch */,

 ;                                const uint8_t *blimit,

 ;                                const uint8_t *limit,

-;                                const uint8_t *thresh,

-;                                int count)

+;                                const uint8_t *thresh)

 ; r0    uint8_t *s,

 ; r1    int p, /* pitch */

@@ -31,22 +28,16 @@

 ; r2    const uint8_t *blimit,

 ; r3    const uint8_t *limit,

 ; sp    const uint8_t *thresh,

-; sp+4  int count

 |vpx_lpf_horizontal_4_neon| PROC

     push        {lr}

     vld1.8      {d0[]}, [r2]               ; duplicate *blimit

-    ldr         r12, [sp, #8]              ; load count

     ldr         r2, [sp, #4]               ; load thresh

     add         r1, r1, r1                 ; double pitch

-    cmp         r12, #0

-    beq         end_vpx_lf_h_edge

     vld1.8      {d1[]}, [r3]               ; duplicate *limit

     vld1.8      {d2[]}, [r2]               ; duplicate *thresh

-count_lf_h_loop

     sub         r2, r0, r1, lsl #1         ; move src pointer down by 4 lines

     add         r3, r2, r1, lsr #1         ; set to 3 lines down

@@ -69,11 +60,6 @@

     vst1.u8     {d6}, [r2@64], r1          ; store oq0

     vst1.u8     {d7}, [r3@64], r1          ; store oq1

-    add         r0, r0, #8

-    subs        r12, r12, #1

-    bne         count_lf_h_loop

-end_vpx_lf_h_edge

     pop         {pc}

     ENDP        ; |vpx_lpf_horizontal_4_neon|

--- a/vpx_dsp/arm/loopfilter_4_neon.c

+++ b/vpx_dsp/arm/loopfilter_4_neon.c

@@ -115,22 +115,18 @@

         int pitch,

         const uint8_t *blimit,

         const uint8_t *limit,

-        const uint8_t *thresh,

-        int count) {

+        const uint8_t *thresh) {

     int i;

     uint8_t *s, *psrc;

     uint8x8_t dblimit, dlimit, dthresh;

     uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;

-    if (count == 0)  // end_vpx_lf_h_edge

-        return;

     dblimit = vld1_u8(blimit);

     dlimit = vld1_u8(limit);

     dthresh = vld1_u8(thresh);

     psrc = src - (pitch << 2);

-    for (i = 0; i < count; i++) {

+    for (i = 0; i < 1; i++) {

         s = psrc + i * 8;

         d3u8 = vld1_u8(s);

--- a/vpx_dsp/loopfilter.c

+++ b/vpx_dsp/loopfilter.c

@@ -119,12 +119,12 @@

 void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,

                             const uint8_t *blimit, const uint8_t *limit,

-                            const uint8_t *thresh, int count) {

+                            const uint8_t *thresh) {

   int i;

   // loop filter designed to work using chars so that we can make maximum use

   // of 8 bit simd instructions.

-  for (i = 0; i < 8 * count; ++i) {

+  for (i = 0; i < 8; ++i) {

     const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];

     const uint8_t q0 = s[0 * p],  q1 = s[1 * p],  q2 = s[2 * p],  q3 = s[3 * p];

     const int8_t mask = filter_mask(*limit, *blimit,

@@ -138,8 +138,8 @@

                                  const uint8_t *limit0, const uint8_t *thresh0,

                                  const uint8_t *blimit1, const uint8_t *limit1,

                                  const uint8_t *thresh1) {

-  vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1);

-  vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1);

+  vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);

+  vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1);

 void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,

--- a/vpx_dsp/mips/loopfilter_4_msa.c

+++ b/vpx_dsp/mips/loopfilter_4_msa.c

@@ -13,13 +13,10 @@

 void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,

                               const uint8_t *b_limit_ptr,

                               const uint8_t *limit_ptr,

-                              const uint8_t *thresh_ptr,

-                              int32_t count) {

+                              const uint8_t *thresh_ptr) {

   uint64_t p1_d, p0_d, q0_d, q1_d;

   v16u8 mask, hev, flat, thresh, b_limit, limit;

   v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;

-  (void)count;

   /* load vector elements */

   LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);

--- a/vpx_dsp/mips/loopfilter_filters_dspr2.c

+++ b/vpx_dsp/mips/loopfilter_filters_dspr2.c

@@ -23,8 +23,7 @@

                                 int pitch,

                                 const uint8_t *blimit,

                                 const uint8_t *limit,

-                                const uint8_t *thresh,

-                                int count) {

+                                const uint8_t *thresh) {

   uint8_t   i;

   uint32_t  mask;

   uint32_t  hev;

@@ -312,8 +311,8 @@

                                      const uint8_t *blimit1,

                                      const uint8_t *limit1,

                                      const uint8_t *thresh1) {

-  vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0, 1);

-  vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);

+  vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0);

+  vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1);

 void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */,

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -559,7 +559,7 @@

 specialize qw/vpx_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/;

 $vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon;

-add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";

+add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";

 specialize qw/vpx_lpf_horizontal_4 mmx neon dspr2 msa/;

 add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";

--- a/vpx_dsp/x86/loopfilter_mmx.asm

+++ b/vpx_dsp/x86/loopfilter_mmx.asm

@@ -18,14 +18,13 @@

 ;    int src_pixel_step,

 ;    const char *blimit,

 ;    const char *limit,

-;    const char *thresh,

-;    int  count

+;    const char *thresh

;)

 global sym(vpx_lpf_horizontal_4_mmx) PRIVATE

 sym(vpx_lpf_horizontal_4_mmx):

     push        rbp

     mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

+    SHADOW_ARGS_TO_STACK 5

     GET_GOT     rbx

     push        rsi

     push        rdi

@@ -39,8 +38,6 @@

         mov         rsi, arg(0) ;src_ptr

         movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?

-        movsxd      rcx, dword ptr arg(5) ;count

-.next8_h:

         mov         rdx, arg(3) ;limit

         movq        mm7, [rdx]

         mov         rdi, rsi              ; rdi points to row +1 for indirect addressing

@@ -207,11 +204,6 @@

         psubsb      mm7, mm4              ; q1-= q1 add

         pxor        mm7, [GLOBAL(t80)]    ; unoffset

         movq        [rdi], mm7            ; write back

-        add         rsi,8

-        neg         rax

-        dec         rcx

-        jnz         .next8_h

     add rsp, 32

     pop rsp