shithub: libvpx

Download patch

ref: 5327fcf8572a37b6aaefab362ef2f6ed4d0b4cc9
parent: ac7f403cbea3f6ec6f36c0c3e18dce1d0fe1c963
parent: 944ad6cac9391fc84c189c31a950e8e88ee62205
author: Frank Galligan <[email protected]>
date: Tue Jul 7 20:16:02 EDT 2015

Merge "Add vp9_int_pro_row_neon."

--- a/test/vp9_avg_test.cc
+++ b/test/vp9_avg_test.cc
@@ -286,6 +286,11 @@
         make_tuple(16, 16, 5, 8, &vp9_avg_8x8_neon),
         make_tuple(32, 32, 15, 8, &vp9_avg_8x8_neon)));
 
+INSTANTIATE_TEST_CASE_P(
+    NEON, IntProRowTest, ::testing::Values(
+        make_tuple(16, &vp9_int_pro_row_neon, &vp9_int_pro_row_c),
+        make_tuple(32, &vp9_int_pro_row_neon, &vp9_int_pro_row_c),
+        make_tuple(64, &vp9_int_pro_row_neon, &vp9_int_pro_row_c)));
 #endif
 
 #if HAVE_MSA
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -821,7 +821,7 @@
 specialize qw/vp9_satd sse2/;
 
 add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
-specialize qw/vp9_int_pro_row sse2/;
+specialize qw/vp9_int_pro_row sse2 neon/;
 
 add_proto qw/int16_t vp9_int_pro_col/, "uint8_t const *ref, const int width";
 specialize qw/vp9_int_pro_col sse2/;
--- a/vp9/encoder/arm/neon/vp9_avg_neon.c
+++ b/vp9/encoder/arm/neon/vp9_avg_neon.c
@@ -47,3 +47,56 @@
 
   return (horizontal_add_u16x8(v_sum) + 32) >> 6;
 }
+
+void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
+                          const int ref_stride, const int height) {
+  int i;
+  uint16x8_t vec_sum_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_hi = vdupq_n_u16(0);
+  const int shift_factor = ((height >> 5) + 3) * -1;
+  const int16x8_t vec_shift = vdupq_n_s16(shift_factor);
+
+  for (i = 0; i < height; i += 8) {
+    const uint8x16_t vec_row1 = vld1q_u8(ref);
+    const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride);
+    const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2);
+    const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3);
+    const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4);
+    const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5);
+    const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6);
+    const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7);
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8));
+
+    ref += ref_stride * 8;
+  }
+
+  vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift);
+  vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift);
+
+  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo));
+  hbuf += 8;
+  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
+}