ref: e1be33b9c8cb20c62b26b9e3f02d206ddf54a80e
parent: c58e9d576c4eaf393f6751ea6375803acd5dec81
author: Martin Storsjö <[email protected]>
date: Sun Feb 9 18:39:11 EST 2020
arm32: looprestoration: Prepare for 16 bpc by splitting code to separate files looprestoration_common.S contains functions that can be used as is with one single instantiation of the functions for both 8 and 16 bpc. This file will be built once, regardless of which bitdepths are enabled. looprestoration_tmpl.S contains functions where the source can be shared and templated between 8 and 16 bpc. This will be included by the separate 8/16bpc implementaton files.
--- a/src/arm/32/looprestoration.S
+++ b/src/arm/32/looprestoration.S
@@ -676,6 +676,8 @@
#define SUM_STRIDE (384+16)
+#include "looprestoration_tmpl.S"
+
// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
// const pixel (*left)[4],
// const pixel *src, const ptrdiff_t stride,
@@ -1236,863 +1238,4 @@
vpop {q4-q7}
pop {r4-r11,pc}
.purgem add5
-endfunc
-
-// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
-// const int w, const int h,
-// const enum LrEdgeFlags edges);
-function sgr_box3_v_neon, export=1
- push {r4-r9,lr}
- ldr r4, [sp, #28]
- add r12, r3, #2 // Number of output rows to move back
- mov lr, r3 // Number of input rows to move back
- add r2, r2, #2 // Actual summed width
- mov r7, #(4*SUM_STRIDE) // sumsq stride
- mov r8, #(2*SUM_STRIDE) // sum stride
- sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride
- sub r1, r1, #(2*SUM_STRIDE) // sum -= stride
-
- tst r4, #4 // LR_HAVE_TOP
- beq 0f
- // If have top, read from row -2.
- sub r5, r0, #(4*SUM_STRIDE)
- sub r6, r1, #(2*SUM_STRIDE)
- add lr, lr, #2
- b 1f
-0:
- // !LR_HAVE_TOP
- // If we don't have top, read from row 0 even if
- // we start writing to row -1.
- add r5, r0, #(4*SUM_STRIDE)
- add r6, r1, #(2*SUM_STRIDE)
-1:
-
- tst r4, #8 // LR_HAVE_BOTTOM
- beq 1f
- // LR_HAVE_BOTTOM
- add r3, r3, #2 // Sum all h+2 lines with the main loop
- add lr, lr, #2
-1:
- mov r9, r3 // Backup of h for next loops
-
-1:
- // Start of horizontal loop; start one vertical filter slice.
- // Start loading rows into q8-q13 and q0-q2 taking top
- // padding into consideration.
- tst r4, #4 // LR_HAVE_TOP
- vld1.32 {q8, q9}, [r5, :128], r7
- vld1.16 {q0}, [r6, :128], r8
- beq 2f
- // LR_HAVE_TOP
- vld1.32 {q10, q11}, [r5, :128], r7
- vld1.16 {q1}, [r6, :128], r8
- vld1.32 {q12, q13}, [r5, :128], r7
- vld1.16 {q2}, [r6, :128], r8
- b 3f
-2: // !LR_HAVE_TOP
- vmov q10, q8
- vmov q11, q9
- vmov q1, q0
- vmov q12, q8
- vmov q13, q9
- vmov q2, q0
-
-3:
- subs r3, r3, #1
-.macro add3
- vadd.i32 q8, q8, q10
- vadd.i32 q9, q9, q11
- vadd.i16 q0, q0, q1
- vadd.i32 q8, q8, q12
- vadd.i32 q9, q9, q13
- vadd.i16 q0, q0, q2
- vst1.32 {q8, q9}, [r0, :128], r7
- vst1.16 {q0}, [r1, :128], r8
-.endm
- add3
- vmov q8, q10
- vmov q9, q11
- vmov q0, q1
- vmov q10, q12
- vmov q11, q13
- vmov q1, q2
- ble 4f
- vld1.32 {q12, q13}, [r5, :128], r7
- vld1.16 {q2}, [r6, :128], r8
- b 3b
-
-4:
- tst r4, #8 // LR_HAVE_BOTTOM
- bne 5f
- // !LR_HAVE_BOTTOM
- // Produce two more rows, extending the already loaded rows.
- add3
- vmov q8, q10
- vmov q9, q11
- vmov q0, q1
- add3
-
-5: // End of one vertical slice.
- subs r2, r2, #8
- ble 0f
- // Move pointers back up to the top and loop horizontally.
- // Input pointers
- mls r5, r7, lr, r5
- mls r6, r8, lr, r6
- // Output pointers
- mls r0, r7, r12, r0
- mls r1, r8, r12, r1
- add r0, r0, #32
- add r1, r1, #16
- add r5, r5, #32
- add r6, r6, #16
- mov r3, r9
- b 1b
-
-0:
- pop {r4-r9,pc}
-.purgem add3
-endfunc
-
-// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
-// const int w, const int h,
-// const enum LrEdgeFlags edges);
-function sgr_box5_v_neon, export=1
- push {r4-r9,lr}
- vpush {q5-q7}
- ldr r4, [sp, #76]
- add r12, r3, #2 // Number of output rows to move back
- mov lr, r3 // Number of input rows to move back
- add r2, r2, #8 // Actual summed width
- mov r7, #(4*SUM_STRIDE) // sumsq stride
- mov r8, #(2*SUM_STRIDE) // sum stride
- sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride
- sub r1, r1, #(2*SUM_STRIDE) // sum -= stride
-
- tst r4, #4 // LR_HAVE_TOP
- beq 0f
- // If have top, read from row -2.
- sub r5, r0, #(4*SUM_STRIDE)
- sub r6, r1, #(2*SUM_STRIDE)
- add lr, lr, #2
- b 1f
-0:
- // !LR_HAVE_TOP
- // If we don't have top, read from row 0 even if
- // we start writing to row -1.
- add r5, r0, #(4*SUM_STRIDE)
- add r6, r1, #(2*SUM_STRIDE)
-1:
-
- tst r4, #8 // LR_HAVE_BOTTOM
- beq 0f
- // LR_HAVE_BOTTOM
- add r3, r3, #2 // Handle h+2 lines with the main loop
- add lr, lr, #2
- b 1f
-0:
- // !LR_HAVE_BOTTOM
- sub r3, r3, #1 // Handle h-1 lines with the main loop
-1:
- mov r9, r3 // Backup of h for next loops
-
-1:
- // Start of horizontal loop; start one vertical filter slice.
- // Start loading rows into q6-q15 and q0-q3,q5 taking top
- // padding into consideration.
- tst r4, #4 // LR_HAVE_TOP
- vld1.32 {q6, q7}, [r5, :128], r7
- vld1.16 {q0}, [r6, :128], r8
- beq 2f
- // LR_HAVE_TOP
- vld1.32 {q10, q11}, [r5, :128], r7
- vld1.16 {q2}, [r6, :128], r8
- vmov q8, q6
- vmov q9, q7
- vmov q1, q0
- vld1.32 {q12, q13}, [r5, :128], r7
- vld1.16 {q3}, [r6, :128], r8
- b 3f
-2: // !LR_HAVE_TOP
- vmov q8, q6
- vmov q9, q7
- vmov q1, q0
- vmov q10, q6
- vmov q11, q7
- vmov q2, q0
- vmov q12, q6
- vmov q13, q7
- vmov q3, q0
-
-3:
- cmp r3, #0
- beq 4f
- vld1.32 {q14, q15}, [r5, :128], r7
- vld1.16 {q5}, [r6, :128], r8
-
-3:
- // Start of vertical loop
- subs r3, r3, #2
-.macro add5
- vadd.i32 q6, q6, q8
- vadd.i32 q7, q7, q9
- vadd.i16 q0, q0, q1
- vadd.i32 q6, q6, q10
- vadd.i32 q7, q7, q11
- vadd.i16 q0, q0, q2
- vadd.i32 q6, q6, q12
- vadd.i32 q7, q7, q13
- vadd.i16 q0, q0, q3
- vadd.i32 q6, q6, q14
- vadd.i32 q7, q7, q15
- vadd.i16 q0, q0, q5
- vst1.32 {q6, q7}, [r0, :128], r7
- vst1.16 {q0}, [r1, :128], r8
-.endm
- add5
-.macro shift2
- vmov q6, q10
- vmov q7, q11
- vmov q0, q2
- vmov q8, q12
- vmov q9, q13
- vmov q1, q3
- vmov q10, q14
- vmov q11, q15
- vmov q2, q5
-.endm
- shift2
- add r0, r0, r7
- add r1, r1, r8
- ble 5f
- vld1.32 {q12, q13}, [r5, :128], r7
- vld1.16 {q3}, [r6, :128], r8
- vld1.32 {q14, q15}, [r5, :128], r7
- vld1.16 {q5}, [r6, :128], r8
- b 3b
-
-4:
- // h == 1, !LR_HAVE_BOTTOM.
- // Pad the last row with the only content row, and add.
- vmov q14, q12
- vmov q15, q13
- vmov q5, q3
- add5
- shift2
- add r0, r0, r7
- add r1, r1, r8
- add5
- b 6f
-
-5:
- tst r4, #8 // LR_HAVE_BOTTOM
- bne 6f
- // !LR_HAVE_BOTTOM
- cmp r3, #0
- bne 5f
- // The intended three edge rows left; output the one at h-2 and
- // the past edge one at h.
- vld1.32 {q12, q13}, [r5, :128], r7
- vld1.16 {q3}, [r6, :128], r8
- // Pad the past-edge row from the last content row.
- vmov q14, q12
- vmov q15, q13
- vmov q5, q3
- add5
- shift2
- add r0, r0, r7
- add r1, r1, r8
- // The last two rows are already padded properly here.
- add5
- b 6f
-
-5:
- // r3 == -1, two rows left, output one.
- // Pad the last two rows from the mid one.
- vmov q12, q10
- vmov q13, q11
- vmov q3, q2
- vmov q14, q10
- vmov q15, q11
- vmov q5, q2
- add5
- add r0, r0, r7
- add r1, r1, r8
- b 6f
-
-6: // End of one vertical slice.
- subs r2, r2, #8
- ble 0f
- // Move pointers back up to the top and loop horizontally.
- // Input pointers
- mls r5, r7, lr, r5
- mls r6, r8, lr, r6
- // Output pointers
- mls r0, r7, r12, r0
- mls r1, r8, r12, r1
- add r0, r0, #32
- add r1, r1, #16
- add r5, r5, #32
- add r6, r6, #16
- mov r3, r9
- b 1b
-
-0:
- vpop {q5-q7}
- pop {r4-r9,pc}
-.purgem add5
-endfunc
-
-// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
-// const int w, const int h, const int strength);
-// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
-// const int w, const int h, const int strength);
-function sgr_calc_ab1_neon, export=1
- push {r4-r5,lr}
- vpush {q4-q7}
- ldr r4, [sp, #76]
- add r3, r3, #2 // h += 2
- vmov.i32 q15, #9 // n
- movw r5, #455
- mov lr, #SUM_STRIDE
- b sgr_calc_ab_neon
-endfunc
-
-function sgr_calc_ab2_neon, export=1
- push {r4-r5,lr}
- vpush {q4-q7}
- ldr r4, [sp, #76]
- add r3, r3, #3 // h += 3
- asr r3, r3, #1 // h /= 2
- vmov.i32 q15, #25 // n
- mov r5, #164
- mov lr, #(2*SUM_STRIDE)
-endfunc
-
-function sgr_calc_ab_neon
- movrel r12, X(sgr_x_by_x)
- vld1.8 {q8, q9}, [r12, :128]!
- vmov.i8 q11, #5
- vmov.i8 d10, #55 // idx of last 5
- vld1.8 {q10}, [r12, :128]
- vmov.i8 d11, #72 // idx of last 4
- vmov.i8 d12, #101 // idx of last 3
- vmov.i8 d13, #169 // idx of last 2
- vmov.i8 d14, #254 // idx of last 1
- vmov.i8 d15, #32 // elements consumed in first vtbl
- add r2, r2, #2 // w += 2
- add r12, r2, #7
- bic r12, r12, #7 // aligned w
- sub r12, lr, r12 // increment between rows
- vmov.i16 q13, #256
- vdup.32 q12, r4
- vdup.32 q14, r5 // one_by_x
- sub r0, r0, #(4*(SUM_STRIDE))
- sub r1, r1, #(2*(SUM_STRIDE))
- mov r4, r2 // backup of w
- vsub.i8 q8, q8, q11
- vsub.i8 q9, q9, q11
- vsub.i8 q10, q10, q11
-1:
- subs r2, r2, #8
- vld1.32 {q0, q1}, [r0, :128] // a
- vld1.16 {q2}, [r1, :128] // b
- vmul.i32 q0, q0, q15 // a * n
- vmul.i32 q1, q1, q15 // a * n
- vmull.u16 q3, d4, d4 // b * b
- vmull.u16 q4, d5, d5 // b * b
- vqsub.u32 q0, q0, q3 // imax(a * n - b * b, 0)
- vqsub.u32 q1, q1, q4 // imax(a * n - b * b, 0)
- vmul.i32 q0, q0, q12 // p * s
- vmul.i32 q1, q1, q12 // p * s
- vqshrn.u32 d0, q0, #16
- vqshrn.u32 d1, q1, #16
- vqrshrn.u16 d0, q0, #4 // imin(z, 255)
-
- vcgt.u8 d2, d0, d10 // = -1 if sgr_x_by_x[d0] < 5
- vcgt.u8 d3, d0, d11 // = -1 if sgr_x_by_x[d0] < 4
- vtbl.8 d1, {q8, q9}, d0
- vcgt.u8 d6, d0, d12 // = -1 if sgr_x_by_x[d0] < 3
- vsub.i8 d9, d0, d15 // indices for vtbx
- vcgt.u8 d7, d0, d13 // = -1 if sgr_x_by_x[d0] < 2
- vadd.i8 d2, d2, d3
- vtbx.8 d1, {q10}, d9
- vcgt.u8 d8, d0, d14 // = -1 if sgr_x_by_x[d0] < 1
- vadd.i8 d6, d6, d7
- vadd.i8 d8, d8, d22
- vadd.i8 d2, d2, d6
- vadd.i8 d1, d1, d8
- vadd.i8 d1, d1, d2
- vmovl.u8 q0, d1 // x
-
- vmull.u16 q1, d0, d4 // x * BB[i]
- vmull.u16 q2, d1, d5 // x * BB[i]
- vmul.i32 q1, q1, q14 // x * BB[i] * sgr_one_by_x
- vmul.i32 q2, q2, q14 // x * BB[i] * sgr_one_by_x
- vrshr.s32 q1, q1, #12 // AA[i]
- vrshr.s32 q2, q2, #12 // AA[i]
- vsub.i16 q0, q13, q0 // 256 - x
-
- vst1.32 {q1, q2}, [r0, :128]!
- vst1.16 {q0}, [r1, :128]!
- bgt 1b
-
- subs r3, r3, #1
- ble 0f
- add r0, r0, r12, lsl #2
- add r1, r1, r12, lsl #1
- mov r2, r4
- b 1b
-0:
- vpop {q4-q7}
- pop {r4-r5,pc}
-endfunc
-
-#define FILTER_OUT_STRIDE 384
-
-// void dav1d_sgr_finish_filter1_8bpc_neon(int16_t *tmp,
-// const pixel *src, const ptrdiff_t stride,
-// const int32_t *a, const int16_t *b,
-// const int w, const int h);
-function sgr_finish_filter1_8bpc_neon, export=1
- push {r4-r11,lr}
- vpush {q4-q7}
- ldrd r4, r5, [sp, #100]
- ldr r6, [sp, #108]
- sub r7, r3, #(4*SUM_STRIDE)
- add r8, r3, #(4*SUM_STRIDE)
- sub r9, r4, #(2*SUM_STRIDE)
- add r10, r4, #(2*SUM_STRIDE)
- mov r11, #SUM_STRIDE
- mov r12, #FILTER_OUT_STRIDE
- add lr, r5, #3
- bic lr, lr, #3 // Aligned width
- sub r2, r2, lr
- sub r12, r12, lr
- sub r11, r11, lr
- sub r11, r11, #4 // We read 4 extra elements from both a and b
- mov lr, r5
- vmov.i16 q14, #3
- vmov.i32 q15, #3
-1:
- vld1.16 {q0}, [r9, :128]!
- vld1.16 {q1}, [r4, :128]!
- vld1.16 {q2}, [r10, :128]!
- vld1.32 {q8, q9}, [r7, :128]!
- vld1.32 {q10, q11}, [r3, :128]!
- vld1.32 {q12, q13}, [r8, :128]!
-
-2:
- subs r5, r5, #4
- vext.8 d6, d0, d1, #2 // -stride
- vext.8 d7, d2, d3, #2 // 0
- vext.8 d8, d4, d5, #2 // +stride
- vext.8 d9, d0, d1, #4 // +1-stride
- vext.8 d10, d2, d3, #4 // +1
- vext.8 d11, d4, d5, #4 // +1+stride
- vadd.i16 d2, d2, d6 // -1, -stride
- vadd.i16 d7, d7, d8 // 0, +stride
- vadd.i16 d0, d0, d9 // -1-stride, +1-stride
- vadd.i16 d2, d2, d7
- vadd.i16 d4, d4, d11 // -1+stride, +1+stride
- vadd.i16 d2, d2, d10 // +1
- vadd.i16 d0, d0, d4
-
- vext.8 q3, q8, q9, #4 // -stride
- vshl.i16 d2, d2, #2
- vext.8 q4, q8, q9, #8 // +1-stride
- vext.8 q5, q10, q11, #4 // 0
- vext.8 q6, q10, q11, #8 // +1
- vmla.i16 d2, d0, d28 // * 3 -> a
- vadd.i32 q3, q3, q10 // -stride, -1
- vadd.i32 q8, q8, q4 // -1-stride, +1-stride
- vadd.i32 q5, q5, q6 // 0, +1
- vadd.i32 q8, q8, q12 // -1+stride
- vadd.i32 q3, q3, q5
- vext.8 q7, q12, q13, #4 // +stride
- vext.8 q10, q12, q13, #8 // +1+stride
- vld1.32 {d24[0]}, [r1, :32]! // src
- vadd.i32 q3, q3, q7 // +stride
- vadd.i32 q8, q8, q10 // +1+stride
- vshl.i32 q3, q3, #2
- vmla.i32 q3, q8, q15 // * 3 -> b
- vmovl.u8 q12, d24 // src
- vmov d0, d1
- vmlal.u16 q3, d2, d24 // b + a * src
- vmov d2, d3
- vrshrn.i32 d6, q3, #9
- vmov d4, d5
- vst1.16 {d6}, [r0]!
-
- ble 3f
- vmov q8, q9
- vmov q10, q11
- vmov q12, q13
- vld1.16 {d1}, [r9, :64]!
- vld1.16 {d3}, [r4, :64]!
- vld1.16 {d5}, [r10, :64]!
- vld1.32 {q9}, [r7, :128]!
- vld1.32 {q11}, [r3, :128]!
- vld1.32 {q13}, [r8, :128]!
- b 2b
-
-3:
- subs r6, r6, #1
- ble 0f
- mov r5, lr
- add r0, r0, r12, lsl #1
- add r1, r1, r2
- add r3, r3, r11, lsl #2
- add r7, r7, r11, lsl #2
- add r8, r8, r11, lsl #2
- add r4, r4, r11, lsl #1
- add r9, r9, r11, lsl #1
- add r10, r10, r11, lsl #1
- b 1b
-0:
- vpop {q4-q7}
- pop {r4-r11,pc}
-endfunc
-
-// void dav1d_sgr_finish_filter2_8bpc_neon(int16_t *tmp,
-// const pixel *src, const ptrdiff_t stride,
-// const int32_t *a, const int16_t *b,
-// const int w, const int h);
-function sgr_finish_filter2_8bpc_neon, export=1
- push {r4-r11,lr}
- vpush {q4-q7}
- ldrd r4, r5, [sp, #100]
- ldr r6, [sp, #108]
- add r7, r3, #(4*(SUM_STRIDE))
- sub r3, r3, #(4*(SUM_STRIDE))
- add r8, r4, #(2*(SUM_STRIDE))
- sub r4, r4, #(2*(SUM_STRIDE))
- mov r9, #(2*SUM_STRIDE)
- mov r10, #FILTER_OUT_STRIDE
- add r11, r5, #7
- bic r11, r11, #7 // Aligned width
- sub r2, r2, r11
- sub r10, r10, r11
- sub r9, r9, r11
- sub r9, r9, #4 // We read 4 extra elements from a
- sub r12, r9, #4 // We read 8 extra elements from b
- mov lr, r5
-
-1:
- vld1.16 {q0, q1}, [r4, :128]!
- vld1.16 {q2, q3}, [r8, :128]!
- vld1.32 {q8, q9}, [r3, :128]!
- vld1.32 {q11, q12}, [r7, :128]!
- vld1.32 {q10}, [r3, :128]!
- vld1.32 {q13}, [r7, :128]!
-
-2:
- vmov.i16 q14, #5
- vmov.i16 q15, #6
- subs r5, r5, #8
- vext.8 q4, q0, q1, #4 // +1-stride
- vext.8 q5, q2, q3, #4 // +1+stride
- vext.8 q6, q0, q1, #2 // -stride
- vext.8 q7, q2, q3, #2 // +stride
- vadd.i16 q0, q0, q4 // -1-stride, +1-stride
- vadd.i16 q5, q2, q5 // -1+stride, +1+stride
- vadd.i16 q2, q6, q7 // -stride, +stride
- vadd.i16 q0, q0, q5
-
- vext.8 q4, q8, q9, #8 // +1-stride
- vext.8 q5, q9, q10, #8
- vext.8 q6, q11, q12, #8 // +1+stride
- vext.8 q7, q12, q13, #8
- vmul.i16 q0, q0, q14 // * 5
- vmla.i16 q0, q2, q15 // * 6
- vadd.i32 q4, q4, q8 // -1-stride, +1-stride
- vadd.i32 q5, q5, q9
- vadd.i32 q6, q6, q11 // -1+stride, +1+stride
- vadd.i32 q7, q7, q12
- vadd.i32 q4, q4, q6
- vadd.i32 q5, q5, q7
- vext.8 q6, q8, q9, #4 // -stride
- vext.8 q7, q9, q10, #4
- vext.8 q8, q11, q12, #4 // +stride
- vext.8 q11, q12, q13, #4
-
- vld1.8 {d4}, [r1, :64]!
-
- vmov.i32 q14, #5
- vmov.i32 q15, #6
-
- vadd.i32 q6, q6, q8 // -stride, +stride
- vadd.i32 q7, q7, q11
- vmul.i32 q4, q4, q14 // * 5
- vmla.i32 q4, q6, q15 // * 6
- vmul.i32 q5, q5, q14 // * 5
- vmla.i32 q5, q7, q15 // * 6
-
- vmovl.u8 q2, d4
- vmlal.u16 q4, d0, d4 // b + a * src
- vmlal.u16 q5, d1, d5 // b + a * src
- vmov q0, q1
- vrshrn.i32 d8, q4, #9
- vrshrn.i32 d9, q5, #9
- vmov q2, q3
- vst1.16 {q4}, [r0, :128]!
-
- ble 3f
- vmov q8, q10
- vmov q11, q13
- vld1.16 {q1}, [r4, :128]!
- vld1.16 {q3}, [r8, :128]!
- vld1.32 {q9, q10}, [r3, :128]!
- vld1.32 {q12, q13}, [r7, :128]!
- b 2b
-
-3:
- subs r6, r6, #1
- ble 0f
- mov r5, lr
- add r0, r0, r10, lsl #1
- add r1, r1, r2
- add r3, r3, r9, lsl #2
- add r7, r7, r9, lsl #2
- add r4, r4, r12, lsl #1
- add r8, r8, r12, lsl #1
-
- vld1.32 {q8, q9}, [r3, :128]!
- vld1.16 {q0, q1}, [r4, :128]!
- vld1.32 {q10}, [r3, :128]!
-
- vmov.i16 q12, #5
- vmov.i16 q13, #6
-
-4:
- subs r5, r5, #8
- vext.8 q3, q0, q1, #4 // +1
- vext.8 q2, q0, q1, #2 // 0
- vadd.i16 q0, q0, q3 // -1, +1
-
- vext.8 q4, q8, q9, #4 // 0
- vext.8 q5, q9, q10, #4
- vext.8 q6, q8, q9, #8 // +1
- vext.8 q7, q9, q10, #8
- vmul.i16 q2, q2, q13 // * 6
- vmla.i16 q2, q0, q12 // * 5 -> a
- vld1.8 {d22}, [r1, :64]!
- vadd.i32 q8, q8, q6 // -1, +1
- vadd.i32 q9, q9, q7
- vmovl.u8 q11, d22
- vmul.i32 q4, q4, q15 // * 6
- vmla.i32 q4, q8, q14 // * 5 -> b
- vmul.i32 q5, q5, q15 // * 6
- vmla.i32 q5, q9, q14 // * 5 -> b
-
- vmlal.u16 q4, d4, d22 // b + a * src
- vmlal.u16 q5, d5, d23
- vmov q0, q1
- vrshrn.i32 d8, q4, #8
- vrshrn.i32 d9, q5, #8
- vmov q8, q10
- vst1.16 {q4}, [r0, :128]!
-
- ble 5f
- vld1.16 {q1}, [r4, :128]!
- vld1.32 {q9, q10}, [r3, :128]!
- b 4b
-
-5:
- subs r6, r6, #1
- ble 0f
- mov r5, lr
- sub r3, r3, r11, lsl #2 // Rewind r3/r4 to where they started
- sub r4, r4, r11, lsl #1
- add r0, r0, r10, lsl #1
- add r1, r1, r2
- sub r3, r3, #16
- sub r4, r4, #16
- b 1b
-0:
- vpop {q4-q7}
- pop {r4-r11,pc}
-endfunc
-
-// void dav1d_sgr_weighted1_8bpc_neon(pixel *dst, const ptrdiff_t dst_stride,
-// const pixel *src, const ptrdiff_t src_stride,
-// const int16_t *t1, const int w, const int h,
-// const int wt);
-function sgr_weighted1_8bpc_neon, export=1
- push {r4-r9,lr}
- ldrd r4, r5, [sp, #28]
- ldrd r6, r7, [sp, #36]
- vdup.16 d31, r7
- cmp r6, #2
- add r9, r0, r1
- add r12, r2, r3
- add lr, r4, #2*FILTER_OUT_STRIDE
- mov r7, #(4*FILTER_OUT_STRIDE)
- lsl r1, r1, #1
- lsl r3, r3, #1
- add r8, r5, #7
- bic r8, r8, #7 // Aligned width
- sub r1, r1, r8
- sub r3, r3, r8
- sub r7, r7, r8, lsl #1
- mov r8, r5
- blt 2f
-1:
- vld1.8 {d0}, [r2, :64]!
- vld1.8 {d16}, [r12, :64]!
- vld1.16 {q1}, [r4, :128]!
- vld1.16 {q9}, [lr, :128]!
- subs r5, r5, #8
- vshll.u8 q0, d0, #4 // u
- vshll.u8 q8, d16, #4 // u
- vsub.i16 q1, q1, q0 // t1 - u
- vsub.i16 q9, q9, q8 // t1 - u
- vshll.u16 q2, d0, #7 // u << 7
- vshll.u16 q3, d1, #7 // u << 7
- vshll.u16 q10, d16, #7 // u << 7
- vshll.u16 q11, d17, #7 // u << 7
- vmlal.s16 q2, d2, d31 // v
- vmlal.s16 q3, d3, d31 // v
- vmlal.s16 q10, d18, d31 // v
- vmlal.s16 q11, d19, d31 // v
- vrshrn.i32 d4, q2, #11
- vrshrn.i32 d5, q3, #11
- vrshrn.i32 d20, q10, #11
- vrshrn.i32 d21, q11, #11
- vqmovun.s16 d4, q2
- vqmovun.s16 d20, q10
- vst1.8 {d4}, [r0]!
- vst1.8 {d20}, [r9]!
- bgt 1b
-
- sub r6, r6, #2
- cmp r6, #1
- blt 0f
- mov r5, r8
- add r0, r0, r1
- add r9, r9, r1
- add r2, r2, r3
- add r12, r12, r3
- add r4, r4, r7
- add lr, lr, r7
- beq 2f
- b 1b
-
-2:
- vld1.8 {d0}, [r2, :64]!
- vld1.16 {q1}, [r4, :128]!
- subs r5, r5, #8
- vshll.u8 q0, d0, #4 // u
- vsub.i16 q1, q1, q0 // t1 - u
- vshll.u16 q2, d0, #7 // u << 7
- vshll.u16 q3, d1, #7 // u << 7
- vmlal.s16 q2, d2, d31 // v
- vmlal.s16 q3, d3, d31 // v
- vrshrn.i32 d4, q2, #11
- vrshrn.i32 d5, q3, #11
- vqmovun.s16 d2, q2
- vst1.8 {d2}, [r0]!
- bgt 2b
-0:
- pop {r4-r9,pc}
-endfunc
-
-// void dav1d_sgr_weighted2_8bpc_neon(pixel *dst, const ptrdiff_t stride,
-// const pixel *src, const ptrdiff_t src_stride,
-// const int16_t *t1, const int16_t *t2,
-// const int w, const int h,
-// const int16_t wt[2]);
-function sgr_weighted2_8bpc_neon, export=1
- push {r4-r11,lr}
- ldrd r4, r5, [sp, #36]
- ldrd r6, r7, [sp, #44]
- ldr r8, [sp, #52]
- cmp r7, #2
- add r10, r0, r1
- add r11, r2, r3
- add r12, r4, #2*FILTER_OUT_STRIDE
- add lr, r5, #2*FILTER_OUT_STRIDE
- vld2.16 {d30[], d31[]}, [r8] // wt[0], wt[1]
- mov r8, #4*FILTER_OUT_STRIDE
- lsl r1, r1, #1
- lsl r3, r3, #1
- add r9, r6, #7
- bic r9, r9, #7 // Aligned width
- sub r1, r1, r9
- sub r3, r3, r9
- sub r8, r8, r9, lsl #1
- mov r9, r6
- blt 2f
-1:
- vld1.8 {d0}, [r2, :64]!
- vld1.8 {d16}, [r11, :64]!
- vld1.16 {q1}, [r4, :128]!
- vld1.16 {q9}, [r12, :128]!
- vld1.16 {q2}, [r5, :128]!
- vld1.16 {q10}, [lr, :128]!
- subs r6, r6, #8
- vshll.u8 q0, d0, #4 // u
- vshll.u8 q8, d16, #4 // u
- vsub.i16 q1, q1, q0 // t1 - u
- vsub.i16 q2, q2, q0 // t2 - u
- vsub.i16 q9, q9, q8 // t1 - u
- vsub.i16 q10, q10, q8 // t2 - u
- vshll.u16 q3, d0, #7 // u << 7
- vshll.u16 q0, d1, #7 // u << 7
- vshll.u16 q11, d16, #7 // u << 7
- vshll.u16 q8, d17, #7 // u << 7
- vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u)
- vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u)
- vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u)
- vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u)
- vmlal.s16 q11, d18, d30 // wt[0] * (t1 - u)
- vmlal.s16 q11, d20, d31 // wt[1] * (t2 - u)
- vmlal.s16 q8, d19, d30 // wt[0] * (t1 - u)
- vmlal.s16 q8, d21, d31 // wt[1] * (t2 - u)
- vrshrn.i32 d6, q3, #11
- vrshrn.i32 d7, q0, #11
- vrshrn.i32 d22, q11, #11
- vrshrn.i32 d23, q8, #11
- vqmovun.s16 d6, q3
- vqmovun.s16 d22, q11
- vst1.8 {d6}, [r0]!
- vst1.8 {d22}, [r10]!
- bgt 1b
-
- subs r7, r7, #2
- cmp r7, #1
- blt 0f
- mov r6, r9
- add r0, r0, r1
- add r10, r10, r1
- add r2, r2, r3
- add r11, r11, r3
- add r4, r4, r8
- add r12, r12, r8
- add r5, r5, r8
- add lr, lr, r8
- beq 2f
- b 1b
-
-2:
- vld1.8 {d0}, [r2, :64]!
- vld1.16 {q1}, [r4, :128]!
- vld1.16 {q2}, [r5, :128]!
- subs r6, r6, #8
- vshll.u8 q0, d0, #4 // u
- vsub.i16 q1, q1, q0 // t1 - u
- vsub.i16 q2, q2, q0 // t2 - u
- vshll.u16 q3, d0, #7 // u << 7
- vshll.u16 q0, d1, #7 // u << 7
- vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u)
- vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u)
- vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u)
- vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u)
- vrshrn.i32 d6, q3, #11
- vrshrn.i32 d7, q0, #11
- vqmovun.s16 d6, q3
- vst1.8 {d6}, [r0]!
- bgt 1b
-0:
- pop {r4-r11,pc}
endfunc
--- /dev/null
+++ b/src/arm/32/looprestoration_common.S
@@ -1,0 +1,441 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define SUM_STRIDE (384+16)
+
+// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box3_v_neon, export=1
+ push {r4-r9,lr}
+ ldr r4, [sp, #28]
+ add r12, r3, #2 // Number of output rows to move back
+ mov lr, r3 // Number of input rows to move back
+ add r2, r2, #2 // Actual summed width
+ mov r7, #(4*SUM_STRIDE) // sumsq stride
+ mov r8, #(2*SUM_STRIDE) // sum stride
+ sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride
+ sub r1, r1, #(2*SUM_STRIDE) // sum -= stride
+
+ tst r4, #4 // LR_HAVE_TOP
+ beq 0f
+ // If have top, read from row -2.
+ sub r5, r0, #(4*SUM_STRIDE)
+ sub r6, r1, #(2*SUM_STRIDE)
+ add lr, lr, #2
+ b 1f
+0:
+ // !LR_HAVE_TOP
+ // If we don't have top, read from row 0 even if
+ // we start writing to row -1.
+ add r5, r0, #(4*SUM_STRIDE)
+ add r6, r1, #(2*SUM_STRIDE)
+1:
+
+ tst r4, #8 // LR_HAVE_BOTTOM
+ beq 1f
+ // LR_HAVE_BOTTOM
+ add r3, r3, #2 // Sum all h+2 lines with the main loop
+ add lr, lr, #2
+1:
+ mov r9, r3 // Backup of h for next loops
+
+1:
+ // Start of horizontal loop; start one vertical filter slice.
+ // Start loading rows into q8-q13 and q0-q2 taking top
+ // padding into consideration.
+ tst r4, #4 // LR_HAVE_TOP
+ vld1.32 {q8, q9}, [r5, :128], r7
+ vld1.16 {q0}, [r6, :128], r8
+ beq 2f
+ // LR_HAVE_TOP
+ vld1.32 {q10, q11}, [r5, :128], r7
+ vld1.16 {q1}, [r6, :128], r8
+ vld1.32 {q12, q13}, [r5, :128], r7
+ vld1.16 {q2}, [r6, :128], r8
+ b 3f
+2: // !LR_HAVE_TOP
+ vmov q10, q8
+ vmov q11, q9
+ vmov q1, q0
+ vmov q12, q8
+ vmov q13, q9
+ vmov q2, q0
+
+3:
+ subs r3, r3, #1
+.macro add3
+ vadd.i32 q8, q8, q10
+ vadd.i32 q9, q9, q11
+ vadd.i16 q0, q0, q1
+ vadd.i32 q8, q8, q12
+ vadd.i32 q9, q9, q13
+ vadd.i16 q0, q0, q2
+ vst1.32 {q8, q9}, [r0, :128], r7
+ vst1.16 {q0}, [r1, :128], r8
+.endm
+ add3
+ vmov q8, q10
+ vmov q9, q11
+ vmov q0, q1
+ vmov q10, q12
+ vmov q11, q13
+ vmov q1, q2
+ ble 4f
+ vld1.32 {q12, q13}, [r5, :128], r7
+ vld1.16 {q2}, [r6, :128], r8
+ b 3b
+
+4:
+ tst r4, #8 // LR_HAVE_BOTTOM
+ bne 5f
+ // !LR_HAVE_BOTTOM
+ // Produce two more rows, extending the already loaded rows.
+ add3
+ vmov q8, q10
+ vmov q9, q11
+ vmov q0, q1
+ add3
+
+5: // End of one vertical slice.
+ subs r2, r2, #8
+ ble 0f
+ // Move pointers back up to the top and loop horizontally.
+ // Input pointers
+ mls r5, r7, lr, r5
+ mls r6, r8, lr, r6
+ // Output pointers
+ mls r0, r7, r12, r0
+ mls r1, r8, r12, r1
+ add r0, r0, #32
+ add r1, r1, #16
+ add r5, r5, #32
+ add r6, r6, #16
+ mov r3, r9
+ b 1b
+
+0:
+ pop {r4-r9,pc}
+.purgem add3
+endfunc
+
+// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box5_v_neon, export=1
+ push {r4-r9,lr}
+ vpush {q5-q7}
+ ldr r4, [sp, #76]
+ add r12, r3, #2 // Number of output rows to move back
+ mov lr, r3 // Number of input rows to move back
+ add r2, r2, #8 // Actual summed width
+ mov r7, #(4*SUM_STRIDE) // sumsq stride
+ mov r8, #(2*SUM_STRIDE) // sum stride
+ sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride
+ sub r1, r1, #(2*SUM_STRIDE) // sum -= stride
+
+ tst r4, #4 // LR_HAVE_TOP
+ beq 0f
+ // If have top, read from row -2.
+ sub r5, r0, #(4*SUM_STRIDE)
+ sub r6, r1, #(2*SUM_STRIDE)
+ add lr, lr, #2
+ b 1f
+0:
+ // !LR_HAVE_TOP
+ // If we don't have top, read from row 0 even if
+ // we start writing to row -1.
+ add r5, r0, #(4*SUM_STRIDE)
+ add r6, r1, #(2*SUM_STRIDE)
+1:
+
+ tst r4, #8 // LR_HAVE_BOTTOM
+ beq 0f
+ // LR_HAVE_BOTTOM
+ add r3, r3, #2 // Handle h+2 lines with the main loop
+ add lr, lr, #2
+ b 1f
+0:
+ // !LR_HAVE_BOTTOM
+ sub r3, r3, #1 // Handle h-1 lines with the main loop
+1:
+ mov r9, r3 // Backup of h for next loops
+
+1:
+ // Start of horizontal loop; start one vertical filter slice.
+ // Start loading rows into q6-q15 and q0-q3,q5 taking top
+ // padding into consideration.
+ tst r4, #4 // LR_HAVE_TOP
+ vld1.32 {q6, q7}, [r5, :128], r7
+ vld1.16 {q0}, [r6, :128], r8
+ beq 2f
+ // LR_HAVE_TOP
+ vld1.32 {q10, q11}, [r5, :128], r7
+ vld1.16 {q2}, [r6, :128], r8
+ vmov q8, q6
+ vmov q9, q7
+ vmov q1, q0
+ vld1.32 {q12, q13}, [r5, :128], r7
+ vld1.16 {q3}, [r6, :128], r8
+ b 3f
+2: // !LR_HAVE_TOP
+ vmov q8, q6
+ vmov q9, q7
+ vmov q1, q0
+ vmov q10, q6
+ vmov q11, q7
+ vmov q2, q0
+ vmov q12, q6
+ vmov q13, q7
+ vmov q3, q0
+
+3:
+ cmp r3, #0
+ beq 4f
+ vld1.32 {q14, q15}, [r5, :128], r7
+ vld1.16 {q5}, [r6, :128], r8
+
+3:
+ // Start of vertical loop
+ subs r3, r3, #2
+.macro add5
+ vadd.i32 q6, q6, q8
+ vadd.i32 q7, q7, q9
+ vadd.i16 q0, q0, q1
+ vadd.i32 q6, q6, q10
+ vadd.i32 q7, q7, q11
+ vadd.i16 q0, q0, q2
+ vadd.i32 q6, q6, q12
+ vadd.i32 q7, q7, q13
+ vadd.i16 q0, q0, q3
+ vadd.i32 q6, q6, q14
+ vadd.i32 q7, q7, q15
+ vadd.i16 q0, q0, q5
+ vst1.32 {q6, q7}, [r0, :128], r7
+ vst1.16 {q0}, [r1, :128], r8
+.endm
+ add5
+.macro shift2
+ vmov q6, q10
+ vmov q7, q11
+ vmov q0, q2
+ vmov q8, q12
+ vmov q9, q13
+ vmov q1, q3
+ vmov q10, q14
+ vmov q11, q15
+ vmov q2, q5
+.endm
+ shift2
+ add r0, r0, r7
+ add r1, r1, r8
+ ble 5f
+ vld1.32 {q12, q13}, [r5, :128], r7
+ vld1.16 {q3}, [r6, :128], r8
+ vld1.32 {q14, q15}, [r5, :128], r7
+ vld1.16 {q5}, [r6, :128], r8
+ b 3b
+
+4:
+ // h == 1, !LR_HAVE_BOTTOM.
+ // Pad the last row with the only content row, and add.
+ vmov q14, q12
+ vmov q15, q13
+ vmov q5, q3
+ add5
+ shift2
+ add r0, r0, r7
+ add r1, r1, r8
+ add5
+ b 6f
+
+5:
+ tst r4, #8 // LR_HAVE_BOTTOM
+ bne 6f
+ // !LR_HAVE_BOTTOM
+ cmp r3, #0
+ bne 5f
+ // The intended three edge rows left; output the one at h-2 and
+ // the past edge one at h.
+ vld1.32 {q12, q13}, [r5, :128], r7
+ vld1.16 {q3}, [r6, :128], r8
+ // Pad the past-edge row from the last content row.
+ vmov q14, q12
+ vmov q15, q13
+ vmov q5, q3
+ add5
+ shift2
+ add r0, r0, r7
+ add r1, r1, r8
+ // The last two rows are already padded properly here.
+ add5
+ b 6f
+
+5:
+ // r3 == -1, two rows left, output one.
+ // Pad the last two rows from the mid one.
+ vmov q12, q10
+ vmov q13, q11
+ vmov q3, q2
+ vmov q14, q10
+ vmov q15, q11
+ vmov q5, q2
+ add5
+ add r0, r0, r7
+ add r1, r1, r8
+ b 6f
+
+6: // End of one vertical slice.
+ subs r2, r2, #8
+ ble 0f
+ // Move pointers back up to the top and loop horizontally.
+ // Input pointers
+ mls r5, r7, lr, r5
+ mls r6, r8, lr, r6
+ // Output pointers
+ mls r0, r7, r12, r0
+ mls r1, r8, r12, r1
+ add r0, r0, #32
+ add r1, r1, #16
+ add r5, r5, #32
+ add r6, r6, #16
+ mov r3, r9
+ b 1b
+
+0:
+ vpop {q5-q7}
+ pop {r4-r9,pc}
+.purgem add5
+endfunc
+
+// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
+// const int w, const int h, const int strength);
+// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
+// const int w, const int h, const int strength);
+function sgr_calc_ab1_neon, export=1
+ push {r4-r5,lr}
+ vpush {q4-q7}
+ ldr r4, [sp, #76]
+ add r3, r3, #2 // h += 2
+ vmov.i32 q15, #9 // n
+ movw r5, #455
+ mov lr, #SUM_STRIDE
+ b sgr_calc_ab_neon
+endfunc
+
+function sgr_calc_ab2_neon, export=1
+ push {r4-r5,lr}
+ vpush {q4-q7}
+ ldr r4, [sp, #76]
+ add r3, r3, #3 // h += 3
+ asr r3, r3, #1 // h /= 2
+ vmov.i32 q15, #25 // n
+ mov r5, #164
+ mov lr, #(2*SUM_STRIDE)
+endfunc
+
+function sgr_calc_ab_neon
+ movrel r12, X(sgr_x_by_x)
+ vld1.8 {q8, q9}, [r12, :128]!
+ vmov.i8 q11, #5
+ vmov.i8 d10, #55 // idx of last 5
+ vld1.8 {q10}, [r12, :128]
+ vmov.i8 d11, #72 // idx of last 4
+ vmov.i8 d12, #101 // idx of last 3
+ vmov.i8 d13, #169 // idx of last 2
+ vmov.i8 d14, #254 // idx of last 1
+ vmov.i8 d15, #32 // elements consumed in first vtbl
+ add r2, r2, #2 // w += 2
+ add r12, r2, #7
+ bic r12, r12, #7 // aligned w
+ sub r12, lr, r12 // increment between rows
+ vmov.i16 q13, #256
+ vdup.32 q12, r4
+ vdup.32 q14, r5 // one_by_x
+ sub r0, r0, #(4*(SUM_STRIDE))
+ sub r1, r1, #(2*(SUM_STRIDE))
+ mov r4, r2 // backup of w
+ vsub.i8 q8, q8, q11
+ vsub.i8 q9, q9, q11
+ vsub.i8 q10, q10, q11
+1:
+ subs r2, r2, #8
+ vld1.32 {q0, q1}, [r0, :128] // a
+ vld1.16 {q2}, [r1, :128] // b
+ vmul.i32 q0, q0, q15 // a * n
+ vmul.i32 q1, q1, q15 // a * n
+ vmull.u16 q3, d4, d4 // b * b
+ vmull.u16 q4, d5, d5 // b * b
+ vqsub.u32 q0, q0, q3 // imax(a * n - b * b, 0)
+ vqsub.u32 q1, q1, q4 // imax(a * n - b * b, 0)
+ vmul.i32 q0, q0, q12 // p * s
+ vmul.i32 q1, q1, q12 // p * s
+ vqshrn.u32 d0, q0, #16
+ vqshrn.u32 d1, q1, #16
+ vqrshrn.u16 d0, q0, #4 // imin(z, 255)
+
+ vcgt.u8 d2, d0, d10 // = -1 if sgr_x_by_x[d0] < 5
+ vcgt.u8 d3, d0, d11 // = -1 if sgr_x_by_x[d0] < 4
+ vtbl.8 d1, {q8, q9}, d0
+ vcgt.u8 d6, d0, d12 // = -1 if sgr_x_by_x[d0] < 3
+ vsub.i8 d9, d0, d15 // indices for vtbx
+ vcgt.u8 d7, d0, d13 // = -1 if sgr_x_by_x[d0] < 2
+ vadd.i8 d2, d2, d3
+ vtbx.8 d1, {q10}, d9
+ vcgt.u8 d8, d0, d14 // = -1 if sgr_x_by_x[d0] < 1
+ vadd.i8 d6, d6, d7
+ vadd.i8 d8, d8, d22
+ vadd.i8 d2, d2, d6
+ vadd.i8 d1, d1, d8
+ vadd.i8 d1, d1, d2
+ vmovl.u8 q0, d1 // x
+
+ vmull.u16 q1, d0, d4 // x * BB[i]
+ vmull.u16 q2, d1, d5 // x * BB[i]
+ vmul.i32 q1, q1, q14 // x * BB[i] * sgr_one_by_x
+ vmul.i32 q2, q2, q14 // x * BB[i] * sgr_one_by_x
+ vrshr.s32 q1, q1, #12 // AA[i]
+ vrshr.s32 q2, q2, #12 // AA[i]
+ vsub.i16 q0, q13, q0 // 256 - x
+
+ vst1.32 {q1, q2}, [r0, :128]!
+ vst1.16 {q0}, [r1, :128]!
+ bgt 1b
+
+ subs r3, r3, #1
+ ble 0f
+ add r0, r0, r12, lsl #2
+ add r1, r1, r12, lsl #1
+ mov r2, r4
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r5,pc}
+endfunc
--- /dev/null
+++ b/src/arm/32/looprestoration_tmpl.S
@@ -1,0 +1,477 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+
+#define FILTER_OUT_STRIDE 384
+
+// void dav1d_sgr_finish_filter1_8bpc_neon(int16_t *tmp,
+// const pixel *src, const ptrdiff_t stride,
+// const int32_t *a, const int16_t *b,
+// const int w, const int h);
+function sgr_finish_filter1_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldr r6, [sp, #108]
+ sub r7, r3, #(4*SUM_STRIDE)
+ add r8, r3, #(4*SUM_STRIDE)
+ sub r9, r4, #(2*SUM_STRIDE)
+ add r10, r4, #(2*SUM_STRIDE)
+ mov r11, #SUM_STRIDE
+ mov r12, #FILTER_OUT_STRIDE
+ add lr, r5, #3
+ bic lr, lr, #3 // Aligned width
+ sub r2, r2, lr
+ sub r12, r12, lr
+ sub r11, r11, lr
+ sub r11, r11, #4 // We read 4 extra elements from both a and b
+ mov lr, r5
+ vmov.i16 q14, #3
+ vmov.i32 q15, #3
+1:
+ vld1.16 {q0}, [r9, :128]!
+ vld1.16 {q1}, [r4, :128]!
+ vld1.16 {q2}, [r10, :128]!
+ vld1.32 {q8, q9}, [r7, :128]!
+ vld1.32 {q10, q11}, [r3, :128]!
+ vld1.32 {q12, q13}, [r8, :128]!
+
+2:
+ subs r5, r5, #4
+ vext.8 d6, d0, d1, #2 // -stride
+ vext.8 d7, d2, d3, #2 // 0
+ vext.8 d8, d4, d5, #2 // +stride
+ vext.8 d9, d0, d1, #4 // +1-stride
+ vext.8 d10, d2, d3, #4 // +1
+ vext.8 d11, d4, d5, #4 // +1+stride
+ vadd.i16 d2, d2, d6 // -1, -stride
+ vadd.i16 d7, d7, d8 // 0, +stride
+ vadd.i16 d0, d0, d9 // -1-stride, +1-stride
+ vadd.i16 d2, d2, d7
+ vadd.i16 d4, d4, d11 // -1+stride, +1+stride
+ vadd.i16 d2, d2, d10 // +1
+ vadd.i16 d0, d0, d4
+
+ vext.8 q3, q8, q9, #4 // -stride
+ vshl.i16 d2, d2, #2
+ vext.8 q4, q8, q9, #8 // +1-stride
+ vext.8 q5, q10, q11, #4 // 0
+ vext.8 q6, q10, q11, #8 // +1
+ vmla.i16 d2, d0, d28 // * 3 -> a
+ vadd.i32 q3, q3, q10 // -stride, -1
+ vadd.i32 q8, q8, q4 // -1-stride, +1-stride
+ vadd.i32 q5, q5, q6 // 0, +1
+ vadd.i32 q8, q8, q12 // -1+stride
+ vadd.i32 q3, q3, q5
+ vext.8 q7, q12, q13, #4 // +stride
+ vext.8 q10, q12, q13, #8 // +1+stride
+ vld1.32 {d24[0]}, [r1, :32]! // src
+ vadd.i32 q3, q3, q7 // +stride
+ vadd.i32 q8, q8, q10 // +1+stride
+ vshl.i32 q3, q3, #2
+ vmla.i32 q3, q8, q15 // * 3 -> b
+ vmovl.u8 q12, d24 // src
+ vmov d0, d1
+ vmlal.u16 q3, d2, d24 // b + a * src
+ vmov d2, d3
+ vrshrn.i32 d6, q3, #9
+ vmov d4, d5
+ vst1.16 {d6}, [r0]!
+
+ ble 3f
+ vmov q8, q9
+ vmov q10, q11
+ vmov q12, q13
+ vld1.16 {d1}, [r9, :64]!
+ vld1.16 {d3}, [r4, :64]!
+ vld1.16 {d5}, [r10, :64]!
+ vld1.32 {q9}, [r7, :128]!
+ vld1.32 {q11}, [r3, :128]!
+ vld1.32 {q13}, [r8, :128]!
+ b 2b
+
+3:
+ subs r6, r6, #1
+ ble 0f
+ mov r5, lr
+ add r0, r0, r12, lsl #1
+ add r1, r1, r2
+ add r3, r3, r11, lsl #2
+ add r7, r7, r11, lsl #2
+ add r8, r8, r11, lsl #2
+ add r4, r4, r11, lsl #1
+ add r9, r9, r11, lsl #1
+ add r10, r10, r11, lsl #1
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+// void dav1d_sgr_finish_filter2_8bpc_neon(int16_t *tmp,
+// const pixel *src, const ptrdiff_t stride,
+// const int32_t *a, const int16_t *b,
+// const int w, const int h);
+function sgr_finish_filter2_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldr r6, [sp, #108]
+ add r7, r3, #(4*(SUM_STRIDE))
+ sub r3, r3, #(4*(SUM_STRIDE))
+ add r8, r4, #(2*(SUM_STRIDE))
+ sub r4, r4, #(2*(SUM_STRIDE))
+ mov r9, #(2*SUM_STRIDE)
+ mov r10, #FILTER_OUT_STRIDE
+ add r11, r5, #7
+ bic r11, r11, #7 // Aligned width
+ sub r2, r2, r11
+ sub r10, r10, r11
+ sub r9, r9, r11
+ sub r9, r9, #4 // We read 4 extra elements from a
+ sub r12, r9, #4 // We read 8 extra elements from b
+ mov lr, r5
+
+1:
+ vld1.16 {q0, q1}, [r4, :128]!
+ vld1.16 {q2, q3}, [r8, :128]!
+ vld1.32 {q8, q9}, [r3, :128]!
+ vld1.32 {q11, q12}, [r7, :128]!
+ vld1.32 {q10}, [r3, :128]!
+ vld1.32 {q13}, [r7, :128]!
+
+2:
+ vmov.i16 q14, #5
+ vmov.i16 q15, #6
+ subs r5, r5, #8
+ vext.8 q4, q0, q1, #4 // +1-stride
+ vext.8 q5, q2, q3, #4 // +1+stride
+ vext.8 q6, q0, q1, #2 // -stride
+ vext.8 q7, q2, q3, #2 // +stride
+ vadd.i16 q0, q0, q4 // -1-stride, +1-stride
+ vadd.i16 q5, q2, q5 // -1+stride, +1+stride
+ vadd.i16 q2, q6, q7 // -stride, +stride
+ vadd.i16 q0, q0, q5
+
+ vext.8 q4, q8, q9, #8 // +1-stride
+ vext.8 q5, q9, q10, #8
+ vext.8 q6, q11, q12, #8 // +1+stride
+ vext.8 q7, q12, q13, #8
+ vmul.i16 q0, q0, q14 // * 5
+ vmla.i16 q0, q2, q15 // * 6
+ vadd.i32 q4, q4, q8 // -1-stride, +1-stride
+ vadd.i32 q5, q5, q9
+ vadd.i32 q6, q6, q11 // -1+stride, +1+stride
+ vadd.i32 q7, q7, q12
+ vadd.i32 q4, q4, q6
+ vadd.i32 q5, q5, q7
+ vext.8 q6, q8, q9, #4 // -stride
+ vext.8 q7, q9, q10, #4
+ vext.8 q8, q11, q12, #4 // +stride
+ vext.8 q11, q12, q13, #4
+
+ vld1.8 {d4}, [r1, :64]!
+
+ vmov.i32 q14, #5
+ vmov.i32 q15, #6
+
+ vadd.i32 q6, q6, q8 // -stride, +stride
+ vadd.i32 q7, q7, q11
+ vmul.i32 q4, q4, q14 // * 5
+ vmla.i32 q4, q6, q15 // * 6
+ vmul.i32 q5, q5, q14 // * 5
+ vmla.i32 q5, q7, q15 // * 6
+
+ vmovl.u8 q2, d4
+ vmlal.u16 q4, d0, d4 // b + a * src
+ vmlal.u16 q5, d1, d5 // b + a * src
+ vmov q0, q1
+ vrshrn.i32 d8, q4, #9
+ vrshrn.i32 d9, q5, #9
+ vmov q2, q3
+ vst1.16 {q4}, [r0, :128]!
+
+ ble 3f
+ vmov q8, q10
+ vmov q11, q13
+ vld1.16 {q1}, [r4, :128]!
+ vld1.16 {q3}, [r8, :128]!
+ vld1.32 {q9, q10}, [r3, :128]!
+ vld1.32 {q12, q13}, [r7, :128]!
+ b 2b
+
+3:
+ subs r6, r6, #1
+ ble 0f
+ mov r5, lr
+ add r0, r0, r10, lsl #1
+ add r1, r1, r2
+ add r3, r3, r9, lsl #2
+ add r7, r7, r9, lsl #2
+ add r4, r4, r12, lsl #1
+ add r8, r8, r12, lsl #1
+
+ vld1.32 {q8, q9}, [r3, :128]!
+ vld1.16 {q0, q1}, [r4, :128]!
+ vld1.32 {q10}, [r3, :128]!
+
+ vmov.i16 q12, #5
+ vmov.i16 q13, #6
+
+4:
+ subs r5, r5, #8
+ vext.8 q3, q0, q1, #4 // +1
+ vext.8 q2, q0, q1, #2 // 0
+ vadd.i16 q0, q0, q3 // -1, +1
+
+ vext.8 q4, q8, q9, #4 // 0
+ vext.8 q5, q9, q10, #4
+ vext.8 q6, q8, q9, #8 // +1
+ vext.8 q7, q9, q10, #8
+ vmul.i16 q2, q2, q13 // * 6
+ vmla.i16 q2, q0, q12 // * 5 -> a
+ vld1.8 {d22}, [r1, :64]!
+ vadd.i32 q8, q8, q6 // -1, +1
+ vadd.i32 q9, q9, q7
+ vmovl.u8 q11, d22
+ vmul.i32 q4, q4, q15 // * 6
+ vmla.i32 q4, q8, q14 // * 5 -> b
+ vmul.i32 q5, q5, q15 // * 6
+ vmla.i32 q5, q9, q14 // * 5 -> b
+
+ vmlal.u16 q4, d4, d22 // b + a * src
+ vmlal.u16 q5, d5, d23
+ vmov q0, q1
+ vrshrn.i32 d8, q4, #8
+ vrshrn.i32 d9, q5, #8
+ vmov q8, q10
+ vst1.16 {q4}, [r0, :128]!
+
+ ble 5f
+ vld1.16 {q1}, [r4, :128]!
+ vld1.32 {q9, q10}, [r3, :128]!
+ b 4b
+
+5:
+ subs r6, r6, #1
+ ble 0f
+ mov r5, lr
+ sub r3, r3, r11, lsl #2 // Rewind r3/r4 to where they started
+ sub r4, r4, r11, lsl #1
+ add r0, r0, r10, lsl #1
+ add r1, r1, r2
+ sub r3, r3, #16
+ sub r4, r4, #16
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+// void dav1d_sgr_weighted1_8bpc_neon(pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *t1, const int w, const int h,
+// const int wt);
+function sgr_weighted1_8bpc_neon, export=1
+ push {r4-r9,lr}
+ ldrd r4, r5, [sp, #28]
+ ldrd r6, r7, [sp, #36]
+ vdup.16 d31, r7
+ cmp r6, #2
+ add r9, r0, r1
+ add r12, r2, r3
+ add lr, r4, #2*FILTER_OUT_STRIDE
+ mov r7, #(4*FILTER_OUT_STRIDE)
+ lsl r1, r1, #1
+ lsl r3, r3, #1
+ add r8, r5, #7
+ bic r8, r8, #7 // Aligned width
+ sub r1, r1, r8
+ sub r3, r3, r8
+ sub r7, r7, r8, lsl #1
+ mov r8, r5
+ blt 2f
+1:
+ vld1.8 {d0}, [r2, :64]!
+ vld1.8 {d16}, [r12, :64]!
+ vld1.16 {q1}, [r4, :128]!
+ vld1.16 {q9}, [lr, :128]!
+ subs r5, r5, #8
+ vshll.u8 q0, d0, #4 // u
+ vshll.u8 q8, d16, #4 // u
+ vsub.i16 q1, q1, q0 // t1 - u
+ vsub.i16 q9, q9, q8 // t1 - u
+ vshll.u16 q2, d0, #7 // u << 7
+ vshll.u16 q3, d1, #7 // u << 7
+ vshll.u16 q10, d16, #7 // u << 7
+ vshll.u16 q11, d17, #7 // u << 7
+ vmlal.s16 q2, d2, d31 // v
+ vmlal.s16 q3, d3, d31 // v
+ vmlal.s16 q10, d18, d31 // v
+ vmlal.s16 q11, d19, d31 // v
+ vrshrn.i32 d4, q2, #11
+ vrshrn.i32 d5, q3, #11
+ vrshrn.i32 d20, q10, #11
+ vrshrn.i32 d21, q11, #11
+ vqmovun.s16 d4, q2
+ vqmovun.s16 d20, q10
+ vst1.8 {d4}, [r0]!
+ vst1.8 {d20}, [r9]!
+ bgt 1b
+
+ sub r6, r6, #2
+ cmp r6, #1
+ blt 0f
+ mov r5, r8
+ add r0, r0, r1
+ add r9, r9, r1
+ add r2, r2, r3
+ add r12, r12, r3
+ add r4, r4, r7
+ add lr, lr, r7
+ beq 2f
+ b 1b
+
+2:
+ vld1.8 {d0}, [r2, :64]!
+ vld1.16 {q1}, [r4, :128]!
+ subs r5, r5, #8
+ vshll.u8 q0, d0, #4 // u
+ vsub.i16 q1, q1, q0 // t1 - u
+ vshll.u16 q2, d0, #7 // u << 7
+ vshll.u16 q3, d1, #7 // u << 7
+ vmlal.s16 q2, d2, d31 // v
+ vmlal.s16 q3, d3, d31 // v
+ vrshrn.i32 d4, q2, #11
+ vrshrn.i32 d5, q3, #11
+ vqmovun.s16 d2, q2
+ vst1.8 {d2}, [r0]!
+ bgt 2b
+0:
+ pop {r4-r9,pc}
+endfunc
+
+// void dav1d_sgr_weighted2_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *t1, const int16_t *t2,
+// const int w, const int h,
+// const int16_t wt[2]);
+function sgr_weighted2_8bpc_neon, export=1
+ push {r4-r11,lr}
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+ ldr r8, [sp, #52]
+ cmp r7, #2
+ add r10, r0, r1
+ add r11, r2, r3
+ add r12, r4, #2*FILTER_OUT_STRIDE
+ add lr, r5, #2*FILTER_OUT_STRIDE
+ vld2.16 {d30[], d31[]}, [r8] // wt[0], wt[1]
+ mov r8, #4*FILTER_OUT_STRIDE
+ lsl r1, r1, #1
+ lsl r3, r3, #1
+ add r9, r6, #7
+ bic r9, r9, #7 // Aligned width
+ sub r1, r1, r9
+ sub r3, r3, r9
+ sub r8, r8, r9, lsl #1
+ mov r9, r6
+ blt 2f
+1:
+ vld1.8 {d0}, [r2, :64]!
+ vld1.8 {d16}, [r11, :64]!
+ vld1.16 {q1}, [r4, :128]!
+ vld1.16 {q9}, [r12, :128]!
+ vld1.16 {q2}, [r5, :128]!
+ vld1.16 {q10}, [lr, :128]!
+ subs r6, r6, #8
+ vshll.u8 q0, d0, #4 // u
+ vshll.u8 q8, d16, #4 // u
+ vsub.i16 q1, q1, q0 // t1 - u
+ vsub.i16 q2, q2, q0 // t2 - u
+ vsub.i16 q9, q9, q8 // t1 - u
+ vsub.i16 q10, q10, q8 // t2 - u
+ vshll.u16 q3, d0, #7 // u << 7
+ vshll.u16 q0, d1, #7 // u << 7
+ vshll.u16 q11, d16, #7 // u << 7
+ vshll.u16 q8, d17, #7 // u << 7
+ vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u)
+ vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u)
+ vmlal.s16 q11, d18, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q11, d20, d31 // wt[1] * (t2 - u)
+ vmlal.s16 q8, d19, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q8, d21, d31 // wt[1] * (t2 - u)
+ vrshrn.i32 d6, q3, #11
+ vrshrn.i32 d7, q0, #11
+ vrshrn.i32 d22, q11, #11
+ vrshrn.i32 d23, q8, #11
+ vqmovun.s16 d6, q3
+ vqmovun.s16 d22, q11
+ vst1.8 {d6}, [r0]!
+ vst1.8 {d22}, [r10]!
+ bgt 1b
+
+ subs r7, r7, #2
+ cmp r7, #1
+ blt 0f
+ mov r6, r9
+ add r0, r0, r1
+ add r10, r10, r1
+ add r2, r2, r3
+ add r11, r11, r3
+ add r4, r4, r8
+ add r12, r12, r8
+ add r5, r5, r8
+ add lr, lr, r8
+ beq 2f
+ b 1b
+
+2:
+ vld1.8 {d0}, [r2, :64]!
+ vld1.16 {q1}, [r4, :128]!
+ vld1.16 {q2}, [r5, :128]!
+ subs r6, r6, #8
+ vshll.u8 q0, d0, #4 // u
+ vsub.i16 q1, q1, q0 // t1 - u
+ vsub.i16 q2, q2, q0 // t2 - u
+ vshll.u16 q3, d0, #7 // u << 7
+ vshll.u16 q0, d1, #7 // u << 7
+ vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u)
+ vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u)
+ vrshrn.i32 d6, q3, #11
+ vrshrn.i32 d7, q0, #11
+ vqmovun.s16 d6, q3
+ vst1.8 {d6}, [r0]!
+ bgt 1b
+0:
+ pop {r4-r11,pc}
+endfunc
--- a/src/meson.build
+++ b/src/meson.build
@@ -132,6 +132,7 @@
endif
elif host_machine.cpu_family().startswith('arm')
libdav1d_sources_asm = files(
+ 'arm/32/looprestoration_common.S',
'arm/32/msac.S',
)