ref: 3811665793d30d62523855f87112d1e267996d53
dir: /src/arm/32/looprestoration.S/
/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2019, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" // void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4], // const pixel *src, ptrdiff_t stride, // const int16_t fh[7], const intptr_t w, // int h, enum LrEdgeFlags edges); function wiener_filter_h_neon, export=1 push {r4-r11,lr} vpush {q4} ldrd r4, r5, [sp, #52] ldrd r6, r7, [sp, #60] mov r8, r5 vld1.16 {q0}, [r4] movw r9, #(1 << 14) - (1 << 2) vdup.16 q14, r9 vmov.s16 q15, #2048 // Calculate mid_stride add r10, r5, #7 bic r10, r10, #7 lsl r10, r10, #1 // Clear the last unused element of q0, to allow filtering a single // pixel with one plain vmul+vpadd. mov r12, #0 vmov.16 d1[3], r12 // Set up pointers for reading/writing alternate rows add r12, r0, r10 lsl r10, r10, #1 add lr, r2, r3 lsl r3, r3, #1 // Subtract the width from mid_stride sub r10, r10, r5, lsl #1 // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels. cmp r5, #8 add r11, r5, #13 bic r11, r11, #7 bge 1f mov r11, #16 1: sub r3, r3, r11 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL tst r7, #1 // LR_HAVE_LEFT beq 2f // LR_HAVE_LEFT cmp r1, #0 bne 0f // left == NULL sub r2, r2, #3 sub lr, lr, #3 b 1f 0: // LR_HAVE_LEFT, left != NULL 2: // !LR_HAVE_LEFT, increase the stride. // For this case we don't read the left 3 pixels from the src pointer, // but shift it as if we had done that. add r3, r3, #3 1: // Loop vertically vld1.8 {q2}, [r2]! vld1.8 {q9}, [lr]! tst r7, #1 // LR_HAVE_LEFT beq 0f cmp r1, #0 beq 2f // LR_HAVE_LEFT, left != NULL vld1.32 {d3[1]}, [r1]! // Move r2/lr back to account for the last 3 bytes we loaded earlier, // which we'll shift out. sub r2, r2, #3 sub lr, lr, #3 vld1.32 {d17[1]}, [r1]! vext.8 q2, q1, q2, #13 vext.8 q9, q8, q9, #13 b 2f 0: // !LR_HAVE_LEFT, fill q1 with the leftmost byte // and shift q2 to have 3x the first byte at the front. vdup.8 q1, d4[0] vdup.8 q8, d18[0] // Move r2 back to account for the last 3 bytes we loaded before, // which we shifted out. sub r2, r2, #3 sub lr, lr, #3 vext.8 q2, q1, q2, #13 vext.8 q9, q8, q9, #13 2: vmovl.u8 q1, d4 vmovl.u8 q2, d5 vmovl.u8 q8, d18 vmovl.u8 q9, d19 tst r7, #2 // LR_HAVE_RIGHT bne 4f // If we'll need to pad the right edge, load that byte to pad with // here since we can find it pretty easily from here. sub r9, r5, #14 ldrb r11, [r2, r9] ldrb r9, [lr, r9] // Fill q12/q13 with the right padding pixel vdup.8 d24, r11 vdup.8 d26, r9 vmovl.u8 q12, d24 vmovl.u8 q13, d26 3: // !LR_HAVE_RIGHT // If we'll have to pad the right edge we need to quit early here. cmp r5, #11 bge 4f // If w >= 11, all used input pixels are valid cmp r5, #7 bge 5f // If w >= 7, we can filter 4 pixels b 6f 4: // Loop horizontally .macro filter_8 // This is tuned as some sort of compromise between Cortex A7, A8, // A9 and A53. vmul.s16 q3, q1, d0[0] vext.8 q10, q1, q2, #2 vext.8 q11, q1, q2, #4 vmla.s16 q3, q10, d0[1] vmla.s16 q3, q11, d0[2] vext.8 q10, q1, q2, #6 vext.8 q11, q1, q2, #8 vmla.s16 q3, q10, d0[3] vmla.s16 q3, q11, d1[0] vext.8 q10, q1, q2, #10 vext.8 q11, q1, q2, #12 vmla.s16 q3, q10, d1[1] vmla.s16 q3, q11, d1[2] vmul.s16 q10, q8, d0[0] vext.8 q11, q8, q9, #2 vext.8 q4, q8, q9, #4 vmla.s16 q10, q11, d0[1] vmla.s16 q10, q4, d0[2] vext.8 q11, q8, q9, #6 vext.8 q4, q8, q9, #8 vmla.s16 q10, q11, d0[3] vmla.s16 q10, q4, d1[0] vext.8 q11, q8, q9, #10 vext.8 q4, q8, q9, #12 vmla.s16 q10, q11, d1[1] vmla.s16 q10, q4, d1[2] vext.8 q1, q1, q2, #6 vext.8 q8, q8, q9, #6 vshl.s16 q1, q1, #7 vshl.s16 q8, q8, #7 vsub.s16 q1, q1, q14 vsub.s16 q8, q8, q14 vqadd.s16 q3, q3, q1 vqadd.s16 q10, q10, q8 vshr.s16 q3, q3, #3 vshr.s16 q10, q10, #3 vadd.s16 q3, q3, q15 vadd.s16 q10, q10, q15 .endm filter_8 vst1.16 {q3}, [r0, :128]! vst1.16 {q10}, [r12, :128]! subs r5, r5, #8 ble 9f tst r7, #2 // LR_HAVE_RIGHT vmov q1, q2 vmov q8, q9 vld1.8 {d4}, [r2]! vld1.8 {d18}, [lr]! vmovl.u8 q2, d4 vmovl.u8 q9, d18 bne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. 5: // Filter 4 pixels, 7 <= w < 11 .macro filter_4 vmul.s16 d6, d2, d0[0] vext.8 q10, q1, q2, #2 vext.8 q11, q1, q2, #4 vmla.s16 d6, d20, d0[1] vmla.s16 d6, d22, d0[2] vext.8 q10, q1, q2, #6 vext.8 q11, q1, q2, #8 vmla.s16 d6, d20, d0[3] vmla.s16 d6, d22, d1[0] vext.8 q10, q1, q2, #10 vext.8 q11, q1, q2, #12 vmla.s16 d6, d20, d1[1] vmla.s16 d6, d22, d1[2] vmul.s16 d20, d16, d0[0] vext.8 q11, q8, q9, #2 vext.8 q4, q8, q9, #4 vmla.s16 d20, d22, d0[1] vmla.s16 d20, d8, d0[2] vext.8 q11, q8, q9, #6 vext.8 q4, q8, q9, #8 vmla.s16 d20, d22, d0[3] vmla.s16 d20, d8, d1[0] vext.8 q11, q8, q9, #10 vext.8 q4, q8, q9, #12 vmla.s16 d20, d22, d1[1] vmla.s16 d20, d8, d1[2] vext.8 q11, q1, q2, #6 vshl.s16 d22, d22, #7 vsub.s16 d22, d22, d28 vqadd.s16 d6, d6, d22 vext.8 q11, q8, q9, #6 vshl.s16 d22, d22, #7 vsub.s16 d22, d22, d28 vqadd.s16 d20, d20, d22 vshr.s16 d6, d6, #3 vshr.s16 d20, d20, #3 vadd.s16 d6, d6, d30 vadd.s16 d20, d20, d30 .endm filter_4 vst1.16 {d6}, [r0, :64]! vst1.16 {d20}, [r12, :64]! subs r5, r5, #4 // 3 <= w < 7 vext.8 q1, q1, q2, #8 vext.8 q2, q2, q2, #8 vext.8 q8, q8, q9, #8 vext.8 q9, q9, q9, #8 6: // Pad the right edge and filter the last few pixels. // w < 7, w+3 pixels valid in q1-q2 cmp r5, #5 blt 7f bgt 8f // w == 5, 8 pixels valid in q1, q2 invalid vmov q2, q12 vmov q9, q13 b 88f 7: // 1 <= w < 5, 4-7 pixels valid in q1 sub r9, r5, #1 // w9 = (pixels valid - 4) adr r11, L(variable_shift_tbl) ldr r9, [r11, r9, lsl #2] add r11, r11, r9 vmov q2, q12 vmov q9, q13 bx r11 .align 2 L(variable_shift_tbl): .word 44f - L(variable_shift_tbl) + CONFIG_THUMB .word 55f - L(variable_shift_tbl) + CONFIG_THUMB .word 66f - L(variable_shift_tbl) + CONFIG_THUMB .word 77f - L(variable_shift_tbl) + CONFIG_THUMB // Shift q1 right, shifting out invalid pixels, // shift q1 left to the original offset, shifting in padding pixels. 44: // 4 pixels valid vext.8 q1, q1, q1, #8 vext.8 q1, q1, q2, #8 vext.8 q8, q8, q8, #8 vext.8 q8, q8, q9, #8 b 88f 55: // 5 pixels valid vext.8 q1, q1, q1, #10 vext.8 q1, q1, q2, #6 vext.8 q8, q8, q8, #10 vext.8 q8, q8, q9, #6 b 88f 66: // 6 pixels valid vext.8 q1, q1, q1, #12 vext.8 q1, q1, q2, #4 vext.8 q8, q8, q8, #12 vext.8 q8, q8, q9, #4 b 88f 77: // 7 pixels valid vext.8 q1, q1, q1, #14 vext.8 q1, q1, q2, #2 vext.8 q8, q8, q8, #14 vext.8 q8, q8, q9, #2 b 88f 8: // w > 5, w == 6, 9 pixels valid in q1-q2, 1 pixel valid in q2 vext.8 q2, q2, q2, #2 vext.8 q2, q2, q12, #14 vext.8 q9, q9, q9, #2 vext.8 q9, q9, q13, #14 88: // w < 7, q1-q2 padded properly cmp r5, #4 blt 888f // w >= 4, filter 4 pixels filter_4 vst1.16 {d6}, [r0, :64]! vst1.16 {d20}, [r12, :64]! subs r5, r5, #4 // 0 <= w < 4 vext.8 q1, q1, q2, #8 vext.8 q8, q8, q9, #8 beq 9f 888: // 1 <= w < 4, filter 1 pixel at a time vmul.s16 q3, q1, q0 vmul.s16 q10, q8, q0 vpadd.s16 d6, d6, d7 vpadd.s16 d7, d20, d21 vdup.16 d24, d2[3] vpadd.s16 d6, d6, d7 vdup.16 d25, d16[3] vpadd.s16 d6, d6, d6 vtrn.16 d24, d25 vshl.s16 d24, d24, #7 vsub.s16 d24, d24, d28 vqadd.s16 d6, d6, d24 vshr.s16 d6, d6, #3 vadd.s16 d6, d6, d30 vst1.s16 {d6[0]}, [r0, :16]! vst1.s16 {d6[1]}, [r12, :16]! subs r5, r5, #1 vext.8 q1, q1, q2, #2 vext.8 q8, q8, q9, #2 bgt 888b 9: subs r6, r6, #2 ble 0f // Jump to the next row and loop horizontally add r0, r0, r10 add r12, r12, r10 add r2, r2, r3 add lr, lr, r3 mov r5, r8 b 1b 0: vpop {q4} pop {r4-r11,pc} .purgem filter_8 .purgem filter_4 endfunc // void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride, // const int16_t *mid, int w, int h, // const int16_t fv[7], enum LrEdgeFlags edges, // ptrdiff_t mid_stride); function wiener_filter_v_neon, export=1 push {r4-r7,lr} ldrd r4, r5, [sp, #20] ldrd r6, r7, [sp, #28] mov lr, r4 vmov.s16 q1, #0 mov r12, #128 vld1.16 {q0}, [r5] vmov.s16 d2[3], r12 vadd.s16 q0, q0, q1 // Calculate the number of rows to move back when looping vertically mov r12, r4 tst r6, #4 // LR_HAVE_TOP beq 0f sub r2, r2, r7, lsl #1 add r12, r12, #2 0: tst r6, #8 // LR_HAVE_BOTTOM beq 1f add r12, r12, #2 1: // Start of horizontal loop; start one vertical filter slice. // Load rows into q8-q11 and pad properly. tst r6, #4 // LR_HAVE_TOP vld1.16 {q8}, [r2, :128], r7 beq 2f // LR_HAVE_TOP vld1.16 {q10}, [r2, :128], r7 vmov q9, q8 vld1.16 {q11}, [r2, :128], r7 b 3f 2: // !LR_HAVE_TOP vmov q9, q8 vmov q10, q8 vmov q11, q8 3: cmp r4, #4 blt 5f // Start filtering normally; fill in q12-q14 with unique rows. vld1.16 {q12}, [r2, :128], r7 vld1.16 {q13}, [r2, :128], r7 vld1.16 {q14}, [r2, :128], r7 4: .macro filter compare subs r4, r4, #1 // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. vmull.s16 q2, d16, d0[0] vmlal.s16 q2, d18, d0[1] vmlal.s16 q2, d20, d0[2] vmlal.s16 q2, d22, d0[3] vmlal.s16 q2, d24, d1[0] vmlal.s16 q2, d26, d1[1] vmlal.s16 q2, d28, d1[2] vmull.s16 q3, d17, d0[0] vmlal.s16 q3, d19, d0[1] vmlal.s16 q3, d21, d0[2] vmlal.s16 q3, d23, d0[3] vmlal.s16 q3, d25, d1[0] vmlal.s16 q3, d27, d1[1] vmlal.s16 q3, d29, d1[2] vqrshrun.s32 d4, q2, #11 vqrshrun.s32 d5, q3, #11 vqmovun.s16 d4, q2 vst1.8 {d4}, [r0], r1 .if \compare cmp r4, #4 .else ble 9f .endif vmov q8, q9 vmov q9, q10 vmov q10, q11 vmov q11, q12 vmov q12, q13 vmov q13, q14 .endm filter 1 blt 7f vld1.16 {q14}, [r2, :128], r7 b 4b 5: // Less than 4 rows in total; not all of q12-q13 are filled yet. tst r6, #8 // LR_HAVE_BOTTOM beq 6f // LR_HAVE_BOTTOM cmp r4, #2 // We load at least 2 rows in all cases. vld1.16 {q12}, [r2, :128], r7 vld1.16 {q13}, [r2, :128], r7 bgt 53f // 3 rows in total beq 52f // 2 rows in total 51: // 1 row in total, q11 already loaded, load edge into q12-q14. vmov q13, q12 b 8f 52: // 2 rows in total, q11 already loaded, load q12 with content data // and 2 rows of edge. vld1.16 {q14}, [r2, :128], r7 vmov q15, q14 b 8f 53: // 3 rows in total, q11 already loaded, load q12 and q13 with content // and 2 rows of edge. vld1.16 {q14}, [r2, :128], r7 vld1.16 {q15}, [r2, :128], r7 vmov q1, q15 b 8f 6: // !LR_HAVE_BOTTOM cmp r4, #2 bgt 63f // 3 rows in total beq 62f // 2 rows in total 61: // 1 row in total, q11 already loaded, pad that into q12-q14. vmov q12, q11 vmov q13, q11 vmov q14, q11 b 8f 62: // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15. vld1.16 {q12}, [r2, :128], r7 vmov q13, q12 vmov q14, q12 vmov q15, q12 b 8f 63: // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1. vld1.16 {q12}, [r2, :128], r7 vld1.16 {q13}, [r2, :128], r7 vmov q14, q13 vmov q15, q13 vmov q1, q13 b 8f 7: // All registers up to q13 are filled already, 3 valid rows left. // < 4 valid rows left; fill in padding and filter the last // few rows. tst r6, #8 // LR_HAVE_BOTTOM beq 71f // LR_HAVE_BOTTOM; load 2 rows of edge. vld1.16 {q14}, [r2, :128], r7 vld1.16 {q15}, [r2, :128], r7 vmov q1, q15 b 8f 71: // !LR_HAVE_BOTTOM, pad 3 rows vmov q14, q13 vmov q15, q13 vmov q1, q13 8: // At this point, all registers up to q14-15,q1 are loaded with // edge/padding (depending on how many rows are left). filter 0 // This branches to 9f when done vmov q14, q15 vmov q15, q1 b 8b 9: // End of one vertical slice. subs r3, r3, #8 ble 0f // Move pointers back up to the top and loop horizontally. mls r0, r1, lr, r0 mls r2, r7, r12, r2 add r0, r0, #8 add r2, r2, #16 mov r4, lr b 1b 0: pop {r4-r7,pc} .purgem filter endfunc // void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride, // const pixel *src, int w, int h); function copy_narrow_neon, export=1 push {r4,lr} ldr r4, [sp, #8] adr r12, L(copy_narrow_tbl) ldr r3, [r12, r3, lsl #2] add r12, r12, r3 bx r12 .align 2 L(copy_narrow_tbl): .word 0 .word 10f - L(copy_narrow_tbl) + CONFIG_THUMB .word 20f - L(copy_narrow_tbl) + CONFIG_THUMB .word 30f - L(copy_narrow_tbl) + CONFIG_THUMB .word 40f - L(copy_narrow_tbl) + CONFIG_THUMB .word 50f - L(copy_narrow_tbl) + CONFIG_THUMB .word 60f - L(copy_narrow_tbl) + CONFIG_THUMB .word 70f - L(copy_narrow_tbl) + CONFIG_THUMB 10: add r3, r0, r1 lsl r1, r1, #1 18: subs r4, r4, #8 blt 110f vld1.8 {d0}, [r2, :64]! vst1.8 {d0[0]}, [r0], r1 vst1.8 {d0[1]}, [r3], r1 vst1.8 {d0[2]}, [r0], r1 vst1.8 {d0[3]}, [r3], r1 vst1.8 {d0[4]}, [r0], r1 vst1.8 {d0[5]}, [r3], r1 vst1.8 {d0[6]}, [r0], r1 vst1.8 {d0[7]}, [r3], r1 ble 0f b 18b 110: add r4, r4, #8 asr r1, r1, #1 11: subs r4, r4, #1 vld1.8 {d0[]}, [r2]! vst1.8 {d0[0]}, [r0], r1 bgt 11b 0: pop {r4,pc} 20: add r3, r0, r1 lsl r1, r1, #1 24: subs r4, r4, #4 blt 210f vld1.16 {d0}, [r2, :64]! vst1.16 {d0[0]}, [r0, :16], r1 vst1.16 {d0[1]}, [r3, :16], r1 vst1.16 {d0[2]}, [r0, :16], r1 vst1.16 {d0[3]}, [r3, :16], r1 ble 0f b 24b 210: add r4, r4, #4 asr r1, r1, #1 22: subs r4, r4, #1 vld1.16 {d0[]}, [r2]! vst1.16 {d0[0]}, [r0], r1 bgt 22b 0: pop {r4,pc} 30: ldrh r3, [r2] ldrb r12, [r2, #2] add r2, r2, #3 subs r4, r4, #1 strh r3, [r0] strb r12, [r0, #2] add r0, r0, r1 bgt 30b pop {r4,pc} 40: add r3, r0, r1 lsl r1, r1, #1 42: subs r4, r4, #2 blt 41f vld1.8 {d0}, [r2, :64]! vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d0[1]}, [r3, :32], r1 ble 0f b 42b 41: vld1.32 {d0[]}, [r2] vst1.32 {d0[0]}, [r0] 0: pop {r4,pc} 50: ldr r3, [r2] ldrb r12, [r2, #4] add r2, r2, #5 subs r4, r4, #1 str r3, [r0] strb r12, [r0, #4] add r0, r0, r1 bgt 50b pop {r4,pc} 60: ldr r3, [r2] ldrh r12, [r2, #4] add r2, r2, #6 subs r4, r4, #1 str r3, [r0] strh r12, [r0, #4] add r0, r0, r1 bgt 60b pop {r4,pc} 70: ldr r3, [r2] ldrh r12, [r2, #4] ldrb lr, [r2, #6] add r2, r2, #7 subs r4, r4, #1 str r3, [r0] strh r12, [r0, #4] strb lr, [r0, #6] add r0, r0, r1 bgt 70b pop {r4,pc} endfunc