ref: e98816e9c10c9e0567f57e06b9b23e515783502c
parent: 5c02c5ffb58a08c1b705e6fa2ab73f631f658823
author: Timothy B. Terriberry <[email protected]>
date: Tue Nov 26 16:55:29 EST 2013
Actually use my NEON code. I'd commented out mine and stuck azanelli's code in for testing, and then accidentally committed it like that.
--- a/celt/arm/celt_pitch_xcorr_arm.s
+++ b/celt/arm/celt_pitch_xcorr_arm.s
@@ -40,368 +40,215 @@
IF OPUS_ARM_MAY_HAVE_NEON
-;; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
-;xcorr_kernel_neon PROC
-; ; input:
-; ; r3 = int len
-; ; r4 = opus_val16 *x
-; ; r5 = opus_val16 *y
-; ; q0 = opus_val32 sum[4]
-; ; output:
-; ; q0 = opus_val32 sum[4]
-; ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
-; ; internal usage:
-; ; r12 = int j
-; ; d3 = y_3|y_2|y_1|y_0
-; ; q2 = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
-; ; q3 = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
-; ; q8 = scratch
-; ;
-; ; Load y[0...3]
-; ; This requires len>0 to always be valid (which we assert in the C code).
-; VLD1.16 {d5}, [r5]!
-; SUBS r12, r3, #8
-; BLE xcorr_kernel_neon_process4
-;; Process 8 samples at a time.
-;; This loop loads one y value more than we actually need. Therefore we have to
-;; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
-;; reading past the end of the array.
-;xcorr_kernel_neon_process8
-; ; This loop has 19 total instructions (10 cycles to issue, minimum), with
-; ; - 2 cycles of ARM insrtuctions,
-; ; - 10 cycles of load/store/byte permute instructions, and
-; ; - 9 cycles of data processing instructions.
-; ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
-; ; latter two categories, meaning the whole loop should run in 10 cycles per
-; ; iteration, barring cache misses.
-; ;
-; ; Load x[0...7]
-; VLD1.16 {d6, d7}, [r4]!
-; ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get
-; ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
-; VAND d3, d5, d5
-; SUBS r12, r12, #8
-; ; Load y[4...11]
-; VLD1.16 {d4, d5}, [r5]!
-; VMLAL.S16 q0, d3, d6[0]
-; VEXT.16 d16, d3, d4, #1
-; VMLAL.S16 q0, d4, d7[0]
-; VEXT.16 d17, d4, d5, #1
-; VMLAL.S16 q0, d16, d6[1]
-; VEXT.16 d16, d3, d4, #2
-; VMLAL.S16 q0, d17, d7[1]
-; VEXT.16 d17, d4, d5, #2
-; VMLAL.S16 q0, d16, d6[2]
-; VEXT.16 d16, d3, d4, #3
-; VMLAL.S16 q0, d17, d7[2]
-; VEXT.16 d17, d4, d5, #3
-; VMLAL.S16 q0, d16, d6[3]
-; VMLAL.S16 q0, d17, d7[3]
-; BGT xcorr_kernel_neon_process8
-;; Process 4 samples here if we have > 4 left (still reading one extra y value).
-;xcorr_kernel_neon_process4
-; ADDS r12, r12, #4
-; BLE xcorr_kernel_neon_process2
-; ; Load x[0...3]
-; VLD1.16 d6, [r4]!
-; ; Use VAND since it's a data processing instruction again.
-; VAND d4, d5, d5
-; SUB r12, r12, #4
-; ; Load y[4...7]
-; VLD1.16 d5, [r5]!
-; VMLAL.S16 q0, d4, d6[0]
-; VEXT.16 d16, d4, d5, #1
-; VMLAL.S16 q0, d16, d6[1]
-; VEXT.16 d16, d4, d5, #2
-; VMLAL.S16 q0, d16, d6[2]
-; VEXT.16 d16, d4, d5, #3
-; VMLAL.S16 q0, d16, d6[3]
-;; Process 2 samples here if we have > 2 left (still reading one extra y value).
-;xcorr_kernel_neon_process2
-; ADDS r12, r12, #2
-; BLE xcorr_kernel_neon_process1
-; ; Load x[0...1]
-; VLD2.16 {d6[],d7[]}, [r4]!
-; ; Use VAND since it's a data processing instruction again.
-; VAND d4, d5, d5
-; SUB r12, r12, #2
-; ; Load y[4...5]
-; VLD1.32 {d5[]}, [r5]!
-; VMLAL.S16 q0, d4, d6
-; VEXT.16 d16, d4, d5, #1
-; ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
-; ; instead of VEXT, since it's a data-processing instruction.
-; VSRI.64 d5, d4, #32
-; VMLAL.S16 q0, d16, d7
-;; Process 1 sample using the extra y value we loaded above.
-;xcorr_kernel_neon_process1
-; ; Load next *x
-; VLD1.16 {d6[]}, [r4]!
-; ADDS r12, r12, #1
-; ; y[0...3] are left in d5 from prior iteration(s) (if any)
-; VMLAL.S16 q0, d5, d6
-; MOVLE pc, lr
-;; Now process 1 last sample, not reading ahead.
-; ; Load last *y
-; VLD1.16 {d4[]}, [r5]!
-; VSRI.64 d4, d5, #16
-; ; Load last *x
-; VLD1.16 {d6[]}, [r4]!
-; VMLAL.S16 q0, d4, d6
-; MOV pc, lr
-; ENDP
-
-;; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
-;; opus_val32 *xcorr, int len, int max_pitch)
-;celt_pitch_xcorr_neon PROC
-; ; input:
-; ; r0 = opus_val16 *_x
-; ; r1 = opus_val16 *_y
-; ; r2 = opus_val32 *xcorr
-; ; r3 = int len
-; ; output:
-; ; r0 = int maxcorr
-; ; internal usage:
-; ; r4 = opus_val16 *x (for xcorr_kernel_neon())
-; ; r5 = opus_val16 *y (for xcorr_kernel_neon())
-; ; r6 = int max_pitch
-; ; r12 = int j
-; ; q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon())
-; STMFD sp!, {r4-r6, lr}
-; LDR r6, [sp, #16]
-; VMOV.S32 q15, #1
-; ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
-; SUBS r6, r6, #4
-; BLT celt_pitch_xcorr_neon_process4_done
-;celt_pitch_xcorr_neon_process4
-; ; xcorr_kernel_neon parameters:
-; ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
-; MOV r4, r0
-; MOV r5, r1
-; VEOR q0, q0, q0
-; ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
-; ; So we don't save/restore any other registers.
-; BL xcorr_kernel_neon
-; SUBS r6, r6, #4
-; VST1.32 {q0}, [r2]!
-; ; _y += 4
-; ADD r1, r1, #8
-; VMAX.S32 q15, q15, q0
-; ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
-; BGE celt_pitch_xcorr_neon_process4
-;; We have less than 4 sums left to compute.
-;celt_pitch_xcorr_neon_process4_done
-; ADDS r6, r6, #4
-; ; Reduce maxcorr to a single value
-; VMAX.S32 d30, d30, d31
-; VPMAX.S32 d30, d30, d30
-; ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
-; BLE celt_pitch_xcorr_neon_done
-;; Now compute each remaining sum one at a time.
-;celt_pitch_xcorr_neon_process_remaining
-; MOV r4, r0
-; MOV r5, r1
-; VMOV.I32 q0, #0
-; SUBS r12, r3, #8
-; BLT celt_pitch_xcorr_neon_process_remaining4
-;; Sum terms 8 at a time.
-;celt_pitch_xcorr_neon_process_remaining_loop8
-; ; Load x[0...7]
-; VLD1.16 {q1}, [r4]!
-; ; Load y[0...7]
-; VLD1.16 {q2}, [r5]!
-; SUBS r12, r12, #8
-; VMLAL.S16 q0, d4, d2
-; VMLAL.S16 q0, d5, d3
-; BGE celt_pitch_xcorr_neon_process_remaining_loop8
-;; Sum terms 4 at a time.
-;celt_pitch_xcorr_neon_process_remaining4
-; ADDS r12, r12, #4
-; BLT celt_pitch_xcorr_neon_process_remaining4_done
-; ; Load x[0...3]
-; VLD1.16 {d2}, [r4]!
-; ; Load y[0...3]
-; VLD1.16 {d3}, [r5]!
-; SUB r12, r12, #4
-; VMLAL.S16 q0, d3, d2
-; ; Reduce the sum to a single value.
-; VADD.S32 d0, d0, d1
-; VPADDL.S32 d0, d0
-;celt_pitch_xcorr_neon_process_remaining4_done
-; ADDS r12, r12, #4
-; BLE celt_pitch_xcorr_neon_process_remaining_loop_done
-;; Sum terms 1 at a time.
-;celt_pitch_xcorr_neon_process_remaining_loop1
-; VLD1.16 {d2[]}, [r4]!
-; VLD1.16 {d3[]}, [r5]!
-; SUBS r12, r12, #1
-; VMLAL.S16 q0, d2, d3
-; BGT celt_pitch_xcorr_neon_process_remaining_loop1
-;celt_pitch_xcorr_neon_process_remaining_loop_done
-; VST1.32 {d0[0]}, [r2]!
-; VMAX.S32 d30, d30, d0
-; SUBS r6, r6, #1
-; ; _y++
-; ADD r1, r1, #2
-; ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
-; BGT celt_pitch_xcorr_neon_process_remaining
-;celt_pitch_xcorr_neon_done
-; VMOV.32 r0, d30[0]
-; LDMFD sp!, {r4-r6, pc}
-; ENDP
-
+; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
xcorr_kernel_neon PROC
; input:
- ; r0 = opus_val16 *x
- ; r1 = opus_val16 *y
- ; r2 = int len
- ; q0 = opus_val32 sum (sum[3] | sum[2] | sum[1] | sum[0])
-
+ ; r3 = int len
+ ; r4 = opus_val16 *x
+ ; r5 = opus_val16 *y
+ ; q0 = opus_val32 sum[4]
; output:
- ; q0 = sum
-
+ ; q0 = opus_val32 sum[4]
+ ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
; internal usage:
- ; r3 = j
- ; d2 = x_3|x_2|x_1|x_0 d3 = y_3|y_2|y_1|y_0
- ; d4 = y_7|y_6|y_5|y_4 d5 = y_4|y_3|y_2|y_1
- ; d6 = y_5|y_4|y_3|y_2 d7 = y_6|y_5|y_4|y_3
- ; We will build d5, d6 and d7 vector from d3 and d4
-
-
- VLD1.16 {d3}, [r1]! ; Load y[3] downto y[0] to d3 lane (yy0)
- SUB r3, r2, #1
- MOVS r3, r3, lsr #2 ; j=(len-1)>>2
- BEQ xcorr_kernel_neon_process4_done
-
- ; Process 4 x samples at a time
- ; For this, we will need 4 y vectors
+ ; r12 = int j
+ ; d3 = y_3|y_2|y_1|y_0
+ ; q2 = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
+ ; q3 = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
+ ; q8 = scratch
+ ;
+ ; Load y[0...3]
+ ; This requires len>0 to always be valid (which we assert in the C code).
+ VLD1.16 {d5}, [r5]!
+ SUBS r12, r3, #8
+ BLE xcorr_kernel_neon_process4
+; Process 8 samples at a time.
+; This loop loads one y value more than we actually need. Therefore we have to
+; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
+; reading past the end of the array.
+xcorr_kernel_neon_process8
+ ; This loop has 19 total instructions (10 cycles to issue, minimum), with
+ ; - 2 cycles of ARM insrtuctions,
+ ; - 10 cycles of load/store/byte permute instructions, and
+ ; - 9 cycles of data processing instructions.
+ ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
+ ; latter two categories, meaning the whole loop should run in 10 cycles per
+ ; iteration, barring cache misses.
+ ;
+ ; Load x[0...7]
+ VLD1.16 {d6, d7}, [r4]!
+ ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get
+ ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
+ VAND d3, d5, d5
+ SUBS r12, r12, #8
+ ; Load y[4...11]
+ VLD1.16 {d4, d5}, [r5]!
+ VMLAL.S16 q0, d3, d6[0]
+ VEXT.16 d16, d3, d4, #1
+ VMLAL.S16 q0, d4, d7[0]
+ VEXT.16 d17, d4, d5, #1
+ VMLAL.S16 q0, d16, d6[1]
+ VEXT.16 d16, d3, d4, #2
+ VMLAL.S16 q0, d17, d7[1]
+ VEXT.16 d17, d4, d5, #2
+ VMLAL.S16 q0, d16, d6[2]
+ VEXT.16 d16, d3, d4, #3
+ VMLAL.S16 q0, d17, d7[2]
+ VEXT.16 d17, d4, d5, #3
+ VMLAL.S16 q0, d16, d6[3]
+ VMLAL.S16 q0, d17, d7[3]
+ BGT xcorr_kernel_neon_process8
+; Process 4 samples here if we have > 4 left (still reading one extra y value).
xcorr_kernel_neon_process4
- SUBS r3, r3, #1 ; j--
- VLD1.16 d4, [r1]! ; Load y[7] downto y[4] to d4 lane
- VLD1.16 d2, [r0]! ; Load x[3] downto x[0] to d2 lane
- VEXT.16 d5, d3, d4, #1 ; Build y[4] downto y[1] vector (yy1)
- VEXT.16 d6, d3, d4, #2 ; Build y[5] downto y[2] vector (yy2)
- VEXT.16 d7, d3, d4, #3 ; Build y[6] downto y[3] vector (yy3)
-
- VMLAL.S16 q0, d3, d2[0] ; MAC16_16(sum, x[0], yy0)
- VMLAL.S16 q0, d5, d2[1] ; MAC16_16(sum, x[1], yy1)
- VMLAL.S16 q0, d6, d2[2] ; MAC16_16(sum, x[2], yy2)
- VMLAL.S16 q0, d7, d2[3] ; MAC16_16(sum, x[3], yy3)
-
- VMOV.S16 d3, d4 ; Next y vector should be in d3 (yy0)
-
- BNE xcorr_kernel_neon_process4
-
-xcorr_kernel_neon_process4_done
- ;Process len-1 to len
- VLD1.16 {d2[]}, [r0]! ; Load *x and duplicate to d2 lane
-
- SUB r3, r2, #1
- ANDS r3, r3, #3 ; j=(len-1)&3
- VMLAL.S16 q0, d3, d2 ; MAC16_16(sum, *x, yy0)
- BEQ xcorr_kernel_neon_done
-
-xcorr_kernel_neon_process_remaining
- SUBS r3, r3, #1 ; j--
- VLD1.16 {d4[]}, [r1]! ; Load y value and duplicate to d4 lane
- VLD1.16 {d2[]}, [r0]! ; Load *x and duplicate to d2 lane
- VEXT.16 d3, d3, d4, #1 ; Build y vector from previous and d4
- VMLAL.S16 q0, d3, d2 ; MAC16_16(sum, *x, yy0)
- BNE xcorr_kernel_neon_process_remaining
-
-xcorr_kernel_neon_done
- MOV pc, lr
+ ADDS r12, r12, #4
+ BLE xcorr_kernel_neon_process2
+ ; Load x[0...3]
+ VLD1.16 d6, [r4]!
+ ; Use VAND since it's a data processing instruction again.
+ VAND d4, d5, d5
+ SUB r12, r12, #4
+ ; Load y[4...7]
+ VLD1.16 d5, [r5]!
+ VMLAL.S16 q0, d4, d6[0]
+ VEXT.16 d16, d4, d5, #1
+ VMLAL.S16 q0, d16, d6[1]
+ VEXT.16 d16, d4, d5, #2
+ VMLAL.S16 q0, d16, d6[2]
+ VEXT.16 d16, d4, d5, #3
+ VMLAL.S16 q0, d16, d6[3]
+; Process 2 samples here if we have > 2 left (still reading one extra y value).
+xcorr_kernel_neon_process2
+ ADDS r12, r12, #2
+ BLE xcorr_kernel_neon_process1
+ ; Load x[0...1]
+ VLD2.16 {d6[],d7[]}, [r4]!
+ ; Use VAND since it's a data processing instruction again.
+ VAND d4, d5, d5
+ SUB r12, r12, #2
+ ; Load y[4...5]
+ VLD1.32 {d5[]}, [r5]!
+ VMLAL.S16 q0, d4, d6
+ VEXT.16 d16, d4, d5, #1
+ ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
+ ; instead of VEXT, since it's a data-processing instruction.
+ VSRI.64 d5, d4, #32
+ VMLAL.S16 q0, d16, d7
+; Process 1 sample using the extra y value we loaded above.
+xcorr_kernel_neon_process1
+ ; Load next *x
+ VLD1.16 {d6[]}, [r4]!
+ ADDS r12, r12, #1
+ ; y[0...3] are left in d5 from prior iteration(s) (if any)
+ VMLAL.S16 q0, d5, d6
+ MOVLE pc, lr
+; Now process 1 last sample, not reading ahead.
+ ; Load last *y
+ VLD1.16 {d4[]}, [r5]!
+ VSRI.64 d4, d5, #16
+ ; Load last *x
+ VLD1.16 {d6[]}, [r4]!
+ VMLAL.S16 q0, d4, d6
+ MOV pc, lr
ENDP
+; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
+; opus_val32 *xcorr, int len, int max_pitch)
celt_pitch_xcorr_neon PROC
; input:
- ; r0 = opus_val16 *_x
- ; r1 = opus_val16 *_y
- ; r2 = opus_val32 *xcorr
- ; r3 = int len
-
+ ; r0 = opus_val16 *_x
+ ; r1 = opus_val16 *_y
+ ; r2 = opus_val32 *xcorr
+ ; r3 = int len
; output:
- ; r0 = maxcorr
-
- STMFD sp!, {r4-r9, lr}
-
- LDR r4, [sp, #28] ; r4 = int max_pitch
- MOV r5, r0 ; r5 = _x
- MOV r6, r1 ; r6 = _y
- MOV r7, r2 ; r7 = xcorr
- MOV r2, r3 ; r2 = len
-
- VMOV.S32 d16, #1 ; d16 = {1, 1} (not used by xcorr_kernel_neon)
- MOV r8, #0 ; r8 = i = 0
- CMP r4, #3 ; max_pitch-3 <= 0 ---> pitch_xcorr_neon_process4_done
- BLE celt_pitch_xcorr_neon_process4_done
-
- SUB r9, r4, #3 ; r9 = max_pitch-3
-
+ ; r0 = int maxcorr
+ ; internal usage:
+ ; r4 = opus_val16 *x (for xcorr_kernel_neon())
+ ; r5 = opus_val16 *y (for xcorr_kernel_neon())
+ ; r6 = int max_pitch
+ ; r12 = int j
+ ; q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon())
+ STMFD sp!, {r4-r6, lr}
+ LDR r6, [sp, #16]
+ VMOV.S32 q15, #1
+ ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
+ SUBS r6, r6, #4
+ BLT celt_pitch_xcorr_neon_process4_done
celt_pitch_xcorr_neon_process4
- MOV r0, r5 ; r0 = _x
- ADD r1, r6 ,r8, LSL #1 ; r1 = _y + i
- VMOV.I32 q0, #0 ; q0 = opus_val32 sum[4] = {0, 0, 0, 0}
-
- ; xcorr_kernel_neon don't touch r2 (len)
- ; So we don't store it
- BL xcorr_kernel_neon ; xcorr_kernel_neon(_x, _y+i, sum, len)
-
- VST1.32 {q0}, [r7]! ; Store sum to xcorr
- VPMAX.S32 d0, d0, d1 ; d0 = max(sum[3], sum[2]) | max(sum[1], sum[0])
- ADD r8, r8, #4 ; i+=4
- VPMAX.S32 d0, d0, d0 ; d0 = max(sum[3], sum[2], sum[1], sum[0])
- CMP r8, r9 ; i < max_pitch-3 ----> pitch_xcorr_neon_process4
- VMAX.S32 d16, d16, d0 ; d16 = maxcorr = max(maxcorr, sum)
-
- BLT celt_pitch_xcorr_neon_process4
-
+ ; xcorr_kernel_neon parameters:
+ ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
+ MOV r4, r0
+ MOV r5, r1
+ VEOR q0, q0, q0
+ ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
+ ; So we don't save/restore any other registers.
+ BL xcorr_kernel_neon
+ SUBS r6, r6, #4
+ VST1.32 {q0}, [r2]!
+ ; _y += 4
+ ADD r1, r1, #8
+ VMAX.S32 q15, q15, q0
+ ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
+ BGE celt_pitch_xcorr_neon_process4
+; We have less than 4 sums left to compute.
celt_pitch_xcorr_neon_process4_done
- CMP r8, r4;
- BGE celt_pitch_xcorr_neon_done
-
+ ADDS r6, r6, #4
+ ; Reduce maxcorr to a single value
+ VMAX.S32 d30, d30, d31
+ VPMAX.S32 d30, d30, d30
+ ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
+ BLE celt_pitch_xcorr_neon_done
+; Now compute each remaining sum one at a time.
celt_pitch_xcorr_neon_process_remaining
- MOV r0, r5 ; r0 = _x
- ADD r1, r6, r8, LSL #1 ; r1 = _y + i
- VMOV.I32 q0, #0
- MOVS r3, r2, LSR #2 ; r3 = j = len
- BEQ inner_loop_neon_process4_done
-
-inner_loop_neon_process4
- VLD1.16 {d2}, [r0]! ; Load x
- VLD1.16 {d3}, [r1]! ; Load y
- SUBS r3, r3, #1
- VMLAL.S16 q0, d2, d3
- BNE inner_loop_neon_process4
-
- VPADD.S32 d0, d0, d1 ; Reduce sum
- VPADD.S32 d0, d0, d0
-
-inner_loop_neon_process4_done
- ANDS r3, r2, #3
- BEQ inner_loop_neon_done
-
-inner_loop_neon_process_remaining
- VLD1.16 {d2[]}, [r0]!
- VLD1.16 {d3[]}, [r1]!
- SUBS r3, r3, #1
- VMLAL.S16 q0, d2, d3
- BNE inner_loop_neon_process_remaining
-
-inner_loop_neon_done
- VST1.32 {d0[0]}, [r7]!
- VMAX.S32 d16, d16, d0
-
- ADD r8, r8, #1
- CMP r8, r4
- BCC celt_pitch_xcorr_neon_process_remaining
-
+ MOV r4, r0
+ MOV r5, r1
+ VMOV.I32 q0, #0
+ SUBS r12, r3, #8
+ BLT celt_pitch_xcorr_neon_process_remaining4
+; Sum terms 8 at a time.
+celt_pitch_xcorr_neon_process_remaining_loop8
+ ; Load x[0...7]
+ VLD1.16 {q1}, [r4]!
+ ; Load y[0...7]
+ VLD1.16 {q2}, [r5]!
+ SUBS r12, r12, #8
+ VMLAL.S16 q0, d4, d2
+ VMLAL.S16 q0, d5, d3
+ BGE celt_pitch_xcorr_neon_process_remaining_loop8
+; Sum terms 4 at a time.
+celt_pitch_xcorr_neon_process_remaining4
+ ADDS r12, r12, #4
+ BLT celt_pitch_xcorr_neon_process_remaining4_done
+ ; Load x[0...3]
+ VLD1.16 {d2}, [r4]!
+ ; Load y[0...3]
+ VLD1.16 {d3}, [r5]!
+ SUB r12, r12, #4
+ VMLAL.S16 q0, d3, d2
+celt_pitch_xcorr_neon_process_remaining4_done
+ ; Reduce the sum to a single value.
+ VADD.S32 d0, d0, d1
+ VPADDL.S32 d0, d0
+ ADDS r12, r12, #4
+ BLE celt_pitch_xcorr_neon_process_remaining_loop_done
+; Sum terms 1 at a time.
+celt_pitch_xcorr_neon_process_remaining_loop1
+ VLD1.16 {d2[]}, [r4]!
+ VLD1.16 {d3[]}, [r5]!
+ SUBS r12, r12, #1
+ VMLAL.S16 q0, d2, d3
+ BGT celt_pitch_xcorr_neon_process_remaining_loop1
+celt_pitch_xcorr_neon_process_remaining_loop_done
+ VST1.32 {d0[0]}, [r2]!
+ VMAX.S32 d30, d30, d0
+ SUBS r6, r6, #1
+ ; _y++
+ ADD r1, r1, #2
+ ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
+ BGT celt_pitch_xcorr_neon_process_remaining
celt_pitch_xcorr_neon_done
- VMOV d0, d16
- VMOV.32 r0, d0[0]
- LDMFD sp!, {r4-r9, pc}
+ VMOV.32 r0, d30[0]
+ LDMFD sp!, {r4-r6, pc}
ENDP
-
ENDIF