shithub: opus

Download patch

ref: e98816e9c10c9e0567f57e06b9b23e515783502c
parent: 5c02c5ffb58a08c1b705e6fa2ab73f631f658823
author: Timothy B. Terriberry <[email protected]>
date: Tue Nov 26 16:55:29 EST 2013

Actually use my NEON code.

I'd commented out mine and stuck azanelli's code in for testing,
 and then accidentally committed it like that.

--- a/celt/arm/celt_pitch_xcorr_arm.s
+++ b/celt/arm/celt_pitch_xcorr_arm.s
@@ -40,368 +40,215 @@
 
 IF OPUS_ARM_MAY_HAVE_NEON
 
-;; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
-;xcorr_kernel_neon PROC
-;  ; input:
-;  ;   r3     = int         len
-;  ;   r4     = opus_val16 *x
-;  ;   r5     = opus_val16 *y
-;  ;   q0     = opus_val32  sum[4]
-;  ; output:
-;  ;   q0     = opus_val32  sum[4]
-;  ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
-;  ; internal usage:
-;  ;   r12 = int j
-;  ;   d3  = y_3|y_2|y_1|y_0
-;  ;   q2  = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
-;  ;   q3  = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
-;  ;   q8  = scratch
-;  ;
-;  ; Load y[0...3]
-;  ; This requires len>0 to always be valid (which we assert in the C code).
-;  VLD1.16      {d5}, [r5]!
-;  SUBS         r12, r3, #8
-;  BLE xcorr_kernel_neon_process4
-;; Process 8 samples at a time.
-;; This loop loads one y value more than we actually need. Therefore we have to
-;; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
-;; reading past the end of the array.
-;xcorr_kernel_neon_process8
-;  ; This loop has 19 total instructions (10 cycles to issue, minimum), with
-;  ; - 2 cycles of ARM insrtuctions,
-;  ; - 10 cycles of load/store/byte permute instructions, and
-;  ; - 9 cycles of data processing instructions.
-;  ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
-;  ; latter two categories, meaning the whole loop should run in 10 cycles per
-;  ; iteration, barring cache misses.
-;  ;
-;  ; Load x[0...7]
-;  VLD1.16      {d6, d7}, [r4]!
-;  ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get
-;  ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
-;  VAND         d3, d5, d5
-;  SUBS         r12, r12, #8
-;  ; Load y[4...11]
-;  VLD1.16      {d4, d5}, [r5]!
-;  VMLAL.S16    q0, d3, d6[0]
-;  VEXT.16      d16, d3, d4, #1
-;  VMLAL.S16    q0, d4, d7[0]
-;  VEXT.16      d17, d4, d5, #1
-;  VMLAL.S16    q0, d16, d6[1]
-;  VEXT.16      d16, d3, d4, #2
-;  VMLAL.S16    q0, d17, d7[1]
-;  VEXT.16      d17, d4, d5, #2
-;  VMLAL.S16    q0, d16, d6[2]
-;  VEXT.16      d16, d3, d4, #3
-;  VMLAL.S16    q0, d17, d7[2]
-;  VEXT.16      d17, d4, d5, #3
-;  VMLAL.S16    q0, d16, d6[3]
-;  VMLAL.S16    q0, d17, d7[3]
-;  BGT xcorr_kernel_neon_process8
-;; Process 4 samples here if we have > 4 left (still reading one extra y value).
-;xcorr_kernel_neon_process4
-;  ADDS         r12, r12, #4
-;  BLE xcorr_kernel_neon_process2
-;  ; Load x[0...3]
-;  VLD1.16      d6, [r4]!
-;  ; Use VAND since it's a data processing instruction again.
-;  VAND         d4, d5, d5
-;  SUB          r12, r12, #4
-;  ; Load y[4...7]
-;  VLD1.16      d5, [r5]!
-;  VMLAL.S16    q0, d4, d6[0]
-;  VEXT.16      d16, d4, d5, #1
-;  VMLAL.S16    q0, d16, d6[1]
-;  VEXT.16      d16, d4, d5, #2
-;  VMLAL.S16    q0, d16, d6[2]
-;  VEXT.16      d16, d4, d5, #3
-;  VMLAL.S16    q0, d16, d6[3]
-;; Process 2 samples here if we have > 2 left (still reading one extra y value).
-;xcorr_kernel_neon_process2
-;  ADDS         r12, r12, #2
-;  BLE xcorr_kernel_neon_process1
-;  ; Load x[0...1]
-;  VLD2.16      {d6[],d7[]}, [r4]!
-;  ; Use VAND since it's a data processing instruction again.
-;  VAND         d4, d5, d5
-;  SUB          r12, r12, #2
-;  ; Load y[4...5]
-;  VLD1.32      {d5[]}, [r5]!
-;  VMLAL.S16    q0, d4, d6
-;  VEXT.16      d16, d4, d5, #1
-;  ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
-;  ; instead of VEXT, since it's a data-processing instruction.
-;  VSRI.64      d5, d4, #32
-;  VMLAL.S16    q0, d16, d7
-;; Process 1 sample using the extra y value we loaded above.
-;xcorr_kernel_neon_process1
-;  ; Load next *x
-;  VLD1.16      {d6[]}, [r4]!
-;  ADDS         r12, r12, #1
-;  ; y[0...3] are left in d5 from prior iteration(s) (if any)
-;  VMLAL.S16    q0, d5, d6
-;  MOVLE        pc, lr
-;; Now process 1 last sample, not reading ahead.
-;  ; Load last *y
-;  VLD1.16      {d4[]}, [r5]!
-;  VSRI.64      d4, d5, #16
-;  ; Load last *x
-;  VLD1.16      {d6[]}, [r4]!
-;  VMLAL.S16    q0, d4, d6
-;  MOV          pc, lr
-;  ENDP
-
-;; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
-;;  opus_val32 *xcorr, int len, int max_pitch)
-;celt_pitch_xcorr_neon PROC
-;  ; input:
-;  ;   r0  = opus_val16 *_x
-;  ;   r1  = opus_val16 *_y
-;  ;   r2  = opus_val32 *xcorr
-;  ;   r3  = int         len
-;  ; output:
-;  ;   r0  = int         maxcorr
-;  ; internal usage:
-;  ;   r4  = opus_val16 *x (for xcorr_kernel_neon())
-;  ;   r5  = opus_val16 *y (for xcorr_kernel_neon())
-;  ;   r6  = int         max_pitch
-;  ;   r12 = int         j
-;  ;   q15 = int         maxcorr[4] (q15 is not used by xcorr_kernel_neon())
-;  STMFD        sp!, {r4-r6, lr}
-;  LDR          r6, [sp, #16]
-;  VMOV.S32     q15, #1
-;  ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
-;  SUBS         r6, r6, #4
-;  BLT celt_pitch_xcorr_neon_process4_done
-;celt_pitch_xcorr_neon_process4
-;  ; xcorr_kernel_neon parameters:
-;  ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
-;  MOV          r4, r0
-;  MOV          r5, r1
-;  VEOR         q0, q0, q0
-;  ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
-;  ; So we don't save/restore any other registers.
-;  BL xcorr_kernel_neon
-;  SUBS         r6, r6, #4
-;  VST1.32      {q0}, [r2]!
-;  ; _y += 4
-;  ADD          r1, r1, #8
-;  VMAX.S32     q15, q15, q0
-;  ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
-;  BGE celt_pitch_xcorr_neon_process4
-;; We have less than 4 sums left to compute.
-;celt_pitch_xcorr_neon_process4_done
-;  ADDS         r6, r6, #4
-;  ; Reduce maxcorr to a single value
-;  VMAX.S32     d30, d30, d31
-;  VPMAX.S32    d30, d30, d30
-;  ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
-;  BLE celt_pitch_xcorr_neon_done
-;; Now compute each remaining sum one at a time.
-;celt_pitch_xcorr_neon_process_remaining
-;  MOV          r4, r0
-;  MOV          r5, r1
-;  VMOV.I32     q0, #0
-;  SUBS         r12, r3, #8
-;  BLT celt_pitch_xcorr_neon_process_remaining4
-;; Sum terms 8 at a time.
-;celt_pitch_xcorr_neon_process_remaining_loop8
-;  ; Load x[0...7]
-;  VLD1.16      {q1}, [r4]!
-;  ; Load y[0...7]
-;  VLD1.16      {q2}, [r5]!
-;  SUBS         r12, r12, #8
-;  VMLAL.S16    q0, d4, d2
-;  VMLAL.S16    q0, d5, d3
-;  BGE celt_pitch_xcorr_neon_process_remaining_loop8
-;; Sum terms 4 at a time.
-;celt_pitch_xcorr_neon_process_remaining4
-;  ADDS         r12, r12, #4
-;  BLT celt_pitch_xcorr_neon_process_remaining4_done
-;  ; Load x[0...3]
-;  VLD1.16      {d2}, [r4]!
-;  ; Load y[0...3]
-;  VLD1.16      {d3}, [r5]!
-;  SUB          r12, r12, #4
-;  VMLAL.S16    q0, d3, d2
-;  ; Reduce the sum to a single value.
-;  VADD.S32     d0, d0, d1
-;  VPADDL.S32   d0, d0
-;celt_pitch_xcorr_neon_process_remaining4_done
-;  ADDS         r12, r12, #4
-;  BLE celt_pitch_xcorr_neon_process_remaining_loop_done
-;; Sum terms 1 at a time.
-;celt_pitch_xcorr_neon_process_remaining_loop1
-;  VLD1.16      {d2[]}, [r4]!
-;  VLD1.16      {d3[]}, [r5]!
-;  SUBS         r12, r12, #1
-;  VMLAL.S16    q0, d2, d3
-;  BGT celt_pitch_xcorr_neon_process_remaining_loop1
-;celt_pitch_xcorr_neon_process_remaining_loop_done
-;  VST1.32      {d0[0]}, [r2]!
-;  VMAX.S32     d30, d30, d0
-;  SUBS         r6, r6, #1
-;  ; _y++
-;  ADD          r1, r1, #2
-;  ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
-;  BGT celt_pitch_xcorr_neon_process_remaining
-;celt_pitch_xcorr_neon_done
-;  VMOV.32      r0, d30[0]
-;  LDMFD        sp!, {r4-r6, pc}
-;  ENDP
-
+; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
 xcorr_kernel_neon PROC
   ; input:
-  ; r0 = opus_val16 *x
-  ; r1 = opus_val16 *y
-  ; r2 = int        len
-  ; q0 = opus_val32 sum (sum[3] | sum[2] | sum[1] | sum[0])
-
+  ;   r3     = int         len
+  ;   r4     = opus_val16 *x
+  ;   r5     = opus_val16 *y
+  ;   q0     = opus_val32  sum[4]
   ; output:
-  ; q0 = sum
-
+  ;   q0     = opus_val32  sum[4]
+  ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
   ; internal usage:
-  ; r3 = j
-  ; d2 = x_3|x_2|x_1|x_0  d3 = y_3|y_2|y_1|y_0
-  ; d4 = y_7|y_6|y_5|y_4  d5 = y_4|y_3|y_2|y_1
-  ; d6 = y_5|y_4|y_3|y_2  d7 = y_6|y_5|y_4|y_3
-  ; We will build d5, d6 and d7 vector from d3 and d4
-
-
-  VLD1.16   {d3}, [r1]!      ; Load y[3] downto y[0] to d3 lane (yy0)
-  SUB       r3, r2, #1
-  MOVS      r3, r3, lsr #2   ; j=(len-1)>>2
-  BEQ       xcorr_kernel_neon_process4_done
-
-  ; Process 4 x samples at a time
-  ; For this, we will need 4 y vectors
+  ;   r12 = int j
+  ;   d3  = y_3|y_2|y_1|y_0
+  ;   q2  = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
+  ;   q3  = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
+  ;   q8  = scratch
+  ;
+  ; Load y[0...3]
+  ; This requires len>0 to always be valid (which we assert in the C code).
+  VLD1.16      {d5}, [r5]!
+  SUBS         r12, r3, #8
+  BLE xcorr_kernel_neon_process4
+; Process 8 samples at a time.
+; This loop loads one y value more than we actually need. Therefore we have to
+; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
+; reading past the end of the array.
+xcorr_kernel_neon_process8
+  ; This loop has 19 total instructions (10 cycles to issue, minimum), with
+  ; - 2 cycles of ARM insrtuctions,
+  ; - 10 cycles of load/store/byte permute instructions, and
+  ; - 9 cycles of data processing instructions.
+  ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
+  ; latter two categories, meaning the whole loop should run in 10 cycles per
+  ; iteration, barring cache misses.
+  ;
+  ; Load x[0...7]
+  VLD1.16      {d6, d7}, [r4]!
+  ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get
+  ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
+  VAND         d3, d5, d5
+  SUBS         r12, r12, #8
+  ; Load y[4...11]
+  VLD1.16      {d4, d5}, [r5]!
+  VMLAL.S16    q0, d3, d6[0]
+  VEXT.16      d16, d3, d4, #1
+  VMLAL.S16    q0, d4, d7[0]
+  VEXT.16      d17, d4, d5, #1
+  VMLAL.S16    q0, d16, d6[1]
+  VEXT.16      d16, d3, d4, #2
+  VMLAL.S16    q0, d17, d7[1]
+  VEXT.16      d17, d4, d5, #2
+  VMLAL.S16    q0, d16, d6[2]
+  VEXT.16      d16, d3, d4, #3
+  VMLAL.S16    q0, d17, d7[2]
+  VEXT.16      d17, d4, d5, #3
+  VMLAL.S16    q0, d16, d6[3]
+  VMLAL.S16    q0, d17, d7[3]
+  BGT xcorr_kernel_neon_process8
+; Process 4 samples here if we have > 4 left (still reading one extra y value).
 xcorr_kernel_neon_process4
-  SUBS      r3, r3, #1       ; j--
-  VLD1.16   d4, [r1]!        ; Load y[7] downto y[4] to d4 lane
-  VLD1.16   d2, [r0]!        ; Load x[3] downto x[0] to d2 lane
-  VEXT.16   d5, d3, d4, #1   ; Build y[4] downto y[1] vector (yy1)
-  VEXT.16   d6, d3, d4, #2   ; Build y[5] downto y[2] vector (yy2)
-  VEXT.16   d7, d3, d4, #3   ; Build y[6] downto y[3] vector (yy3)
-
-  VMLAL.S16 q0, d3, d2[0]    ; MAC16_16(sum, x[0], yy0)
-  VMLAL.S16 q0, d5, d2[1]    ; MAC16_16(sum, x[1], yy1)
-  VMLAL.S16 q0, d6, d2[2]    ; MAC16_16(sum, x[2], yy2)
-  VMLAL.S16 q0, d7, d2[3]    ; MAC16_16(sum, x[3], yy3)
-
-  VMOV.S16  d3, d4           ; Next y vector should be in d3 (yy0)
-
-  BNE xcorr_kernel_neon_process4
-
-xcorr_kernel_neon_process4_done
-  ;Process len-1 to len
-  VLD1.16   {d2[]}, [r0]!    ; Load *x and duplicate to d2 lane
-
-  SUB       r3, r2, #1
-  ANDS      r3, r3, #3       ; j=(len-1)&3
-  VMLAL.S16 q0, d3, d2       ; MAC16_16(sum, *x, yy0)
-  BEQ xcorr_kernel_neon_done
-
-xcorr_kernel_neon_process_remaining
-  SUBS      r3, r3, #1       ; j--
-  VLD1.16   {d4[]}, [r1]!    ; Load y value and duplicate to d4 lane
-  VLD1.16   {d2[]}, [r0]!    ; Load *x and duplicate to d2 lane
-  VEXT.16   d3, d3, d4, #1   ; Build y vector from previous and d4
-  VMLAL.S16 q0, d3, d2       ; MAC16_16(sum, *x, yy0)
-  BNE xcorr_kernel_neon_process_remaining
-
-xcorr_kernel_neon_done
-  MOV       pc, lr
+  ADDS         r12, r12, #4
+  BLE xcorr_kernel_neon_process2
+  ; Load x[0...3]
+  VLD1.16      d6, [r4]!
+  ; Use VAND since it's a data processing instruction again.
+  VAND         d4, d5, d5
+  SUB          r12, r12, #4
+  ; Load y[4...7]
+  VLD1.16      d5, [r5]!
+  VMLAL.S16    q0, d4, d6[0]
+  VEXT.16      d16, d4, d5, #1
+  VMLAL.S16    q0, d16, d6[1]
+  VEXT.16      d16, d4, d5, #2
+  VMLAL.S16    q0, d16, d6[2]
+  VEXT.16      d16, d4, d5, #3
+  VMLAL.S16    q0, d16, d6[3]
+; Process 2 samples here if we have > 2 left (still reading one extra y value).
+xcorr_kernel_neon_process2
+  ADDS         r12, r12, #2
+  BLE xcorr_kernel_neon_process1
+  ; Load x[0...1]
+  VLD2.16      {d6[],d7[]}, [r4]!
+  ; Use VAND since it's a data processing instruction again.
+  VAND         d4, d5, d5
+  SUB          r12, r12, #2
+  ; Load y[4...5]
+  VLD1.32      {d5[]}, [r5]!
+  VMLAL.S16    q0, d4, d6
+  VEXT.16      d16, d4, d5, #1
+  ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
+  ; instead of VEXT, since it's a data-processing instruction.
+  VSRI.64      d5, d4, #32
+  VMLAL.S16    q0, d16, d7
+; Process 1 sample using the extra y value we loaded above.
+xcorr_kernel_neon_process1
+  ; Load next *x
+  VLD1.16      {d6[]}, [r4]!
+  ADDS         r12, r12, #1
+  ; y[0...3] are left in d5 from prior iteration(s) (if any)
+  VMLAL.S16    q0, d5, d6
+  MOVLE        pc, lr
+; Now process 1 last sample, not reading ahead.
+  ; Load last *y
+  VLD1.16      {d4[]}, [r5]!
+  VSRI.64      d4, d5, #16
+  ; Load last *x
+  VLD1.16      {d6[]}, [r4]!
+  VMLAL.S16    q0, d4, d6
+  MOV          pc, lr
   ENDP
 
+; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
+;  opus_val32 *xcorr, int len, int max_pitch)
 celt_pitch_xcorr_neon PROC
   ; input:
-  ; r0 = opus_val16 *_x
-  ; r1 = opus_val16 *_y
-  ; r2 = opus_val32 *xcorr
-  ; r3 = int        len
-
+  ;   r0  = opus_val16 *_x
+  ;   r1  = opus_val16 *_y
+  ;   r2  = opus_val32 *xcorr
+  ;   r3  = int         len
   ; output:
-  ; r0 = maxcorr
-
-  STMFD     sp!, {r4-r9, lr}
-
-  LDR       r4, [sp, #28]        ; r4 = int max_pitch
-  MOV       r5, r0               ; r5 = _x
-  MOV       r6, r1               ; r6 = _y
-  MOV       r7, r2               ; r7 = xcorr
-  MOV       r2, r3               ; r2 = len
-
-  VMOV.S32  d16, #1              ; d16 = {1, 1}  (not used by xcorr_kernel_neon)
-  MOV       r8, #0               ; r8 = i = 0
-  CMP       r4, #3               ; max_pitch-3 <= 0  ---> pitch_xcorr_neon_process4_done
-  BLE       celt_pitch_xcorr_neon_process4_done
-
-  SUB       r9, r4, #3           ; r9 = max_pitch-3
-
+  ;   r0  = int         maxcorr
+  ; internal usage:
+  ;   r4  = opus_val16 *x (for xcorr_kernel_neon())
+  ;   r5  = opus_val16 *y (for xcorr_kernel_neon())
+  ;   r6  = int         max_pitch
+  ;   r12 = int         j
+  ;   q15 = int         maxcorr[4] (q15 is not used by xcorr_kernel_neon())
+  STMFD        sp!, {r4-r6, lr}
+  LDR          r6, [sp, #16]
+  VMOV.S32     q15, #1
+  ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
+  SUBS         r6, r6, #4
+  BLT celt_pitch_xcorr_neon_process4_done
 celt_pitch_xcorr_neon_process4
-  MOV       r0, r5               ; r0 = _x
-  ADD       r1, r6 ,r8, LSL #1   ; r1 = _y + i
-  VMOV.I32  q0, #0               ; q0 = opus_val32 sum[4] = {0, 0, 0, 0}
-
-                                 ; xcorr_kernel_neon don't touch r2 (len)
-                                 ; So we don't store it
-  BL xcorr_kernel_neon           ; xcorr_kernel_neon(_x, _y+i, sum, len)
-
-  VST1.32   {q0}, [r7]!          ; Store sum to xcorr
-  VPMAX.S32 d0, d0, d1           ; d0 = max(sum[3], sum[2]) | max(sum[1], sum[0])
-  ADD       r8, r8, #4           ; i+=4
-  VPMAX.S32 d0, d0, d0           ; d0 = max(sum[3], sum[2], sum[1], sum[0])
-  CMP       r8, r9               ; i < max_pitch-3 ----> pitch_xcorr_neon_process4
-  VMAX.S32  d16, d16, d0         ; d16 = maxcorr = max(maxcorr, sum)
-
-  BLT       celt_pitch_xcorr_neon_process4
-
+  ; xcorr_kernel_neon parameters:
+  ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
+  MOV          r4, r0
+  MOV          r5, r1
+  VEOR         q0, q0, q0
+  ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
+  ; So we don't save/restore any other registers.
+  BL xcorr_kernel_neon
+  SUBS         r6, r6, #4
+  VST1.32      {q0}, [r2]!
+  ; _y += 4
+  ADD          r1, r1, #8
+  VMAX.S32     q15, q15, q0
+  ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
+  BGE celt_pitch_xcorr_neon_process4
+; We have less than 4 sums left to compute.
 celt_pitch_xcorr_neon_process4_done
-  CMP       r8, r4;
-  BGE       celt_pitch_xcorr_neon_done
-
+  ADDS         r6, r6, #4
+  ; Reduce maxcorr to a single value
+  VMAX.S32     d30, d30, d31
+  VPMAX.S32    d30, d30, d30
+  ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
+  BLE celt_pitch_xcorr_neon_done
+; Now compute each remaining sum one at a time.
 celt_pitch_xcorr_neon_process_remaining
-  MOV       r0, r5               ; r0 = _x
-  ADD       r1, r6, r8, LSL #1   ; r1 = _y + i
-  VMOV.I32  q0, #0
-  MOVS      r3, r2, LSR #2       ; r3 = j = len
-  BEQ       inner_loop_neon_process4_done
-
-inner_loop_neon_process4
-  VLD1.16   {d2}, [r0]!          ; Load x
-  VLD1.16   {d3}, [r1]!          ; Load y
-  SUBS      r3, r3, #1
-  VMLAL.S16 q0, d2, d3
-  BNE       inner_loop_neon_process4
-
-  VPADD.S32 d0, d0, d1          ; Reduce sum
-  VPADD.S32 d0, d0, d0
-
-inner_loop_neon_process4_done
-  ANDS      r3, r2, #3
-  BEQ       inner_loop_neon_done
-
-inner_loop_neon_process_remaining
-  VLD1.16   {d2[]}, [r0]!
-  VLD1.16   {d3[]}, [r1]!
-  SUBS      r3, r3, #1
-  VMLAL.S16 q0, d2, d3
-  BNE       inner_loop_neon_process_remaining
-
-inner_loop_neon_done
-  VST1.32   {d0[0]}, [r7]!
-  VMAX.S32  d16, d16, d0
-
-  ADD       r8, r8, #1
-  CMP       r8, r4
-  BCC       celt_pitch_xcorr_neon_process_remaining
-
+  MOV          r4, r0
+  MOV          r5, r1
+  VMOV.I32     q0, #0
+  SUBS         r12, r3, #8
+  BLT celt_pitch_xcorr_neon_process_remaining4
+; Sum terms 8 at a time.
+celt_pitch_xcorr_neon_process_remaining_loop8
+  ; Load x[0...7]
+  VLD1.16      {q1}, [r4]!
+  ; Load y[0...7]
+  VLD1.16      {q2}, [r5]!
+  SUBS         r12, r12, #8
+  VMLAL.S16    q0, d4, d2
+  VMLAL.S16    q0, d5, d3
+  BGE celt_pitch_xcorr_neon_process_remaining_loop8
+; Sum terms 4 at a time.
+celt_pitch_xcorr_neon_process_remaining4
+  ADDS         r12, r12, #4
+  BLT celt_pitch_xcorr_neon_process_remaining4_done
+  ; Load x[0...3]
+  VLD1.16      {d2}, [r4]!
+  ; Load y[0...3]
+  VLD1.16      {d3}, [r5]!
+  SUB          r12, r12, #4
+  VMLAL.S16    q0, d3, d2
+celt_pitch_xcorr_neon_process_remaining4_done
+  ; Reduce the sum to a single value.
+  VADD.S32     d0, d0, d1
+  VPADDL.S32   d0, d0
+  ADDS         r12, r12, #4
+  BLE celt_pitch_xcorr_neon_process_remaining_loop_done
+; Sum terms 1 at a time.
+celt_pitch_xcorr_neon_process_remaining_loop1
+  VLD1.16      {d2[]}, [r4]!
+  VLD1.16      {d3[]}, [r5]!
+  SUBS         r12, r12, #1
+  VMLAL.S16    q0, d2, d3
+  BGT celt_pitch_xcorr_neon_process_remaining_loop1
+celt_pitch_xcorr_neon_process_remaining_loop_done
+  VST1.32      {d0[0]}, [r2]!
+  VMAX.S32     d30, d30, d0
+  SUBS         r6, r6, #1
+  ; _y++
+  ADD          r1, r1, #2
+  ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
+  BGT celt_pitch_xcorr_neon_process_remaining
 celt_pitch_xcorr_neon_done
-  VMOV      d0, d16
-  VMOV.32   r0, d0[0]
-  LDMFD     sp!, {r4-r9, pc}
+  VMOV.32      r0, d30[0]
+  LDMFD        sp!, {r4-r6, pc}
   ENDP
-
 
 ENDIF