ref: df9275b31a5290b0c2abe42614e29b4072ed4ed0
parent: 821945d97c664b648ea645ed2af7c26a8fdc4c19
author: Jean-Marc Valin <[email protected]>
date: Thu Apr 10 10:38:14 EDT 2008
Not all compilers are equal -- making it clearer how the MDCT indexing is done
--- a/libcelt/mdct.c
+++ b/libcelt/mdct.c
@@ -99,37 +99,57 @@
/* Consider the input to be compused of four blocks: [a, b, c, d] */
/* Shuffle, fold, pre-rotate (part 1) */
- for(i=0;i<N/8;i++)
{
- kiss_fft_scalar re, im;
- /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
- re = -HALF32(in[N2+N4+2*i] + in[N2+N4-2*i-1]);
- im = -HALF32(in[N4+2*i] - in[N4-2*i-1]);
- /* We could remove the HALF32 above and just use MULT16_32_Q16 below
- (MIXED_PRECISION only) */
- out[2*i] = S_MUL(re,l->trig[i]) - S_MUL(im,l->trig[i+N4]);
- out[2*i+1] = S_MUL(im,l->trig[i]) + S_MUL(re,l->trig[i+N4]);
+ /* Temp pointers to make it really clear to the compiler what we're doing */
+ const kiss_fft_scalar * restrict xp1 = in+N4;
+ const kiss_fft_scalar * restrict xp2 = in+N2+N4-1;
+ kiss_fft_scalar * restrict yp = out;
+ for(i=0;i<N/8;i++)
+ {
+ kiss_fft_scalar re, im;
+ /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
+ re = -HALF32(xp1[N2] + *xp2);
+ im = -HALF32(*xp1 - xp2[-N2]);
+ xp1+=2;
+ xp2-=2;
+ /* We could remove the HALF32 above and just use MULT16_32_Q16 below
+ (MIXED_PRECISION only) */
+ *yp++ = S_MUL(re,l->trig[i]) - S_MUL(im,l->trig[i+N4]);
+ *yp++ = S_MUL(im,l->trig[i]) + S_MUL(re,l->trig[i+N4]);
+ }
+ for(;i<N4;i++)
+ {
+ kiss_fft_scalar re, im;
+ /* Real part arranged as a-bR, Imag part arranged as -c-dR */
+ re = HALF32(xp1[-N2] - *xp2);
+ im = -HALF32(*xp1 + xp2[N2]);
+ xp1+=2;
+ xp2-=2;
+ /* We could remove the HALF32 above and just use MULT16_32_Q16 below
+ (MIXED_PRECISION only) */
+ *yp++ = S_MUL(re,l->trig[i]) - S_MUL(im,l->trig[i+N4]);
+ *yp++ = S_MUL(im,l->trig[i]) + S_MUL(re,l->trig[i+N4]);
+ }
}
- for(;i<N4;i++)
- {
- kiss_fft_scalar re, im;
- /* Real part arranged as a-bR, Imag part arranged as -c-dR */
- re = HALF32(in[2*i-N4] - in[N2+N4-2*i-1]);
- im = -HALF32(in[N4+2*i] + in[N+N4-2*i-1]);
- /* We could remove the HALF32 above and just use MULT16_32_Q16 below
- (MIXED_PRECISION only) */
- out[2*i] = S_MUL(re,l->trig[i]) - S_MUL(im,l->trig[i+N4]);
- out[2*i+1] = S_MUL(im,l->trig[i]) + S_MUL(re,l->trig[i+N4]);
- }
/* N/4 complex FFT, which should normally down-scale by 4/N (but doesn't now) */
cpx32_fft(l->kfft, out, f, N4);
/* Post-rotate and apply the scaling if the FFT doesn't to it itself */
- for(i=0;i<N4;i++)
{
- out[2*i] = -S_MUL(f[2*i+1],l->trig[i+N4]) + S_MUL(f[2*i] ,l->trig[i]);
- out[N2-1-2*i] = -S_MUL(f[2*i] ,l->trig[i+N4]) - S_MUL(f[2*i+1],l->trig[i]);
+ /* Temp pointers to make it really clear to the compiler what we're doing */
+ const kiss_fft_scalar * restrict fp = f;
+ kiss_fft_scalar * restrict yp1 = out;
+ kiss_fft_scalar * restrict yp2 = out+N2-1;
+ /* Temp pointers to make it really clear to the compiler what we're doing */
+ for(i=0;i<N4;i++)
+ {
+ *yp1 = -S_MUL(fp[1],l->trig[i+N4]) + S_MUL(fp[0],l->trig[i]);
+ *yp2 = -S_MUL(fp[0],l->trig[i+N4]) - S_MUL(fp[1],l->trig[i]);
+ fp += 2;
+ yp1 += 2;
+ yp2 -= 2;
+ }
}
RESTORE_STACK;
}