shithub: opus

Download patch

ref: df9275b31a5290b0c2abe42614e29b4072ed4ed0
parent: 821945d97c664b648ea645ed2af7c26a8fdc4c19
author: Jean-Marc Valin <[email protected]>
date: Thu Apr 10 10:38:14 EDT 2008

Not all compilers are equal -- making it clearer how the MDCT indexing is done

--- a/libcelt/mdct.c
+++ b/libcelt/mdct.c
@@ -99,37 +99,57 @@
    
    /* Consider the input to be compused of four blocks: [a, b, c, d] */
    /* Shuffle, fold, pre-rotate (part 1) */
-   for(i=0;i<N/8;i++)
    {
-      kiss_fft_scalar re, im;
-      /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
-      re = -HALF32(in[N2+N4+2*i] + in[N2+N4-2*i-1]);
-      im = -HALF32(in[N4+2*i]    - in[N4-2*i-1]);
-      /* We could remove the HALF32 above and just use MULT16_32_Q16 below
-         (MIXED_PRECISION only) */
-      out[2*i]   = S_MUL(re,l->trig[i])  -  S_MUL(im,l->trig[i+N4]);
-      out[2*i+1] = S_MUL(im,l->trig[i])  +  S_MUL(re,l->trig[i+N4]);
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      const kiss_fft_scalar * restrict xp1 = in+N4;
+      const kiss_fft_scalar * restrict xp2 = in+N2+N4-1;
+      kiss_fft_scalar * restrict yp = out;
+      for(i=0;i<N/8;i++)
+      {
+         kiss_fft_scalar re, im;
+         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
+         re = -HALF32(xp1[N2] + *xp2);
+         im = -HALF32(*xp1    - xp2[-N2]);
+         xp1+=2;
+         xp2-=2;
+         /* We could remove the HALF32 above and just use MULT16_32_Q16 below
+            (MIXED_PRECISION only) */
+         *yp++ = S_MUL(re,l->trig[i])  -  S_MUL(im,l->trig[i+N4]);
+         *yp++ = S_MUL(im,l->trig[i])  +  S_MUL(re,l->trig[i+N4]);
+      }
+      for(;i<N4;i++)
+      {
+         kiss_fft_scalar re, im;
+         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
+         re =  HALF32(xp1[-N2] - *xp2);
+         im = -HALF32(*xp1 + xp2[N2]);
+         xp1+=2;
+         xp2-=2;
+         /* We could remove the HALF32 above and just use MULT16_32_Q16 below
+            (MIXED_PRECISION only) */
+         *yp++ = S_MUL(re,l->trig[i])  -  S_MUL(im,l->trig[i+N4]);
+         *yp++ = S_MUL(im,l->trig[i])  +  S_MUL(re,l->trig[i+N4]);
+      }
    }
-   for(;i<N4;i++)
-   {
-      kiss_fft_scalar re, im;
-      /* Real part arranged as a-bR, Imag part arranged as -c-dR */
-      re =  HALF32(in[2*i-N4] - in[N2+N4-2*i-1]);
-      im = -HALF32(in[N4+2*i] + in[N+N4-2*i-1]);
-      /* We could remove the HALF32 above and just use MULT16_32_Q16 below
-         (MIXED_PRECISION only) */
-      out[2*i]   = S_MUL(re,l->trig[i])  -  S_MUL(im,l->trig[i+N4]);
-      out[2*i+1] = S_MUL(im,l->trig[i])  +  S_MUL(re,l->trig[i+N4]);
-   }
 
    /* N/4 complex FFT, which should normally down-scale by 4/N (but doesn't now) */
    cpx32_fft(l->kfft, out, f, N4);
 
    /* Post-rotate and apply the scaling if the FFT doesn't to it itself */
-   for(i=0;i<N4;i++)
    {
-      out[2*i]      = -S_MUL(f[2*i+1],l->trig[i+N4]) + S_MUL(f[2*i]  ,l->trig[i]);
-      out[N2-1-2*i] = -S_MUL(f[2*i]  ,l->trig[i+N4]) - S_MUL(f[2*i+1],l->trig[i]);
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      const kiss_fft_scalar * restrict fp = f;
+      kiss_fft_scalar * restrict yp1 = out;
+      kiss_fft_scalar * restrict yp2 = out+N2-1;
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      for(i=0;i<N4;i++)
+      {
+         *yp1 = -S_MUL(fp[1],l->trig[i+N4]) + S_MUL(fp[0],l->trig[i]);
+         *yp2 = -S_MUL(fp[0],l->trig[i+N4]) - S_MUL(fp[1],l->trig[i]);
+         fp += 2;
+         yp1 += 2;
+         yp2 -= 2;
+      }
    }
    RESTORE_STACK;
 }