shithub: opus

Download patch

ref: 447e5b8fc631ed40b7ad3715de2f8cfe26580960
parent: ef0d5f15c767a03a80bf6cea9c6e6e7cfe473168
author: Jean-Marc Valin <[email protected]>
date: Tue Apr 22 06:13:25 EDT 2008

Separated the pre-rotate from the window+shuffling and further simplified
everything.

--- a/libcelt/mdct.c
+++ b/libcelt/mdct.c
@@ -86,6 +86,13 @@
    celt_free(l->trig);
 }
 
+/* Only divide by half if float. In fixed-point, it's included in the shift */
+#ifdef FIXED_POINT
+#define FL_HALF(x) (x)
+#else
+#define FL_HALF(x) (.5f*(x))
+#endif
+
 void mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * restrict out, const celt_word16_t *window, int overlap)
 {
    int i;
@@ -98,65 +105,54 @@
    ALLOC(f, N2, kiss_fft_scalar);
    
    /* Consider the input to be compused of four blocks: [a, b, c, d] */
-   /* Shuffle, fold, pre-rotate (part 1) */
+   /* Window, shuffle, fold */
    {
       /* Temp pointers to make it really clear to the compiler what we're doing */
-      const kiss_fft_scalar * restrict xp1 = in+overlap/2;
-      const kiss_fft_scalar * restrict xp2 = in+N2-1+overlap/2;
+      const kiss_fft_scalar * restrict xp1 = in+(overlap>>1);
+      const kiss_fft_scalar * restrict xp2 = in+N2-1+(overlap>>1);
       kiss_fft_scalar * restrict yp = out;
-      kiss_fft_scalar *t = &l->trig[0];
-      const celt_word16_t * restrict wp1 = window+overlap/2;
-      const celt_word16_t * restrict wp2 = window+overlap/2-1;
-      for(i=0;i<overlap/4;i++)
+      const celt_word16_t * restrict wp1 = window+(overlap>>1);
+      const celt_word16_t * restrict wp2 = window+(overlap>>1)-1;
+      for(i=0;i<(overlap>>2);i++)
       {
-         kiss_fft_scalar re, im;
          /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
-         re = -(MULT16_32_Q16(*wp2, xp1[N2]) + MULT16_32_Q16(*wp1,*xp2));
-         im = -(MULT16_32_Q16(*wp1, *xp1)    - MULT16_32_Q16(*wp2, xp2[-N2]));
-#ifndef FIXED_POINT
-         re *= .5; im *= .5;
-#endif
+         *yp++ = -FL_HALF(MULT16_32_Q16(*wp2, xp1[N2]) + MULT16_32_Q16(*wp1,*xp2));
+         *yp++ = -FL_HALF(MULT16_32_Q16(*wp1, *xp1)    - MULT16_32_Q16(*wp2, xp2[-N2]));
          xp1+=2;
          xp2-=2;
          wp1+=2;
          wp2-=2;
-         /* We could remove the HALF32 above and just use MULT16_32_Q16 below
-         (MIXED_PRECISION only) */
-         *yp++ = S_MUL(re,t[0])  -  S_MUL(im,t[N4]);
-         *yp++ = S_MUL(im,t[0])  +  S_MUL(re,t[N4]);
-         t++;
       }
       wp1 = window;
       wp2 = window+overlap-1;
-      for(;i<N4-overlap/4;i++)
+      for(;i<N4-(overlap>>2);i++)
       {
-         kiss_fft_scalar re, im;
          /* Real part arranged as a-bR, Imag part arranged as -c-dR */
-         re = -HALF32(*xp2);
-         im = -HALF32(*xp1);
+         *yp++ = -HALF32(*xp2);
+         *yp++ = -HALF32(*xp1);
          xp1+=2;
          xp2-=2;
-         /* We could remove the HALF32 above and just use MULT16_32_Q16 below
-            (MIXED_PRECISION only) */
-         *yp++ = S_MUL(re,t[0])  -  S_MUL(im,t[N4]);
-         *yp++ = S_MUL(im,t[0])  +  S_MUL(re,t[N4]);
-         t++;
       }
       for(;i<N4;i++)
       {
-         kiss_fft_scalar re, im;
          /* Real part arranged as a-bR, Imag part arranged as -c-dR */
-         re =  (MULT16_32_Q16(*wp1, xp1[-N2]) - MULT16_32_Q16(*wp2, *xp2));
-         im = -(MULT16_32_Q16(*wp2, *xp1)     + MULT16_32_Q16(*wp1, xp2[N2]));
-#ifndef FIXED_POINT
-         re *= .5; im *= .5;
-#endif
+         *yp++ =  FL_HALF(MULT16_32_Q16(*wp1, xp1[-N2]) - MULT16_32_Q16(*wp2, *xp2));
+         *yp++ = -FL_HALF(MULT16_32_Q16(*wp2, *xp1)     + MULT16_32_Q16(*wp1, xp2[N2]));
          xp1+=2;
          xp2-=2;
          wp1+=2;
          wp2-=2;
-         /* We could remove the HALF32 above and just use MULT16_32_Q16 below
-         (MIXED_PRECISION only) */
+      }
+   }
+   /* Pre-rotation */
+   {
+      kiss_fft_scalar * restrict yp = out;
+      kiss_fft_scalar *t = &l->trig[0];
+      for(i=0;i<N4;i++)
+      {
+         kiss_fft_scalar re, im;
+         re = yp[0];
+         im = yp[1];
          *yp++ = S_MUL(re,t[0])  -  S_MUL(im,t[N4]);
          *yp++ = S_MUL(im,t[0])  +  S_MUL(re,t[N4]);
          t++;