shithub: opus

Download patch

ref: 6211c90def21b649e7c30e75ea1ea07e2a4c8483
parent: d7dfb00886d88e82614418c21a582101a0517622
author: Jean-Marc Valin <[email protected]>
date: Fri Feb 8 09:21:20 EST 2008

Split the radix functions into forward and backward versions, removed the
"inverse" flag from the state so it can be shared between the forward and
inverse transforms.

--- a/libcelt/_kiss_fft_guts.h
+++ b/libcelt/_kiss_fft_guts.h
@@ -29,7 +29,6 @@
 
 struct kiss_fft_state{
     int nfft;
-    int inverse;
     int factors[2*MAXFACTORS];
     int *bitrev;
     kiss_fft_cpx twiddles[1];
--- a/libcelt/kiss_fft.c
+++ b/libcelt/kiss_fft.c
@@ -1,6 +1,8 @@
 /*
 Copyright (c) 2003-2004, Mark Borgerding
+Lots of modifications by JMV
 Copyright (c) 2005-2007, Jean-Marc Valin
+Copyright (c) 2008,      Jean-Marc Valin, CSIRO
 
 All rights reserved.
 
@@ -27,372 +29,457 @@
  */
 
 static void kf_bfly2(
-        kiss_fft_cpx * Fout,
-        const size_t fstride,
-        const kiss_fft_cfg st,
-        int m,
-        int N,
-        int mm
-        )
+                     kiss_fft_cpx * Fout,
+                     const size_t fstride,
+                     const kiss_fft_cfg st,
+                     int m,
+                     int N,
+                     int mm
+                    )
 {
-    kiss_fft_cpx * Fout2;
-    kiss_fft_cpx * tw1;
-    kiss_fft_cpx t;
-    if (!st->inverse) {
-       int i,j;
-       kiss_fft_cpx * Fout_beg = Fout;
-       for (i=0;i<N;i++)
-       {
-          Fout = Fout_beg + i*mm;
-          Fout2 = Fout + m;
-          tw1 = st->twiddles;
-          for(j=0;j<m;j++)
-          {
+   kiss_fft_cpx * Fout2;
+   kiss_fft_cpx * tw1;
+   kiss_fft_cpx t;
+   int i,j;
+   kiss_fft_cpx * Fout_beg = Fout;
+   for (i=0;i<N;i++)
+   {
+      Fout = Fout_beg + i*mm;
+      Fout2 = Fout + m;
+      tw1 = st->twiddles;
+      for(j=0;j<m;j++)
+      {
              /* Almost the same as the code path below, except that we divide the input by two
-              (while keeping the best accuracy possible) */
-             celt_word32_t tr, ti;
-             tr = SHR32(SUB32(MULT16_16(Fout2->r , tw1->r),MULT16_16(Fout2->i , tw1->i)), 1);
-             ti = SHR32(ADD32(MULT16_16(Fout2->i , tw1->r),MULT16_16(Fout2->r , tw1->i)), 1);
-             tw1 += fstride;
-             Fout2->r = PSHR32(SUB32(SHL32(EXTEND32(Fout->r), 14), tr), 15);
-             Fout2->i = PSHR32(SUB32(SHL32(EXTEND32(Fout->i), 14), ti), 15);
-             Fout->r = PSHR32(ADD32(SHL32(EXTEND32(Fout->r), 14), tr), 15);
-             Fout->i = PSHR32(ADD32(SHL32(EXTEND32(Fout->i), 14), ti), 15);
-             ++Fout2;
-             ++Fout;
-          }
-       }
-    } else {
-       int i,j;
-       kiss_fft_cpx * Fout_beg = Fout;
-       for (i=0;i<N;i++)
-       {
-          Fout = Fout_beg + i*mm;
-          Fout2 = Fout + m;
-          tw1 = st->twiddles;
-          for(j=0;j<m;j++)
-          {
-             C_MULC (t,  *Fout2 , *tw1);
-             tw1 += fstride;
-             C_SUB( *Fout2 ,  *Fout , t );
-             C_ADDTO( *Fout ,  t );
-             ++Fout2;
-             ++Fout;
-          }
-       }
-    }
+         (while keeping the best accuracy possible) */
+         celt_word32_t tr, ti;
+         tr = SHR32(SUB32(MULT16_16(Fout2->r , tw1->r),MULT16_16(Fout2->i , tw1->i)), 1);
+         ti = SHR32(ADD32(MULT16_16(Fout2->i , tw1->r),MULT16_16(Fout2->r , tw1->i)), 1);
+         tw1 += fstride;
+         Fout2->r = PSHR32(SUB32(SHL32(EXTEND32(Fout->r), 14), tr), 15);
+         Fout2->i = PSHR32(SUB32(SHL32(EXTEND32(Fout->i), 14), ti), 15);
+         Fout->r = PSHR32(ADD32(SHL32(EXTEND32(Fout->r), 14), tr), 15);
+         Fout->i = PSHR32(ADD32(SHL32(EXTEND32(Fout->i), 14), ti), 15);
+         ++Fout2;
+         ++Fout;
+      }
+   }
 }
 
+static void ki_bfly2(
+                     kiss_fft_cpx * Fout,
+                     const size_t fstride,
+                     const kiss_fft_cfg st,
+                     int m,
+                     int N,
+                     int mm
+                    )
+{
+   kiss_fft_cpx * Fout2;
+   kiss_fft_cpx * tw1;
+   kiss_fft_cpx t;
+   int i,j;
+   kiss_fft_cpx * Fout_beg = Fout;
+   for (i=0;i<N;i++)
+   {
+      Fout = Fout_beg + i*mm;
+      Fout2 = Fout + m;
+      tw1 = st->twiddles;
+      for(j=0;j<m;j++)
+      {
+         C_MULC (t,  *Fout2 , *tw1);
+         tw1 += fstride;
+         C_SUB( *Fout2 ,  *Fout , t );
+         C_ADDTO( *Fout ,  t );
+         ++Fout2;
+         ++Fout;
+      }
+   }
+}
+
 static void kf_bfly4(
-        kiss_fft_cpx * Fout,
-        const size_t fstride,
-        const kiss_fft_cfg st,
-        int m,
-        int N,
-        int mm
-        )
+                     kiss_fft_cpx * Fout,
+                     const size_t fstride,
+                     const kiss_fft_cfg st,
+                     int m,
+                     int N,
+                     int mm
+                    )
 {
-    kiss_fft_cpx *tw1,*tw2,*tw3;
-    kiss_fft_cpx scratch[6];
-    const size_t m2=2*m;
-    const size_t m3=3*m;
-    int i, j;
+   kiss_fft_cpx *tw1,*tw2,*tw3;
+   kiss_fft_cpx scratch[6];
+   const size_t m2=2*m;
+   const size_t m3=3*m;
+   int i, j;
 
-    if (st->inverse)
-    {
-       kiss_fft_cpx * Fout_beg = Fout;
-       for (i=0;i<N;i++)
-       {
-          Fout = Fout_beg + i*mm;
-          tw3 = tw2 = tw1 = st->twiddles;
-          for (j=0;j<m;j++)
-          {
-             C_MULC(scratch[0],Fout[m] , *tw1 );
-             C_MULC(scratch[1],Fout[m2] , *tw2 );
-             C_MULC(scratch[2],Fout[m3] , *tw3 );
+   kiss_fft_cpx * Fout_beg = Fout;
+   for (i=0;i<N;i++)
+   {
+      Fout = Fout_beg + i*mm;
+      tw3 = tw2 = tw1 = st->twiddles;
+      for (j=0;j<m;j++)
+      {
+         C_MUL4(scratch[0],Fout[m] , *tw1 );
+         C_MUL4(scratch[1],Fout[m2] , *tw2 );
+         C_MUL4(scratch[2],Fout[m3] , *tw3 );
              
-             C_SUB( scratch[5] , *Fout, scratch[1] );
-             C_ADDTO(*Fout, scratch[1]);
-             C_ADD( scratch[3] , scratch[0] , scratch[2] );
-             C_SUB( scratch[4] , scratch[0] , scratch[2] );
-             C_SUB( Fout[m2], *Fout, scratch[3] );
-             tw1 += fstride;
-             tw2 += fstride*2;
-             tw3 += fstride*3;
-             C_ADDTO( *Fout , scratch[3] );
+         Fout->r = PSHR16(Fout->r, 2);
+         Fout->i = PSHR16(Fout->i, 2);
+         C_SUB( scratch[5] , *Fout, scratch[1] );
+         C_ADDTO(*Fout, scratch[1]);
+         C_ADD( scratch[3] , scratch[0] , scratch[2] );
+         C_SUB( scratch[4] , scratch[0] , scratch[2] );
+         Fout[m2].r = PSHR16(Fout[m2].r, 2);
+         Fout[m2].i = PSHR16(Fout[m2].i, 2);
+         C_SUB( Fout[m2], *Fout, scratch[3] );
+         tw1 += fstride;
+         tw2 += fstride*2;
+         tw3 += fstride*3;
+         C_ADDTO( *Fout , scratch[3] );
              
-             Fout[m].r = scratch[5].r - scratch[4].i;
-             Fout[m].i = scratch[5].i + scratch[4].r;
-             Fout[m3].r = scratch[5].r + scratch[4].i;
-             Fout[m3].i = scratch[5].i - scratch[4].r;
-             ++Fout;
-          }
-       }
-    } else
-    {
-       kiss_fft_cpx * Fout_beg = Fout;
-       for (i=0;i<N;i++)
-       {
-          Fout = Fout_beg + i*mm;
-          tw3 = tw2 = tw1 = st->twiddles;
-          for (j=0;j<m;j++)
-          {
-             C_MUL4(scratch[0],Fout[m] , *tw1 );
-             C_MUL4(scratch[1],Fout[m2] , *tw2 );
-             C_MUL4(scratch[2],Fout[m3] , *tw3 );
+         Fout[m].r = scratch[5].r + scratch[4].i;
+         Fout[m].i = scratch[5].i - scratch[4].r;
+         Fout[m3].r = scratch[5].r - scratch[4].i;
+         Fout[m3].i = scratch[5].i + scratch[4].r;
+         ++Fout;
+      }
+   }
+}
+
+static void ki_bfly4(
+                     kiss_fft_cpx * Fout,
+                     const size_t fstride,
+                     const kiss_fft_cfg st,
+                     int m,
+                     int N,
+                     int mm
+                    )
+{
+   kiss_fft_cpx *tw1,*tw2,*tw3;
+   kiss_fft_cpx scratch[6];
+   const size_t m2=2*m;
+   const size_t m3=3*m;
+   int i, j;
+
+   kiss_fft_cpx * Fout_beg = Fout;
+   for (i=0;i<N;i++)
+   {
+      Fout = Fout_beg + i*mm;
+      tw3 = tw2 = tw1 = st->twiddles;
+      for (j=0;j<m;j++)
+      {
+         C_MULC(scratch[0],Fout[m] , *tw1 );
+         C_MULC(scratch[1],Fout[m2] , *tw2 );
+         C_MULC(scratch[2],Fout[m3] , *tw3 );
              
-             Fout->r = PSHR16(Fout->r, 2);
-             Fout->i = PSHR16(Fout->i, 2);
-             C_SUB( scratch[5] , *Fout, scratch[1] );
-             C_ADDTO(*Fout, scratch[1]);
-             C_ADD( scratch[3] , scratch[0] , scratch[2] );
-             C_SUB( scratch[4] , scratch[0] , scratch[2] );
-             Fout[m2].r = PSHR16(Fout[m2].r, 2);
-             Fout[m2].i = PSHR16(Fout[m2].i, 2);
-             C_SUB( Fout[m2], *Fout, scratch[3] );
-             tw1 += fstride;
-             tw2 += fstride*2;
-             tw3 += fstride*3;
-             C_ADDTO( *Fout , scratch[3] );
+         C_SUB( scratch[5] , *Fout, scratch[1] );
+         C_ADDTO(*Fout, scratch[1]);
+         C_ADD( scratch[3] , scratch[0] , scratch[2] );
+         C_SUB( scratch[4] , scratch[0] , scratch[2] );
+         C_SUB( Fout[m2], *Fout, scratch[3] );
+         tw1 += fstride;
+         tw2 += fstride*2;
+         tw3 += fstride*3;
+         C_ADDTO( *Fout , scratch[3] );
              
-             Fout[m].r = scratch[5].r + scratch[4].i;
-             Fout[m].i = scratch[5].i - scratch[4].r;
-             Fout[m3].r = scratch[5].r - scratch[4].i;
-             Fout[m3].i = scratch[5].i + scratch[4].r;
-             ++Fout;
-          }
-       }
-    }
+         Fout[m].r = scratch[5].r - scratch[4].i;
+         Fout[m].i = scratch[5].i + scratch[4].r;
+         Fout[m3].r = scratch[5].r + scratch[4].i;
+         Fout[m3].i = scratch[5].i - scratch[4].r;
+         ++Fout;
+      }
+   }
 }
 
+
 static void kf_bfly3(
-         kiss_fft_cpx * Fout,
-         const size_t fstride,
-         const kiss_fft_cfg st,
-         size_t m
-         )
+                     kiss_fft_cpx * Fout,
+                     const size_t fstride,
+                     const kiss_fft_cfg st,
+                     size_t m
+                    )
 {
-     size_t k=m;
-     const size_t m2 = 2*m;
-     kiss_fft_cpx *tw1,*tw2;
-     kiss_fft_cpx scratch[5];
-     kiss_fft_cpx epi3;
-     epi3 = st->twiddles[fstride*m];
+   size_t k=m;
+   const size_t m2 = 2*m;
+   kiss_fft_cpx *tw1,*tw2;
+   kiss_fft_cpx scratch[5];
+   kiss_fft_cpx epi3;
+   epi3 = st->twiddles[fstride*m];
 
-     tw1=tw2=st->twiddles;
-     if (st->inverse) {
-        do{
-           if (!st->inverse) {
-              C_FIXDIV(*Fout,3); C_FIXDIV(Fout[m],3); C_FIXDIV(Fout[m2],3);
-           }
+   tw1=tw2=st->twiddles;
+   do{
+      C_FIXDIV(*Fout,3); C_FIXDIV(Fout[m],3); C_FIXDIV(Fout[m2],3);
 
-           C_MULC(scratch[1],Fout[m] , *tw1);
-           C_MULC(scratch[2],Fout[m2] , *tw2);
+      C_MUL(scratch[1],Fout[m] , *tw1);
+      C_MUL(scratch[2],Fout[m2] , *tw2);
 
-           C_ADD(scratch[3],scratch[1],scratch[2]);
-           C_SUB(scratch[0],scratch[1],scratch[2]);
-           tw1 += fstride;
-           tw2 += fstride*2;
+      C_ADD(scratch[3],scratch[1],scratch[2]);
+      C_SUB(scratch[0],scratch[1],scratch[2]);
+      tw1 += fstride;
+      tw2 += fstride*2;
 
-           Fout[m].r = Fout->r - HALF_OF(scratch[3].r);
-           Fout[m].i = Fout->i - HALF_OF(scratch[3].i);
+      Fout[m].r = Fout->r - HALF_OF(scratch[3].r);
+      Fout[m].i = Fout->i - HALF_OF(scratch[3].i);
 
-           C_MULBYSCALAR( scratch[0] , -epi3.i );
+      C_MULBYSCALAR( scratch[0] , epi3.i );
 
-           C_ADDTO(*Fout,scratch[3]);
+      C_ADDTO(*Fout,scratch[3]);
 
-           Fout[m2].r = Fout[m].r + scratch[0].i;
-           Fout[m2].i = Fout[m].i - scratch[0].r;
+      Fout[m2].r = Fout[m].r + scratch[0].i;
+      Fout[m2].i = Fout[m].i - scratch[0].r;
 
-           Fout[m].r -= scratch[0].i;
-           Fout[m].i += scratch[0].r;
+      Fout[m].r -= scratch[0].i;
+      Fout[m].i += scratch[0].r;
 
-           ++Fout;
-        }while(--k);
-     } else {
-        do{
-           if (!st->inverse) {
-              C_FIXDIV(*Fout,3); C_FIXDIV(Fout[m],3); C_FIXDIV(Fout[m2],3);
-           }
+      ++Fout;
+   }while(--k);
+}
 
-           C_MUL(scratch[1],Fout[m] , *tw1);
-           C_MUL(scratch[2],Fout[m2] , *tw2);
+static void ki_bfly3(
+                     kiss_fft_cpx * Fout,
+                     const size_t fstride,
+                     const kiss_fft_cfg st,
+                     size_t m
+                    )
+{
+   size_t k=m;
+   const size_t m2 = 2*m;
+   kiss_fft_cpx *tw1,*tw2;
+   kiss_fft_cpx scratch[5];
+   kiss_fft_cpx epi3;
+   epi3 = st->twiddles[fstride*m];
 
-           C_ADD(scratch[3],scratch[1],scratch[2]);
-           C_SUB(scratch[0],scratch[1],scratch[2]);
-           tw1 += fstride;
-           tw2 += fstride*2;
+   tw1=tw2=st->twiddles;
+   do{
 
-           Fout[m].r = Fout->r - HALF_OF(scratch[3].r);
-           Fout[m].i = Fout->i - HALF_OF(scratch[3].i);
+      C_MULC(scratch[1],Fout[m] , *tw1);
+      C_MULC(scratch[2],Fout[m2] , *tw2);
 
-           C_MULBYSCALAR( scratch[0] , epi3.i );
+      C_ADD(scratch[3],scratch[1],scratch[2]);
+      C_SUB(scratch[0],scratch[1],scratch[2]);
+      tw1 += fstride;
+      tw2 += fstride*2;
 
-           C_ADDTO(*Fout,scratch[3]);
+      Fout[m].r = Fout->r - HALF_OF(scratch[3].r);
+      Fout[m].i = Fout->i - HALF_OF(scratch[3].i);
 
-           Fout[m2].r = Fout[m].r + scratch[0].i;
-           Fout[m2].i = Fout[m].i - scratch[0].r;
+      C_MULBYSCALAR( scratch[0] , -epi3.i );
 
-           Fout[m].r -= scratch[0].i;
-           Fout[m].i += scratch[0].r;
+      C_ADDTO(*Fout,scratch[3]);
 
-           ++Fout;
-        }while(--k);
-     }
+      Fout[m2].r = Fout[m].r + scratch[0].i;
+      Fout[m2].i = Fout[m].i - scratch[0].r;
+
+      Fout[m].r -= scratch[0].i;
+      Fout[m].i += scratch[0].r;
+
+      ++Fout;
+   }while(--k);
 }
 
+
 static void kf_bfly5(
-        kiss_fft_cpx * Fout,
-        const size_t fstride,
-        const kiss_fft_cfg st,
-        int m
-        )
+                     kiss_fft_cpx * Fout,
+                     const size_t fstride,
+                     const kiss_fft_cfg st,
+                     int m
+                    )
 {
-    kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
-    int u;
-    kiss_fft_cpx scratch[13];
-    kiss_fft_cpx * twiddles = st->twiddles;
-    kiss_fft_cpx *tw;
-    kiss_fft_cpx ya,yb;
-    ya = twiddles[fstride*m];
-    yb = twiddles[fstride*2*m];
+   kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
+   int u;
+   kiss_fft_cpx scratch[13];
+   kiss_fft_cpx * twiddles = st->twiddles;
+   kiss_fft_cpx *tw;
+   kiss_fft_cpx ya,yb;
+   ya = twiddles[fstride*m];
+   yb = twiddles[fstride*2*m];
 
-    Fout0=Fout;
-    Fout1=Fout0+m;
-    Fout2=Fout0+2*m;
-    Fout3=Fout0+3*m;
-    Fout4=Fout0+4*m;
+   Fout0=Fout;
+   Fout1=Fout0+m;
+   Fout2=Fout0+2*m;
+   Fout3=Fout0+3*m;
+   Fout4=Fout0+4*m;
 
-    tw=st->twiddles;
-    if (st->inverse) {
+   tw=st->twiddles;
+   for ( u=0; u<m; ++u ) {
+      C_FIXDIV( *Fout0,5); C_FIXDIV( *Fout1,5); C_FIXDIV( *Fout2,5); C_FIXDIV( *Fout3,5); C_FIXDIV( *Fout4,5);
+      scratch[0] = *Fout0;
 
-       for ( u=0; u<m; ++u ) {
-          if (!st->inverse) {
-             C_FIXDIV( *Fout0,5); C_FIXDIV( *Fout1,5); C_FIXDIV( *Fout2,5); C_FIXDIV( *Fout3,5); C_FIXDIV( *Fout4,5);
-          }
-          scratch[0] = *Fout0;
+      C_MUL(scratch[1] ,*Fout1, tw[u*fstride]);
+      C_MUL(scratch[2] ,*Fout2, tw[2*u*fstride]);
+      C_MUL(scratch[3] ,*Fout3, tw[3*u*fstride]);
+      C_MUL(scratch[4] ,*Fout4, tw[4*u*fstride]);
 
-          C_MULC(scratch[1] ,*Fout1, tw[u*fstride]);
-          C_MULC(scratch[2] ,*Fout2, tw[2*u*fstride]);
-          C_MULC(scratch[3] ,*Fout3, tw[3*u*fstride]);
-          C_MULC(scratch[4] ,*Fout4, tw[4*u*fstride]);
+      C_ADD( scratch[7],scratch[1],scratch[4]);
+      C_SUB( scratch[10],scratch[1],scratch[4]);
+      C_ADD( scratch[8],scratch[2],scratch[3]);
+      C_SUB( scratch[9],scratch[2],scratch[3]);
 
-          C_ADD( scratch[7],scratch[1],scratch[4]);
-          C_SUB( scratch[10],scratch[1],scratch[4]);
-          C_ADD( scratch[8],scratch[2],scratch[3]);
-          C_SUB( scratch[9],scratch[2],scratch[3]);
+      Fout0->r += scratch[7].r + scratch[8].r;
+      Fout0->i += scratch[7].i + scratch[8].i;
 
-          Fout0->r += scratch[7].r + scratch[8].r;
-          Fout0->i += scratch[7].i + scratch[8].i;
+      scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r);
+      scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r);
 
-          scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r);
-          scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r);
+      scratch[6].r =  S_MUL(scratch[10].i,ya.i) + S_MUL(scratch[9].i,yb.i);
+      scratch[6].i = -S_MUL(scratch[10].r,ya.i) - S_MUL(scratch[9].r,yb.i);
 
-          scratch[6].r = -S_MUL(scratch[10].i,ya.i) - S_MUL(scratch[9].i,yb.i);
-          scratch[6].i =  S_MUL(scratch[10].r,ya.i) + S_MUL(scratch[9].r,yb.i);
+      C_SUB(*Fout1,scratch[5],scratch[6]);
+      C_ADD(*Fout4,scratch[5],scratch[6]);
 
-          C_SUB(*Fout1,scratch[5],scratch[6]);
-          C_ADD(*Fout4,scratch[5],scratch[6]);
+      scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r);
+      scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r);
+      scratch[12].r = - S_MUL(scratch[10].i,yb.i) + S_MUL(scratch[9].i,ya.i);
+      scratch[12].i = S_MUL(scratch[10].r,yb.i) - S_MUL(scratch[9].r,ya.i);
 
-          scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r);
-          scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r);
-          scratch[12].r =  S_MUL(scratch[10].i,yb.i) - S_MUL(scratch[9].i,ya.i);
-          scratch[12].i = -S_MUL(scratch[10].r,yb.i) + S_MUL(scratch[9].r,ya.i);
+      C_ADD(*Fout2,scratch[11],scratch[12]);
+      C_SUB(*Fout3,scratch[11],scratch[12]);
 
-          C_ADD(*Fout2,scratch[11],scratch[12]);
-          C_SUB(*Fout3,scratch[11],scratch[12]);
+      ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
+   }
+}
 
-          ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
-       }
-    } else {
-       for ( u=0; u<m; ++u ) {
-          if (!st->inverse) {
-             C_FIXDIV( *Fout0,5); C_FIXDIV( *Fout1,5); C_FIXDIV( *Fout2,5); C_FIXDIV( *Fout3,5); C_FIXDIV( *Fout4,5);
-          }
-          scratch[0] = *Fout0;
+static void ki_bfly5(
+                     kiss_fft_cpx * Fout,
+                     const size_t fstride,
+                     const kiss_fft_cfg st,
+                     int m
+                    )
+{
+   kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
+   int u;
+   kiss_fft_cpx scratch[13];
+   kiss_fft_cpx * twiddles = st->twiddles;
+   kiss_fft_cpx *tw;
+   kiss_fft_cpx ya,yb;
+   ya = twiddles[fstride*m];
+   yb = twiddles[fstride*2*m];
 
-          C_MUL(scratch[1] ,*Fout1, tw[u*fstride]);
-          C_MUL(scratch[2] ,*Fout2, tw[2*u*fstride]);
-          C_MUL(scratch[3] ,*Fout3, tw[3*u*fstride]);
-          C_MUL(scratch[4] ,*Fout4, tw[4*u*fstride]);
+   Fout0=Fout;
+   Fout1=Fout0+m;
+   Fout2=Fout0+2*m;
+   Fout3=Fout0+3*m;
+   Fout4=Fout0+4*m;
 
-          C_ADD( scratch[7],scratch[1],scratch[4]);
-          C_SUB( scratch[10],scratch[1],scratch[4]);
-          C_ADD( scratch[8],scratch[2],scratch[3]);
-          C_SUB( scratch[9],scratch[2],scratch[3]);
+   tw=st->twiddles;
+   for ( u=0; u<m; ++u ) {
+      scratch[0] = *Fout0;
 
-          Fout0->r += scratch[7].r + scratch[8].r;
-          Fout0->i += scratch[7].i + scratch[8].i;
+      C_MULC(scratch[1] ,*Fout1, tw[u*fstride]);
+      C_MULC(scratch[2] ,*Fout2, tw[2*u*fstride]);
+      C_MULC(scratch[3] ,*Fout3, tw[3*u*fstride]);
+      C_MULC(scratch[4] ,*Fout4, tw[4*u*fstride]);
 
-          scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r);
-          scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r);
+      C_ADD( scratch[7],scratch[1],scratch[4]);
+      C_SUB( scratch[10],scratch[1],scratch[4]);
+      C_ADD( scratch[8],scratch[2],scratch[3]);
+      C_SUB( scratch[9],scratch[2],scratch[3]);
 
-          scratch[6].r =  S_MUL(scratch[10].i,ya.i) + S_MUL(scratch[9].i,yb.i);
-          scratch[6].i = -S_MUL(scratch[10].r,ya.i) - S_MUL(scratch[9].r,yb.i);
+      Fout0->r += scratch[7].r + scratch[8].r;
+      Fout0->i += scratch[7].i + scratch[8].i;
 
-          C_SUB(*Fout1,scratch[5],scratch[6]);
-          C_ADD(*Fout4,scratch[5],scratch[6]);
+      scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r);
+      scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r);
 
-          scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r);
-          scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r);
-          scratch[12].r = - S_MUL(scratch[10].i,yb.i) + S_MUL(scratch[9].i,ya.i);
-          scratch[12].i = S_MUL(scratch[10].r,yb.i) - S_MUL(scratch[9].r,ya.i);
+      scratch[6].r = -S_MUL(scratch[10].i,ya.i) - S_MUL(scratch[9].i,yb.i);
+      scratch[6].i =  S_MUL(scratch[10].r,ya.i) + S_MUL(scratch[9].r,yb.i);
 
-          C_ADD(*Fout2,scratch[11],scratch[12]);
-          C_SUB(*Fout3,scratch[11],scratch[12]);
+      C_SUB(*Fout1,scratch[5],scratch[6]);
+      C_ADD(*Fout4,scratch[5],scratch[6]);
 
-          ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
-       }
-    }
+      scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r);
+      scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r);
+      scratch[12].r =  S_MUL(scratch[10].i,yb.i) - S_MUL(scratch[9].i,ya.i);
+      scratch[12].i = -S_MUL(scratch[10].r,yb.i) + S_MUL(scratch[9].r,ya.i);
+
+      C_ADD(*Fout2,scratch[11],scratch[12]);
+      C_SUB(*Fout3,scratch[11],scratch[12]);
+
+      ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
+   }
 }
 
 /* perform the butterfly for one stage of a mixed radix FFT */
 static void kf_bfly_generic(
-        kiss_fft_cpx * Fout,
-        const size_t fstride,
-        const kiss_fft_cfg st,
-        int m,
-        int p
-        )
+                            kiss_fft_cpx * Fout,
+                            const size_t fstride,
+                            const kiss_fft_cfg st,
+                            int m,
+                            int p
+                           )
 {
-    int u,k,q1,q;
-    kiss_fft_cpx * twiddles = st->twiddles;
-    kiss_fft_cpx t;
-    kiss_fft_cpx scratchbuf[17];
-    int Norig = st->nfft;
+   int u,k,q1,q;
+   kiss_fft_cpx * twiddles = st->twiddles;
+   kiss_fft_cpx t;
+   kiss_fft_cpx scratchbuf[17];
+   int Norig = st->nfft;
 
-    /*CHECKBUF(scratchbuf,nscratchbuf,p);*/
-    if (p>17)
-       celt_fatal("KissFFT: max radix supported is 17");
+   /*CHECKBUF(scratchbuf,nscratchbuf,p);*/
+   if (p>17)
+      celt_fatal("KissFFT: max radix supported is 17");
     
-    for ( u=0; u<m; ++u ) {
-        k=u;
-        for ( q1=0 ; q1<p ; ++q1 ) {
-            scratchbuf[q1] = Fout[ k  ];
-        if (!st->inverse) {
-            C_FIXDIV(scratchbuf[q1],p);
-	}
-            k += m;
-        }
+   for ( u=0; u<m; ++u ) {
+      k=u;
+      for ( q1=0 ; q1<p ; ++q1 ) {
+         scratchbuf[q1] = Fout[ k  ];
+         C_FIXDIV(scratchbuf[q1],p);
+         k += m;
+      }
 
-        k=u;
-        for ( q1=0 ; q1<p ; ++q1 ) {
-            int twidx=0;
-            Fout[ k ] = scratchbuf[0];
-            for (q=1;q<p;++q ) {
-                twidx += fstride * k;
-                if (twidx>=Norig) twidx-=Norig;
-                if (st->inverse)
-                   C_MULC(t,scratchbuf[q] , twiddles[twidx] );
-                else
-                   C_MUL(t,scratchbuf[q] , twiddles[twidx] );
-                C_ADDTO( Fout[ k ] ,t);
-            }
-            k += m;
-        }
-    }
+      k=u;
+      for ( q1=0 ; q1<p ; ++q1 ) {
+         int twidx=0;
+         Fout[ k ] = scratchbuf[0];
+         for (q=1;q<p;++q ) {
+            twidx += fstride * k;
+            if (twidx>=Norig) twidx-=Norig;
+            C_MUL(t,scratchbuf[q] , twiddles[twidx] );
+            C_ADDTO( Fout[ k ] ,t);
+         }
+         k += m;
+      }
+   }
 }
 
+static void ki_bfly_generic(
+                            kiss_fft_cpx * Fout,
+                            const size_t fstride,
+                            const kiss_fft_cfg st,
+                            int m,
+                            int p
+                           )
+{
+   int u,k,q1,q;
+   kiss_fft_cpx * twiddles = st->twiddles;
+   kiss_fft_cpx t;
+   kiss_fft_cpx scratchbuf[17];
+   int Norig = st->nfft;
+
+   /*CHECKBUF(scratchbuf,nscratchbuf,p);*/
+   if (p>17)
+      celt_fatal("KissFFT: max radix supported is 17");
+    
+   for ( u=0; u<m; ++u ) {
+      k=u;
+      for ( q1=0 ; q1<p ; ++q1 ) {
+         scratchbuf[q1] = Fout[ k  ];
+         k += m;
+      }
+
+      k=u;
+      for ( q1=0 ; q1<p ; ++q1 ) {
+         int twidx=0;
+         Fout[ k ] = scratchbuf[0];
+         for (q=1;q<p;++q ) {
+            twidx += fstride * k;
+            if (twidx>=Norig) twidx-=Norig;
+            C_MULC(t,scratchbuf[q] , twiddles[twidx] );
+            C_ADDTO( Fout[ k ] ,t);
+         }
+         k += m;
+      }
+   }
+}
+
 static
 void compute_bitrev_table(
          int * Fout,
@@ -456,6 +543,36 @@
     }    
 }
 
+static
+      void ki_work(
+                   kiss_fft_cpx * Fout,
+                   const kiss_fft_cpx * f,
+                   const size_t fstride,
+                   int in_stride,
+                   int * factors,
+                   const kiss_fft_cfg st,
+                   int N,
+                   int s2,
+                   int m2
+                  )
+{
+   int i;
+   kiss_fft_cpx * Fout_beg=Fout;
+   const int p=*factors++; /* the radix  */
+   const int m=*factors++; /* stage's fft length/p */
+   /*printf ("fft %d %d %d %d %d %d %d\n", p*m, m, p, s2, fstride*in_stride, N, m2);*/
+   if (m!=1) 
+      ki_work( Fout , f, fstride*p, in_stride, factors,st, N*p, fstride*in_stride, m);
+
+   switch (p) {
+      case 2: ki_bfly2(Fout,fstride,st,m, N, m2); break;
+      case 3: for (i=0;i<N;i++){Fout=Fout_beg+i*m2; ki_bfly3(Fout,fstride,st,m);} break; 
+      case 4: ki_bfly4(Fout,fstride,st,m, N, m2); break;
+      case 5: for (i=0;i<N;i++){Fout=Fout_beg+i*m2; ki_bfly5(Fout,fstride,st,m);} break; 
+      default: for (i=0;i<N;i++){Fout=Fout_beg+i*m2; ki_bfly_generic(Fout,fstride,st,m,p);} break;
+   }    
+}
+
 /*  facbuf is populated by p1,m1,p2,m2, ...
     where 
     p[i] * m[i] = m[i-1]
@@ -488,7 +605,7 @@
  * The return value is a contiguous block of memory, allocated with malloc.  As such,
  * It can be freed with free(), rather than a kiss_fft-specific function.
  * */
-kiss_fft_cfg kiss_fft_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem )
+kiss_fft_cfg kiss_fft_alloc(int nfft,void * mem,size_t * lenmem )
 {
     kiss_fft_cfg st=NULL;
     size_t memneeded = sizeof(struct kiss_fft_state)
@@ -504,10 +621,9 @@
     if (st) {
         int i;
         st->nfft=nfft;
-        st->inverse = inverse_fft;
 #ifdef FIXED_POINT
         for (i=0;i<nfft;++i) {
-            celt_word32_t phase = i;
+            celt_word32_t phase = -i;
             kf_cexp2(st->twiddles+i, DIV32(SHL32(phase,17),nfft));
         }
 #else
@@ -546,5 +662,24 @@
 void kiss_fft(kiss_fft_cfg cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
 {
     kiss_fft_stride(cfg,fin,fout,1);
+}
+
+void kiss_ifft_stride(kiss_fft_cfg st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout,int in_stride)
+{
+   if (fin == fout) 
+   {
+      celt_fatal("In-place FFT not supported");
+   } else {
+      /* Bit-reverse the input */
+      int i;
+      for (i=0;i<st->nfft;i++)
+         fout[i] = fin[st->bitrev[i]];
+      ki_work( fout, fin, 1,in_stride, st->factors,st, 1, in_stride, 1);
+   }
+}
+
+void kiss_ifft(kiss_fft_cfg cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
+{
+   kiss_ifft_stride(cfg,fin,fout,1);
 }
 
--- a/libcelt/kiss_fft.h
+++ b/libcelt/kiss_fft.h
@@ -71,7 +71,7 @@
  *      buffer size in *lenmem.
  * */
 
-kiss_fft_cfg kiss_fft_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem); 
+kiss_fft_cfg kiss_fft_alloc(int nfft,void * mem,size_t * lenmem); 
 
 /*
  * kiss_fft(cfg,in_out_buf)
@@ -84,11 +84,13 @@
     f[k].r and f[k].i
  * */
 void kiss_fft(kiss_fft_cfg cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout);
+void kiss_ifft(kiss_fft_cfg cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout);
 
 /*
  A more generic version of the above function. It reads its input from every Nth sample.
  * */
 void kiss_fft_stride(kiss_fft_cfg cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout,int fin_stride);
+void kiss_ifft_stride(kiss_fft_cfg cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout,int fin_stride);
 
 /* If kiss_fft_alloc allocated a buffer, it is one contiguous 
    buffer and can be simply free()d when no longer needed*/
--- a/libcelt/kiss_fftr.c
+++ b/libcelt/kiss_fftr.c
@@ -33,7 +33,7 @@
 #endif    
 };
 
-kiss_fftr_cfg kiss_fftr_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem)
+kiss_fftr_cfg kiss_fftr_alloc(int nfft,void * mem,size_t * lenmem)
 {
     int i;
     kiss_fftr_cfg st = NULL;
@@ -45,7 +45,7 @@
     }
     nfft >>= 1;
 
-    kiss_fft_alloc (nfft, inverse_fft, NULL, &subsize);
+    kiss_fft_alloc (nfft, NULL, &subsize);
     memneeded = sizeof(struct kiss_fftr_state) + subsize + sizeof(kiss_fft_cpx) * ( nfft * 2);
 
     if (lenmem == NULL) {
@@ -61,7 +61,7 @@
     st->substate = (kiss_fft_cfg) (st + 1); /*just beyond kiss_fftr_state struct */
     st->tmpbuf = (kiss_fft_cpx *) (((char *) st->substate) + subsize);
     st->super_twiddles = st->tmpbuf + nfft;
-    kiss_fft_alloc(nfft, inverse_fft, st->substate, &subsize);
+    kiss_fft_alloc(nfft, st->substate, &subsize);
 
 #ifdef FIXED_POINT
     for (i=0;i<nfft;++i) {
@@ -85,10 +85,6 @@
    kiss_fft_cpx f2k,tdc;
    celt_word32_t f1kr, f1ki, twr, twi;
 
-   if ( st->substate->inverse) {
-      celt_fatal("kiss fft usage error: improper alloc\n");
-   }
-
    ncfft = st->substate->nfft;
 
    /*perform the parallel fft of two real signals packed in real,imag*/
@@ -142,10 +138,6 @@
    /* input buffer timedata is stored row-wise */
    int k, ncfft;
 
-   if (st->substate->inverse == 0) {
-      celt_fatal ("kiss fft usage error: improper alloc\n");
-   }
-
    ncfft = st->substate->nfft;
 
    st->tmpbuf[0].r = freqdata[0] + freqdata[1];
@@ -169,5 +161,5 @@
       st->tmpbuf[ncfft - k].i *= -1;
 #endif
    }
-   kiss_fft (st->substate, st->tmpbuf, (kiss_fft_cpx *) timedata);
+   kiss_ifft (st->substate, st->tmpbuf, (kiss_fft_cpx *) timedata);
 }
--- a/libcelt/kiss_fftr.h
+++ b/libcelt/kiss_fftr.h
@@ -18,7 +18,7 @@
 typedef struct kiss_fftr_state *kiss_fftr_cfg;
 
 
-kiss_fftr_cfg kiss_fftr_alloc(int nfft,int inverse_fft,void * mem, size_t * lenmem);
+kiss_fftr_cfg kiss_fftr_alloc(int nfft,void * mem, size_t * lenmem);
 /*
  nfft must be even
 
--- a/libcelt/mdct.c
+++ b/libcelt/mdct.c
@@ -54,8 +54,7 @@
    l->n = N;
    N2 = N/2;
    N4 = N/4;
-   l->kfft = kiss_fft_alloc(N4, 0, NULL, NULL);
-   l->ikfft = kiss_fft_alloc(N4, 1, NULL, NULL);
+   l->kfft = kiss_fft_alloc(N4, NULL, NULL);
    l->trig = celt_alloc(N2*sizeof(float));
    /* We have enough points that sine isn't necessary */
    for (i=0;i<N2;i++)
@@ -66,7 +65,6 @@
 void mdct_clear(mdct_lookup *l)
 {
    kiss_fft_free(l->kfft);
-   kiss_fft_free(l->ikfft);
    celt_free(l->trig);
 }
 
@@ -131,7 +129,7 @@
    }
 
    /* Inverse N/4 complex FFT. This one should *not* downscale even in fixed-point */
-   kiss_fft(l->ikfft, (const kiss_fft_cpx *)out, (kiss_fft_cpx *)f);
+   kiss_ifft(l->kfft, (const kiss_fft_cpx *)out, (kiss_fft_cpx *)f);
    
    /* Post-rotate */
    for(i=0;i<N4;i++)
--- a/libcelt/mdct.h
+++ b/libcelt/mdct.h
@@ -47,7 +47,6 @@
 typedef struct {
    int n;
    kiss_fft_cfg kfft;
-   kiss_fft_cfg ikfft;
    float *trig;
    float scale;
 } mdct_lookup;
--- a/tests/dft-test.c
+++ b/tests/dft-test.c
@@ -42,7 +42,7 @@
 
     kiss_fft_cpx  * in = (kiss_fft_cpx*)malloc(buflen);
     kiss_fft_cpx  * out= (kiss_fft_cpx*)malloc(buflen);
-    kiss_fft_cfg  cfg = kiss_fft_alloc(nfft,isinverse,0,0);
+    kiss_fft_cfg  cfg = kiss_fft_alloc(nfft,0,0);
     int k;
 
     for (k=0;k<nfft;++k) {
@@ -50,7 +50,10 @@
         in[k].i = (rand() % 65536) - 32768;
     }
 
-    kiss_fft(cfg,in,out);
+    if (isinverse)
+       kiss_ifft(cfg,in,out);
+    else
+       kiss_fft(cfg,in,out);
 
     check(in,out,nfft,isinverse);
 
--- a/tests/real-fft-test.c
+++ b/tests/real-fft-test.c
@@ -96,8 +96,8 @@
         cin[i].i = zero;
     }
 
-    kiss_fft_state = kiss_fft_alloc(NFFT,0,0,0);
-    kiss_fftr_state = kiss_fftr_alloc(NFFT,0,0,0);
+    kiss_fft_state = kiss_fft_alloc(NFFT,0,0);
+    kiss_fftr_state = kiss_fftr_alloc(NFFT,0,0);
     kiss_fft(kiss_fft_state,cin,cout);
     kiss_fftr(kiss_fftr_state,rin,sout);
     
@@ -118,12 +118,6 @@
 
     printf("%d complex ffts took %gs, real took %gs\n",NUMFFTS,tfft,trfft);
 
-    free(kiss_fft_state);
-    free(kiss_fftr_state);
-
-    kiss_fft_state = kiss_fft_alloc(NFFT,1,0,0);
-    kiss_fftr_state = kiss_fftr_alloc(NFFT,1,0,0);
-
     memset(cin,0,sizeof(cin));
 #if 1
     cin[0].r = rand_scalar();
@@ -153,7 +147,7 @@
        fin[2*i+1] = cin[i].i;
     }
     
-    kiss_fft(kiss_fft_state,cin,cout);
+    kiss_ifft(kiss_fft_state,cin,cout);
     kiss_fftri(kiss_fftr_state,fin,rout);
     /*
     printf(" results from inverse kiss_fft : (%f,%f), (%f,%f), (%f,%f), (%f,%f), (%f,%f) ...\n "