shithub: opus

--- a/libcelt/arch.h

+++ b/libcelt/arch.h

@@ -189,6 +189,7 @@

 #define MULT16_32_Q13(a,b)     ((a)*(b))

 #define MULT16_32_Q14(a,b)     ((a)*(b))

 #define MULT16_32_Q15(a,b)     ((a)*(b))

+#define MULT16_32_Q16(a,b)     ((a)*(b))

 #define MULT16_32_P15(a,b)     ((a)*(b))

 #define MULT32_32_Q31(a,b)     ((a)*(b))

--- a/libcelt/celt.c

+++ b/libcelt/celt.c

@@ -153,29 +153,26 @@

 /** Apply window and compute the MDCT for all sub-frames and all channels in a frame */

 static void compute_mdcts(const CELTMode *mode, const celt_word16_t * restrict window, celt_sig_t * restrict in, celt_sig_t * restrict out)

-   int c, N4;

    const mdct_lookup *lookup = MDCT(mode);

    const int N = FRAMESIZE(mode);

    const int C = CHANNELS(mode);

    const int overlap = OVERLAP(mode);

-   N4 = (N-overlap)>>1;

    if (C==1)

       mdct_forward(lookup, in, out, window, overlap);

    } else {

+      int c;

       VARDECL(celt_word32_t, x);

       VARDECL(celt_word32_t, tmp);

       SAVE_STACK;

-      ALLOC(x, 2*N, celt_word32_t);

+      ALLOC(x, N+overlap, celt_word32_t);

       ALLOC(tmp, N, celt_word32_t);

       for (c=0;c<C;c++)

          int j;

-         for (j=0;j<2*N-2*N4;j++)

-            x[j+N4] = in[C*j+c];

-         CELT_MEMSET(x, 0, N4);

-         CELT_MEMSET(x+2*N-N4, 0, N4);

-         mdct_forward(lookup, x+N4, tmp, window, overlap);

+         for (j=0;j<N+overlap;j++)

+            x[j] = in[C*j+c];

+         mdct_forward(lookup, x, tmp, window, overlap);

          /* Interleaving the sub-frames */

          for (j=0;j<N;j++)

             out[C*j+c] = tmp[j];

--- a/libcelt/fixed_debug.h

+++ b/libcelt/fixed_debug.h

@@ -44,6 +44,9 @@

 #define MULT16_16SU(a,b) ((celt_word32_t)(celt_word16_t)(a)*(celt_word32_t)(celt_uint16_t)(b))

 #define MULT32_32_Q31(a,b) ADD32(ADD32(SHL32(MULT16_16(SHR((a),16),SHR((b),16)),1), SHR32(MULT16_16SU(SHR((a),16),((b)&0x0000ffff)),15)), SHR(MULT16_16SU(SHR((b),16),((a)&0x0000ffff)),15))

+/** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */

+#define MULT16_32_Q16(a,b) ADD32(MULT16_16((a),SHR((b),16)), SHR(MULT16_16SU((a),((b)&0x0000ffff)),16))

 #define QCONST16(x,bits) ((celt_word16_t)(.5+(x)*(((celt_word32_t)1)<<(bits))))

 #define QCONST32(x,bits) ((celt_word32_t)(.5+(x)*(((celt_word32_t)1)<<(bits))))

--- a/libcelt/fixed_generic.h

+++ b/libcelt/fixed_generic.h

@@ -38,6 +38,9 @@

 /** Multiply a 16-bit signed value by a 16-bit unsigned value. The result is a 32-bit signed value */

 #define MULT16_16SU(a,b) ((celt_word32_t)(celt_word16_t)(a)*(celt_word32_t)(celt_uint16_t)(b))

+/** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */

+#define MULT16_32_Q16(a,b) ADD32(MULT16_16((a),SHR((b),16)), SHR(MULT16_16SU((a),((b)&0x0000ffff)),16))

 /** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */

 #define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),((b)&0x0000ffff)),15))

--- a/libcelt/mdct.c

+++ b/libcelt/mdct.c

@@ -111,8 +111,11 @@

          kiss_fft_scalar re, im;

          /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/

-         re = -HALF32(MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2));

-         im = -HALF32(MULT16_32_Q15(*wp1, *xp1)    - MULT16_32_Q15(*wp2, xp2[-N2]));

+         re = -(MULT16_32_Q16(*wp2, xp1[N2]) + MULT16_32_Q16(*wp1,*xp2));

+         im = -(MULT16_32_Q16(*wp1, *xp1)    - MULT16_32_Q16(*wp2, xp2[-N2]));

+#ifndef FIXED_POINT

+         re *= .5; im *= .5;

+#endif

          xp1+=2;

          xp2-=2;

          wp1+=2;

@@ -123,20 +126,6 @@

          *yp++ = S_MUL(im,t[0])  +  S_MUL(re,t[N4]);

          t++;

-      for(;i<N/8;i++)

-      {

-         kiss_fft_scalar re, im;

-         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/

-         re = -HALF32(*xp2);

-         im = -HALF32(*xp1);

-         xp1+=2;

-         xp2-=2;

-         /* We could remove the HALF32 above and just use MULT16_32_Q16 below

-            (MIXED_PRECISION only) */

-         *yp++ = S_MUL(re,t[0])  -  S_MUL(im,t[N4]);

-         *yp++ = S_MUL(im,t[0])  +  S_MUL(re,t[N4]);

-	 t++;

-      }

       wp1 = window;

       wp2 = window+overlap-1;

       for(;i<N4-overlap/4;i++)

@@ -143,7 +132,7 @@

          kiss_fft_scalar re, im;

          /* Real part arranged as a-bR, Imag part arranged as -c-dR */

-         re =  HALF32(-*xp2);

+         re = -HALF32(*xp2);

          im = -HALF32(*xp1);

          xp1+=2;

          xp2-=2;

@@ -151,14 +140,17 @@

             (MIXED_PRECISION only) */

          *yp++ = S_MUL(re,t[0])  -  S_MUL(im,t[N4]);

          *yp++ = S_MUL(im,t[0])  +  S_MUL(re,t[N4]);

-	 t++;

+         t++;

       for(;i<N4;i++)

          kiss_fft_scalar re, im;

          /* Real part arranged as a-bR, Imag part arranged as -c-dR */

-         re =  HALF32(MULT16_32_Q15(*wp1, xp1[-N2]) - MULT16_32_Q15(*wp2, *xp2));

-         im = -HALF32(MULT16_32_Q15(*wp2, *xp1)     + MULT16_32_Q15(*wp1, xp2[N2]));

+         re =  (MULT16_32_Q16(*wp1, xp1[-N2]) - MULT16_32_Q16(*wp2, *xp2));

+         im = -(MULT16_32_Q16(*wp2, *xp1)     + MULT16_32_Q16(*wp1, xp2[N2]));

+#ifndef FIXED_POINT

+         re *= .5; im *= .5;

+#endif

          xp1+=2;

          xp2-=2;

          wp1+=2;

@@ -189,7 +181,7 @@

          fp += 2;

          yp1 += 2;

          yp2 -= 2;

-	 t++;

+         t++;

    RESTORE_STACK;