shithub: opus

--- a/Makefile.draft

+++ b/Makefile.draft

@@ -36,8 +36,7 @@

 LIBSUFFIX = .a

 OBJSUFFIX = .o

-CC     = $(TOOLCHAIN_PREFIX)gcc$(TOOLCHAIN_SUFFIX)

-CXX    = $(TOOLCHAIN_PREFIX)g++$(TOOLCHAIN_SUFFIX)

+CC     = $(TOOLCHAIN_PREFIX)cc$(TOOLCHAIN_SUFFIX)

 AR     = $(TOOLCHAIN_PREFIX)ar

 RANLIB = $(TOOLCHAIN_PREFIX)ranlib

 CP     = $(TOOLCHAIN_PREFIX)cp

@@ -79,7 +78,6 @@

 LDLIBS  += $(call ldlibs-from-libs,$(LIBS))

 COMPILE.c.cmdline   = $(CC) -c $(CFLAGS) -o $@ $<

-COMPILE.cpp.cmdline = $(CXX) -c $(CFLAGS) -o $@ $<

 LINK.o              = $(CC) $(LDPREFLAGS) $(LDFLAGS)

 LINK.o.cmdline      = $(LINK.o) $^ $(LDLIBS) -o $@$(EXESUFFIX)

--- a/README.draft

+++ b/README.draft

@@ -6,10 +6,11 @@

 to compile for a fixed-point architecture), simply edit the options in the

 Makefile.

-To build from the git repository instead of using this draft, follow these

+To build from the git repository instead of using this RFC, follow these

 steps:

-1) Clone the repository:

+1) Clone the repository (latest implementation of this standard at the time

+of publication)

 % git clone git://git.opus-codec.org/opus.git

 % cd opus

--- a/celt/bands.c

+++ b/celt/bands.c

@@ -99,8 +99,7 @@

                sum = MAC16_16(sum, EXTRACT16(VSHR32(X[j+c*N],shift)),

                                    EXTRACT16(VSHR32(X[j+c*N],shift)));

             } while (++j<M*eBands[i+1]);

-            /* We're adding one here to make damn sure we never end up with a pitch vector that's

-               larger than unity norm */

+            /* We're adding one here to ensure the normalized band isn't larger than unity norm */

             bandE[i+c*m->nbEBands] = EPSILON+VSHR32(EXTEND32(celt_sqrt(sum)),-shift);

          } else {

             bandE[i+c*m->nbEBands] = EPSILON;

--- a/celt/kiss_fft.h

+++ b/celt/kiss_fft.h

@@ -37,19 +37,6 @@

 extern "C" {

 #endif

-/*

- ATTENTION!

- If you would like a :

- -- a utility that will handle the caching of fft objects

- -- real-only (no imaginary time component ) FFT

- -- a multi-dimensional FFT

- -- a command-line utility to perform ffts

- -- a command-line utility to perform fast-convolution filtering

- Then see kfc.h kiss_fftr.h kiss_fftnd.h fftutil.c kiss_fastfir.c

-  in the tools/ directory.

-*/

 #ifdef USE_SIMD

 # include <xmmintrin.h>

 # define kiss_fft_scalar __m128

--- a/celt/vq.c

+++ b/celt/vq.c

@@ -70,14 +70,7 @@

    opus_val16 gain, theta;

    int stride2=0;

    int factor;

-   /*int i;

-   if (len>=30)

-   {

-      for (i=0;i<len;i++)

-         X[i] = 0;

-      X[14] = 1;

-      K=5;

-   }*/

    if (2*K>=len || spread==SPREAD_NONE)

       return;

    factor = SPREAD_FACTOR[spread-1];

@@ -91,9 +84,8 @@

    if (len>=8*stride)

       stride2 = 1;

-      /* This is just a simple way of computing sqrt(len/stride) with rounding.

-         It's basically incrementing long as (stride2+0.5)^2 < len/stride.

-         I _think_ it is bit-exact */

+      /* This is just a simple (equivalent) way of computing sqrt(len/stride) with rounding.

+         It's basically incrementing long as (stride2+0.5)^2 < len/stride. */

       while ((stride2*stride2+stride2)*stride + (stride>>2) < len)

          stride2++;

@@ -113,13 +105,6 @@

             exp_rotation1(X+i*len, len, stride2, s, -c);

-   /*if (len>=30)

-   {

-      for (i=0;i<len;i++)

-         printf ("%f ", X[i]);

-      printf ("\n");

-      exit(0);

-   }*/

 /** Takes the pitch vector and the decoded residual vector, computes the gain

@@ -233,7 +218,6 @@

          while (++j<N);

          sum = QCONST16(1.f,14);

-      /* Do we have sufficient accuracy here? */

       rcp = EXTRACT16(MULT16_32_Q16(K-1, celt_rcp(sum)));

       j=0; do {

 #ifdef FIXED_POINT

--- a/doc/draft-ietf-codec-opus.xml

+++ b/doc/draft-ietf-codec-opus.xml

@@ -80,8 +80,8 @@

 The Opus codec is a real-time interactive audio codec designed to meet the requirements

 described in <xref target="requirements"></xref>.

 It is composed of a linear

- prediction (LP)-based layer and a Modified Discrete Cosine Transform

- (MDCT)-based layer.

+ prediction (LP)-based <xref target="LPC"/> layer and a Modified Discrete Cosine Transform

+ (MDCT)-based <xref target="MDCT"/> layer.

 The main idea behind using two layers is that in speech, linear prediction

  techniques (such as CELP) code low frequencies more efficiently than transform

  (e.g., MDCT) domain techniques, while the situation is reversed for music and

@@ -273,8 +273,7 @@

 </t>

<t>

-The LP layer is based on the

- <eref target='http://developer.skype.com/silk'>SILK</eref> codec

+The LP layer is based on the SILK codec

  <xref target="SILK"></xref>.

 It supports NB, MB, or WB audio and frame sizes from 10&nbsp;ms to 60&nbsp;ms,

  and requires an additional 5&nbsp;ms look-ahead for noise shaping estimation.

@@ -290,9 +289,7 @@

 </t>

<t>

-The MDCT layer is based on the

- <eref target='http://www.celt-codec.org/'>CELT</eref>  codec

- <xref target="CELT"></xref>.

+The MDCT layer is based on the CELT  codec <xref target="CELT"></xref>.

 It supports NB, WB, SWB, or FB audio and frame sizes from 2.5&nbsp;ms to

  20&nbsp;ms, and requires an additional 2.5&nbsp;ms look-ahead due to the

  overlapping MDCT windows.

@@ -436,7 +433,7 @@

 0 is the lowest complexity and 10 is the highest. Examples of

 computations for which such trade-offs may occur are:

 <list style="symbols">

-<t>The order of the pitch analysis whitening filter,</t>

+<t>The order of the pitch analysis whitening filter <xref target="Whitening"/>,</t>

 <t>The order of the short-term noise shaping filter,</t>

 <t>The number of states in delayed decision quantization of the

 residual signal, and</t>

@@ -474,9 +471,8 @@

 is required. There are two main reasons to operate in CBR mode:

 <list style="symbols">

 <t>When the transport only supports a fixed size for each compressed frame</t>

-<t>When security is important <spanx style="emph">and</spanx> the input audio

-not a normal conversation but is highly constrained (e.g. yes/no, recorded prompts)

-<xref target="SRTP-VBR"></xref> </t>

+<t>When encryption is used for an audio stream that is either highly constrained

+   (e.g. yes/no, recorded prompts) or highly sensitive <xref target="SRTP-VBR"></xref> </t>

 </list>

 When low-latency transmission is required over a relatively slow connection, then

@@ -734,9 +730,9 @@

 </figure>

 </section>

-<section title="Code 3: An Arbitrary Number of Frames in the Packet">

+<section title="Code 3: A Signaled Number of Frames in the Packet">

<t>

-Code 3 packets may encode an arbitrary number of frames, as well as additional

+Code 3 packets signal the number of frames, as well as additional

  padding, called "Opus padding" to indicate that this padding is added at the

  Opus layer, rather than at the transport layer.

 Code 3 packets MUST have at least 2 bytes.

@@ -1271,10 +1267,10 @@

 The reference implementation reads them using ec_dec_bits() (entdec.c).

 Because the range decoder must read several bytes ahead in the stream, as

  described in <xref target="range-decoder-renorm"/>, the input consumed by the

- raw bits MAY overlap with the input consumed by the range coder, and a decoder

+ raw bits may overlap with the input consumed by the range coder, and a decoder

  MUST allow this.

 The format should render it impossible to attempt to read more raw bits than

- there are actual bits in the frame, though a decoder MAY wish to check for

+ there are actual bits in the frame, though a decoder may wish to check for

  this and report an error.

 </t>

 </section>

@@ -1388,9 +1384,9 @@

 <section anchor="ec_tell" title="ec_tell()">

<t>

-The whole number of bits buffered in rng may be estimated via l = ilog(rng).

+The whole number of bits buffered in rng may be estimated via lg = ilog(rng).

 ec_tell() then becomes a simple matter of removing these bits from the total.

-It returns (nbits_total - l).

+It returns (nbits_total - lg).

 </t>

<t>

 In a newly initialized decoder, before any symbols have been read, this reports

@@ -1403,7 +1399,7 @@

<t>

 ec_tell_frac() estimates the number of bits buffered in rng to fractional

  precision.

-Since rng must be greater than 2**23 after renormalization, l must be at least

+Since rng must be greater than 2**23 after renormalization, lg must be at least

24.

Let

 <figure align="center">

@@ -1414,7 +1410,7 @@

 </figure>

  so that 32768 &lt;= r_Q15 &lt; 65536, an unsigned Q15 value representing the

  fractional part of rng.

-Then the following procedure can be used to add one bit of precision to l.

+Then the following procedure can be used to add one bit of precision to lg.

 First, update

 <figure align="center">

 <artwork align="center">

@@ -1422,11 +1418,11 @@

 r_Q15 = (r_Q15*r_Q15) >> 15 .

 ]]></artwork>

 </figure>

-Then add the 16th bit of r_Q15 to l via

+Then add the 16th bit of r_Q15 to lg via

 <figure align="center">

 <artwork align="center">

 <![CDATA[

-l = 2*l + (r_Q15 >> 16) .

+lg = 2*lg + (r_Q15 >> 16) .

 ]]></artwork>

 </figure>

 Finally, if this bit was a 1, reduce r_Q15 by a factor of two via

@@ -1439,8 +1435,8 @@

  so that it once again lies in the range 32768 &lt;= r_Q15 &lt; 65536.

 </t>

<t>

-This procedure is repeated three times to extend l to 1/8th bit precision.

-ec_tell_frac() then returns (nbits_total*8 - l).

+This procedure is repeated three times to extend lg to 1/8th bit precision.

+ec_tell_frac() then returns (nbits_total*8 - lg).

 </t>

 </section>

@@ -5301,7 +5297,7 @@

<t>

 A negative TF adjustment means that the temporal resolution is increased,

 while a positive TF adjustment means that the frequency resolution is increased.

-Changes in TF resolution are implemented using the Hadamard transform. To increase

+Changes in TF resolution are implemented using the Hadamard transform <xref target="Hadamard"/>. To increase

 the time resolution by N, N "levels" of the Hadamard transform are applied to the

 decoded vector for each interleaved MDCT vector. To increase the frequency resolution

 (assumes a transient frame), then N levels of the Hadamard transform are applied

@@ -5459,9 +5455,9 @@

<t>

 A decoder MAY employ a more sophisticated drift compensation method. For

 example, the

-<eref target='http://code.google.com/p/webrtc/source/browse/trunk/src/modules/audio_coding/NetEQ/main/source/?r=583'>NetEQ component</eref>

+<xref target='Google-NetEQ'>NetEQ component</xref>

 of the

-<eref target='http://code.google.com/p/webrtc/'>WebRTC.org codebase</eref>

+<xref target='Google-WebRTC'>Google WebRTC codebase</xref>

 compensates for drift by adding or removing

 one period when the signal is highly periodic. The reference implementation of

 Opus allows a caller to learn whether the current frame's signal is highly

@@ -6822,7 +6818,7 @@

 each value depends on the quantization decision of the previous value.

 This dependency is exploited by the delayed decision mechanism to

 search for a quantization sequency with best R/D performance

-with a Viterbi-like algorithm .

+with a Viterbi-like algorithm <xref target="Viterbi"/>.

 The quantizer processes the residual LSF vector in reverse order

 (i.e., it starts with the highest residual LSF value).

 This is done because the prediction works slightly

@@ -7274,14 +7270,15 @@

 <section title="Opus Custom">

<t>

-To complement the Opus specification, the "Opus Custom" codec is defined to

+Opus Custom is an OPTIONAL part of the specification that is defined to

 handle special sample rates and frame rates that are not supported by the

 main Opus specification. Use of Opus Custom is discouraged for all but very

 special applications for which a frame size different from 2.5, 5, 10, or 20&nbsp;ms is

-needed (for either complexity or latency reasons). Such applications will not

-be compatible with the "main" Opus codec. In Opus Custom operation,

-only the CELT layer is available, which is available using the celt_* function

-calls in celt.h.

+needed (for either complexity or latency reasons). Because Opus Custom is

+optional, applications using that part of the specification may not be compatible

+with other applications implementing Opus. In Opus Custom operation,

+only the CELT layer is available, using the opus_custom_* function

+calls in opus_custom.h.

 </t>

 </section>

@@ -7338,7 +7335,7 @@

 </t>

 </list>

 In all of the conditions above, both the encoder and the decoder were run

- inside the <eref target="http://valgrind.org/">Valgrind</eref> memory

+ inside the <xref target="Valgrind">Valgrind</xref> memory

  debugger, which tracks reads and writes to invalid memory regions as well as

  the use of uninitialized memory.

 There were no errors reported on any of the tested conditions.

@@ -7407,7 +7404,7 @@

 <format type='TXT' target='http://tools.ietf.org/rfc/rfc6366.txt' />

 </reference>

-<reference anchor='SILK'>

+<reference anchor='SILK' target='http://developer.skype.com/silk'>

 <front>

 <title>SILK Speech Codec</title>

 <author initials='K.' surname='Vos' fullname='K. Vos'>

@@ -7442,7 +7439,7 @@

 <seriesInfo name="ICASSP-1991, Proc. IEEE Int. Conf. Acoust., Speech, Signal Processing, pp. 641-644, October" value="1991"/>

 </reference>

-<reference anchor='CELT'>

+<reference anchor='CELT' target='http://celt-codec.org/'>

 <front>

 <title>Constrained-Energy Lapped Transform (CELT) Codec</title>

 <author initials='J-M.' surname='Valin' fullname='J-M. Valin'>

@@ -7472,8 +7469,8 @@

 <abstract>

 <t></t>

 </abstract></front>

-<seriesInfo name='Internet-Draft' value='draft-ietf-avtcore-srtp-vbr-audio-03' />

-<format type='TXT' target='http://tools.ietf.org/html/draft-ietf-avtcore-srtp-vbr-audio-03' />

+<seriesInfo name='RFC' value='6562' />

+<format type='TXT' target='http://tools.ietf.org/html/rfc6562' />

 </reference>

 <reference anchor='DOS'>

@@ -7536,6 +7533,98 @@

 <seriesInfo name="IEEE Trans. on Information Theory, Vol. 32" value="pp. 568-583" />

 </reference>

+<reference anchor="Valgrind" target="http://valgrind.org/">

+<front>

+<title>Valgrind website</title>

+<author></author>

+</front>

+</reference>

+<reference anchor="Google-NetEQ" target="http://code.google.com/p/webrtc/source/browse/trunk/src/modules/audio_coding/NetEQ/main/source/?r=583">

+<front>

+<title>Google NetEQ code</title>

+<author></author>

+</front>

+</reference>

+<reference anchor="Google-WebRTC" target="http://code.google.com/p/webrtc/">

+<front>

+<title>Google WebRTC code</title>

+<author></author>

+</front>

+</reference>

+<reference anchor="Opus-git" target="git://git.xiph.org/opus.git">

+<front>

+<title>Opus Git Repository</title>

+<author></author>

+</front>

+</reference>

+<reference anchor="Opus-website" target="http://opus-codec.org/">

+<front>

+<title>Opus website</title>

+<author></author>

+</front>

+</reference>

+<reference anchor="Vectors-website" target="http://opus-codec.org/testvectors/">

+<front>

+<title>Opus Testvectors (webside)</title>

+<author></author>

+</front>

+</reference>

+<reference anchor="Vectors-proc" target="http://www.ietf.org/proceedings/83/slides/slides-83-codec-0.gz">

+<front>

+<title>Opus Testvectors (proceedings)</title>

+<author></author>

+</front>

+</reference>

+<reference anchor="Hadamard" target="http://en.wikipedia.org/wiki/Hadamard_transform">

+<front>

+<title>Hadamard Transform</title>

+<author><organization>Wikipedia</organization></author>

+</front>

+</reference>

+<reference anchor="Viterbi" target="http://en.wikipedia.org/wiki/Viterbi_algorithm">

+<front>

+<title>Viterbi Algorithm</title>

+<author><organization>Wikipedia</organization></author>

+</front>

+</reference>

+<reference anchor="Whitening" target="http://en.wikipedia.org/wiki/White_noise">

+<front>

+<title>White Noise</title>

+<author><organization>Wikipedia</organization></author>

+</front>

+</reference>

+<reference anchor="LPC" target="http://en.wikipedia.org/wiki/Linear_prediction">

+<front>

+<title>Linear Prediction</title>

+<author><organization>Wikipedia</organization></author>

+</front>

+</reference>

+<reference anchor="MDCT" target="http://en.wikipedia.org/wiki/Modified_discrete_cosine_transform">

+<front>

+<title>Modified Discrete Cosine Transform</title>

+<author><organization>Wikipedia</organization></author>

+</front>

+</reference>

+<reference anchor="FFT" target="http://en.wikipedia.org/wiki/Fast_Fourier_transform">

+<front>

+<title>Fast Fourier Transform</title>

+<author><organization>Wikipedia</organization></author>

+</front>

+</reference>

 </references>

 <section anchor="ref-implementation" title="Reference Implementation">

@@ -7551,7 +7640,7 @@

 <t>The implementation can be compiled with either a C89 or a C99

 compiler. It is reasonably optimized for most platforms such that

 only architecture-specific optimizations are likely to be useful.

-The FFT used is a slightly modified version of the KISS-FFT library,

+The FFT <xref target="FFT"/> used is a slightly modified version of the KISS-FFT library,

 but it is easy to substitute any other FFT library.

 </t>

@@ -7586,7 +7675,7 @@

 <list style="symbols">

 <t><![CDATA[

-cat draft-ietf-codec-opus.txt | grep '^\ \ \ ###' | sed -e 's/\s\s\s###//' | base64 -d > opus_source.tar.gz

+cat draft-ietf-codec-opus.txt | grep '^\ \ \ ###' | sed -e 's/...###//' | base64 -d > opus_source.tar.gz

 ]]></t>

<t>

 tar xzvf opus_source.tar.gz

@@ -7594,11 +7683,19 @@

 <t>cd opus_source</t>

 <t>make</t>

 </list>

+On systems where the provided Makefile does not work, the following command line may be used to compile

+the source code:

+<list style="symbols">

+<t><![CDATA[

+cc -O2 -g -o opus_demo src/opus_demo.c `cat *.mk | grep -v fixed | sed -e 's/.*=//' -e 's/\\\\//'` -DOPUS_BUILD -Iinclude -Icelt -Isilk -Isilk/float -Drestrict= -lm

+]]></t></list>

+</t>

+<t>

 On systems where the base64 utility is not present, the following commands can be used instead:

 <list style="symbols">

 <t><![CDATA[

-cat draft-ietf-codec-opus.txt | grep '^\ \ \ ###' | sed -e 's/\s\s\s###//' > opus.b64

+cat draft-ietf-codec-opus.txt | grep '^\ \ \ ###' | sed -e 's/...###//' > opus.b64

 ]]></t>

 <t>openssl base64 -d -in opus.b64 > opus_source.tar.gz</t>

 </list>

@@ -7606,12 +7703,13 @@

 </t>

 </section>

-<section title="Development Versions">

+<section title="Up-to-date Implementation">

<t>

-The current development version of the source code is available in a

- <eref target='git://git.opus-codec.org/opus.git'>Git repository</eref>.

-Development snapshots are provided at

- <eref target='http://opus-codec.org/'/>.

+As of the time of publication of this memo, up-to-date source code implementing

+this standard is available in a

+ <xref target='Opus-git'>Git repository</xref>.

+Releases and other resources are available at

+ <xref target='Opus-website'/>.

 </t>

 </section>

@@ -7624,9 +7722,8 @@

 <section anchor="test-vectors" title="Test Vectors">

<t>

 Because of size constraints, the Opus test vectors are not distributed in this

-draft. They are available from the Opus codec website at

-<eref target="http://opus-codec.org/testvectors/"/> and will also be made available

-in IETF meeting proceedings. These test vectors were created specifically to exercise

+draft. They are available in the proceedings of the 83th IETF meeting (Paris) <xref target="Vectors-proc"/> and from the Opus codec website at

+<xref target="Vectors-website"/>. These test vectors were created specifically to exercise

 all aspects of the decoder and therefore the audio quality of the decoded output is

 significantly lower than what Opus can achieve in normal operation.

 </t>

--- a/include/opus_defines.h

+++ b/include/opus_defines.h

@@ -120,7 +120,7 @@

 #define OPUS_AUTO                           -1000 /**<Auto/default setting @hideinitializer*/

 #define OPUS_BITRATE_MAX                       -1 /**<Maximum bitrate @hideinitializer*/

-/** Best for "standard" VoIP/videoconference applications where listening quality and intelligibility matter most

+/** Best for most VoIP/videoconference applications where listening quality and intelligibility matter most

  * @hideinitializer */

 #define OPUS_APPLICATION_VOIP                2048

 /** Best for broadcast/high-fidelity application where the decoded audio should be as close as possible to the input

--- a/src/opus_decoder.c

+++ b/src/opus_decoder.c

@@ -292,7 +292,7 @@

       if (st->prev_mode==MODE_CELT_ONLY)

          silk_InitDecoder( silk_dec );

-      /* The SILK PLC cannot support produce frames of less than 10 ms */

+      /* The SILK PLC cannot produce frames of less than 10 ms */

       st->DecControl.payloadSize_ms = IMAX(10, 1000 * audiosize / st->Fs);

       if (data != NULL)

@@ -574,7 +574,7 @@

       last_size = len-size[0];

       break;

    /* Multiple CBR/VBR frames (from 0 to 120 ms) */

-   case 3:

+   default: /*case 3:*/

       if (len<1)

          return OPUS_INVALID_PACKET;

       /* Number of frames encoded in bits 0 to 5 */

--- a/tests/run_vectors.sh

+++ b/tests/run_vectors.sh

@@ -36,7 +36,7 @@

 echo "=============="

 echo

-for file in `seq -w 1 11`

+for file in 01 02 03 04 05 06 07 08 09 10 11 12

do

     if [ -e $VECTOR_PATH/testvector$file.bit ]; then

         echo Testing testvector$file

@@ -66,7 +66,7 @@

 echo "=============="

 echo

-for file in `seq -w 1 11`

+for file in 01 02 03 04 05 06 07 08 09 10 11 12

do

     if [ -e $VECTOR_PATH/testvector$file.bit ]; then

         echo Testing testvector$file