ref: b5972388d7c9b0d5f5a780da23ba00dce6a2628d
parent: a5e96b84302f2008ec32e8664356dc83efd72c17
author: Jean-Marc Valin <[email protected]>
date: Fri Oct 7 04:38:27 EDT 2011
Proper SILK delay compensation for resampling Adds SILK delay compensation that depends on encode and decode sampling rate, as well as SILK internal coding rate. This ensures that the SILK part of Opus is always in sync with the CELT part no matter what the sampling rates are. It also increases the resampling delay to 1.15 ms (was previously 0.48 ms).
--- a/silk/control_codec.c
+++ b/silk/control_codec.c
@@ -37,6 +37,14 @@
#endif
#include "tuning_parameters.h"
+
+static const int enc_delay_matrix[3][5] = {
+/*SILK API 8 12 16 24 48 */
+/* 8 */ {5, 0, 3, 4, 8},
+/*12 */ {0, 6, 0, 0, 0},
+/*16 */ {4, 5, 11, 5, 18}
+};
+
opus_int silk_setup_resamplers(
silk_encoder_state_Fxx *psEnc, /* I/O */
opus_int fs_kHz /* I */
@@ -234,6 +242,9 @@
psEnc->sCmn.PacketSize_ms = PacketSize_ms;
psEnc->sCmn.TargetRate_bps = 0; /* trigger new SNR computation */
}
+
+ psEnc->sCmn.delay = enc_delay_matrix[rateID(fs_kHz*1000)][rateID(psEnc->sCmn.API_fs_Hz)];
+ silk_assert(psEnc->sCmn.delay <= MAX_ENCODER_DELAY);
/* Set internal sampling frequency */
silk_assert( fs_kHz == 8 || fs_kHz == 12 || fs_kHz == 16 );
--- a/silk/dec_API.c
+++ b/silk/dec_API.c
@@ -31,6 +31,14 @@
#include "API.h"
#include "main.h"
+static const int dec_delay_matrix[3][5] = {
+/*SILK API 8 12 16 24 48 */
+/* 8 */ {3, 0, 2, 0, 0},
+/*12 */ {0, 8, 5, 7, 5},
+/*16 */ {0, 0, 8, 5, 5}
+};
+
+
/************************/
/* Decoder Super Struct */
/************************/
@@ -82,13 +90,16 @@
{
opus_int i, n, prev_fs_kHz, decode_only_middle = 0, ret = SILK_NO_ERROR;
opus_int32 nSamplesOutDec, LBRR_symbol;
- opus_int16 samplesOut1_tmp[ 2 ][ MAX_FS_KHZ * MAX_FRAME_LENGTH_MS + 2 ];
+ opus_int16 samplesOut1_tmp[ 2 ][ MAX_FS_KHZ * MAX_FRAME_LENGTH_MS + 2 + MAX_DECODER_DELAY ];
opus_int16 samplesOut2_tmp[ MAX_API_FS_KHZ * MAX_FRAME_LENGTH_MS ];
opus_int32 MS_pred_Q13[ 2 ] = { 0 };
opus_int16 *resample_out_ptr;
silk_decoder *psDec = ( silk_decoder * )decState;
silk_decoder_state *channel_state = psDec->channel_state;
+ int delay;
+ delay = channel_state[ 0 ].delay;
+
/**********************************/
/* Test if first frame in payload */
/**********************************/
@@ -106,6 +117,7 @@
ret += silk_init_decoder( &channel_state[ 1 ] );
if( psDec->nChannelsAPI == 2 ) {
silk_memcpy( &channel_state[ 1 ].resampler_state, &channel_state[ 0 ].resampler_state, sizeof( silk_resampler_state_struct ) );
+ silk_memcpy( &channel_state[ 1 ].delayBuf, &channel_state[ 0 ].delayBuf, MAX_DECODER_DELAY*sizeof(opus_int16));
}
}
@@ -143,9 +155,12 @@
/* Initialize resampler when switching internal or external sampling frequency */
if( prev_fs_kHz != channel_state[ 0 ].fs_kHz || channel_state[ 0 ].prev_API_sampleRate != decControl->API_sampleRate ) {
+ channel_state[ 0 ].delay = dec_delay_matrix[rateID(silk_SMULBB( channel_state[ 0 ].fs_kHz, 1000 ))][rateID(decControl->API_sampleRate)];
+ silk_assert(channel_state[ 0 ].delay <= MAX_DECODER_DELAY);
ret = silk_resampler_init( &channel_state[ 0 ].resampler_state, silk_SMULBB( channel_state[ 0 ].fs_kHz, 1000 ), decControl->API_sampleRate );
if( decControl->nChannelsAPI == 2 && decControl->nChannelsInternal == 2 ) {
silk_memcpy( &channel_state[ 1 ].resampler_state, &channel_state[ 0 ].resampler_state, sizeof( silk_resampler_state_struct ) );
+ channel_state[ 1 ].delay = channel_state[ 0 ].delay;
}
}
channel_state[ 0 ].prev_API_sampleRate = decControl->API_sampleRate;
@@ -230,19 +245,19 @@
/* Call decoder for one frame */
for( n = 0; n < decControl->nChannelsInternal; n++ ) {
if( n == 0 || decode_only_middle == 0 ) {
- ret += silk_decode_frame( &channel_state[ n ], psRangeDec, &samplesOut1_tmp[ n ][ 2 ], &nSamplesOutDec, lostFlag );
+ ret += silk_decode_frame( &channel_state[ n ], psRangeDec, &samplesOut1_tmp[ n ][ 2 + delay ], &nSamplesOutDec, lostFlag );
} else {
- silk_memset( &samplesOut1_tmp[ n ][ 2 ], 0, nSamplesOutDec * sizeof( opus_int16 ) );
+ silk_memset( &samplesOut1_tmp[ n ][ 2 + delay ], 0, nSamplesOutDec * sizeof( opus_int16 ) );
}
}
if( decControl->nChannelsAPI == 2 && decControl->nChannelsInternal == 2 ) {
/* Convert Mid/Side to Left/Right */
- silk_stereo_MS_to_LR( &psDec->sStereo, samplesOut1_tmp[ 0 ], samplesOut1_tmp[ 1 ], MS_pred_Q13, channel_state[ 0 ].fs_kHz, nSamplesOutDec );
+ silk_stereo_MS_to_LR( &psDec->sStereo, &samplesOut1_tmp[ 0 ][delay], &samplesOut1_tmp[ 1 ][delay], MS_pred_Q13, channel_state[ 0 ].fs_kHz, nSamplesOutDec );
} else {
/* Buffering */
- silk_memcpy( samplesOut1_tmp[ 0 ], psDec->sStereo.sMid, 2 * sizeof( opus_int16 ) );
- silk_memcpy( psDec->sStereo.sMid, &samplesOut1_tmp[ 0 ][ nSamplesOutDec ], 2 * sizeof( opus_int16 ) );
+ silk_memcpy( &samplesOut1_tmp[ 0 ][delay], psDec->sStereo.sMid, 2 * sizeof( opus_int16 ) );
+ silk_memcpy( psDec->sStereo.sMid, &samplesOut1_tmp[ 0 ][ nSamplesOutDec + delay ], 2 * sizeof( opus_int16 ) );
}
/* Number of output samples */
@@ -256,8 +271,11 @@
}
for( n = 0; n < silk_min( decControl->nChannelsAPI, decControl->nChannelsInternal ); n++ ) {
+
+ silk_memcpy(&samplesOut1_tmp[ n ][ 1 ], &channel_state[ n ].delayBuf[ MAX_DECODER_DELAY-delay ], delay*sizeof(opus_int16));
/* Resample decoded signal to API_sampleRate */
ret += silk_resampler( &channel_state[ n ].resampler_state, resample_out_ptr, &samplesOut1_tmp[ n ][ 1 ], nSamplesOutDec );
+ silk_memcpy(channel_state[ n ].delayBuf, &samplesOut1_tmp[ n ][ 1 + nSamplesOutDec + delay - MAX_DECODER_DELAY ], MAX_DECODER_DELAY*sizeof(opus_int16));
/* Interleave if stereo output and stereo stream */
if( decControl->nChannelsAPI == 2 && decControl->nChannelsInternal == 2 ) {
--- a/silk/define.h
+++ b/silk/define.h
@@ -86,6 +86,9 @@
#define MAX_FRAME_LENGTH_MS ( SUB_FRAME_LENGTH_MS * MAX_NB_SUBFR )
#define MAX_FRAME_LENGTH ( MAX_FRAME_LENGTH_MS * MAX_FS_KHZ )
+#define MAX_ENCODER_DELAY 18
+#define MAX_DECODER_DELAY 8
+
/* Milliseconds of lookahead for pitch analysis */
#define LA_PITCH_MS 2
#define LA_PITCH_MAX ( LA_PITCH_MS * MAX_FS_KHZ )
--- a/silk/enc_API.c
+++ b/silk/enc_API.c
@@ -138,8 +138,8 @@
opus_int speech_act_thr_for_switch_Q8;
opus_int32 TargetRate_bps, MStargetRates_bps[ 2 ], channelRate_bps, LBRR_symbol;
silk_encoder *psEnc = ( silk_encoder * )encState;
- opus_int16 buf[ MAX_FRAME_LENGTH_MS * MAX_API_FS_KHZ ];
- opus_int transition;
+ opus_int16 buf[ MAX_FRAME_LENGTH_MS * MAX_API_FS_KHZ + MAX_ENCODER_DELAY];
+ opus_int transition, delay;
psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded = psEnc->state_Fxx[ 1 ].sCmn.nFramesEncoded = 0;
@@ -222,6 +222,7 @@
}
silk_assert( encControl->nChannelsInternal == 1 || psEnc->state_Fxx[ 0 ].sCmn.fs_kHz == psEnc->state_Fxx[ 1 ].sCmn.fs_kHz );
+ delay = psEnc->state_Fxx[ 0 ].sCmn.delay;
/* Input buffering/resampling and encoding */
while( 1 ) {
nSamplesToBuffer = psEnc->state_Fxx[ 0 ].sCmn.frame_length - psEnc->state_Fxx[ 0 ].sCmn.inputBufIx;
@@ -231,12 +232,15 @@
if( encControl->nChannelsAPI == 2 && encControl->nChannelsInternal == 2 ) {
int id = psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded;
for( n = 0; n < nSamplesFromInput; n++ ) {
- buf[ n ] = samplesIn[ 2 * n ];
+ buf[ n+delay ] = samplesIn[ 2 * n ];
}
+ silk_memcpy(buf, &psEnc->state_Fxx[ 0 ].sCmn.delayBuf[MAX_ENCODER_DELAY-delay], delay*sizeof(opus_int16));
/* Making sure to start both resamplers from the same state when switching from mono to stereo */
if(psEnc->nPrevChannelsInternal == 1 && id==0) {
- silk_memcpy(&psEnc->state_Fxx[ 1 ].sCmn.resampler_state, &psEnc->state_Fxx[ 0 ].sCmn.resampler_state, sizeof(psEnc->state_Fxx[ 1 ].sCmn.resampler_state));
+ silk_memcpy( &psEnc->state_Fxx[ 1 ].sCmn.resampler_state, &psEnc->state_Fxx[ 0 ].sCmn.resampler_state, sizeof(psEnc->state_Fxx[ 1 ].sCmn.resampler_state));
+ silk_memcpy( &psEnc->state_Fxx[ 1 ].sCmn.delayBuf, &psEnc->state_Fxx[ 0 ].sCmn.delayBuf, MAX_ENCODER_DELAY*sizeof(opus_int16));
}
+ silk_memcpy(psEnc->state_Fxx[ 0 ].sCmn.delayBuf, buf+nSamplesFromInput+delay-MAX_ENCODER_DELAY, MAX_ENCODER_DELAY*sizeof(opus_int16));
ret += silk_resampler( &psEnc->state_Fxx[ 0 ].sCmn.resampler_state,
&psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ psEnc->state_Fxx[ 0 ].sCmn.inputBufIx + 2 ], buf, nSamplesFromInput );
@@ -245,23 +249,31 @@
nSamplesToBuffer = psEnc->state_Fxx[ 1 ].sCmn.frame_length - psEnc->state_Fxx[ 1 ].sCmn.inputBufIx;
nSamplesToBuffer = silk_min( nSamplesToBuffer, 10 * nBlocksOf10ms * psEnc->state_Fxx[ 1 ].sCmn.fs_kHz );
for( n = 0; n < nSamplesFromInput; n++ ) {
- buf[ n ] = samplesIn[ 2 * n + 1 ];
+ buf[ n+delay ] = samplesIn[ 2 * n + 1 ];
}
+ silk_memcpy(buf, &psEnc->state_Fxx[ 1 ].sCmn.delayBuf[MAX_ENCODER_DELAY-delay], delay*sizeof(opus_int16));
ret += silk_resampler( &psEnc->state_Fxx[ 1 ].sCmn.resampler_state,
&psEnc->state_Fxx[ 1 ].sCmn.inputBuf[ psEnc->state_Fxx[ 1 ].sCmn.inputBufIx + 2 ], buf, nSamplesFromInput );
+ silk_memcpy(psEnc->state_Fxx[ 1 ].sCmn.delayBuf, buf+nSamplesFromInput+delay-MAX_ENCODER_DELAY, MAX_ENCODER_DELAY*sizeof(opus_int16));
+
psEnc->state_Fxx[ 1 ].sCmn.inputBufIx += nSamplesToBuffer;
} else if( encControl->nChannelsAPI == 2 && encControl->nChannelsInternal == 1 ) {
/* Combine left and right channels before resampling */
for( n = 0; n < nSamplesFromInput; n++ ) {
- buf[ n ] = (opus_int16)silk_RSHIFT_ROUND( samplesIn[ 2 * n ] + samplesIn[ 2 * n + 1 ], 1 );
+ buf[ n+delay ] = (opus_int16)silk_RSHIFT_ROUND( samplesIn[ 2 * n ] + samplesIn[ 2 * n + 1 ], 1 );
}
+ silk_memcpy(buf, &psEnc->state_Fxx[ 0 ].sCmn.delayBuf[MAX_ENCODER_DELAY-delay], delay*sizeof(opus_int16));
ret += silk_resampler( &psEnc->state_Fxx[ 0 ].sCmn.resampler_state,
&psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ psEnc->state_Fxx[ 0 ].sCmn.inputBufIx + 2 ], buf, nSamplesFromInput );
+ silk_memcpy(psEnc->state_Fxx[ 0 ].sCmn.delayBuf, buf+nSamplesFromInput+delay-MAX_ENCODER_DELAY, MAX_ENCODER_DELAY*sizeof(opus_int16));
psEnc->state_Fxx[ 0 ].sCmn.inputBufIx += nSamplesToBuffer;
} else {
silk_assert( encControl->nChannelsAPI == 1 && encControl->nChannelsInternal == 1 );
+ silk_memcpy(buf+delay, samplesIn, nSamplesFromInput*sizeof(opus_int16));
+ silk_memcpy(buf, &psEnc->state_Fxx[ 0 ].sCmn.delayBuf[MAX_ENCODER_DELAY-delay], delay*sizeof(opus_int16));
ret += silk_resampler( &psEnc->state_Fxx[ 0 ].sCmn.resampler_state,
- &psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ psEnc->state_Fxx[ 0 ].sCmn.inputBufIx + 2 ], samplesIn, nSamplesFromInput );
+ &psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ psEnc->state_Fxx[ 0 ].sCmn.inputBufIx + 2 ], buf, nSamplesFromInput );
+ silk_memcpy(psEnc->state_Fxx[ 0 ].sCmn.delayBuf, buf+nSamplesFromInput+delay-MAX_ENCODER_DELAY, MAX_ENCODER_DELAY*sizeof(opus_int16));
psEnc->state_Fxx[ 0 ].sCmn.inputBufIx += nSamplesToBuffer;
}
--- a/silk/main.h
+++ b/silk/main.h
@@ -43,6 +43,8 @@
/* Uncomment the next line to force a fixed internal sampling rate (independent of what bitrate is used */
/*#define FORCE_INTERNAL_FS_KHZ 16*/
+/* Simple way to make [8000, 12000, 16000, 24000, 48000] to [0,1,2,3,4] */
+#define rateID(R) ( ( ( ((R)>>12) - ((R)>16000) ) >> ((R)>24000) ) - 1 )
/* Convert Left/Right stereo signal to adaptive Mid/Side representation */
void silk_stereo_LR_to_MS(
--- a/silk/structs.h
+++ b/silk/structs.h
@@ -149,6 +149,7 @@
opus_int minInternal_fs_Hz; /* Minimum internal sampling frequency (Hz) */
opus_int desiredInternal_fs_Hz; /* Soft request for internal sampling frequency (Hz) */
opus_int fs_kHz; /* Internal sampling frequency (kHz) */
+ opus_int delay; /* Number of samples of delay to apply */
opus_int nb_subfr; /* Number of 5 ms subframes in a frame */
opus_int frame_length; /* Frame length (samples) */
opus_int subfr_length; /* Subframe length (samples) */
@@ -192,6 +193,7 @@
/* Input/output buffering */
opus_int16 inputBuf[ MAX_FRAME_LENGTH + 2 ]; /* Buffer containing input signal */
+ opus_int16 delayBuf[MAX_ENCODER_DELAY];
opus_int inputBufIx;
opus_int nFramesPerPacket;
opus_int nFramesEncoded; /* Number of frames analyzed in current packet */
@@ -257,6 +259,8 @@
opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + MAX_LPC_ORDER ];
opus_int32 exc_Q10[ MAX_FRAME_LENGTH ];
opus_int16 outBuf[ 2 * MAX_FRAME_LENGTH ]; /* Buffer for output signal */
+ opus_int16 delayBuf[ MAX_DECODER_DELAY ]; /* Buffer for delaying the SILK output prior to resampling */
+ opus_int delay; /* How much decoder delay to add */
opus_int lagPrev; /* Previous Lag */
opus_int8 LastGainIndex; /* Previous gain index */
opus_int fs_kHz; /* Sampling frequency in kHz */
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -123,6 +123,11 @@
{ 48000, 24000}, /* mono */
{ 48000, 24000}, /* stereo */
};
+
+static const int celt_delay_table[5] = {
+/* API 8 12 16 24 48 */
+ 10, 16, 21, 27, 55
+};
int opus_encoder_get_size(int channels)
{
int silkEncSizeBytes, celtEncSizeBytes;
@@ -202,15 +207,9 @@
st->encoder_buffer = st->Fs/100;
st->delay_compensation = st->Fs/400;
- /* This part is meant to compensate for the resampler delay as a function
- of the API sampling rate */
- if (st->Fs == 48000)
- st->delay_compensation += 23;
- else if (st->Fs == 24000)
- st->delay_compensation += 15;
- else
- st->delay_compensation += 2;
+ st->delay_compensation += celt_delay_table[rateID(st->Fs)];
+
st->hybrid_stereo_width_Q14 = 1 << 14;
st->variable_HP_smth2_Q15 = silk_LSHIFT( silk_lin2log( VARIABLE_HP_MIN_CUTOFF_HZ ), 8 );
st->first = 1;
@@ -486,7 +485,7 @@
}
#endif
- if (st->stream_channels == 1 && st->prev_channels ==2 && st->silk_mode.toMono==0)
+ if (st->stream_channels == 1 && st->prev_channels ==2 && st->silk_mode.toMono==0)
{
/* Delay stereo->mono transition by two frames so that SILK can do a smooth downmix */
st->silk_mode.toMono = 1;