ref: 4eb399d7f4d97f5f20081db6196dba04d22113a4
parent: 0f5ff80bacdde3f68e746c5eeede9590149b6ac2
author: Jean-Marc Valin <[email protected]>
date: Mon Jul 1 16:19:24 EDT 2013
More speech/music detection comments (delayed decision)
--- a/src/analysis.c
+++ b/src/analysis.c
@@ -177,6 +177,8 @@
curr_lookahead = IMAX(curr_lookahead-10, 0);
psum=0;
+ /* Summing the probability of transition patterns that involve music at
+ time (DETECT_SIZE-curr_lookahead-1) */
for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
psum += tonal->pmusic[i];
for (;i<DETECT_SIZE;i++)
@@ -479,7 +481,9 @@
frame_probs[0] = .5f*(frame_probs[0]+1);
/* Curve fitting between the MLP probability and the actual probability */
frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);
+ /* Probability of active audio (as opposed to silence) */
frame_probs[1] = .5f*frame_probs[1]+.5f;
+ /* Consider that silence has a 50-50 probability. */
frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f;
/*printf("%f %f ", frame_probs[0], frame_probs[1]);*/
@@ -491,13 +495,15 @@
float beta;
/* Denormalized probability of speech (p0) and music (p1) after update */
float p0, p1;
- /* Delayed decision variables */
+ /* Probabilities for "all speech" and "all music" */
float s0, m0;
+ /* Probability sum for renormalisation */
float psum;
+ /* Instantaneous probability of speech and music, with beta pre-applied. */
float speech0;
float music0;
- /* One transition every 3 minutes */
+ /* One transition every 3 minutes of active audio */
tau = .00005f*frame_probs[1];
beta = .05f;
if (1) {
@@ -522,6 +528,7 @@
/* This chunk of code deals with delayed decision. */
psum=1e-20f;
+ /* Instantaneous probability of speech and music, with beta pre-applied. */
speech0 = (float)pow(1-frame_probs[0], beta);
music0 = (float)pow(frame_probs[0], beta);
if (tonal->count==1)
@@ -529,18 +536,25 @@
tonal->pspeech[0]=.5;
tonal->pmusic [0]=.5;
}
+ /* Updated probability of having only speech (s0) or only music (m0),
+ before considering the new observation. */
s0 = tonal->pspeech[0] + tonal->pspeech[1];
m0 = tonal->pmusic [0] + tonal->pmusic [1];
+ /* Updates s0 and m0 with instantaneous probability. */
tonal->pspeech[0] = s0*(1-tau)*speech0;
tonal->pmusic [0] = m0*(1-tau)*music0;
+ /* Propagate the transition probabilities */
for (i=1;i<DETECT_SIZE-1;i++)
{
tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
}
+ /* Probability that the latest frame is speech, when all the previous ones were music. */
tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
+ /* Probability that the latest frame is music, when all the previous ones were speech. */
tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
+ /* Renormalise probabilities to 1 */
for (i=0;i<DETECT_SIZE;i++)
psum += tonal->pspeech[i] + tonal->pmusic[i];
psum = 1.f/psum;
--- a/src/analysis.h
+++ b/src/analysis.h
@@ -62,7 +62,11 @@
int count;
opus_val32 subframe_mem[3];
int analysis_offset;
+ /** Probability of having speech for time i to DETECT_SIZE-1 (and music before).
+ pspeech[0] is the probability that all frames in the window are speech. */
float pspeech[DETECT_SIZE];
+ /** Probability of having music for time i to DETECT_SIZE-1 (and speech before).
+ pmusic[0] is the probability that all frames in the window are music. */
float pmusic[DETECT_SIZE];
float speech_confidence;
float music_confidence;