ref: 747c817d96482883527c592b48367975c6f9a1a2
parent: cd213ea19cd4b5da427ad90f56220c21bd5f7a15
author: Jean-Marc Valin <[email protected]>
date: Tue Nov 22 17:44:56 EST 2011
Adds MFCC standard deviation features
--- a/src/analysis.c
+++ b/src/analysis.c
@@ -82,6 +82,8 @@
float lowE[NB_TBANDS], highE[NB_TBANDS];
float mem[32];
float cmean[8];
+ float std[9];
+ float music_prob;
int E_count;
int count;
} TonalityAnalysisState;
@@ -109,8 +111,13 @@
float frame_stationarity;
float relativeE;
float frame_prob;
+ float alpha;
celt_encoder_ctl(celt_enc, CELT_GET_MODE(&mode));
+ alpha = 1.f/IMIN(20, 1+tonal->count);
+
+ if (tonal->count<4)
+ tonal->music_prob = .5;
kfft = mode->mdct.kfft[0];
if (C==1)
{
@@ -283,8 +290,9 @@
for (i=0;i<5;i++)
features[i] = -0.12299*(BFCC[i]+tonal->mem[i+24]) + 0.49195*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693*tonal->mem[i+8] - 1.4349*tonal->cmean[i];
+
for (i=0;i<5;i++)
- tonal->cmean[i] = .95*tonal->cmean[i] + .05*BFCC[i];
+ tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*(i==0)*BFCC[i];
for (i=0;i<5;i++)
features[5+i] = 0.63246*(BFCC[i]-tonal->mem[i+24]) + 0.31623*(tonal->mem[i]-tonal->mem[i+16]);
@@ -291,6 +299,12 @@
for (i=0;i<4;i++)
features[10+i] = 0.53452*(BFCC[i]+tonal->mem[i+24]) - 0.26726*(tonal->mem[i]+tonal->mem[i+16]) -0.53452*tonal->mem[i+8];
+ if (tonal->count > 5)
+ {
+ for (i=0;i<9;i++)
+ tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[5+i]*features[5+i];
+ }
+
for (i=0;i<8;i++)
{
tonal->mem[i+24] = tonal->mem[i+16];
@@ -301,28 +315,33 @@
features[14] = info->tonality;
features[15] = info->activity;
features[16] = frame_stationarity;
+ features[17] = info->tonality_slope;
+ for (i=0;i<9;i++)
+ features[18+i] = sqrt(tonal->std[i]);
#ifndef FIXED_POINT
mlp_process(&net, features, &frame_prob);
frame_prob = .5*(frame_prob+1);
+ frame_prob = MAX16(.01f, MIN16(0.99f, frame_prob));
/*frame_prob = .45*frame_prob + .55*frame_prob*frame_prob*frame_prob;*/
/*printf("%f\n", frame_prob);*/
{
- float alpha, beta;
+ float tau, beta;
float p0, p1;
- alpha = .01;
- beta = .2;
- p0 = (1-info->music_prob)*(1-alpha) + info->music_prob *alpha;
- p1 = info->music_prob *(1-alpha) + (1-info->music_prob)*alpha;
+ tau = .0001;
+ beta = .1;
+ p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau;
+ p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
p0 *= pow(1-frame_prob, beta);
p1 *= pow(frame_prob, beta);
- info->music_prob = p1/(p0+p1);
- /*printf("%f\n", info->music_prob);*/
+ tonal->music_prob = MAX16(0.01f, MIN16(0.99f, p1/(p0+p1)));
+ info->music_prob = tonal->music_prob;
+ /*printf("%f %f\n", frame_prob, info->music_prob);*/
}
#else
info->music_prob = 0;
#endif
- /*for (i=0;i<17;i++)
+ /*for (i=0;i<27;i++)
printf("%f ", features[i]);
printf("\n");*/
--- a/src/mlp_data.c
+++ b/src/mlp_data.c
@@ -1,53 +1,73 @@
#include "mlp.h"
-/* RMS error was 0.289869, seed was 1321927439 */
+/* RMS error was 0.082498, seed was 1321973245 */
-static const float weights[191] = {
+static const float weights[291] = {
/* hidden layer */
-2.96755, 0.00115262, -0.241332, -0.176303, -0.23881,
--0.104982, 0.0976941, 0.0866153, 0.0445939, 0.109937,
-0.0233463, 0.0474137, -0.214098, -0.104922, -0.182143,
--8.4938, -6.0075, -0.521072, 0.0844896, -0.017247,
--0.00201771, 0.102053, -0.000613734, -0.0365414, -0.313326,
--0.388116, 0.195686, 0.415635, 0.139734, -0.375908,
--0.495817, 0.197472, 0.108982, 15.8272, 1.39337,
-0.295248, 0.478087, -0.521584, 0.547038, -0.970695,
-0.410447, -0.0398747, -0.157433, 0.225191, -0.159447,
-0.0773803, -0.14035, -0.432067, 0.436629, -0.81,
-0.258396, -1.35361, 0.379898, 0.0501006, 5.69164,
-0.0244047, -0.0253634, 0.0778099, -0.00686666, 0.0670103,
-0.131673, -0.0865675, -0.088408, -0.0215524, -0.105515,
-0.130154, -0.0107951, 0.0149045, -0.00721336, 2.70627,
--5.84219, 0.845236, 2.9728, 0.0480452, 0.0522916,
--0.17107, -0.844511, 0.086013, 0.0808069, 0.0362425,
-0.104797, 0.0312275, 0.100703, 0.0868895, 0.107739,
--0.155109, -0.743343, 2.12173, -3.50347, 3.38095,
--4.60509, -0.0940445, 0.133728, -0.0279815, 0.072341,
-0.0587296, -0.490762, -0.68488, -0.171973, -0.0674625,
-0.0557464, -0.000785266, 0.326857, -0.109421, 0.0148745,
--22.8631, 4.74747, -0.927737, -0.125692, -0.484348,
-0.448016, -0.858588, 0.36091, -0.0261568, -0.193647,
-0.224419, -0.156897, 0.0704276, -0.133405, -0.420752,
-0.374365, -0.718979, 0.213614, -0.00984738, 1.75345,
--0.739806, 1.5547, 0.23016, -0.314378, -0.221868,
-0.444039, -0.287516, 0.0769676, 0.025709, 0.0960222,
--0.0841409, 0.188217, 0.311774, -0.269616, -0.237803,
-0.318714, 5.50044, -3.76367, 5.06448, 0.592319,
--3.03044, 3.38612, -4.38443, 1.30165, -0.804144,
--0.531203, 0.605657, -0.43792, 0.352739, -0.0578825,
--2.29906, 2.33548, -2.93828, 0.74198, 21.5563,
-5.77912, -11.0732, 0.552401, -0.274121, -0.615635,
--0.142968, -0.201479, -0.0541993, 0.0475207, 0.222928,
--0.0327647, -0.0123197, -0.00380516, -0.149003, -0.313818,
--0.137811, -0.181652, 1.23463, 2.17364, 0.229491,
+1.98961, -0.0130782, 0.00232344, 0.034999, -0.111098,
+-0.0306255, 5.99275e-05, 0.0279719, -0.0122697, -0.0743631,
+-0.0265766, -0.0475938, -0.0358393, 0.0266045, -0.118931,
+2.27785, 3.14688, 0.407808, 1.40886, -0.0050141,
+-0.289166, -0.507755, -0.699676, -1.15413, 0.00851358,
+0.252532, -0.36482, -0.408518, 5.64382, 0.0997894,
+-0.0181532, 0.026249, 0.00308319, 0.0723668, 0.0562646,
+0.00452278, 0.083442, 0.0495319, 0.00983553, 0.136991,
+-0.0351484, 0.0259153, 0.00614155, 1.19176, -2.81638,
+-0.514078, 0.881382, -0.257151, -0.486597, -0.187349,
+-1.26923, -0.464027, 0.181971, 0.485618, 0.0691355,
+1.36658, -0.173798, -0.0413093, -0.265832, -1.01119,
+-0.0471417, 0.201685, -0.132444, -0.0361785, -0.0858083,
+0.0962927, -0.0848372, -0.152459, -0.287675, -0.822893,
+0.135702, -16.7317, 7.52835, -6.51729, 17.316,
+-0.138875, -0.591076, -0.87604, -0.0401374, -0.439988,
+0.514362, 0.978875, 0.114317, 0.652455, 2.75847,
+0.127838, -0.0673431, -0.058183, -0.104182, 0.073971,
+-0.00959418, 0.123298, 0.100977, 0.0675852, 0.0445473,
+0.166355, -0.0726645, -0.0597856, -0.0462871, 0.0931391,
+-0.431765, -0.954361, -0.852153, 0.0753634, -0.436737,
+-0.404625, -0.215624, -0.268892, 0.521599, -0.201161,
+-1.0103, 0.115346, -0.997492, 0.0101888, 0.266479,
+0.116699, 0.149483, 0.00956709, 0.0848689, 0.0285464,
+-0.0281497, 0.0368108, -0.0187327, 0.133623, 0.160256,
+0.132136, 0.0975494, 0.809777, -2.9119, 1.60906,
+0.433096, -0.0618059, -0.0886098, 0.027982, -0.817968,
+-0.08592, 0.0535723, 0.354925, 0.586248, 1.21096,
+-0.723206, -0.0159389, 0.0745776, 0.0526613, 0.133663,
+0.0446259, -0.0496362, 0.0188244, 0.0471644, -6.58117e-05,
+0.0333697, -0.0445606, 0.0627888, 0.0244612, 0.123508,
+4.18755, 1.53047, -1.88803, -4.54538, 0.20431,
+0.0413455, 0.50537, 0.338508, 0.252371, -0.000301334,
+-0.370387, -1.02901, -0.616503, -0.652812, 0.00404532,
+0.0624655, 0.255373, 0.05062, -0.00559389, -0.0594389,
+0.000658649, -0.0496338, 0.00196121, -0.00272021, -0.0455981,
+0.0595863, 0.193358, 0.030662, -2.04355, 1.55208,
+-0.523008, -0.15559, 0.177406, 0.230804, 0.70517,
+0.000395192, -0.379844, -0.0423835, -0.195152, -0.4136,
+-0.524797, 1.05256, -0.109296, -0.637306, 0.0539148,
+-0.0858552, -0.606899, -0.300064, -0.0766599, -0.0802716,
+0.00901309, 0.0400887, -0.456173, -0.47669, -0.00608932,
+-0.219657, -4.26855, 10.958, -13.9916, -3.13647,
+0.0392604, 0.214159, 0.120234, -0.321367, 0.139534,
+0.0780652, 0.151579, -0.797584, 0.4504, -0.521148,
+0.0438544, -0.445612, -1.2245, -0.378183, -0.00752445,
+-0.517898, 0.490149, 0.561133, -0.314982, 0.177619,
+-0.296543, 0.0727557, -0.830196, -0.328138, -0.829027,
+-0.0582978, 1.13614, -12.5848, 0.0546282, -0.97928,
+-0.754297, -0.223966, -0.175016, -0.0240597, 0.172425,
+0.209448, 2.25374, 1.69033, -0.0182459, -0.0484306,
+-0.0919532, 0.0966783, 0.365315, 0.494533, -0.727803,
+0.780301, -0.580837, 0.355177, -0.1704, 0.0151144,
+-0.240249, 0.197192, -7.70031, 0.771764, -0.98316,
+6.65569, -0.0561571, -0.125888, -0.0773176, 0.176193,
+0.711231, -0.167186, -0.0388936, -0.460926, 0.103807,
/* output layer */
--7.91184, -1.52122, 0.603183, -3.27692, 3.61369,
-1.16504, -1.1068, 2.80566, 0.85419, 0.545877,
-0.804097, };
+-1.36902, 1.7905, 2.34488, -0.948134, -1.51596,
+-2.37084, 3.30504, -4.03211, -0.918167, -0.841675,
+-0.859274, };
-static const int topo[3] = {17, 10, 1};
+static const int topo[3] = {27, 10, 1};
const MLP net = {
3,
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -870,8 +870,11 @@
nb_analysis_frames = frame_size/(st->Fs/100);
for (i=0;i<nb_analysis_frames;i++)
tonality_analysis(&st->analysis, &analysis_info, celt_enc, pcm_buf+i*(st->Fs/100)*st->channels, st->channels);
+ if (st->signal_type == OPUS_AUTO)
+ st->voice_ratio = floor(.5+100*(1-analysis_info.music_prob));
} else {
analysis_info.valid = 0;
+ st->voice_ratio = -1;
}
#endif