ref: 9791b22b2c83980f6b4386c870cad58557c78007
parent: 054acff3c142c147e3b5801a13347f04c56f0eb5
author: Jean-Marc Valin <[email protected]>
date: Thu Nov 22 09:06:34 EST 2018
Refactoring: Isolating the matrix-vector product in gemm_accum()
--- a/src/mlp.c
+++ b/src/mlp.c
@@ -69,9 +69,19 @@
return .5f + .5f*tansig_approx(.5f*x);
}
-void compute_dense(const DenseLayer *layer, float *output, const float *input)
+static void gemm_accum(float *out, const opus_int8 *weights, int rows, int cols, int col_stride, const float *x)
{
int i, j;
+ for (i=0;i<rows;i++)
+ {
+ for (j=0;j<cols;j++)
+ out[i] += weights[j*col_stride + i]*x[j];
+ }
+}
+
+void compute_dense(const DenseLayer *layer, float *output, const float *input)
+{
+ int i;
int N, M;
int stride;
M = layer->nb_inputs;
@@ -78,13 +88,10 @@
N = layer->nb_neurons;
stride = N;
for (i=0;i<N;i++)
- {
- /* Compute update gate. */
- float sum = layer->bias[i];
- for (j=0;j<M;j++)
- sum += layer->input_weights[j*stride + i]*input[j];
- output[i] = WEIGHTS_SCALE*sum;
- }
+ output[i] = layer->bias[i];
+ gemm_accum(output, layer->input_weights, N, M, stride, input);
+ for (i=0;i<N;i++)
+ output[i] *= WEIGHTS_SCALE;
if (layer->sigmoid) {
for (i=0;i<N;i++)
output[i] = sigmoid_approx(output[i]);
@@ -96,9 +103,10 @@
void compute_gru(const GRULayer *gru, float *state, const float *input)
{
- int i, j;
+ int i;
int N, M;
int stride;
+ float tmp[MAX_NEURONS];
float z[MAX_NEURONS];
float r[MAX_NEURONS];
float h[MAX_NEURONS];
@@ -105,36 +113,31 @@
M = gru->nb_inputs;
N = gru->nb_neurons;
stride = 3*N;
+ /* Compute update gate. */
for (i=0;i<N;i++)
- {
- /* Compute update gate. */
- float sum = gru->bias[i];
- for (j=0;j<M;j++)
- sum += gru->input_weights[j*stride + i]*input[j];
- for (j=0;j<N;j++)
- sum += gru->recurrent_weights[j*stride + i]*state[j];
- z[i] = sigmoid_approx(WEIGHTS_SCALE*sum);
- }
+ z[i] = gru->bias[i];
+ gemm_accum(z, gru->input_weights, N, M, stride, input);
+ gemm_accum(z, gru->recurrent_weights, N, N, stride, state);
for (i=0;i<N;i++)
- {
- /* Compute reset gate. */
- float sum = gru->bias[N + i];
- for (j=0;j<M;j++)
- sum += gru->input_weights[N + j*stride + i]*input[j];
- for (j=0;j<N;j++)
- sum += gru->recurrent_weights[N + j*stride + i]*state[j];
- r[i] = sigmoid_approx(WEIGHTS_SCALE*sum);
- }
+ z[i] = sigmoid_approx(WEIGHTS_SCALE*z[i]);
+
+ /* Compute reset gate. */
for (i=0;i<N;i++)
- {
- /* Compute output. */
- float sum = gru->bias[2*N + i];
- for (j=0;j<M;j++)
- sum += gru->input_weights[2*N + j*stride + i]*input[j];
- for (j=0;j<N;j++)
- sum += gru->recurrent_weights[2*N + j*stride + i]*state[j]*r[j];
- h[i] = z[i]*state[i] + (1-z[i])*tansig_approx(WEIGHTS_SCALE*sum);
- }
+ r[i] = gru->bias[N + i];
+ gemm_accum(r, &gru->input_weights[N], N, M, stride, input);
+ gemm_accum(r, &gru->recurrent_weights[N], N, N, stride, state);
+ for (i=0;i<N;i++)
+ r[i] = sigmoid_approx(WEIGHTS_SCALE*r[i]);
+
+ /* Compute output. */
+ for (i=0;i<N;i++)
+ h[i] = gru->bias[2*N + i];
+ for (i=0;i<N;i++)
+ tmp[i] = state[i] * r[i];
+ gemm_accum(h, &gru->input_weights[2*N], N, M, stride, input);
+ gemm_accum(h, &gru->recurrent_weights[2*N], N, N, stride, tmp);
+ for (i=0;i<N;i++)
+ h[i] = z[i]*state[i] + (1-z[i])*tansig_approx(WEIGHTS_SCALE*h[i]);
for (i=0;i<N;i++)
state[i] = h[i];
}