Coverage Report

Created: 2021-04-14 04:30

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/cmd/blas/ccv_nnc_gemm_cpu_ref.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
static int _ccv_nnc_gemm_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
14
7.40k
{
15
7.40k
  assert(input_size >= 2);
16
7.40k
  const ccv_nnc_tensor_view_t* a = (const ccv_nnc_tensor_view_t*)inputs[0];
17
7.40k
  const ccv_nnc_tensor_view_t* w = (const ccv_nnc_tensor_view_t*)inputs[1];
18
7.40k
  const ccv_nnc_tensor_view_t* bias = input_size > 2 ? 
(const ccv_nnc_tensor_view_t*)inputs[2]5.36k
:
02.04k
;
19
7.40k
  // Copy the most of parameters, but reshape the dimension of a to a vector.
20
7.40k
  assert(output_size == 1);
21
7.40k
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
22
7.40k
  assert(!bias || (bias->info.dim[1] == 0 || bias->info.dim[2] == 0 || bias->info.dim[3] == 0)); // It is a 1-d array
23
7.40k
  int a_batch_size, a_rows, a_cols, a_batch_inc, a_rows_inc, a_cols_inc;
24
7.40k
  int w_batch_size, w_rows, w_cols, w_batch_inc, w_rows_inc, w_cols_inc;
25
7.40k
  int b_batch_size, b_rows, b_cols, b_batch_inc, b_rows_inc, b_cols_inc;
26
7.40k
  const static int no_transpose[2] = {};
27
7.40k
  ccv_nnc_tensor_get_matrix_params(a->info, CCV_IS_TENSOR_VIEW(a) ? 
a->inc9
:
a->info.dim7.40k
, cmd.info.blas.transpose_a, &a_batch_size, &a_rows, &a_cols, &a_batch_inc, &a_rows_inc, &a_cols_inc);
28
7.40k
  ccv_nnc_tensor_get_matrix_params(w->info, CCV_IS_TENSOR_VIEW(w) ? 
w->inc0
: w->info.dim, cmd.info.blas.transpose_b, &w_batch_size, &w_rows, &w_cols, &w_batch_inc, &w_rows_inc, &w_cols_inc);
29
7.40k
  ccv_nnc_tensor_get_matrix_params(b->info, CCV_IS_TENSOR_VIEW(b) ? 
b->inc15
:
b->info.dim7.39k
, no_transpose, &b_batch_size, &b_rows, &b_cols, &b_batch_inc, &b_rows_inc, &b_cols_inc);
30
7.40k
  assert(ccv_max(a_batch_size, w_batch_size) == b_batch_size);
31
7.40k
  assert(a_batch_size == b_batch_size || a_batch_size == 1);
32
7.40k
  if (a_batch_size == 1 && 
b_batch_size > 17.40k
)
33
0
    a_batch_inc = 0;
34
7.40k
  assert(w_batch_size == b_batch_size || w_batch_size == 1);
35
7.40k
  if (w_batch_size == 1 && 
b_batch_size > 17.40k
)
36
2
    w_batch_inc = 0;
37
7.40k
  assert(a_rows == b_rows);
38
7.40k
  assert(a_cols == w_rows);
39
7.40k
  assert(w_cols == b_cols);
40
7.40k
  int n, i;
41
7.40k
  if (bias)
42
5.36k
  {
43
5.36k
    int bias_batch_size, bias_rows, bias_cols, bias_batch_inc, bias_rows_inc, bias_cols_inc;
44
5.36k
    ccv_nnc_tensor_get_matrix_params(bias->info, CCV_IS_TENSOR_VIEW(bias) ? 
bias->inc0
: bias->info.dim, no_transpose, &bias_batch_size, &bias_rows, &bias_cols, &bias_batch_inc, &bias_rows_inc, &bias_cols_inc);
45
5.36k
    assert(bias_batch_size == b_batch_size || bias_batch_size == 1);
46
5.36k
    if (bias_batch_size == 1 && 
b_batch_size > 15.36k
)
47
1
      bias_batch_inc = 0;
48
5.36k
    if (bias_rows == 1 && b_rows > 1)
49
5.12k
      bias_rows_inc = 0;
50
5.36k
    assert(bias_cols == b_cols);
51
10.7k
    
for (n = 0; 5.36k
n < b_batch_size;
n++5.36k
)
52
5.36k
    {
53
5.36k
      const float* const ap = a->data.f32 + n * a_batch_inc;
54
5.36k
      const float* const wp = w->data.f32 + n * w_batch_inc;
55
5.36k
      const float* const biasp = bias->data.f32 + n * bias_batch_inc;
56
5.36k
      float* const bp = b->data.f32 + n * b_batch_inc;
57
15.9k
      for (i = 0; i < b_rows; 
i++10.5k
)
58
10.5k
      {
59
10.5k
        const float* const api = ap + i * a_rows_inc;
60
10.5k
        const float* const biaspi = biasp + i * bias_rows_inc;
61
10.5k
        float* const bpi = bp + i * b_rows_inc;
62
10.5k
        parallel_for(j, b_cols) {
63
0
          float v = biaspi[j * bias_cols_inc];
64
0
          const float* const wpj = wp + j * w_cols_inc;
65
0
          int k;
66
19.6M
          for (k = 0; k < a_cols; k++)
67
19.6M
            v += wpj[k * w_rows_inc] * api[k * a_cols_inc];
68
0
          bpi[j * b_cols_inc] = v;
69
10.5k
        } parallel_endfor
70
10.5k
      }
71
5.36k
    }
72
5.36k
  } else {
73
4.08k
    for (n = 0; n < b_batch_size; 
n++2.04k
)
74
2.04k
    {
75
2.04k
      const float* const ap = a->data.f32 + n * a_batch_inc;
76
2.04k
      const float* const wp = w->data.f32 + n * w_batch_inc;
77
2.04k
      float* const bp = b->data.f32 + n * b_batch_inc;
78
4.14k
      for (i = 0; i < b_rows; 
i++2.09k
)
79
2.09k
      {
80
2.09k
        const float* const api = ap + i * a_rows_inc;
81
2.09k
        float* const bpi = bp + i * b_rows_inc;
82
2.09k
        parallel_for(j, b_cols) {
83
0
          float v = 0;
84
0
          const float* const wpj = wp + j * w_cols_inc;
85
0
          int k;
86
16.8k
          for (k = 0; k < a_cols; k++)
87
16.8k
            v += wpj[k * w_rows_inc] * api[k * a_cols_inc];
88
0
          bpi[j * b_cols_inc] = v;
89
2.09k
        } parallel_endfor
90
2.09k
      }
91
2.04k
    }
92
2.04k
  }
93
7.40k
  return CCV_NNC_EXEC_SUCCESS;
94
7.40k
}
95
96
static int _ccv_nnc_gemm_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
97
7.29k
{
98
7.29k
  // inputs: gradient, forw prop input, [w]
99
7.29k
  // outputs: [output gradient], weight updates, bias updates
100
7.29k
  assert(input_size >= 2 && output_size >= 2);
101
7.29k
  const ccv_nnc_tensor_view_t* g = (const ccv_nnc_tensor_view_t*)inputs[0];
102
7.29k
  ccv_nnc_tensor_view_t* dw = (ccv_nnc_tensor_view_t*)outputs[1];
103
7.29k
  ccv_nnc_tensor_view_t* bias = output_size > 2 ? 
(ccv_nnc_tensor_view_t*)outputs[2]5.28k
:
02.00k
;
104
7.29k
  assert(!bias || (bias->info.dim[1] == 0 || bias->info.dim[2] == 0 || bias->info.dim[3] == 0)); // It is a 2-d or 3-d array.
105
7.29k
  int g_batch_size, g_rows, g_cols, g_batch_inc, g_rows_inc, g_cols_inc;
106
7.29k
  const static int no_transpose[2] = {};
107
7.29k
  ccv_nnc_tensor_get_matrix_params(g->info, CCV_IS_TENSOR_VIEW(g) ? 
g->inc6
:
g->info.dim7.28k
, no_transpose, &g_batch_size, &g_rows, &g_cols, &g_batch_inc, &g_rows_inc, &g_cols_inc);
108
7.29k
  int n, i;
109
7.29k
  if (bias)
110
278
  {
111
278
    if (!(flags & CCV_NNC_ACCUMULATE_OUTPUT)) // reset the gradients to 0
112
278
      ccv_nnc_tensor_zero(bias);
113
278
    int bias_batch_size, bias_rows, bias_cols, bias_batch_inc, bias_rows_inc, bias_cols_inc;
114
278
    ccv_nnc_tensor_get_matrix_params(bias->info, CCV_IS_TENSOR_VIEW(bias) ? 
bias->inc0
: bias->info.dim, no_transpose, &bias_batch_size, &bias_rows, &bias_cols, &bias_batch_inc, &bias_rows_inc, &bias_cols_inc);
115
278
    assert(bias_cols == g_cols);
116
278
    assert(bias_batch_size == 1 || bias_batch_size == g_batch_size);
117
278
    if (bias_batch_size == 1 && 
g_batch_size > 1276
)
118
3
      bias_batch_inc = 0;
119
278
    if (bias_rows == 1 && g_rows > 1)
120
131
      bias_rows_inc = 0;
121
278
    int j;
122
561
    for (n = 0; n < g_batch_size; 
n++283
)
123
283
    {
124
283
      const float* const gp = g->data.f32 + n * g_batch_inc;
125
283
      float* const bp = bias->data.f32 + n * bias_batch_inc;
126
772
      for (i = 0; i < g_rows; 
i++489
)
127
489
      {
128
489
        const float* const gpi = gp + i * g_rows_inc;
129
489
        float* const bpi = bp + i * bias_rows_inc;
130
5.40k
        for (j = 0; j < g_cols; 
j++4.91k
)
131
4.91k
          bpi[j * bias_cols_inc] += gpi[j * g_cols_inc];
132
489
      }
133
283
    }
134
278
  }
135
7.29k
  if (dw)
136
7.29k
  {
137
7.29k
    if (!(flags & CCV_NNC_ACCUMULATE_OUTPUT)) // reset the gradients to 0
138
7.29k
      ccv_nnc_tensor_zero(dw);
139
7.29k
    const ccv_nnc_tensor_view_t* a = (const ccv_nnc_tensor_view_t*)inputs[1];
140
7.29k
    int a_batch_size, a_rows, a_cols, a_batch_inc, a_rows_inc, a_cols_inc;
141
7.29k
    int dw_batch_size, dw_rows, dw_cols, dw_batch_inc, dw_rows_inc, dw_cols_inc;
142
7.29k
    ccv_nnc_tensor_get_matrix_params(a->info, CCV_IS_TENSOR_VIEW(a) ? 
a->inc6
:
a->info.dim7.28k
, cmd.info.blas.transpose_a, &a_batch_size, &a_rows, &a_cols, &a_batch_inc, &a_rows_inc, &a_cols_inc);
143
7.29k
    ccv_nnc_tensor_get_matrix_params(dw->info, CCV_IS_TENSOR_VIEW(dw) ? 
dw->inc0
: dw->info.dim, cmd.info.blas.transpose_b, &dw_batch_size, &dw_rows, &dw_cols, &dw_batch_inc, &dw_rows_inc, &dw_cols_inc);
144
7.29k
    assert(a_rows == g_rows);
145
7.29k
    assert(a_cols == dw_rows);
146
7.29k
    assert(dw_cols == g_cols);
147
7.29k
    assert(a_batch_size == g_batch_size || a_batch_size == 1);
148
7.29k
    if (a_batch_size == 1 && 
g_batch_size > 17.29k
)
149
0
      a_batch_inc = 0;
150
7.29k
    assert(dw_batch_size == g_batch_size || dw_batch_size == 1);
151
7.29k
    if (dw_batch_size == 1 && 
g_batch_size > 17.29k
)
152
3
      dw_batch_inc = 0;
153
14.5k
    for (n = 0; n < g_batch_size; 
n++7.30k
)
154
7.30k
    {
155
7.30k
      const float* const gp = g->data.f32 + n * g_batch_inc;
156
7.30k
      const float* const ap = a->data.f32 + n * a_batch_inc;
157
7.30k
      float* const dwp = dw->data.f32 + n * dw_batch_inc;
158
19.8k
      for (i = 0; i < a_rows; 
i++12.5k
)
159
12.5k
      {
160
12.5k
        const float* const gpi = gp + i * g_rows_inc;
161
12.5k
        const float* const api = ap + i * a_rows_inc;
162
12.5k
        parallel_for(j, g_cols) {
163
0
          const float v = gpi[j * g_cols_inc];
164
0
          float* dwpj = dwp + j * dw_cols_inc;
165
0
          int k;
166
117k
          for (k = 0; k < a_cols; k++)
167
117k
            dwpj[k * dw_rows_inc] += api[k * a_cols_inc] * v;
168
12.5k
        } parallel_endfor
169
12.5k
      }
170
7.30k
    }
171
7.29k
  }
172
7.29k
  ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[0];
173
7.29k
  if (h)
174
7.29k
  {
175
7.29k
    const int zero_h = !(flags & CCV_NNC_ACCUMULATE_OUTPUT); // reset the gradients to 0
176
7.29k
    const ccv_nnc_tensor_view_t* w = (const ccv_nnc_tensor_view_t*)inputs[2];
177
7.29k
    int h_batch_size, h_rows, h_cols, h_batch_inc, h_rows_inc, h_cols_inc;
178
7.29k
    int w_batch_size, w_rows, w_cols, w_batch_inc, w_rows_inc, w_cols_inc;
179
7.29k
    ccv_nnc_tensor_get_matrix_params(h->info, CCV_IS_TENSOR_VIEW(h) ? 
h->inc6
:
h->info.dim7.28k
, cmd.info.blas.transpose_a, &h_batch_size, &h_rows, &h_cols, &h_batch_inc, &h_rows_inc, &h_cols_inc);
180
7.29k
    ccv_nnc_tensor_get_matrix_params(w->info, CCV_IS_TENSOR_VIEW(w) ? 
w->inc0
: w->info.dim, cmd.info.blas.transpose_b, &w_batch_size, &w_rows, &w_cols, &w_batch_inc, &w_rows_inc, &w_cols_inc);
181
7.29k
    assert(h_cols == w_rows);
182
7.29k
    assert(w_cols == g_cols);
183
7.29k
    assert(h_batch_size == g_batch_size || h_batch_size == 1);
184
7.29k
    if (h_batch_size == 1 && 
g_batch_size > 17.29k
)
185
0
      h_batch_inc = 0;
186
7.29k
    assert(w_batch_size == g_batch_size || w_batch_size == 1);
187
7.29k
    if (w_batch_size == 1 && 
g_batch_size > 17.29k
)
188
3
      w_batch_inc = 0;
189
14.5k
    for (n = 0; n < g_batch_size; 
n++7.30k
)
190
7.30k
    {
191
7.30k
      const float* const gp = g->data.f32 + n * g_batch_inc;
192
7.30k
      const float* const wp = w->data.f32 + n * w_batch_inc;
193
7.30k
      float* const hp = h->data.f32 + n * h_batch_inc;
194
19.8k
      for (i = 0; i < h_rows; 
i++12.5k
)
195
12.5k
      {
196
12.5k
        const float* const gpi = gp + i * g_rows_inc;
197
12.5k
        float* const hpi = hp + i * h_rows_inc;
198
12.5k
        parallel_for(j, h_cols) {
199
0
          const float* const wpj = wp + j * w_rows_inc;
200
18.4E
          float v = zero_h ? 
026.3k
: hpi[j * h_cols_inc];
201
0
          int k;
202
113k
          for (k = 0; k < g_cols; k++)
203
113k
            v += wpj[k * w_cols_inc] * gpi[k * g_cols_inc];
204
0
          hpi[j * h_cols_inc] = v;
205
12.5k
        } parallel_endfor
206
12.5k
      }
207
7.30k
    }
208
7.29k
  }
209
7.29k
  return CCV_NNC_EXEC_SUCCESS;
210
7.29k
}
211
212
REGISTER_COMMAND_BACKEND(CCV_NNC_GEMM_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
213
1
{
214
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW;
215
1
  registry->tensor_datatypes = CCV_32F;
216
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
217
1
  registry->algorithms = 1;
218
1
  registry->exec = _ccv_nnc_gemm_forw;
219
1
}
220
221
REGISTER_COMMAND_BACKEND(CCV_NNC_GEMM_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
222
1
{
223
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW;
224
1
  registry->tensor_datatypes = CCV_32F;
225
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
226
1
  registry->algorithms = 1;
227
1
  registry->exec = _ccv_nnc_gemm_back;
228
1
}