Coverage Report

Created: 2021-04-14 04:30

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/cmd/convolution/ccv_nnc_conv_cpu_ref.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
static int _ccv_nnc_conv_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
14
737
{
15
737
  assert(input_size >= 2);
16
737
  const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
17
737
  const ccv_nnc_tensor_t* w = inputs[1];
18
737
  assert(!CCV_IS_TENSOR_VIEW(w));
19
737
  const ccv_nnc_tensor_t* bias = input_size > 2 ? 
inputs[2]734
:
03
;
20
737
  assert(!bias || !CCV_IS_TENSOR_VIEW(bias));
21
737
  assert(output_size == 1);
22
737
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
23
737
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
24
737
  assert(a_nd == CCV_NNC_MAX_DIM + 1 || a_nd == CCV_NNC_MAX_DIM + 2);
25
737
  const int* adim = (a_nd == CCV_NNC_MAX_DIM + 1) ? 
a->info.dim669
:
a->info.dim + 168
;
26
737
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
27
737
  assert(b_nd == CCV_NNC_MAX_DIM + 1 || b_nd == CCV_NNC_MAX_DIM + 2);
28
737
  const int* bdim = (b_nd == CCV_NNC_MAX_DIM + 1) ? 
b->info.dim669
:
b->info.dim + 168
;
29
737
  assert(bdim[CCV_NNC_MAX_DIM] == cmd.info.convolution.count);
30
737
  int i;
31
737
  // Make sure the weights dimension matches the network dimension
32
2.88k
  for (i = 1; i < CCV_NNC_MAX_DIM_ALLOC; 
i++2.14k
)
33
2.88k
  {
34
2.88k
    if (w->info.dim[i] == 0 || 
cmd.info.size.dim[i - 1] == 02.21k
)
35
737
      break;
36
2.14k
    assert(w->info.dim[i] == cmd.info.size.dim[i - 1]);
37
2.14k
  }
38
737
  const int groups = cmd.info.convolution.groups;
39
737
  assert(w->info.dim[CCV_NNC_MAX_DIM + 1] * groups == adim[CCV_NNC_MAX_DIM]);
40
737
  assert(cmd.info.convolution.count % groups == 0);
41
737
  const int group_size = cmd.info.convolution.count / groups;
42
737
  // Make sure the weights output dimension matches the network convolution kernels
43
737
  assert(w->info.dim[0] == cmd.info.convolution.count);
44
737
  const int* ainc = CCV_IS_TENSOR_VIEW(a) ? 
((a_nd == 0
CCV_NNC_MAX_DIM0
+ 1) ?
a->inc0
:
a->inc + 10
) : adim;
45
737
  const int* binc = CCV_IS_TENSOR_VIEW(b) ? 
((b_nd == 0
CCV_NNC_MAX_DIM0
+ 1) ?
b->inc0
:
b->inc + 10
) : bdim;
46
737
  assert(!bias || bias->info.dim[0] == cmd.info.convolution.count);
47
737
  const int channel_size = w->info.dim[CCV_NNC_MAX_DIM + 1];
48
737
  parallel_for(k, cmd.info.convolution.count) {
49
0
    int c;
50
0
    const int gidx = k / group_size;
51
0
    float* ap = a->data.f32;
52
0
    float* bp = b->data.f32 + k;
53
0
    // kernel weight for one dim.
54
0
    float* wp = w->data.f32 + k * w->info.dim[1] * w->info.dim[2] * channel_size;
55
18.4E
    float biasval = bias ? 
bias->data.f32[k]31.3k
: 0;
56
0
    // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables.
57
0
    int i[CCV_NNC_MAX_DIM];
58
0
    int n[CCV_NNC_MAX_DIM];
59
0
    int m[CCV_NNC_MAX_DIM];
60
0
    int j[CCV_NNC_MAX_DIM];
61
1.29M
    for (i[0] = 0; i[0] < bdim[0]; i[0]++)
62
1.29M
    {
63
1.29M
      SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, w->info.dim + 1, adim, n, m);
64
1.29M
      float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
65
81.5M
      for (i[1] = 0; i[1] < bdim[1]; 
i[1]++80.2M
)
66
80.2M
      {
67
80.2M
        SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, w->info.dim + 1, adim, n, m);
68
80.2M
        float p = biasval;
69
80.2M
        float* wpz = wpu + n[1] * channel_size;
70
80.2M
        float* apz = ap + ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) * ainc[CCV_NNC_MAX_DIM] + gidx * channel_size;
71
277M
        for (j[0] = 0; j[0] < m[0]; 
j[0]++197M
)
72
197M
        {
73
640M
          for (j[1] = 0; j[1] < m[1]; 
j[1]++442M
)
74
4.37G
            
for (c = 0; 442M
c < channel_size;
c++3.92G
)
75
3.92G
              p += wpz[j[1] * channel_size + c] * apz[j[1] * ainc[CCV_NNC_MAX_DIM] + c];
76
197M
          wpz += w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
77
197M
          apz += ainc[CCV_NNC_MAX_DIM - 1] * ainc[CCV_NNC_MAX_DIM];
78
197M
        }
79
80.2M
        bp[i[1] * binc[CCV_NNC_MAX_DIM]] = p;
80
80.2M
      }
81
1.29M
      bp += binc[CCV_NNC_MAX_DIM - 1] * binc[CCV_NNC_MAX_DIM];
82
1.29M
      ap += ainc[CCV_NNC_MAX_DIM - 1] * ainc[CCV_NNC_MAX_DIM] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0));
83
1.29M
    }
84
737
  } parallel_endfor
85
737
  return CCV_NNC_EXEC_SUCCESS;
86
737
}
87
88
static int _ccv_nnc_conv_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
89
1.10k
{
90
1.10k
  // inputs: gradient, forw prop input, [w]
91
1.10k
  // outputs: [output gradient], weight updates, bias updates
92
1.10k
  assert(input_size >= 2 && output_size >= 2);
93
1.10k
  const ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0]; // gradients
94
1.10k
  const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[1];
95
1.10k
  ccv_nnc_tensor_t* w = outputs[1];
96
1.10k
  assert(!CCV_IS_TENSOR_VIEW(w));
97
1.10k
  ccv_nnc_tensor_t* bias = output_size > 2 ? 
outputs[2]1.10k
:
02
;
98
1.10k
  assert(!bias || !CCV_IS_TENSOR_VIEW(bias));
99
1.10k
  ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[0]; // output gradients
100
1.10k
  if (!(flags & CCV_NNC_ACCUMULATE_OUTPUT)) // reset the gradients to 0
101
1.10k
  {
102
1.10k
    memset(w->data.u8, 0, sizeof(float) * ccv_nnc_tensor_count(w->info));
103
1.10k
    if (bias)
104
1.10k
      memset(bias->data.u8, 0, sizeof(float) * ccv_nnc_tensor_count(bias->info));
105
1.10k
  }
106
1.10k
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
107
1.10k
  assert(a_nd == CCV_NNC_MAX_DIM + 1 || a_nd == CCV_NNC_MAX_DIM + 2);
108
1.10k
  const int* adim = (a_nd == CCV_NNC_MAX_DIM + 1) ? 
a->info.dim4
:
a->info.dim + 11.10k
;
109
1.10k
  const int g_nd = ccv_nnc_tensor_nd(g->info.dim);
110
1.10k
  assert(g_nd == CCV_NNC_MAX_DIM + 1 || g_nd == CCV_NNC_MAX_DIM + 2);
111
1.10k
  const int* gdim = (g_nd == CCV_NNC_MAX_DIM + 1) ? 
g->info.dim4
:
g->info.dim + 11.10k
;
112
1.10k
  const int* ainc = CCV_IS_TENSOR_VIEW(a) ? 
((a_nd == 0
CCV_NNC_MAX_DIM0
+ 1) ?
a->inc0
:
a->inc + 10
) : adim;
113
1.10k
  const int* ginc = CCV_IS_TENSOR_VIEW(g) ? 
((g_nd == 0
CCV_NNC_MAX_DIM0
+ 1) ?
g->inc0
:
g->inc + 10
) : gdim;
114
1.10k
  const int groups = cmd.info.convolution.groups;
115
1.10k
  assert(w->info.dim[CCV_NNC_MAX_DIM + 1] * groups == adim[CCV_NNC_MAX_DIM]);
116
1.10k
  assert(cmd.info.convolution.count % groups == 0);
117
1.10k
  const int group_size = cmd.info.convolution.count / groups;
118
1.10k
  const int channel_size = w->info.dim[CCV_NNC_MAX_DIM + 1];
119
1.10k
  parallel_for(k, cmd.info.convolution.count) {
120
0
    int c;
121
0
    const int gidx = k / group_size;
122
0
    float* ap = a->data.f32;
123
0
    float* gp = g->data.f32 + k;
124
0
    // kernel weight for one dim.
125
0
    float* wp = w->data.f32 + k * w->info.dim[1] * w->info.dim[2] * w->info.dim[3];
126
0
    float biasval = 0;
127
0
    int i[CCV_NNC_MAX_DIM];
128
0
    int n[CCV_NNC_MAX_DIM];
129
0
    int m[CCV_NNC_MAX_DIM];
130
0
    int j[CCV_NNC_MAX_DIM];
131
559k
    for (i[0] = 0; i[0] < gdim[0]; i[0]++)
132
559k
    {
133
561k
      SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, w->info.dim + 1, adim, n, m);
134
559k
      float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
135
3.37M
      for (i[1] = 0; i[1] < gdim[1]; 
i[1]++2.81M
)
136
2.81M
      {
137
2.81M
        SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, w->info.dim + 1, adim, n, m);
138
2.81M
        const float v = gp[i[1] * gdim[CCV_NNC_MAX_DIM]];
139
2.81M
        if (v == 0) // shortcut if v is zero
140
2.33M
          continue;
141
475k
        biasval += v;
142
475k
        float* wpz = wpu + n[1] * channel_size;
143
475k
        float* apz = ap + ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) * ainc[CCV_NNC_MAX_DIM] + gidx * channel_size;
144
2.68M
        for (j[0] = 0; j[0] < m[0]; 
j[0]++2.20M
)
145
2.20M
        {
146
8.02M
          for (j[1] = 0; j[1] < m[1]; 
j[1]++5.81M
)
147
42.4M
            
for (c = 0; 5.81M
c < channel_size;
c++36.6M
)
148
36.6M
              wpz[j[1] * channel_size + c] += v * apz[j[1] * ainc[CCV_NNC_MAX_DIM] + c];
149
2.20M
          wpz += w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
150
2.20M
          apz += ainc[CCV_NNC_MAX_DIM - 1] * ainc[CCV_NNC_MAX_DIM];
151
2.20M
        }
152
475k
      }
153
559k
      gp += ginc[CCV_NNC_MAX_DIM - 1] * ginc[CCV_NNC_MAX_DIM];
154
559k
      ap += ainc[CCV_NNC_MAX_DIM - 1] * ainc[CCV_NNC_MAX_DIM] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0));
155
559k
    }
156
0
    if (bias)
157
45.9k
      bias->data.f32[k] = biasval;
158
1.10k
  } parallel_endfor
159
1.10k
  // If h is available, therefore, we need to propagate the gradients back
160
1.10k
  if (h)
161
1.00k
  {
162
1.00k
    assert(h);
163
1.00k
    const int h_nd = ccv_nnc_tensor_nd(h->info.dim);
164
1.00k
    assert(h_nd == CCV_NNC_MAX_DIM + 1 || h_nd == CCV_NNC_MAX_DIM + 2);
165
1.00k
    const int* hdim = (h_nd == CCV_NNC_MAX_DIM + 1) ? 
h->info.dim6
:
h->info.dim + 11.00k
;
166
1.00k
    const int* hinc = CCV_IS_TENSOR_VIEW(h) ? 
((h_nd == 0
CCV_NNC_MAX_DIM0
+ 1) ?
h->inc0
:
h->inc + 10
) : hdim;
167
1.00k
    // reset it to 0.
168
1.00k
    ccv_nnc_tensor_zero(h);
169
1.00k
    w = inputs[2];
170
1.00k
    assert(!CCV_IS_TENSOR_VIEW(w));
171
1.00k
    int k, gidx;
172
2.01k
    for (gidx = 0; gidx < groups; 
gidx++1.00k
)
173
44.8k
      
for (k = gidx * group_size; 1.00k
k < (gidx + 1) * group_size;
k++43.8k
)
174
43.8k
      {
175
43.8k
        int c;
176
43.8k
        float* hp = h->data.f32;
177
43.8k
        float* gp = g->data.f32 + k;
178
43.8k
        // kernel weight for one dim.
179
43.8k
        float* wp = w->data.f32 + k * w->info.dim[1] * w->info.dim[2] * channel_size;
180
43.8k
        // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables.
181
43.8k
        int i[CCV_NNC_MAX_DIM];
182
43.8k
        int n[CCV_NNC_MAX_DIM];
183
43.8k
        int m[CCV_NNC_MAX_DIM];
184
43.8k
        int j[CCV_NNC_MAX_DIM];
185
650k
        for (i[0] = 0; i[0] < gdim[0]; 
i[0]++606k
)
186
606k
        {
187
606k
          SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, w->info.dim + 1, h->info.dim, n, m);
188
606k
          float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
189
12.6M
          for (i[1] = 0; i[1] < gdim[1]; 
i[1]++12.0M
)
190
12.0M
          {
191
12.0M
            SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, w->info.dim + 1, h->info.dim, n, m);
192
12.0M
            const float v = gp[i[1] * ginc[CCV_NNC_MAX_DIM]];
193
12.0M
            if (v == 0) // shortcut if v is zero
194
11.3M
              continue;
195
723k
            float* wpz = wpu + n[1] * channel_size;
196
723k
            float* hpz = hp + ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) * hinc[CCV_NNC_MAX_DIM] + gidx * channel_size;
197
1.33M
            for (j[0] = 0; j[0] < m[0]; 
j[0]++609k
)
198
609k
            {
199
3.11M
              for (j[1] = 0; j[1] < m[1]; 
j[1]++2.50M
)
200
60.0M
                
for (c = 0; 2.50M
c < channel_size;
c++57.5M
)
201
57.5M
                  hpz[j[1] * hinc[CCV_NNC_MAX_DIM] + c] += v * wpz[j[1] * channel_size + c];
202
609k
              wpz += w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
203
609k
              hpz += hinc[CCV_NNC_MAX_DIM - 1] * hinc[CCV_NNC_MAX_DIM];
204
609k
            }
205
723k
          }
206
606k
          gp += ginc[CCV_NNC_MAX_DIM - 1] * ginc[CCV_NNC_MAX_DIM];
207
606k
          hp += hinc[CCV_NNC_MAX_DIM - 1] * hinc[CCV_NNC_MAX_DIM] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0));
208
606k
        }
209
43.8k
      }
210
1.00k
  }
211
1.10k
  return CCV_NNC_EXEC_SUCCESS;
212
1.10k
}
213
214
REGISTER_COMMAND_BACKEND(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
215
1
{
216
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC;
217
1
  registry->tensor_datatypes = CCV_32F;
218
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
219
1
  registry->algorithms = 1;
220
1
  registry->exec = _ccv_nnc_conv_forw;
221
1
}
222
223
REGISTER_COMMAND_BACKEND(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
224
1
{
225
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC;
226
1
  registry->tensor_datatypes = CCV_32F;
227
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
228
1
  registry->algorithms = 1;
229
1
  registry->exec = _ccv_nnc_conv_back;
230
1
}
231