Coverage Report

Created: 2022-07-27 23:53

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/cmd/convolution/ccv_nnc_conv_cpu_ref.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
static int _ccv_nnc_conv_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
14
787
{
15
787
  assert(input_size >= 2);
16
787
  const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
17
787
  const ccv_nnc_tensor_t* w = inputs[1];
18
787
  assert(CCV_IS_TENSOR_CONTIGUOUS(w));
19
787
  const ccv_nnc_tensor_t* bias = input_size > 2 ? 
inputs[2]784
:
03
;
20
787
  assert(!bias || !CCV_IS_TENSOR_VIEW(bias));
21
787
  assert(output_size == 1);
22
787
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
23
787
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
24
787
  assert(a_nd == CCV_NNC_MAX_DIM + 1 || a_nd == CCV_NNC_MAX_DIM + 2);
25
787
  const int* adim = (a_nd == CCV_NNC_MAX_DIM + 1) ? 
a->info.dim719
:
a->info.dim + 168
;
26
787
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
27
787
  assert(b_nd == CCV_NNC_MAX_DIM + 1 || b_nd == CCV_NNC_MAX_DIM + 2);
28
787
  const int* bdim = (b_nd == CCV_NNC_MAX_DIM + 1) ? 
b->info.dim719
:
b->info.dim + 168
;
29
787
  assert(bdim[CCV_NNC_MAX_DIM] == cmd.info.convolution.count);
30
787
  int i;
31
  // Make sure the weights dimension matches the network dimension
32
3.08k
  for (i = 1; i < CCV_NNC_MAX_DIM_ALLOC; 
i++2.29k
)
33
3.08k
  {
34
3.08k
    if (w->info.dim[i] == 0 || 
cmd.info.size.dim[i - 1] == 02.36k
)
35
787
      break;
36
2.29k
    assert(w->info.dim[i] == cmd.info.size.dim[i - 1]);
37
2.29k
  }
38
787
  const int groups = cmd.info.convolution.groups;
39
787
  assert(w->info.dim[CCV_NNC_MAX_DIM + 1] * groups == adim[CCV_NNC_MAX_DIM]);
40
787
  assert(cmd.info.convolution.count % groups == 0);
41
787
  const int group_size = cmd.info.convolution.count / groups;
42
  // Make sure the weights output dimension matches the network convolution kernels
43
787
  assert(w->info.dim[0] == cmd.info.convolution.count);
44
787
  const int* ainc = CCV_IS_TENSOR_VIEW(a) ? 
(0
(a_nd == 0
CCV_NNC_MAX_DIM0
+ 1) ?
a->inc0
:
a->inc + 10
) : adim;
45
787
  const int* binc = CCV_IS_TENSOR_VIEW(b) ? 
(0
(b_nd == 0
CCV_NNC_MAX_DIM0
+ 1) ?
b->inc0
:
b->inc + 10
) : bdim;
46
787
  assert(!bias || bias->info.dim[0] == cmd.info.convolution.count);
47
787
  const int channel_size = w->info.dim[CCV_NNC_MAX_DIM + 1];
48
47.5k
  
parallel_for24.5k
(k, cmd.info.convolution.count) {
49
47.5k
    int c;
50
47.5k
    const int gidx = k / group_size;
51
47.5k
    float* ap = a->data.f32;
52
47.5k
    float* bp = b->data.f32 + k;
53
    // kernel weight for one dim.
54
47.5k
    float* wp = w->data.f32 + k * w->info.dim[1] * w->info.dim[2] * channel_size;
55
18.4E
    float biasval = 
bias23.7k
?
bias->data.f32[k]31.7k
: 0;
56
    // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables.
57
47.5k
    int i[CCV_NNC_MAX_DIM];
58
47.5k
    int n[CCV_NNC_MAX_DIM];
59
47.5k
    int m[CCV_NNC_MAX_DIM];
60
47.5k
    int j[CCV_NNC_MAX_DIM];
61
1.32M
    for (i[0] = 0; i[0] < bdim[0]; 
i[0]++1.30M
)
62
1.30M
    {
63
1.30M
      SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, w->info.dim + 1, adim, n, m);
64
1.30M
      float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
65
82.4M
      for (i[1] = 0; i[1] < bdim[1]; 
i[1]++81.1M
)
66
81.1M
      {
67
81.1M
        SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, w->info.dim + 1, adim, n, m);
68
81.1M
        float p = biasval;
69
81.1M
        float* wpz = wpu + n[1] * channel_size;
70
81.1M
        float* apz = ap + ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) * ainc[CCV_NNC_MAX_DIM] + gidx * channel_size;
71
280M
        for (j[0] = 0; j[0] < m[0]; 
j[0]++199M
)
72
199M
        {
73
645M
          for (j[1] = 0; j[1] < m[1]; 
j[1]++446M
)
74
4.97G
            
for (c = 0; 446M
c < channel_size;
c++4.52G
)
75
4.52G
              p += wpz[j[1] * channel_size + c] * apz[j[1] * ainc[CCV_NNC_MAX_DIM] + c];
76
199M
          wpz += w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
77
199M
          apz += ainc[CCV_NNC_MAX_DIM - 1] * ainc[CCV_NNC_MAX_DIM];
78
199M
        }
79
81.1M
        bp[i[1] * binc[CCV_NNC_MAX_DIM]] = p;
80
81.1M
      }
81
1.30M
      bp += binc[CCV_NNC_MAX_DIM - 1] * binc[CCV_NNC_MAX_DIM];
82
1.30M
      ap += ainc[CCV_NNC_MAX_DIM - 1] * ainc[CCV_NNC_MAX_DIM] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0));
83
1.30M
    }
84
47.5k
  } 
parallel_endfor24.5k
85
787
  return CCV_NNC_EXEC_SUCCESS;
86
787
}
87
88
static int _ccv_nnc_conv_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
89
1.10k
{
90
  // inputs: gradient, forw prop input, [w]
91
  // outputs: [output gradient], weight updates, bias updates
92
1.10k
  assert(input_size >= 2 && output_size >= 2);
93
1.10k
  const ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0]; // gradients
94
1.10k
  const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[1];
95
1.10k
  ccv_nnc_tensor_t* w = outputs[1];
96
1.10k
  assert(CCV_IS_TENSOR_CONTIGUOUS(w));
97
1.10k
  ccv_nnc_tensor_t* bias = output_size > 2 ? 
outputs[2]1.10k
:
02
;
98
1.10k
  assert(!bias || !CCV_IS_TENSOR_VIEW(bias));
99
1.10k
  ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[0]; // output gradients
100
1.10k
  if (!(flags & CCV_NNC_ACCUMULATE_OUTPUT)) // reset the gradients to 0
101
1.10k
  {
102
1.10k
    memset(w->data.u8, 0, sizeof(float) * ccv_nnc_tensor_count(w->info));
103
1.10k
    if (bias)
104
1.10k
      memset(bias->data.u8, 0, sizeof(float) * ccv_nnc_tensor_count(bias->info));
105
1.10k
  }
106
1.10k
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
107
1.10k
  assert(a_nd == CCV_NNC_MAX_DIM + 1 || a_nd == CCV_NNC_MAX_DIM + 2);
108
1.10k
  const int* adim = (a_nd == CCV_NNC_MAX_DIM + 1) ? 
a->info.dim4
:
a->info.dim + 11.10k
;
109
1.10k
  const int g_nd = ccv_nnc_tensor_nd(g->info.dim);
110
1.10k
  assert(g_nd == CCV_NNC_MAX_DIM + 1 || g_nd == CCV_NNC_MAX_DIM + 2);
111
1.10k
  const int* gdim = (g_nd == CCV_NNC_MAX_DIM + 1) ? 
g->info.dim4
:
g->info.dim + 11.10k
;
112
1.10k
  const int* ainc = CCV_IS_TENSOR_VIEW(a) ? 
(0
(a_nd == 0
CCV_NNC_MAX_DIM0
+ 1) ?
a->inc0
:
a->inc + 10
) : adim;
113
1.10k
  const int* ginc = CCV_IS_TENSOR_VIEW(g) ? 
(0
(g_nd == 0
CCV_NNC_MAX_DIM0
+ 1) ?
g->inc0
:
g->inc + 10
) : gdim;
114
1.10k
  const int groups = cmd.info.convolution.groups;
115
1.10k
  assert(w->info.dim[CCV_NNC_MAX_DIM + 1] * groups == adim[CCV_NNC_MAX_DIM]);
116
1.10k
  assert(cmd.info.convolution.count % groups == 0);
117
1.10k
  const int group_size = cmd.info.convolution.count / groups;
118
1.10k
  const int channel_size = w->info.dim[CCV_NNC_MAX_DIM + 1];
119
37.0k
  
parallel_for19.6k
(k, cmd.info.convolution.count) {
120
37.0k
    int c;
121
37.0k
    const int gidx = k / group_size;
122
37.0k
    float* ap = a->data.f32;
123
37.0k
    float* gp = g->data.f32 + k;
124
    // kernel weight for one dim.
125
37.0k
    float* wp = w->data.f32 + k * w->info.dim[1] * w->info.dim[2] * w->info.dim[3];
126
37.0k
    float biasval = 0;
127
37.0k
    int i[CCV_NNC_MAX_DIM];
128
37.0k
    int n[CCV_NNC_MAX_DIM];
129
37.0k
    int m[CCV_NNC_MAX_DIM];
130
37.0k
    int j[CCV_NNC_MAX_DIM];
131
598k
    for (i[0] = 0; i[0] < gdim[0]; 
i[0]++580k
)
132
580k
    {
133
581k
      SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, w->info.dim + 1, adim, n, m);
134
580k
      float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
135
3.44M
      for (i[1] = 0; i[1] < gdim[1]; 
i[1]++2.86M
)
136
2.86M
      {
137
2.86M
        SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, w->info.dim + 1, adim, n, m);
138
2.86M
        const float v = gp[i[1] * gdim[CCV_NNC_MAX_DIM]];
139
2.86M
        if (v == 0) // shortcut if v is zero
140
2.41M
          continue;
141
454k
        biasval += v;
142
454k
        float* wpz = wpu + n[1] * channel_size;
143
454k
        float* apz = ap + ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) * ainc[CCV_NNC_MAX_DIM] + gidx * channel_size;
144
2.77M
        for (j[0] = 0; j[0] < m[0]; 
j[0]++2.32M
)
145
2.32M
        {
146
8.30M
          for (j[1] = 0; j[1] < m[1]; 
j[1]++5.98M
)
147
42.4M
            
for (c = 0; 5.98M
c < channel_size;
c++36.4M
)
148
36.4M
              wpz[j[1] * channel_size + c] += v * apz[j[1] * ainc[CCV_NNC_MAX_DIM] + c];
149
2.32M
          wpz += w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
150
2.32M
          apz += ainc[CCV_NNC_MAX_DIM - 1] * ainc[CCV_NNC_MAX_DIM];
151
2.32M
        }
152
454k
      }
153
580k
      gp += ginc[CCV_NNC_MAX_DIM - 1] * ginc[CCV_NNC_MAX_DIM];
154
580k
      ap += ainc[CCV_NNC_MAX_DIM - 1] * ainc[CCV_NNC_MAX_DIM] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0));
155
580k
    }
156
37.0k
    if (
bias18.5k
)
157
46.2k
      bias->data.f32[k] = biasval;
158
37.0k
  } 
parallel_endfor19.6k
159
  // If h is available, therefore, we need to propagate the gradients back
160
1.10k
  if (h)
161
1.00k
  {
162
1.00k
    assert(h);
163
1.00k
    const int h_nd = ccv_nnc_tensor_nd(h->info.dim);
164
1.00k
    assert(h_nd == CCV_NNC_MAX_DIM + 1 || h_nd == CCV_NNC_MAX_DIM + 2);
165
1.00k
    const int* hdim = (h_nd == CCV_NNC_MAX_DIM + 1) ? 
h->info.dim6
:
h->info.dim + 11.00k
;
166
1.00k
    const int* hinc = CCV_IS_TENSOR_VIEW(h) ? 
(0
(h_nd == 0
CCV_NNC_MAX_DIM0
+ 1) ?
h->inc0
:
h->inc + 10
) : hdim;
167
    // reset it to 0.
168
1.00k
    ccv_nnc_tensor_zero(h);
169
1.00k
    w = inputs[2];
170
1.00k
    assert(CCV_IS_TENSOR_CONTIGUOUS(w));
171
1.00k
    int k, gidx;
172
2.01k
    for (gidx = 0; gidx < groups; 
gidx++1.00k
)
173
44.8k
      
for (k = gidx * group_size; 1.00k
k < (gidx + 1) * group_size;
k++43.8k
)
174
43.8k
      {
175
43.8k
        int c;
176
43.8k
        float* hp = h->data.f32;
177
43.8k
        float* gp = g->data.f32 + k;
178
        // kernel weight for one dim.
179
43.8k
        float* wp = w->data.f32 + k * w->info.dim[1] * w->info.dim[2] * channel_size;
180
        // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables.
181
43.8k
        int i[CCV_NNC_MAX_DIM];
182
43.8k
        int n[CCV_NNC_MAX_DIM];
183
43.8k
        int m[CCV_NNC_MAX_DIM];
184
43.8k
        int j[CCV_NNC_MAX_DIM];
185
650k
        for (i[0] = 0; i[0] < gdim[0]; 
i[0]++606k
)
186
606k
        {
187
606k
          SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, w->info.dim + 1, h->info.dim, n, m);
188
606k
          float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
189
12.6M
          for (i[1] = 0; i[1] < gdim[1]; 
i[1]++12.0M
)
190
12.0M
          {
191
12.0M
            SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, w->info.dim + 1, h->info.dim, n, m);
192
12.0M
            const float v = gp[i[1] * ginc[CCV_NNC_MAX_DIM]];
193
12.0M
            if (v == 0) // shortcut if v is zero
194
11.2M
              continue;
195
780k
            float* wpz = wpu + n[1] * channel_size;
196
780k
            float* hpz = hp + ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) * hinc[CCV_NNC_MAX_DIM] + gidx * channel_size;
197
1.40M
            for (j[0] = 0; j[0] < m[0]; 
j[0]++627k
)
198
627k
            {
199
3.20M
              for (j[1] = 0; j[1] < m[1]; 
j[1]++2.57M
)
200
62.2M
                
for (c = 0; 2.57M
c < channel_size;
c++59.6M
)
201
59.6M
                  hpz[j[1] * hinc[CCV_NNC_MAX_DIM] + c] += v * wpz[j[1] * channel_size + c];
202
627k
              wpz += w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
203
627k
              hpz += hinc[CCV_NNC_MAX_DIM - 1] * hinc[CCV_NNC_MAX_DIM];
204
627k
            }
205
780k
          }
206
606k
          gp += ginc[CCV_NNC_MAX_DIM - 1] * ginc[CCV_NNC_MAX_DIM];
207
606k
          hp += hinc[CCV_NNC_MAX_DIM - 1] * hinc[CCV_NNC_MAX_DIM] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0));
208
606k
        }
209
43.8k
      }
210
1.00k
  }
211
1.10k
  return CCV_NNC_EXEC_SUCCESS;
212
1.10k
}
213
214
REGISTER_COMMAND_BACKEND(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
215
1
{
216
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC;
217
1
  registry->tensor_datatypes = CCV_32F;
218
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
219
1
  registry->algorithms = 1;
220
1
  registry->exec = _ccv_nnc_conv_forw;
221
1
}
222
223
REGISTER_COMMAND_BACKEND(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
224
1
{
225
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC;
226
1
  registry->tensor_datatypes = CCV_32F;
227
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
228
1
  registry->algorithms = 1;
229
1
  registry->exec = _ccv_nnc_conv_back;
230
1
}
231