Coverage Report

Created: 2019-07-03 22:50

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/cmd/convolution/ccv_nnc_conv_cpu_ref.c
Line
Count
Source (jump to first uncovered line)
1
#include <ccv.h>
2
#include <ccv_internal.h>
3
#include <nnc/ccv_nnc.h>
4
#include <nnc/ccv_nnc_easy.h>
5
#include <nnc/ccv_nnc_internal.h>
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
static int _ccv_nnc_conv_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
14
735
{
15
735
  assert(input_size >= 2);
16
735
  const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
17
735
  const ccv_nnc_tensor_t* w = inputs[1];
18
735
  assert(!CCV_IS_TENSOR_VIEW(w));
19
735
  const ccv_nnc_tensor_t* bias = input_size > 2 ? 
inputs[2]734
:
01
;
20
735
  assert(!bias || !CCV_IS_TENSOR_VIEW(bias));
21
735
  assert(output_size == 1);
22
735
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
23
735
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
24
735
  assert(a_nd == CCV_NNC_MAX_DIM + 1 || a_nd == CCV_NNC_MAX_DIM + 2);
25
735
  const int* adim = (a_nd == CCV_NNC_MAX_DIM + 1) ? 
a->info.dim669
:
a->info.dim + 166
;
26
735
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
27
735
  assert(b_nd == CCV_NNC_MAX_DIM + 1 || b_nd == CCV_NNC_MAX_DIM + 2);
28
735
  const int* bdim = (b_nd == CCV_NNC_MAX_DIM + 1) ? 
b->info.dim669
:
b->info.dim + 166
;
29
735
  assert(bdim[CCV_NNC_MAX_DIM] == cmd.info.convolution.count);
30
735
  int i;
31
735
  // Make sure the weights dimension matches the network dimension
32
2.87k
  for (i = 1; i < CCV_NNC_MAX_DIM_ALLOC; 
i++2.14k
)
33
2.87k
  {
34
2.87k
    if (w->info.dim[i] == 0 || 
cmd.info.size.dim[i - 1] == 02.20k
)
35
735
      break;
36
2.14k
    assert(w->info.dim[i] == cmd.info.size.dim[i - 1]);
37
2.14k
  }
38
735
  const int groups = cmd.info.convolution.groups;
39
735
  assert(w->info.dim[CCV_NNC_MAX_DIM + 1] * groups == adim[CCV_NNC_MAX_DIM]);
40
735
  assert(cmd.info.convolution.count % groups == 0);
41
735
  const int group_size = cmd.info.convolution.count / groups;
42
735
  // Make sure the weights output dimension matches the network convolution kernels
43
735
  assert(w->info.dim[0] == cmd.info.convolution.count);
44
735
  const int* ainc = CCV_IS_TENSOR_VIEW(a) ? 
((a_nd == 0
CCV_NNC_MAX_DIM0
+ 1) ?
a->inc0
:
a->inc + 10
) : adim;
45
735
  const int* binc = CCV_IS_TENSOR_VIEW(b) ? 
((b_nd == 0
CCV_NNC_MAX_DIM0
+ 1) ?
b->inc0
:
b->inc + 10
) : bdim;
46
735
  assert(!bias || bias->info.dim[0] == cmd.info.convolution.count);
47
735
  const int channel_size = w->info.dim[CCV_NNC_MAX_DIM + 1];
48
735
  parallel_for(k, cmd.info.convolution.count) {
49
0
    int c;
50
0
    const int gidx = k / group_size;
51
0
    float* ap = a->data.f32;
52
0
    float* bp = b->data.f32 + k;
53
0
    // kernel weight for one dim.
54
0
    float* wp = w->data.f32 + k * w->info.dim[1] * w->info.dim[2] * channel_size;
55
18.4E
    float biasval = bias ? 
bias->data.f32[k]33.5k
: 0;
56
0
    // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables.
57
0
    int i[CCV_NNC_MAX_DIM];
58
0
    int n[CCV_NNC_MAX_DIM];
59
0
    int m[CCV_NNC_MAX_DIM];
60
0
    int j[CCV_NNC_MAX_DIM];
61
1.30M
    for (i[0] = 0; i[0] < bdim[0]; i[0]++)
62
1.30M
    {
63
1.30M
      SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, w->info.dim + 1, adim, n, m);
64
1.30M
      float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
65
89.4M
      for (i[1] = 0; i[1] < bdim[1]; 
i[1]++88.1M
)
66
88.1M
      {
67
88.1M
        SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, w->info.dim + 1, adim, n, m);
68
88.1M
        float p = biasval;
69
88.1M
        float* wpz = wpu + n[1] * channel_size;
70
88.1M
        float* apz = ap + ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) * ainc[CCV_NNC_MAX_DIM] + gidx * channel_size;
71
313M
        for (j[0] = 0; j[0] < m[0]; 
j[0]++225M
)
72
225M
        {
73
735M
          for (j[1] = 0; j[1] < m[1]; 
j[1]++509M
)
74
5.49G
            
for (c = 0; 509M
c < channel_size;
c++4.98G
)
75
4.98G
              p += wpz[j[1] * channel_size + c] * apz[j[1] * ainc[CCV_NNC_MAX_DIM] + c];
76
225M
          wpz += w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
77
225M
          apz += ainc[CCV_NNC_MAX_DIM - 1] * ainc[CCV_NNC_MAX_DIM];
78
225M
        }
79
88.1M
        bp[i[1] * binc[CCV_NNC_MAX_DIM]] = p;
80
88.1M
      }
81
1.30M
      bp += binc[CCV_NNC_MAX_DIM - 1] * binc[CCV_NNC_MAX_DIM];
82
1.30M
      ap += ainc[CCV_NNC_MAX_DIM - 1] * ainc[CCV_NNC_MAX_DIM] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0));
83
1.30M
    }
84
735
  } parallel_endfor
85
735
  return CCV_NNC_EXEC_SUCCESS;
86
735
}
87
88
static int _ccv_nnc_conv_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
89
1.10k
{
90
1.10k
  // inputs: gradient, forw prop input, [w]
91
1.10k
  // outputs: [output gradient], weight updates, bias updates
92
1.10k
  assert(input_size >= 2 && output_size >= 2);
93
1.10k
  const ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0]; // gradients
94
1.10k
  const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[1];
95
1.10k
  ccv_nnc_tensor_t* w = outputs[1];
96
1.10k
  assert(!CCV_IS_TENSOR_VIEW(w));
97
1.10k
  ccv_nnc_tensor_t* bias = output_size > 2 ? outputs[2] : 
00
;
98
1.10k
  assert(!bias || !CCV_IS_TENSOR_VIEW(bias));
99
1.10k
  ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[0]; // output gradients
100
1.10k
  if (!(flags & CCV_NNC_ACCUMULATE_OUTPUT)) // reset the gradients to 0
101
1.10k
  {
102
1.10k
    memset(w->data.u8, 0, sizeof(float) * ccv_nnc_tensor_count(w->info));
103
1.10k
    if (bias)
104
1.10k
      memset(bias->data.u8, 0, sizeof(float) * ccv_nnc_tensor_count(bias->info));
105
1.10k
  }
106
1.10k
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
107
1.10k
  assert(a_nd == CCV_NNC_MAX_DIM + 1 || a_nd == CCV_NNC_MAX_DIM + 2);
108
1.10k
  const int* adim = (a_nd == CCV_NNC_MAX_DIM + 1) ? 
a->info.dim4
:
a->info.dim + 11.10k
;
109
1.10k
  const int g_nd = ccv_nnc_tensor_nd(g->info.dim);
110
1.10k
  assert(g_nd == CCV_NNC_MAX_DIM + 1 || g_nd == CCV_NNC_MAX_DIM + 2);
111
1.10k
  const int* gdim = (g_nd == CCV_NNC_MAX_DIM + 1) ? 
g->info.dim4
:
g->info.dim + 11.10k
;
112
1.10k
  const int* ainc = CCV_IS_TENSOR_VIEW(a) ? 
((a_nd == 0
CCV_NNC_MAX_DIM0
+ 1) ?
a->inc0
:
a->inc + 10
) : adim;
113
1.10k
  const int* ginc = CCV_IS_TENSOR_VIEW(g) ? 
((g_nd == 0
CCV_NNC_MAX_DIM0
+ 1) ?
g->inc0
:
g->inc + 10
) : gdim;
114
1.10k
  const int groups = cmd.info.convolution.groups;
115
1.10k
  assert(w->info.dim[CCV_NNC_MAX_DIM + 1] * groups == adim[CCV_NNC_MAX_DIM]);
116
1.10k
  assert(cmd.info.convolution.count % groups == 0);
117
1.10k
  const int group_size = cmd.info.convolution.count / groups;
118
1.10k
  const int channel_size = w->info.dim[CCV_NNC_MAX_DIM + 1];
119
1.10k
  parallel_for(k, cmd.info.convolution.count) {
120
0
    int c;
121
0
    const int gidx = k / group_size;
122
0
    float* ap = a->data.f32;
123
0
    float* gp = g->data.f32 + k;
124
0
    // kernel weight for one dim.
125
0
    float* wp = w->data.f32 + k * w->info.dim[1] * w->info.dim[2] * w->info.dim[3];
126
0
    float biasval = 0;
127
0
    int i[CCV_NNC_MAX_DIM];
128
0
    int n[CCV_NNC_MAX_DIM];
129
0
    int m[CCV_NNC_MAX_DIM];
130
0
    int j[CCV_NNC_MAX_DIM];
131
580k
    for (i[0] = 0; i[0] < gdim[0]; i[0]++)
132
580k
    {
133
581k
      SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, w->info.dim + 1, adim, n, m);
134
580k
      float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
135
5.16M
      for (i[1] = 0; i[1] < gdim[1]; 
i[1]++4.58M
)
136
4.58M
      {
137
4.58M
        SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, w->info.dim + 1, adim, n, m);
138
4.58M
        const float v = gp[i[1] * gdim[CCV_NNC_MAX_DIM]];
139
4.58M
        if (v == 0) // shortcut if v is zero
140
3.99M
          continue;
141
584k
        biasval += v;
142
584k
        float* wpz = wpu + n[1] * channel_size;
143
584k
        float* apz = ap + ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) * ainc[CCV_NNC_MAX_DIM] + gidx * channel_size;
144
3.44M
        for (j[0] = 0; j[0] < m[0]; 
j[0]++2.86M
)
145
2.86M
        {
146
9.81M
          for (j[1] = 0; j[1] < m[1]; 
j[1]++6.95M
)
147
42.4M
            
for (c = 0; 6.95M
c < channel_size;
c++35.5M
)
148
35.5M
              wpz[j[1] * channel_size + c] += v * apz[j[1] * ainc[CCV_NNC_MAX_DIM] + c];
149
2.86M
          wpz += w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
150
2.86M
          apz += ainc[CCV_NNC_MAX_DIM - 1] * ainc[CCV_NNC_MAX_DIM];
151
2.86M
        }
152
584k
      }
153
580k
      gp += ginc[CCV_NNC_MAX_DIM - 1] * ginc[CCV_NNC_MAX_DIM];
154
580k
      ap += ainc[CCV_NNC_MAX_DIM - 1] * ainc[CCV_NNC_MAX_DIM] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0));
155
580k
    }
156
0
    if (bias)
157
46.5k
      bias->data.f32[k] = biasval;
158
1.10k
  } parallel_endfor
159
1.10k
  // If h is available, therefore, we need to propagate the gradients back
160
1.10k
  if (h)
161
738
  {
162
738
    assert(h);
163
738
    const int h_nd = ccv_nnc_tensor_nd(h->info.dim);
164
738
    assert(h_nd == CCV_NNC_MAX_DIM + 1 || h_nd == CCV_NNC_MAX_DIM + 2);
165
738
    const int* hdim = (h_nd == CCV_NNC_MAX_DIM + 1) ? 
h->info.dim4
:
h->info.dim + 1734
;
166
738
    const int* hinc = CCV_IS_TENSOR_VIEW(h) ? 
((h_nd == 0
CCV_NNC_MAX_DIM0
+ 1) ?
h->inc0
:
h->inc + 10
) : hdim;
167
738
    // reset it to 0.
168
738
    ccv_nnc_tensor_zero(h);
169
738
    w = inputs[2];
170
738
    assert(!CCV_IS_TENSOR_VIEW(w));
171
738
    int k, gidx;
172
1.47k
    for (gidx = 0; gidx < groups; 
gidx++738
)
173
36.0k
      
for (k = gidx * group_size; 738
k < (gidx + 1) * group_size;
k++35.2k
)
174
35.2k
      {
175
35.2k
        int c;
176
35.2k
        float* hp = h->data.f32;
177
35.2k
        float* gp = g->data.f32 + k;
178
35.2k
        // kernel weight for one dim.
179
35.2k
        float* wp = w->data.f32 + k * w->info.dim[1] * w->info.dim[2] * channel_size;
180
35.2k
        // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables.
181
35.2k
        int i[CCV_NNC_MAX_DIM];
182
35.2k
        int n[CCV_NNC_MAX_DIM];
183
35.2k
        int m[CCV_NNC_MAX_DIM];
184
35.2k
        int j[CCV_NNC_MAX_DIM];
185
377k
        for (i[0] = 0; i[0] < gdim[0]; 
i[0]++341k
)
186
341k
        {
187
341k
          SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, w->info.dim + 1, h->info.dim, n, m);
188
341k
          float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
189
4.16M
          for (i[1] = 0; i[1] < gdim[1]; 
i[1]++3.82M
)
190
3.82M
          {
191
3.82M
            SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, w->info.dim + 1, h->info.dim, n, m);
192
3.82M
            const float v = gp[i[1] * ginc[CCV_NNC_MAX_DIM]];
193
3.82M
            if (v == 0) // shortcut if v is zero
194
2.99M
              continue;
195
829k
            float* wpz = wpu + n[1] * channel_size;
196
829k
            float* hpz = hp + ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) * hinc[CCV_NNC_MAX_DIM] + gidx * channel_size;
197
1.39M
            for (j[0] = 0; j[0] < m[0]; 
j[0]++569k
)
198
569k
            {
199
2.85M
              for (j[1] = 0; j[1] < m[1]; 
j[1]++2.28M
)
200
63.5M
                
for (c = 0; 2.28M
c < channel_size;
c++61.2M
)
201
61.2M
                  hpz[j[1] * hinc[CCV_NNC_MAX_DIM] + c] += v * wpz[j[1] * channel_size + c];
202
569k
              wpz += w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
203
569k
              hpz += hinc[CCV_NNC_MAX_DIM - 1] * hinc[CCV_NNC_MAX_DIM];
204
569k
            }
205
829k
          }
206
341k
          gp += ginc[CCV_NNC_MAX_DIM - 1] * ginc[CCV_NNC_MAX_DIM];
207
341k
          hp += hinc[CCV_NNC_MAX_DIM - 1] * hinc[CCV_NNC_MAX_DIM] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0));
208
341k
        }
209
35.2k
      }
210
738
  }
211
1.10k
  return CCV_NNC_EXEC_SUCCESS;
212
1.10k
}
213
214
REGISTER_COMMAND_BACKEND(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
215
1
{
216
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC;
217
1
  registry->tensor_datatypes = CCV_32F;
218
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
219
1
  registry->algorithms = 1;
220
1
  registry->exec = _ccv_nnc_conv_forw;
221
1
}
222
223
REGISTER_COMMAND_BACKEND(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
224
1
{
225
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC;
226
1
  registry->tensor_datatypes = CCV_32F;
227
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
228
1
  registry->algorithms = 1;
229
1
  registry->exec = _ccv_nnc_conv_back;
230
1
}
231