Coverage Report

Created: 2025-02-24 17:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/convolution/ccv_nnc_conv_cpu_ref.c
Line
Count
Source
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
static int _ccv_nnc_conv_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
14
751
{
15
751
  assert(input_size >= 2);
16
751
  const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
17
751
  const ccv_nnc_tensor_t* w = inputs[1];
18
751
  assert(CCV_IS_TENSOR_CONTIGUOUS(w));
19
751
  const ccv_nnc_tensor_t* bias = input_size > 2 ? 
inputs[2]748
:
03
;
20
751
  assert(!bias || !CCV_IS_TENSOR_VIEW(bias));
21
751
  assert(output_size == 1);
22
751
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
23
751
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
24
751
  const int size_nd = ccv_nnc_tensor_nd(cmd.info.size.dim) - 1;
25
751
  assert(size_nd == 2 || size_nd == 3);
26
751
  assert(a_nd == size_nd + 1 || a_nd == size_nd + 2);
27
751
  const int* adim = (a_nd == size_nd + 1) ? 
a->info.dim679
:
a->info.dim + 172
;
28
751
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
29
751
  assert(b_nd == size_nd + 1 || b_nd == size_nd + 2);
30
751
  const int* bdim = (b_nd == size_nd + 1) ? 
b->info.dim679
:
b->info.dim + 172
;
31
751
  const int groups = cmd.info.convolution.groups;
32
751
  assert(cmd.info.convolution.count % groups == 0);
33
751
  const int group_size = cmd.info.convolution.count / groups;
34
  // Make sure the weights output dimension matches the network convolution kernels
35
751
  assert(w->info.dim[0] == cmd.info.convolution.count);
36
751
  int astride[CCV_NNC_MAX_DIM_ALLOC];
37
751
  ccv_nnc_tensor_view_get_stride(a, astride);
38
751
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
39
751
  ccv_nnc_tensor_view_get_stride(b, bstride);
40
751
  assert(!bias || bias->info.dim[0] == cmd.info.convolution.count);
41
751
  const int batch_size = (a_nd == size_nd + 2) ? 
a->info.dim[0]72
:
1679
;
42
751
  int dilation[size_nd];
43
751
  int i;
44
2.25k
  for (i = 0; i < size_nd; 
i++1.50k
)
45
1.50k
    dilation[i] = ccv_max(cmd.info.convolution.dilation[i], 1);
46
751
  if (a->info.format == CCV_TENSOR_FORMAT_NHWC)
47
749
  {
48
    // Make sure the weights dimension matches the network dimension
49
2.24k
    for (i = 0; i < size_nd; 
i++1.50k
)
50
1.50k
      { assert(w->info.dim[i + 1] == cmd.info.size.dim[i]); }
51
749
    int wdim[size_nd];
52
2.24k
    for (i = 0; i < size_nd; 
i++1.50k
)
53
1.50k
      wdim[i] = (w->info.dim[i + 1] - 1) * dilation[i] + 1;
54
749
    assert(w->info.dim[size_nd + 1] * groups == adim[size_nd]);
55
749
    assert(b->info.format == CCV_TENSOR_FORMAT_NHWC);
56
749
    const int channel_size = w->info.dim[size_nd + 1];
57
749
    assert(bdim[size_nd] == cmd.info.convolution.count);
58
749
    if (size_nd == 2)
59
747
    {
60
41.4k
      
parallel_for747
(idx, cmd.info.convolution.count * batch_size) {
61
41.4k
        int c;
62
41.4k
        const int bidx = idx / cmd.info.convolution.count;
63
41.4k
        const int k = idx % cmd.info.convolution.count;
64
41.4k
        const int gidx = k / group_size;
65
41.4k
        float* ap = a->data.f32 + bidx * astride[0];
66
41.4k
        float* bp = b->data.f32 + bidx * bstride[0] + k;
67
        // kernel weight for one dim.
68
41.4k
        float* wp = w->data.f32 + k * w->info.dim[1] * w->info.dim[2] * channel_size;
69
41.4k
        float biasval = bias ? 
bias->data.f32[k]41.4k
:
08
;
70
        // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables.
71
41.4k
        int i[2];
72
41.4k
        int n[2];
73
41.4k
        int d[2];
74
41.4k
        int m[2];
75
41.4k
        int j[2];
76
2.01M
        for (i[0] = 0; i[0] < bdim[0]; 
i[0]++1.96M
)
77
1.96M
        {
78
1.96M
          SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, adim, n, m);
79
1.96M
          m[0] = (m[0] + n[0] - 1) / dilation[0] + 1;
80
1.96M
          const int n0 = (n[0] + dilation[0] - 1) / dilation[0];
81
1.96M
          d[0] = n0 * dilation[0] - n[0];
82
1.96M
          n[0] = n0;
83
1.96M
          m[0] = m[0] - n[0];
84
1.96M
          float* wpu = wp + n[0] * w->info.dim[2] * channel_size;
85
181M
          for (i[1] = 0; i[1] < bdim[1]; 
i[1]++179M
)
86
179M
          {
87
179M
            SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, adim, n, m);
88
179M
            m[1] = (m[1] + n[1] - 1) / dilation[1] + 1;
89
179M
            const int n1 = (n[1] + dilation[1] - 1) / dilation[1];
90
179M
            d[1] = n1 * dilation[1] - n[1];
91
179M
            n[1] = n1;
92
179M
            m[1] = m[1] - n[1];
93
179M
            float p = biasval;
94
179M
            float* wpz = wpu + n[1] * channel_size;
95
179M
            float* apz = ap + d[0] * astride[1] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * astride[2] + gidx * channel_size;
96
1.02G
            for (j[0] = 0; j[0] < m[0]; 
j[0]++843M
)
97
843M
            {
98
5.46G
              for (j[1] = 0; j[1] < m[1]; 
j[1]++4.62G
)
99
116G
                
for (c = 0; 4.62G
c < channel_size;
c++111G
)
100
111G
                  p += wpz[j[1] * channel_size + c] * apz[j[1] * dilation[1] * astride[2] + c];
101
843M
              wpz += w->info.dim[2] * channel_size;
102
843M
              apz += astride[1] * dilation[0];
103
843M
            }
104
179M
            bp[i[1] * bstride[2]] = p;
105
179M
          }
106
1.96M
          bp += bstride[1];
107
1.96M
          ap += astride[1] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0));
108
1.96M
        }
109
41.4k
      } parallel_endfor
110
747
    } else 
if (2
size_nd == 32
) {
111
2
      if (a_nd == size_nd + 1)
112
5
        
for (i = a_nd; 1
i > 0;
i--4
)
113
4
          astride[i] = astride[i - 1];
114
2
      if (b_nd == size_nd + 1)
115
5
        
for (i = b_nd; 1
i > 0;
i--4
)
116
4
          bstride[i] = bstride[i - 1];
117
1.54k
      
parallel_for2
(idx, cmd.info.convolution.count * batch_size) {
118
1.54k
        int c;
119
1.54k
        const int bidx = idx / cmd.info.convolution.count;
120
1.54k
        const int k = idx % cmd.info.convolution.count;
121
1.54k
        const int gidx = k / group_size;
122
1.54k
        float* ap = a->data.f32 + bidx * astride[0];
123
1.54k
        float* bp = b->data.f32 + bidx * bstride[0] + k;
124
        // kernel weight for one dim.
125
1.54k
        float* wp = w->data.f32 + k * w->info.dim[1] * w->info.dim[2] * w->info.dim[3] * channel_size;
126
1.54k
        float biasval = bias ? bias->data.f32[k] : 
00
;
127
        // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables.
128
1.54k
        int i[3];
129
1.54k
        int n[3];
130
1.54k
        int d[3];
131
1.54k
        int m[3];
132
1.54k
        int j[3];
133
6.17k
        for (i[0] = 0; i[0] < bdim[0]; 
i[0]++4.63k
)
134
4.63k
        {
135
4.63k
          SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, adim, n, m);
136
4.63k
          m[0] = (m[0] + n[0] - 1) / dilation[0] + 1;
137
4.63k
          const int n0 = (n[0] + dilation[0] - 1) / dilation[0];
138
4.63k
          d[0] = n0 * dilation[0] - n[0];
139
4.63k
          n[0] = n0;
140
4.63k
          m[0] = m[0] - n[0];
141
4.63k
          float* wpu = wp + n[0] * w->info.dim[2] * w->info.dim[3] * channel_size;
142
4.63k
          float* bpu = bp;
143
522k
          for (i[1] = 0; i[1] < bdim[1]; 
i[1]++517k
)
144
517k
          {
145
517k
            SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, adim, n, m);
146
517k
            m[1] = (m[1] + n[1] - 1) / dilation[1] + 1;
147
517k
            const int n1 = (n[1] + dilation[1] - 1) / dilation[1];
148
517k
            d[1] = n1 * dilation[1] - n[1];
149
517k
            n[1] = n1;
150
517k
            m[1] = m[1] - n[1];
151
58.3M
            for (i[2] = 0; i[2] < bdim[2]; 
i[2]++57.8M
)
152
57.8M
            {
153
57.8M
              SET_BORDER_OFFSET_SIZE_FOR(2, i, hint, wdim, adim, n, m);
154
57.8M
              m[2] = (m[2] + n[2] - 1) / dilation[2] + 1;
155
57.8M
              const int n2 = (n[2] + dilation[2] - 1) / dilation[2];
156
57.8M
              d[2] = n2 * dilation[2] - n[2];
157
57.8M
              n[2] = n2;
158
57.8M
              m[2] = m[2] - n[2];
159
57.8M
              float p = biasval;
160
57.8M
              float* wpz = wpu + n[1] * w->info.dim[3] * channel_size + n[2] * channel_size;
161
57.8M
              float* apz = ap + d[0] * astride[1] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * astride[2] + (ccv_max(i[2] * hint.stride.dim[2] - hint.border.begin[2], 0) + d[2]) * astride[3] + gidx * channel_size;
162
193M
              for (j[0] = 0; j[0] < m[0]; 
j[0]++135M
)
163
135M
              {
164
1.08G
                for (j[1] = 0; j[1] < m[1]; 
j[1]++945M
)
165
7.54G
                  
for (j[2] = 0; 945M
j[2] < m[2];
j[2]++6.59G
)
166
26.3G
                    
for (c = 0; 6.59G
c < channel_size;
c++19.7G
)
167
19.7G
                      p += wpz[(j[1] * w->info.dim[3] + j[2]) * channel_size + c] * apz[j[1] * dilation[1] * astride[2] + j[2] * dilation[2] * astride[3] + c];
168
135M
                wpz += w->info.dim[2] * w->info.dim[3] * channel_size;
169
135M
                apz += astride[1] * dilation[0];
170
135M
              }
171
57.8M
              bpu[i[2] * bstride[3]] = p;
172
57.8M
            }
173
517k
            bpu += bstride[2];
174
517k
          }
175
4.63k
          bp += bstride[1];
176
4.63k
          ap += astride[1] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0));
177
4.63k
        }
178
1.54k
      } parallel_endfor
179
2
    } else {
180
0
      assert(0 && "Cannot support 1d or 4d convolution.");
181
0
    }
182
749
  } else 
if (2
a->info.format == CCV_TENSOR_FORMAT_NCHW2
) {
183
    // Make sure the weights dimension matches the network dimension
184
7
    for (i = 0; i < size_nd; 
i++5
)
185
5
      { assert(w->info.dim[i + 2] == cmd.info.size.dim[i]); }
186
2
    int wdim[size_nd];
187
7
    for (i = 0; i < size_nd; 
i++5
)
188
5
      wdim[i] = (w->info.dim[i + 2] - 1) * dilation[i] + 1;
189
2
    assert(w->info.dim[1] * groups == adim[0]);
190
2
    assert(b->info.format == CCV_TENSOR_FORMAT_NCHW);
191
2
    const int channel_size = w->info.dim[1];
192
2
    assert(bdim[0] == cmd.info.convolution.count);
193
2
    if (size_nd == 2)
194
1
    {
195
1
      const int hw = w->info.dim[2] * w->info.dim[3];
196
1.53k
      
parallel_for1
(idx, cmd.info.convolution.count * batch_size) {
197
1.53k
        int c;
198
1.53k
        const int bidx = idx / cmd.info.convolution.count;
199
1.53k
        const int k = idx % cmd.info.convolution.count;
200
1.53k
        const int gidx = k / group_size;
201
1.53k
        float* ap = a->data.f32 + bidx * astride[0];
202
1.53k
        float* bp = b->data.f32 + bidx * bstride[0] + k * bstride[1];
203
        // kernel weight for one dim.
204
1.53k
        float* wp = w->data.f32 + k * hw * channel_size;
205
1.53k
        float biasval = bias ? bias->data.f32[k] : 
00
;
206
        // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables.
207
1.53k
        int i[2];
208
1.53k
        int n[2];
209
1.53k
        int d[2];
210
1.53k
        int m[2];
211
1.53k
        int j[2];
212
173k
        for (i[0] = 0; i[0] < bdim[1]; 
i[0]++172k
)
213
172k
        {
214
172k
          SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, adim + 1, n, m);
215
172k
          m[0] = (m[0] + n[0] - 1) / dilation[0] + 1;
216
172k
          const int n0 = (n[0] + dilation[0] - 1) / dilation[0];
217
172k
          d[0] = n0 * dilation[0] - n[0];
218
172k
          n[0] = n0;
219
172k
          m[0] = m[0] - n[0];
220
172k
          float* wpu = wp + n[0] * w->info.dim[3];
221
19.4M
          for (i[1] = 0; i[1] < bdim[2]; 
i[1]++19.2M
)
222
19.2M
          {
223
19.2M
            SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, adim + 1, n, m);
224
19.2M
            m[1] = (m[1] + n[1] - 1) / dilation[1] + 1;
225
19.2M
            const int n1 = (n[1] + dilation[1] - 1) / dilation[1];
226
19.2M
            d[1] = n1 * dilation[1] - n[1];
227
19.2M
            n[1] = n1;
228
19.2M
            m[1] = m[1] - n[1];
229
19.2M
            float p = biasval;
230
19.2M
            float* wpz = wpu + n[1];
231
19.2M
            float* apz = ap + d[0] * astride[2] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * astride[3] + gidx * channel_size * astride[1];
232
153M
            for (j[0] = 0; j[0] < m[0]; 
j[0]++133M
)
233
133M
            {
234
1.06G
              for (j[1] = 0; j[1] < m[1]; 
j[1]++929M
)
235
3.71G
                
for (c = 0; 929M
c < channel_size;
c++2.78G
)
236
2.78G
                  p += wpz[j[1] + c * hw] * apz[j[1] * dilation[1] * astride[3] + c * astride[1]];
237
133M
              wpz += w->info.dim[3];
238
133M
              apz += astride[2] * dilation[0];
239
133M
            }
240
19.2M
            bp[i[1]] = p;
241
19.2M
          }
242
172k
          bp += bstride[2];
243
172k
          ap += astride[2] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0));
244
172k
        }
245
1.53k
      } parallel_endfor
246
1
    } else if (size_nd == 3) {
247
1
      if (a_nd == size_nd + 1)
248
0
        for (i = a_nd; i > 0; i--)
249
0
          astride[i] = astride[i - 1];
250
1
      if (b_nd == size_nd + 1)
251
0
        for (i = b_nd; i > 0; i--)
252
0
          bstride[i] = bstride[i - 1];
253
1
      const int hw = w->info.dim[2] * w->info.dim[3] * w->info.dim[4];
254
1.53k
      
parallel_for1
(idx, cmd.info.convolution.count * batch_size) {
255
1.53k
        int c;
256
1.53k
        const int bidx = idx / cmd.info.convolution.count;
257
1.53k
        const int k = idx % cmd.info.convolution.count;
258
1.53k
        const int gidx = k / group_size;
259
1.53k
        float* ap = a->data.f32 + bidx * astride[0];
260
1.53k
        float* bp = b->data.f32 + bidx * bstride[0] + k * bstride[1];
261
        // kernel weight for one dim.
262
1.53k
        float* wp = w->data.f32 + k * hw * channel_size;
263
1.53k
        float biasval = bias ? bias->data.f32[k] : 
00
;
264
        // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables.
265
1.53k
        int i[3];
266
1.53k
        int n[3];
267
1.53k
        int d[3];
268
1.53k
        int m[3];
269
1.53k
        int j[3];
270
6.14k
        for (i[0] = 0; i[0] < bdim[1]; 
i[0]++4.60k
)
271
4.60k
        {
272
4.60k
          SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, adim + 1, n, m);
273
4.60k
          m[0] = (m[0] + n[0] - 1) / dilation[0] + 1;
274
4.60k
          const int n0 = (n[0] + dilation[0] - 1) / dilation[0];
275
4.60k
          d[0] = n0 * dilation[0] - n[0];
276
4.60k
          n[0] = n0;
277
4.60k
          m[0] = m[0] - n[0];
278
4.60k
          float* wpu = wp + n[0] * w->info.dim[3] * w->info.dim[4];
279
520k
          for (i[1] = 0; i[1] < bdim[2]; 
i[1]++516k
)
280
516k
          {
281
516k
            SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, adim + 1, n, m);
282
516k
            m[1] = (m[1] + n[1] - 1) / dilation[1] + 1;
283
516k
            const int n1 = (n[1] + dilation[1] - 1) / dilation[1];
284
516k
            d[1] = n1 * dilation[1] - n[1];
285
516k
            n[1] = n1;
286
516k
            m[1] = m[1] - n[1];
287
58.3M
            for (i[2] = 0; i[2] < bdim[3]; 
i[2]++57.8M
)
288
57.8M
            {
289
57.8M
              SET_BORDER_OFFSET_SIZE_FOR(2, i, hint, wdim, adim + 1, n, m);
290
57.8M
              m[2] = (m[2] + n[2] - 1) / dilation[2] + 1;
291
57.8M
              const int n2 = (n[2] + dilation[2] - 1) / dilation[2];
292
57.8M
              d[2] = n2 * dilation[2] - n[2];
293
57.8M
              n[2] = n2;
294
57.8M
              m[2] = m[2] - n[2];
295
57.8M
              float p = biasval;
296
57.8M
              float* wpz = wpu + n[1] * w->info.dim[4] + n[2];
297
57.8M
              float* apz = ap + d[0] * astride[2] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * astride[3] + (ccv_max(i[2] * hint.stride.dim[2] - hint.border.begin[2], 0) + d[2]) * astride[4] + gidx * channel_size * astride[1];
298
192M
              for (j[0] = 0; j[0] < m[0]; 
j[0]++134M
)
299
134M
              {
300
1.07G
                for (j[1] = 0; j[1] < m[1]; 
j[1]++936M
)
301
7.44G
                  
for (j[2] = 0; 936M
j[2] < m[2];
j[2]++6.50G
)
302
26.0G
                    
for (c = 0; 6.50G
c < channel_size;
c++19.5G
)
303
19.5G
                      p += wpz[j[1] * w->info.dim[4] + j[2] + c * hw] * apz[j[1] * dilation[1] * astride[3] + j[2] * dilation[2] * astride[4] + c * astride[1]];
304
134M
                wpz += w->info.dim[3] * w->info.dim[4];
305
134M
                apz += astride[2] * dilation[0];
306
134M
              }
307
57.8M
              bp[i[1] * bstride[3] + i[2]] = p;
308
57.8M
            }
309
516k
          }
310
4.60k
          bp += bstride[2];
311
4.60k
          ap += astride[2] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0));
312
4.60k
        }
313
1.53k
      } parallel_endfor
314
1
    } else {
315
0
      assert(0 && "Cannot support 1d or 4d convolution.");
316
0
    }
317
2
  }
318
751
  return CCV_NNC_EXEC_SUCCESS;
319
751
}
320
321
static int _ccv_nnc_conv_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
322
1.11k
{
323
  // inputs: gradient, forw prop input, [w]
324
  // outputs: [output gradient], weight updates, bias updates
325
1.11k
  assert(input_size >= 2 && output_size >= 2);
326
1.11k
  const ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0]; // gradients
327
1.11k
  const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[1];
328
1.11k
  ccv_nnc_tensor_t* w = output_size > 1 ? outputs[1] : 
00
;
329
1.11k
  assert(CCV_IS_TENSOR_CONTIGUOUS(w));
330
1.11k
  ccv_nnc_tensor_t* bias = output_size > 2 ? 
outputs[2]1.10k
:
02
;
331
1.11k
  assert(!bias || !CCV_IS_TENSOR_VIEW(bias));
332
1.11k
  ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[0]; // output gradients
333
1.11k
  if (!(flags & CCV_NNC_ACCUMULATE_OUTPUT)) // reset the gradients to 0
334
1.11k
  {
335
1.11k
    if (w)
336
1.11k
      memset(w->data.u8, 0, sizeof(float) * ccv_nnc_tensor_count(w->info));
337
1.11k
    if (bias)
338
1.10k
      memset(bias->data.u8, 0, sizeof(float) * ccv_nnc_tensor_count(bias->info));
339
1.11k
  }
340
1.11k
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
341
1.11k
  assert(a_nd == CCV_NNC_MAX_DIM + 1 || a_nd == CCV_NNC_MAX_DIM + 2);
342
1.11k
  const int* adim = (a_nd == CCV_NNC_MAX_DIM + 1) ? 
a->info.dim4
:
a->info.dim + 11.10k
;
343
1.11k
  const int g_nd = ccv_nnc_tensor_nd(g->info.dim);
344
1.11k
  assert(g_nd == CCV_NNC_MAX_DIM + 1 || g_nd == CCV_NNC_MAX_DIM + 2);
345
1.11k
  const int* gdim = (g_nd == CCV_NNC_MAX_DIM + 1) ? 
g->info.dim4
:
g->info.dim + 11.10k
;
346
1.11k
  int astride[CCV_NNC_MAX_DIM_ALLOC];
347
1.11k
  ccv_nnc_tensor_view_get_stride(a, astride);
348
1.11k
  int gstride[CCV_NNC_MAX_DIM_ALLOC];
349
1.11k
  ccv_nnc_tensor_view_get_stride(g, gstride);
350
1.11k
  const int groups = cmd.info.convolution.groups;
351
1.11k
  if (w)
352
1.11k
    assert(w->info.dim[CCV_NNC_MAX_DIM + 1] * groups == adim[CCV_NNC_MAX_DIM]);
353
1.11k
  assert(cmd.info.convolution.count % groups == 0);
354
1.11k
  const int group_size = cmd.info.convolution.count / groups;
355
1.11k
  const int channel_size = w ? w->info.dim[CCV_NNC_MAX_DIM + 1] : 
inputs[2]->info.dim[0
CCV_NNC_MAX_DIM0
+ 1];
356
1.11k
  const int batch_size = (a_nd == CCV_NNC_MAX_DIM + 2) ? 
a->info.dim[0]1.10k
:
14
;
357
1.11k
  const int dilation[CCV_NNC_MAX_DIM] = {
358
1.11k
    ccv_max(cmd.info.convolution.dilation[0], 1),
359
1.11k
    ccv_max(cmd.info.convolution.dilation[1], 1)
360
1.11k
  };
361
1.11k
  const int wdim[CCV_NNC_MAX_DIM] = {
362
1.11k
    (w->info.dim[1] - 1) * dilation[0] + 1,
363
1.11k
    (w->info.dim[2] - 1) * dilation[1] + 1
364
1.11k
  };
365
1.11k
  if (w)
366
1.11k
  {
367
47.4k
    
parallel_for1.11k
(k, cmd.info.convolution.count) {
368
47.4k
      int c;
369
47.4k
      const int gidx = k / group_size;
370
      // kernel weight for one dim.
371
47.4k
      float* wp = w->data.f32 + k * w->info.dim[1] * w->info.dim[2] * w->info.dim[3];
372
47.4k
      float biasval = 0;
373
47.4k
      int i[CCV_NNC_MAX_DIM];
374
47.4k
      int n[CCV_NNC_MAX_DIM];
375
47.4k
      int d[CCV_NNC_MAX_DIM];
376
47.4k
      int m[CCV_NNC_MAX_DIM];
377
47.4k
      int j[CCV_NNC_MAX_DIM];
378
47.4k
      int bidx;
379
100k
      for (bidx = 0; bidx < batch_size; 
bidx++53.1k
)
380
53.1k
      {
381
53.1k
        const float* ap = a->data.f32 + bidx * astride[0];
382
53.1k
        const float* gp = g->data.f32 + bidx * gstride[0] + k;
383
1.44M
        for (i[0] = 0; i[0] < gdim[0]; 
i[0]++1.39M
)
384
1.39M
        {
385
1.39M
          SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, adim, n, m);
386
1.39M
          m[0] = (m[0] + n[0] - 1) / dilation[0] + 1;
387
1.39M
          const int n0 = (n[0] + dilation[0] - 1) / dilation[0];
388
1.39M
          d[0] = n0 * dilation[0] - n[0];
389
1.39M
          n[0] = n0;
390
1.39M
          m[0] = m[0] - n[0];
391
1.39M
          float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
392
93.5M
          for (i[1] = 0; i[1] < gdim[1]; 
i[1]++92.1M
)
393
92.1M
          {
394
92.1M
            SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, adim, n, m);
395
92.1M
            m[1] = (m[1] + n[1] - 1) / dilation[1] + 1;
396
92.1M
            const int n1 = (n[1] + dilation[1] - 1) / dilation[1];
397
92.1M
            d[1] = n1 * dilation[1] - n[1];
398
92.1M
            n[1] = n1;
399
92.1M
            m[1] = m[1] - n[1];
400
92.1M
            const float v = gp[i[1] * gstride[CCV_NNC_MAX_DIM]];
401
92.1M
            if (v == 0) // shortcut if v is zero
402
11.8M
              continue;
403
80.2M
            biasval += v;
404
80.2M
            float* wpz = wpu + n[1] * channel_size;
405
80.2M
            const float* apz = ap + d[0] * astride[CCV_NNC_MAX_DIM - 1] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * astride[CCV_NNC_MAX_DIM] + gidx * channel_size;
406
629M
            for (j[0] = 0; j[0] < m[0]; 
j[0]++549M
)
407
549M
            {
408
4.31G
              for (j[1] = 0; j[1] < m[1]; 
j[1]++3.76G
)
409
15.9G
                
for (c = 0; 3.76G
c < channel_size;
c++12.1G
)
410
12.1G
                  wpz[j[1] * channel_size + c] += v * apz[j[1] * dilation[1] * astride[CCV_NNC_MAX_DIM] + c];
411
549M
              wpz += w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
412
549M
              apz += astride[CCV_NNC_MAX_DIM - 1] * dilation[0];
413
549M
            }
414
80.2M
          }
415
1.39M
          gp += gstride[CCV_NNC_MAX_DIM - 1];
416
1.39M
          ap += astride[CCV_NNC_MAX_DIM - 1] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0));
417
1.39M
        }
418
53.1k
      }
419
47.4k
      if (bias)
420
47.4k
        bias->data.f32[k] = biasval;
421
47.4k
    } parallel_endfor
422
1.11k
  }
423
  // If h is available, therefore, we need to propagate the gradients back
424
1.11k
  if (h)
425
1.01k
  {
426
1.01k
    assert(h);
427
1.01k
    const int h_nd = ccv_nnc_tensor_nd(h->info.dim);
428
1.01k
    assert(h_nd == CCV_NNC_MAX_DIM + 1 || h_nd == CCV_NNC_MAX_DIM + 2);
429
1.01k
    const int* hdim = (h_nd == CCV_NNC_MAX_DIM + 1) ? 
h->info.dim6
:
h->info.dim + 11.00k
;
430
1.01k
    int hstride[CCV_NNC_MAX_DIM_ALLOC];
431
1.01k
    ccv_nnc_tensor_view_get_stride(h, hstride);
432
    // reset it to 0.
433
1.01k
    ccv_nnc_tensor_zero(h);
434
1.01k
    w = inputs[2];
435
1.01k
    assert(CCV_IS_TENSOR_CONTIGUOUS(w));
436
1.01k
    int bidx;
437
2.08k
    for (bidx = 0; bidx < batch_size; 
bidx++1.07k
)
438
1.07k
    {
439
1.07k
      int k;
440
51.0k
      for (k = 0; k < cmd.info.convolution.count; 
k++49.9k
)
441
49.9k
      {
442
49.9k
        int c;
443
49.9k
        const int gidx = k / group_size;
444
49.9k
        float* hp = h->data.f32 + bidx * hstride[0];
445
49.9k
        const float* gp = g->data.f32 + bidx * gstride[0] + k;
446
        // kernel weight for one dim.
447
49.9k
        float* wp = w->data.f32 + k * w->info.dim[1] * w->info.dim[2] * channel_size;
448
        // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables.
449
49.9k
        int i[CCV_NNC_MAX_DIM];
450
49.9k
        int n[CCV_NNC_MAX_DIM];
451
49.9k
        int d[CCV_NNC_MAX_DIM];
452
49.9k
        int m[CCV_NNC_MAX_DIM];
453
49.9k
        int j[CCV_NNC_MAX_DIM];
454
1.34M
        for (i[0] = 0; i[0] < gdim[0]; 
i[0]++1.29M
)
455
1.29M
        {
456
1.29M
          SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, hdim, n, m);
457
1.29M
          m[0] = (m[0] + n[0] - 1) / dilation[0] + 1;
458
1.29M
          const int n0 = (n[0] + dilation[0] - 1) / dilation[0];
459
1.29M
          d[0] = n0 * dilation[0] - n[0];
460
1.29M
          n[0] = n0;
461
1.29M
          m[0] = m[0] - n[0];
462
1.29M
          const float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
463
90.3M
          for (i[1] = 0; i[1] < gdim[1]; 
i[1]++89.1M
)
464
89.1M
          {
465
89.1M
            SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, hdim, n, m);
466
89.1M
            m[1] = (m[1] + n[1] - 1) / dilation[1] + 1;
467
89.1M
            const int n1 = (n[1] + dilation[1] - 1) / dilation[1];
468
89.1M
            d[1] = n1 * dilation[1] - n[1];
469
89.1M
            n[1] = n1;
470
89.1M
            m[1] = m[1] - n[1];
471
89.1M
            const float v = gp[i[1] * gstride[CCV_NNC_MAX_DIM]];
472
89.1M
            if (v == 0) // shortcut if v is zero
473
9.26M
              continue;
474
79.8M
            const float* wpz = wpu + n[1] * channel_size;
475
79.8M
            float* hpz = hp + d[0] * hstride[CCV_NNC_MAX_DIM - 1] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * hstride[CCV_NNC_MAX_DIM] + gidx * channel_size;
476
626M
            for (j[0] = 0; j[0] < m[0]; 
j[0]++547M
)
477
547M
            {
478
4.30G
              for (j[1] = 0; j[1] < m[1]; 
j[1]++3.75G
)
479
15.8G
                
for (c = 0; 3.75G
c < channel_size;
c++12.1G
)
480
12.1G
                  hpz[j[1] * dilation[1] * hstride[CCV_NNC_MAX_DIM] + c] += v * wpz[j[1] * channel_size + c];
481
547M
              wpz += w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
482
547M
              hpz += hstride[CCV_NNC_MAX_DIM - 1] * dilation[0];
483
547M
            }
484
79.8M
          }
485
1.29M
          gp += gstride[CCV_NNC_MAX_DIM - 1];
486
1.29M
          hp += hstride[CCV_NNC_MAX_DIM - 1] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0));
487
1.29M
        }
488
49.9k
      }
489
1.07k
    }
490
1.01k
  }
491
1.11k
  return CCV_NNC_EXEC_SUCCESS;
492
1.11k
}
493
494
REGISTER_COMMAND_BACKEND(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
495
1
{
496
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW;
497
1
  registry->tensor_datatypes = CCV_32F;
498
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
499
1
  registry->algorithms = 1;
500
1
  registry->exec = _ccv_nnc_conv_forw;
501
1
}
502
503
REGISTER_COMMAND_BACKEND(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
504
1
{
505
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC;
506
1
  registry->tensor_datatypes = CCV_32F;
507
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
508
1
  registry->algorithms = 1;
509
1
  registry->exec = _ccv_nnc_conv_back;
510
1
}
511