Coverage Report

Created: 2024-08-19 11:27

/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/convolution/ccv_nnc_conv_cpu_ref.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
static int _ccv_nnc_conv_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
14
748
{
15
748
  assert(input_size >= 2);
16
748
  const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
17
748
  const ccv_nnc_tensor_t* w = inputs[1];
18
748
  assert(CCV_IS_TENSOR_CONTIGUOUS(w));
19
748
  const ccv_nnc_tensor_t* bias = input_size > 2 ? 
inputs[2]745
:
03
;
20
748
  assert(!bias || !CCV_IS_TENSOR_VIEW(bias));
21
748
  assert(output_size == 1);
22
748
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
23
748
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
24
748
  assert(a_nd == CCV_NNC_MAX_DIM + 1 || a_nd == CCV_NNC_MAX_DIM + 2);
25
748
  const int* adim = (a_nd == CCV_NNC_MAX_DIM + 1) ? 
a->info.dim678
:
a->info.dim + 170
;
26
748
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
27
748
  assert(b_nd == CCV_NNC_MAX_DIM + 1 || b_nd == CCV_NNC_MAX_DIM + 2);
28
748
  const int* bdim = (b_nd == CCV_NNC_MAX_DIM + 1) ? 
b->info.dim678
:
b->info.dim + 170
;
29
748
  const int groups = cmd.info.convolution.groups;
30
748
  assert(cmd.info.convolution.count % groups == 0);
31
748
  const int group_size = cmd.info.convolution.count / groups;
32
  // Make sure the weights output dimension matches the network convolution kernels
33
748
  assert(w->info.dim[0] == cmd.info.convolution.count);
34
748
  int astride[CCV_NNC_MAX_DIM_ALLOC];
35
748
  ccv_nnc_tensor_view_get_stride(a, astride);
36
748
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
37
748
  ccv_nnc_tensor_view_get_stride(b, bstride);
38
748
  assert(!bias || bias->info.dim[0] == cmd.info.convolution.count);
39
748
  const int batch_size = (a_nd == CCV_NNC_MAX_DIM + 2) ? 
a->info.dim[0]70
:
1678
;
40
748
  const int dilation[CCV_NNC_MAX_DIM] = {
41
748
    ccv_max(cmd.info.convolution.dilation[0], 1),
42
748
    ccv_max(cmd.info.convolution.dilation[1], 1)
43
748
  };
44
748
  if (a->info.format == CCV_TENSOR_FORMAT_NHWC)
45
747
  {
46
    // Make sure the weights dimension matches the network dimension
47
747
    assert(w->info.dim[1] == cmd.info.size.dim[0]);
48
747
    assert(w->info.dim[2] == cmd.info.size.dim[1]);
49
747
    const int wdim[CCV_NNC_MAX_DIM] = {
50
747
      (w->info.dim[1] - 1) * dilation[0] + 1,
51
747
      (w->info.dim[2] - 1) * dilation[1] + 1
52
747
    };
53
747
    assert(w->info.dim[CCV_NNC_MAX_DIM + 1] * groups == adim[CCV_NNC_MAX_DIM]);
54
747
    assert(b->info.format == CCV_TENSOR_FORMAT_NHWC);
55
747
    const int channel_size = w->info.dim[CCV_NNC_MAX_DIM + 1];
56
747
    assert(bdim[CCV_NNC_MAX_DIM] == cmd.info.convolution.count);
57
59.8k
    
parallel_for747
(idx, cmd.info.convolution.count * batch_size) {
58
59.8k
      int c;
59
59.8k
      const int bidx = idx / cmd.info.convolution.count;
60
59.8k
      const int k = idx % cmd.info.convolution.count;
61
59.8k
      const int gidx = k / group_size;
62
59.8k
      float* ap = a->data.f32 + bidx * astride[0];
63
59.8k
      float* bp = b->data.f32 + bidx * bstride[0] + k;
64
      // kernel weight for one dim.
65
59.8k
      float* wp = w->data.f32 + k * w->info.dim[1] * w->info.dim[2] * channel_size;
66
59.8k
      float biasval = bias ? 
bias->data.f32[k]59.8k
:
08
;
67
      // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables.
68
59.8k
      int i[CCV_NNC_MAX_DIM];
69
59.8k
      int n[CCV_NNC_MAX_DIM];
70
59.8k
      int d[CCV_NNC_MAX_DIM];
71
59.8k
      int m[CCV_NNC_MAX_DIM];
72
59.8k
      int j[CCV_NNC_MAX_DIM];
73
4.09M
      for (i[0] = 0; i[0] < bdim[0]; 
i[0]++4.03M
)
74
4.03M
      {
75
4.03M
        SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, adim, n, m);
76
4.03M
        m[0] = (m[0] + n[0] - 1) / dilation[0] + 1;
77
4.03M
        const int n0 = (n[0] + dilation[0] - 1) / dilation[0];
78
4.03M
        d[0] = n0 * dilation[0] - n[0];
79
4.03M
        n[0] = n0;
80
4.03M
        m[0] = m[0] - n[0];
81
4.03M
        float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
82
414M
        for (i[1] = 0; i[1] < bdim[1]; 
i[1]++410M
)
83
410M
        {
84
410M
          SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, adim, n, m);
85
410M
          m[1] = (m[1] + n[1] - 1) / dilation[1] + 1;
86
410M
          const int n1 = (n[1] + dilation[1] - 1) / dilation[1];
87
410M
          d[1] = n1 * dilation[1] - n[1];
88
410M
          n[1] = n1;
89
410M
          m[1] = m[1] - n[1];
90
410M
          float p = biasval;
91
410M
          float* wpz = wpu + n[1] * channel_size;
92
410M
          float* apz = ap + d[0] * astride[CCV_NNC_MAX_DIM - 1] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * astride[CCV_NNC_MAX_DIM] + gidx * channel_size;
93
2.85G
          for (j[0] = 0; j[0] < m[0]; 
j[0]++2.44G
)
94
2.44G
          {
95
18.1G
            for (j[1] = 0; j[1] < m[1]; 
j[1]++15.7G
)
96
160G
              
for (c = 0; 15.7G
c < channel_size;
c++144G
)
97
144G
                p += wpz[j[1] * channel_size + c] * apz[j[1] * dilation[1] * astride[CCV_NNC_MAX_DIM] + c];
98
2.44G
            wpz += w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
99
2.44G
            apz += astride[CCV_NNC_MAX_DIM - 1] * dilation[0];
100
2.44G
          }
101
410M
          bp[i[1] * bstride[CCV_NNC_MAX_DIM]] = p;
102
410M
        }
103
4.03M
        bp += bstride[CCV_NNC_MAX_DIM - 1];
104
4.03M
        ap += astride[CCV_NNC_MAX_DIM - 1] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0));
105
4.03M
      }
106
59.8k
    } parallel_endfor
107
747
  } else 
if (1
a->info.format == CCV_TENSOR_FORMAT_NCHW1
) {
108
    // Make sure the weights dimension matches the network dimension
109
1
    assert(w->info.dim[2] == cmd.info.size.dim[0]);
110
1
    assert(w->info.dim[3] == cmd.info.size.dim[1]);
111
1
    const int wdim[CCV_NNC_MAX_DIM] = {
112
1
      (w->info.dim[2] - 1) * dilation[0] + 1,
113
1
      (w->info.dim[3] - 1) * dilation[1] + 1
114
1
    };
115
1
    assert(w->info.dim[1] * groups == adim[0]);
116
1
    assert(b->info.format == CCV_TENSOR_FORMAT_NCHW);
117
1
    const int channel_size = w->info.dim[1];
118
1
    const int hw = w->info.dim[2] * w->info.dim[3];
119
1
    assert(bdim[0] == cmd.info.convolution.count);
120
6.14k
    
parallel_for1
(idx, cmd.info.convolution.count * batch_size) {
121
6.14k
      int c;
122
6.14k
      const int bidx = idx / cmd.info.convolution.count;
123
6.14k
      const int k = idx % cmd.info.convolution.count;
124
6.14k
      const int gidx = k / group_size;
125
6.14k
      float* ap = a->data.f32 + bidx * astride[0];
126
6.14k
      float* bp = b->data.f32 + bidx * bstride[0] + k * bstride[1];
127
      // kernel weight for one dim.
128
6.14k
      float* wp = w->data.f32 + k * hw * channel_size;
129
6.14k
      float biasval = bias ? bias->data.f32[k] : 
00
;
130
      // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables.
131
6.14k
      int i[CCV_NNC_MAX_DIM];
132
6.14k
      int n[CCV_NNC_MAX_DIM];
133
6.14k
      int d[CCV_NNC_MAX_DIM];
134
6.14k
      int m[CCV_NNC_MAX_DIM];
135
6.14k
      int j[CCV_NNC_MAX_DIM];
136
694k
      for (i[0] = 0; i[0] < bdim[1]; 
i[0]++688k
)
137
688k
      {
138
688k
        SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, adim + 1, n, m);
139
688k
        m[0] = (m[0] + n[0] - 1) / dilation[0] + 1;
140
688k
        const int n0 = (n[0] + dilation[0] - 1) / dilation[0];
141
688k
        d[0] = n0 * dilation[0] - n[0];
142
688k
        n[0] = n0;
143
688k
        m[0] = m[0] - n[0];
144
688k
        float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM + 1];
145
77.7M
        for (i[1] = 0; i[1] < bdim[2]; 
i[1]++77.0M
)
146
77.0M
        {
147
77.0M
          SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, adim + 1, n, m);
148
77.0M
          m[1] = (m[1] + n[1] - 1) / dilation[1] + 1;
149
77.0M
          const int n1 = (n[1] + dilation[1] - 1) / dilation[1];
150
77.0M
          d[1] = n1 * dilation[1] - n[1];
151
77.0M
          n[1] = n1;
152
77.0M
          m[1] = m[1] - n[1];
153
77.0M
          float p = biasval;
154
77.0M
          float* wpz = wpu + n[1];
155
77.0M
          float* apz = ap + d[0] * astride[CCV_NNC_MAX_DIM] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * astride[CCV_NNC_MAX_DIM + 1] + gidx * channel_size * astride[1];
156
612M
          for (j[0] = 0; j[0] < m[0]; 
j[0]++535M
)
157
535M
          {
158
4.25G
            for (j[1] = 0; j[1] < m[1]; 
j[1]++3.71G
)
159
14.8G
              
for (c = 0; 3.71G
c < channel_size;
c++11.1G
)
160
11.1G
                p += wpz[j[1] + c * hw] * apz[j[1] * dilation[1] * astride[CCV_NNC_MAX_DIM + 1] + c * astride[1]];
161
535M
            wpz += w->info.dim[CCV_NNC_MAX_DIM + 1];
162
535M
            apz += astride[CCV_NNC_MAX_DIM] * dilation[0];
163
535M
          }
164
77.0M
          bp[i[1]] = p;
165
77.0M
        }
166
688k
        bp += bstride[CCV_NNC_MAX_DIM];
167
688k
        ap += astride[CCV_NNC_MAX_DIM] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0));
168
688k
      }
169
6.14k
    } parallel_endfor
170
1
  }
171
748
  return CCV_NNC_EXEC_SUCCESS;
172
748
}
173
174
static int _ccv_nnc_conv_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
175
1.11k
{
176
  // inputs: gradient, forw prop input, [w]
177
  // outputs: [output gradient], weight updates, bias updates
178
1.11k
  assert(input_size >= 2 && output_size >= 2);
179
1.11k
  const ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0]; // gradients
180
1.11k
  const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[1];
181
1.11k
  ccv_nnc_tensor_t* w = output_size > 1 ? outputs[1] : 
00
;
182
1.11k
  assert(CCV_IS_TENSOR_CONTIGUOUS(w));
183
1.11k
  ccv_nnc_tensor_t* bias = output_size > 2 ? 
outputs[2]1.10k
:
02
;
184
1.11k
  assert(!bias || !CCV_IS_TENSOR_VIEW(bias));
185
1.11k
  ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[0]; // output gradients
186
1.11k
  if (!(flags & CCV_NNC_ACCUMULATE_OUTPUT)) // reset the gradients to 0
187
1.11k
  {
188
1.11k
    if (w)
189
1.11k
      memset(w->data.u8, 0, sizeof(float) * ccv_nnc_tensor_count(w->info));
190
1.11k
    if (bias)
191
1.10k
      memset(bias->data.u8, 0, sizeof(float) * ccv_nnc_tensor_count(bias->info));
192
1.11k
  }
193
1.11k
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
194
1.11k
  assert(a_nd == CCV_NNC_MAX_DIM + 1 || a_nd == CCV_NNC_MAX_DIM + 2);
195
1.11k
  const int* adim = (a_nd == CCV_NNC_MAX_DIM + 1) ? 
a->info.dim4
:
a->info.dim + 11.10k
;
196
1.11k
  const int g_nd = ccv_nnc_tensor_nd(g->info.dim);
197
1.11k
  assert(g_nd == CCV_NNC_MAX_DIM + 1 || g_nd == CCV_NNC_MAX_DIM + 2);
198
1.11k
  const int* gdim = (g_nd == CCV_NNC_MAX_DIM + 1) ? 
g->info.dim4
:
g->info.dim + 11.10k
;
199
1.11k
  int astride[CCV_NNC_MAX_DIM_ALLOC];
200
1.11k
  ccv_nnc_tensor_view_get_stride(a, astride);
201
1.11k
  int gstride[CCV_NNC_MAX_DIM_ALLOC];
202
1.11k
  ccv_nnc_tensor_view_get_stride(g, gstride);
203
1.11k
  const int groups = cmd.info.convolution.groups;
204
1.11k
  if (w)
205
1.11k
    assert(w->info.dim[CCV_NNC_MAX_DIM + 1] * groups == adim[CCV_NNC_MAX_DIM]);
206
1.11k
  assert(cmd.info.convolution.count % groups == 0);
207
1.11k
  const int group_size = cmd.info.convolution.count / groups;
208
1.11k
  const int channel_size = w ? w->info.dim[CCV_NNC_MAX_DIM + 1] : 
inputs[2]->info.dim[0
CCV_NNC_MAX_DIM0
+ 1];
209
1.11k
  const int batch_size = (a_nd == CCV_NNC_MAX_DIM + 2) ? 
a->info.dim[0]1.10k
:
14
;
210
1.11k
  const int dilation[CCV_NNC_MAX_DIM] = {
211
1.11k
    ccv_max(cmd.info.convolution.dilation[0], 1),
212
1.11k
    ccv_max(cmd.info.convolution.dilation[1], 1)
213
1.11k
  };
214
1.11k
  const int wdim[CCV_NNC_MAX_DIM] = {
215
1.11k
    (w->info.dim[1] - 1) * dilation[0] + 1,
216
1.11k
    (w->info.dim[2] - 1) * dilation[1] + 1
217
1.11k
  };
218
1.11k
  if (w)
219
1.11k
  {
220
47.4k
    
parallel_for1.11k
(k, cmd.info.convolution.count) {
221
47.4k
      int c;
222
47.4k
      const int gidx = k / group_size;
223
      // kernel weight for one dim.
224
47.4k
      float* wp = w->data.f32 + k * w->info.dim[1] * w->info.dim[2] * w->info.dim[3];
225
47.4k
      float biasval = 0;
226
47.4k
      int i[CCV_NNC_MAX_DIM];
227
47.4k
      int n[CCV_NNC_MAX_DIM];
228
47.4k
      int d[CCV_NNC_MAX_DIM];
229
47.4k
      int m[CCV_NNC_MAX_DIM];
230
47.4k
      int j[CCV_NNC_MAX_DIM];
231
47.4k
      int bidx;
232
119k
      for (bidx = 0; bidx < batch_size; 
bidx++71.6k
)
233
71.6k
      {
234
71.6k
        const float* ap = a->data.f32 + bidx * astride[0];
235
71.6k
        const float* gp = g->data.f32 + bidx * gstride[0] + k;
236
3.53M
        for (i[0] = 0; i[0] < gdim[0]; 
i[0]++3.45M
)
237
3.45M
        {
238
3.45M
          SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, adim, n, m);
239
3.45M
          m[0] = (m[0] + n[0] - 1) / dilation[0] + 1;
240
3.45M
          const int n0 = (n[0] + dilation[0] - 1) / dilation[0];
241
3.45M
          d[0] = n0 * dilation[0] - n[0];
242
3.45M
          n[0] = n0;
243
3.45M
          m[0] = m[0] - n[0];
244
3.45M
          float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
245
326M
          for (i[1] = 0; i[1] < gdim[1]; 
i[1]++323M
)
246
323M
          {
247
323M
            SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, adim, n, m);
248
323M
            m[1] = (m[1] + n[1] - 1) / dilation[1] + 1;
249
323M
            const int n1 = (n[1] + dilation[1] - 1) / dilation[1];
250
323M
            d[1] = n1 * dilation[1] - n[1];
251
323M
            n[1] = n1;
252
323M
            m[1] = m[1] - n[1];
253
323M
            const float v = gp[i[1] * gstride[CCV_NNC_MAX_DIM]];
254
323M
            if (v == 0) // shortcut if v is zero
255
11.8M
              continue;
256
311M
            biasval += v;
257
311M
            float* wpz = wpu + n[1] * channel_size;
258
311M
            const float* apz = ap + d[0] * astride[CCV_NNC_MAX_DIM - 1] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * astride[CCV_NNC_MAX_DIM] + gidx * channel_size;
259
2.46G
            for (j[0] = 0; j[0] < m[0]; 
j[0]++2.15G
)
260
2.15G
            {
261
17.0G
              for (j[1] = 0; j[1] < m[1]; 
j[1]++14.8G
)
262
60.3G
                
for (c = 0; 14.8G
c < channel_size;
c++45.5G
)
263
45.5G
                  wpz[j[1] * channel_size + c] += v * apz[j[1] * dilation[1] * astride[CCV_NNC_MAX_DIM] + c];
264
2.15G
              wpz += w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
265
2.15G
              apz += astride[CCV_NNC_MAX_DIM - 1] * dilation[0];
266
2.15G
            }
267
311M
          }
268
3.45M
          gp += gstride[CCV_NNC_MAX_DIM - 1];
269
3.45M
          ap += astride[CCV_NNC_MAX_DIM - 1] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0));
270
3.45M
        }
271
71.6k
      }
272
47.4k
      if (bias)
273
47.4k
        bias->data.f32[k] = biasval;
274
47.4k
    } parallel_endfor
275
1.11k
  }
276
  // If h is available, therefore, we need to propagate the gradients back
277
1.11k
  if (h)
278
1.01k
  {
279
1.01k
    assert(h);
280
1.01k
    const int h_nd = ccv_nnc_tensor_nd(h->info.dim);
281
1.01k
    assert(h_nd == CCV_NNC_MAX_DIM + 1 || h_nd == CCV_NNC_MAX_DIM + 2);
282
1.01k
    const int* hdim = (h_nd == CCV_NNC_MAX_DIM + 1) ? 
h->info.dim6
:
h->info.dim + 11.00k
;
283
1.01k
    int hstride[CCV_NNC_MAX_DIM_ALLOC];
284
1.01k
    ccv_nnc_tensor_view_get_stride(h, hstride);
285
    // reset it to 0.
286
1.01k
    ccv_nnc_tensor_zero(h);
287
1.01k
    w = inputs[2];
288
1.01k
    assert(CCV_IS_TENSOR_CONTIGUOUS(w));
289
1.01k
    int bidx;
290
2.27k
    for (bidx = 0; bidx < batch_size; 
bidx++1.26k
)
291
1.26k
    {
292
1.26k
      int k;
293
69.6k
      for (k = 0; k < cmd.info.convolution.count; 
k++68.4k
)
294
68.4k
      {
295
68.4k
        int c;
296
68.4k
        const int gidx = k / group_size;
297
68.4k
        float* hp = h->data.f32 + bidx * hstride[0];
298
68.4k
        const float* gp = g->data.f32 + bidx * gstride[0] + k;
299
        // kernel weight for one dim.
300
68.4k
        float* wp = w->data.f32 + k * w->info.dim[1] * w->info.dim[2] * channel_size;
301
        // This block will be cause in each for-loop, therefore, you can use it to generate some temporary variables.
302
68.4k
        int i[CCV_NNC_MAX_DIM];
303
68.4k
        int n[CCV_NNC_MAX_DIM];
304
68.4k
        int d[CCV_NNC_MAX_DIM];
305
68.4k
        int m[CCV_NNC_MAX_DIM];
306
68.4k
        int j[CCV_NNC_MAX_DIM];
307
3.42M
        for (i[0] = 0; i[0] < gdim[0]; 
i[0]++3.35M
)
308
3.35M
        {
309
3.35M
          SET_BORDER_OFFSET_SIZE_FOR(0, i, hint, wdim, hdim, n, m);
310
3.35M
          m[0] = (m[0] + n[0] - 1) / dilation[0] + 1;
311
3.35M
          const int n0 = (n[0] + dilation[0] - 1) / dilation[0];
312
3.35M
          d[0] = n0 * dilation[0] - n[0];
313
3.35M
          n[0] = n0;
314
3.35M
          m[0] = m[0] - n[0];
315
3.35M
          const float* wpu = wp + n[0] * w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
316
323M
          for (i[1] = 0; i[1] < gdim[1]; 
i[1]++320M
)
317
320M
          {
318
320M
            SET_BORDER_OFFSET_SIZE_FOR(1, i, hint, wdim, hdim, n, m);
319
320M
            m[1] = (m[1] + n[1] - 1) / dilation[1] + 1;
320
320M
            const int n1 = (n[1] + dilation[1] - 1) / dilation[1];
321
320M
            d[1] = n1 * dilation[1] - n[1];
322
320M
            n[1] = n1;
323
320M
            m[1] = m[1] - n[1];
324
320M
            const float v = gp[i[1] * gstride[CCV_NNC_MAX_DIM]];
325
320M
            if (v == 0) // shortcut if v is zero
326
9.21M
              continue;
327
311M
            const float* wpz = wpu + n[1] * channel_size;
328
311M
            float* hpz = hp + d[0] * hstride[CCV_NNC_MAX_DIM - 1] + (ccv_max(i[1] * hint.stride.dim[1] - hint.border.begin[1], 0) + d[1]) * hstride[CCV_NNC_MAX_DIM] + gidx * channel_size;
329
2.46G
            for (j[0] = 0; j[0] < m[0]; 
j[0]++2.15G
)
330
2.15G
            {
331
17.0G
              for (j[1] = 0; j[1] < m[1]; 
j[1]++14.8G
)
332
60.3G
                
for (c = 0; 14.8G
c < channel_size;
c++45.4G
)
333
45.4G
                  hpz[j[1] * dilation[1] * hstride[CCV_NNC_MAX_DIM] + c] += v * wpz[j[1] * channel_size + c];
334
2.15G
              wpz += w->info.dim[CCV_NNC_MAX_DIM] * channel_size;
335
2.15G
              hpz += hstride[CCV_NNC_MAX_DIM - 1] * dilation[0];
336
2.15G
            }
337
311M
          }
338
3.35M
          gp += gstride[CCV_NNC_MAX_DIM - 1];
339
3.35M
          hp += hstride[CCV_NNC_MAX_DIM - 1] * (ccv_max((i[0] + 1) * hint.stride.dim[0] - hint.border.begin[0], 0) - ccv_max(i[0] * hint.stride.dim[0] - hint.border.begin[0], 0));
340
3.35M
        }
341
68.4k
      }
342
1.26k
    }
343
1.01k
  }
344
1.11k
  return CCV_NNC_EXEC_SUCCESS;
345
1.11k
}
346
347
REGISTER_COMMAND_BACKEND(CCV_NNC_CONVOLUTION_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
348
1
{
349
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW;
350
1
  registry->tensor_datatypes = CCV_32F;
351
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
352
1
  registry->algorithms = 1;
353
1
  registry->exec = _ccv_nnc_conv_forw;
354
1
}
355
356
REGISTER_COMMAND_BACKEND(CCV_NNC_CONVOLUTION_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
357
1
{
358
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC;
359
1
  registry->tensor_datatypes = CCV_32F;
360
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
361
1
  registry->algorithms = 1;
362
1
  registry->exec = _ccv_nnc_conv_back;
363
1
}
364