Coverage Report

Created: 2021-09-30 21:42

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/cmd/ew/ccv_nnc_ew_cpu_ref.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
#include "../_ccv_nnc_cpu_ref.h"
14
15
void _ccv_nnc_ewsum_forw_cpu_ref(ccv_nnc_tensor_view_t* const* const inputs, const int input_size, ccv_nnc_tensor_view_t* const* const outputs, const int output_size)
16
34.2k
{
17
34.2k
  if (input_size == 1 && 
output_size == 10
)
18
0
  {
19
0
    _ccv_nnc_tensor_transfer_cpu_ref_f32(inputs[0], outputs[0]);
20
0
    return;
21
0
  }
22
34.2k
  // Assuming this is float 32.
23
34.2k
  int dim[CCV_NNC_MAX_DIM_ALLOC];
24
34.2k
  int ainc[CCV_NNC_MAX_DIM_ALLOC];
25
34.2k
  int binc[CCV_NNC_MAX_DIM_ALLOC];
26
34.2k
  int cinc[CCV_NNC_MAX_DIM_ALLOC];
27
34.2k
  int x, z;
28
34.2k
  int k = 0;
29
34.2k
  // Bad, I promised this can be inplace operation. Need to first find out if there are share the same pointer first.
30
68.5k
  for (z = 1; z < input_size; 
z++34.3k
)
31
34.3k
  {
32
34.3k
    ccv_nnc_tensor_view_t* c = outputs[0];
33
34.3k
    ccv_nnc_tensor_view_t* a = inputs[z];
34
34.3k
    if (c->data.f32 == a->data.f32)
35
13
    {
36
13
      k = z;
37
13
      break;
38
13
    }
39
34.3k
  }
40
68.6k
  for (z = 0; z < input_size - 1; 
z++34.3k
)
41
34.3k
  {
42
34.3k
    ccv_nnc_tensor_view_t* c = outputs[0];
43
34.3k
    ccv_nnc_tensor_view_t* a = z > 0 ? 
c19
:
inputs[k]34.2k
;
44
34.3k
    ccv_nnc_tensor_view_t* b = z >= k ? 
inputs[z + 1]34.3k
:
inputs[z]13
;
45
34.3k
    assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
46
34.3k
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
47
34.3k
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
48
34.3k
    ccv_nnc_tensor_view_get_dim(a, dim);
49
34.3k
    assert(ccv_nnc_tensor_view_check_dim(b, dim));
50
34.3k
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
51
34.3k
    if (!CCV_IS_TENSOR_VIEW(a) && 
!34.3k
CCV_IS_TENSOR_VIEW34.3k
(b) &&
!34.3k
CCV_IS_TENSOR_VIEW34.3k
(c))
52
34.3k
    {
53
34.3k
      // Super optimal case, just do one for-loop for sum.
54
34.3k
      const int tensor_count = ccv_nnc_tensor_count(a->info);
55
15.4M
      for (x = 0; x < tensor_count; 
x++15.4M
)
56
15.4M
        c->data.f32[x] = a->data.f32[x] + b->data.f32[x];
57
34.3k
      continue;
58
34.3k
    }
59
7
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
60
7
    ccv_nnc_tensor_view_get_inc(a, ainc);
61
7
    ccv_nnc_tensor_view_get_inc(b, binc);
62
7
    ccv_nnc_tensor_view_get_inc(c, cinc);
63
7
    int i[CCV_NNC_MAX_DIM + 2];
64
7
    float* ap = a->data.f32;
65
7
    float* bp = b->data.f32;
66
7
    float* cp = c->data.f32;
67
7
    const int count = dim[2] * dim[3];
68
7
    if (ainc[3] == dim[3] && 
binc[3] == dim[3]3
&&
cinc[3] == dim[3]0
)
69
0
    {
70
0
      // Special casing if the ainc[3] is the same as dim[3] (do memcpy for the last two dim)
71
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
72
0
      {
73
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
74
0
        {
75
0
          for (x = 0; x < count; x++)
76
0
            cp[x] = ap[x] + bp[x];
77
0
          ap += ainc[2] * ainc[3];
78
0
          bp += binc[2] * binc[3];
79
0
          cp += cinc[2] * cinc[3];
80
0
        }
81
0
        ap += (ainc[1] - dim[1]) * ainc[2] * ainc[0];
82
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
83
0
        cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
84
0
      }
85
0
      continue;
86
0
    }
87
7
    // Non-optimal case, need to do skip copy.
88
14
    
for (i[0] = 0; 7
i[0] < dim[0];
i[0]++7
)
89
7
    {
90
14
      for (i[1] = 0; i[1] < dim[1]; 
i[1]++7
)
91
7
      {
92
14
        for (i[2] = 0; i[2] < dim[2]; 
i[2]++7
)
93
7
        {
94
14
          for (x = 0; x < dim[3]; 
x++7
)
95
7
            cp[x] = ap[x] + bp[x];
96
7
          ap += ainc[3];
97
7
          bp += binc[3];
98
7
          cp += cinc[3];
99
7
        }
100
7
        ap += (ainc[2] - dim[2]) * ainc[3];
101
7
        bp += (binc[2] - dim[2]) * binc[3];
102
7
        cp += (cinc[2] - dim[2]) * cinc[3];
103
7
      }
104
7
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
105
7
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
106
7
      cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
107
7
    }
108
7
  }
109
34.2k
}
110
111
static int _ccv_nnc_ewsum_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
112
34.2k
{
113
34.2k
  _ccv_nnc_ewsum_forw_cpu_ref((ccv_nnc_tensor_view_t**)inputs, input_size, (ccv_nnc_tensor_view_t**)outputs, output_size);
114
34.2k
  return CCV_NNC_EXEC_SUCCESS;
115
34.2k
}
116
117
static int _ccv_nnc_ewsum_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
118
7.55k
{
119
7.55k
  // D[x + y + z, x] = 1
120
7.55k
  int i;
121
7.55k
  if (inputs[0] == 0)
122
0
  {
123
0
    // Set them to 1.
124
0
    for (i = 0; i < output_size; i++)
125
0
      if (outputs[i])
126
0
        _ccv_nnc_tensor_set_cpu_ref((ccv_nnc_tensor_view_t*)outputs[i], 1);
127
7.55k
  } else {
128
7.55k
    // Copy over the gradient (If they are not pointing to the same tensor already).
129
22.6k
    for (i = 0; i < output_size; 
i++15.1k
)
130
15.1k
      if (outputs[i] && inputs[0]->data.f32 != outputs[i]->data.f32)
131
11.7k
        _ccv_nnc_tensor_transfer_cpu_ref_f32((ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)outputs[i]);
132
7.55k
  }
133
7.55k
  return CCV_NNC_EXEC_SUCCESS;
134
7.55k
}
135
136
void _ccv_nnc_ewprod_forw_cpu_ref(ccv_nnc_tensor_view_t* const* const inputs, const int input_size, ccv_nnc_tensor_view_t* const* const outputs, const int output_size)
137
29.1k
{
138
29.1k
  if (input_size == 1 && 
output_size == 10
)
139
0
  {
140
0
    _ccv_nnc_tensor_transfer_cpu_ref_f32(inputs[0], outputs[0]);
141
0
    return;
142
0
  }
143
29.1k
  // Assuming this is float 32.
144
29.1k
  int dim[CCV_NNC_MAX_DIM_ALLOC];
145
29.1k
  int ainc[CCV_NNC_MAX_DIM_ALLOC];
146
29.1k
  int binc[CCV_NNC_MAX_DIM_ALLOC];
147
29.1k
  int cinc[CCV_NNC_MAX_DIM_ALLOC];
148
29.1k
  int x, z;
149
29.1k
  int k = 0;
150
29.1k
  // Bad, I promised this can be inplace operation. Need to first find out if there are share the same pointer first.
151
58.3k
  for (z = 1; z < input_size; 
z++29.1k
)
152
29.1k
  {
153
29.1k
    ccv_nnc_tensor_view_t* c = outputs[0];
154
29.1k
    ccv_nnc_tensor_view_t* a = inputs[z];
155
29.1k
    if (c->data.f32 == a->data.f32)
156
12
    {
157
12
      k = z;
158
12
      break;
159
12
    }
160
29.1k
  }
161
58.3k
  for (z = 0; z < input_size - 1; 
z++29.1k
)
162
29.1k
  {
163
29.1k
    ccv_nnc_tensor_view_t* c = outputs[0];
164
29.1k
    ccv_nnc_tensor_view_t* a = z > 0 ? 
c0
: inputs[k];
165
29.1k
    ccv_nnc_tensor_view_t* b = z >= k ? 
inputs[z + 1]29.1k
:
inputs[z]12
;
166
29.1k
    assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
167
29.1k
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
168
29.1k
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
169
29.1k
    ccv_nnc_tensor_view_get_dim(a, dim);
170
29.1k
    assert(ccv_nnc_tensor_view_check_dim(b, dim));
171
29.1k
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
172
29.1k
    if (!CCV_IS_TENSOR_VIEW(a) && 
!29.1k
CCV_IS_TENSOR_VIEW29.1k
(b) &&
!29.1k
CCV_IS_TENSOR_VIEW29.1k
(c))
173
29.1k
    {
174
29.1k
      // Super optimal case, just do one for-loop for sum.
175
29.1k
      const int tensor_count = ccv_nnc_tensor_count(a->info);
176
78.0k
      for (x = 0; x < tensor_count; 
x++48.8k
)
177
48.8k
        c->data.f32[x] = a->data.f32[x] * b->data.f32[x];
178
29.1k
      continue;
179
29.1k
    }
180
6
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
181
6
    ccv_nnc_tensor_view_get_inc(a, ainc);
182
6
    ccv_nnc_tensor_view_get_inc(b, binc);
183
6
    ccv_nnc_tensor_view_get_inc(c, cinc);
184
6
    int i[CCV_NNC_MAX_DIM + 2];
185
6
    float* ap = a->data.f32;
186
6
    float* bp = b->data.f32;
187
6
    float* cp = c->data.f32;
188
6
    const int count = dim[2] * dim[3];
189
6
    if (ainc[3] == dim[3] && 
binc[3] == dim[3]1
&&
cinc[3] == dim[3]1
)
190
0
    {
191
0
      // Special casing if the ainc[3] is the same as dim[3]
192
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
193
0
      {
194
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
195
0
        {
196
0
          for (x = 0; x < count; x++)
197
0
            cp[x] = ap[x] * bp[x];
198
0
          ap += ainc[2] * ainc[3];
199
0
          bp += binc[2] * binc[3];
200
0
          cp += cinc[2] * cinc[3];
201
0
        }
202
0
        ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
203
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
204
0
        cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
205
0
      }
206
0
      continue;
207
0
    }
208
6
    // Non-optimal case, need to do skip copy.
209
12
    
for (i[0] = 0; 6
i[0] < dim[0];
i[0]++6
)
210
6
    {
211
12
      for (i[1] = 0; i[1] < dim[1]; 
i[1]++6
)
212
6
      {
213
12
        for (i[2] = 0; i[2] < dim[2]; 
i[2]++6
)
214
6
        {
215
12
          for (x = 0; x < dim[3]; 
x++6
)
216
6
            cp[x] = ap[x] * bp[x];
217
6
          ap += ainc[3];
218
6
          bp += binc[3];
219
6
          cp += cinc[3];
220
6
        }
221
6
        ap += (ainc[2] - dim[2]) * ainc[3];
222
6
        bp += (binc[2] - dim[2]) * binc[3];
223
6
        cp += (cinc[2] - dim[2]) * cinc[3];
224
6
      }
225
6
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
226
6
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
227
6
      cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
228
6
    }
229
6
  }
230
29.1k
}
231
232
static int _ccv_nnc_ewprod_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
233
19.1k
{
234
19.1k
  _ccv_nnc_ewprod_forw_cpu_ref((ccv_nnc_tensor_view_t**)inputs, input_size, (ccv_nnc_tensor_view_t**)outputs, output_size);
235
19.1k
  return CCV_NNC_EXEC_SUCCESS;
236
19.1k
}
237
238
static int _ccv_nnc_ewprod_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
239
19.0k
{
240
19.0k
  // D[x * y * z, x] = y * z
241
19.0k
  // Assuming this is float 32.
242
19.0k
  int dim[CCV_NNC_MAX_DIM_ALLOC];
243
19.0k
  int ginc[CCV_NNC_MAX_DIM_ALLOC];
244
19.0k
  int ainc[CCV_NNC_MAX_DIM_ALLOC];
245
19.0k
  int binc[CCV_NNC_MAX_DIM_ALLOC];
246
19.0k
  int hinc[CCV_NNC_MAX_DIM_ALLOC];
247
19.0k
  int x, z;
248
19.0k
  ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0];
249
19.0k
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[output_size + 1];
250
19.0k
  if (g == 0)
251
0
  {
252
0
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
253
0
    ccv_nnc_tensor_view_get_dim(b, dim);
254
0
    ccv_nnc_tensor_view_get_inc(b, binc);
255
0
    for (z = 0; z < output_size; z++)
256
0
    {
257
0
      ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[z + 1];
258
0
      ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[z];
259
0
      assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
260
0
      assert(ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2);
261
0
      assert(ccv_nnc_tensor_view_check_dim(a, dim));
262
0
      assert(ccv_nnc_tensor_view_check_dim(h, dim));
263
0
      ccv_nnc_tensor_view_get_inc(a, ainc);
264
0
      ccv_nnc_tensor_view_get_inc(h, hinc);
265
0
      if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(h))
266
0
      {
267
0
        // Super optimal case, just do one for-loop for sum.
268
0
        const int tensor_count = ccv_nnc_tensor_count(b->info);
269
0
        for (x = 0; x < tensor_count; x++)
270
0
          h->data.f32[x] = b->data.f32[x] / a->data.f32[x];
271
0
        continue;
272
0
      }
273
0
      assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
274
0
      int i[CCV_NNC_MAX_DIM + 2];
275
0
      float* ap = a->data.f32;
276
0
      float* bp = b->data.f32;
277
0
      float* hp = h->data.f32;
278
0
      const int count = dim[2] * dim[3];
279
0
      if (ainc[3] == dim[3] && binc[3] == dim[3] && hinc[3] == dim[3])
280
0
      {
281
0
        // Special casing if the ainc[3] is the same as dim[3]
282
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
283
0
        {
284
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
285
0
          {
286
0
            for (x = 0; x < count; x++)
287
0
              hp[x] = bp[x] / ap[x];
288
0
            ap += ainc[2] * ainc[3];
289
0
            bp += binc[2] * binc[3];
290
0
            hp += hinc[2] * hinc[3];
291
0
          }
292
0
          ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
293
0
          bp += (binc[1] - dim[1]) * binc[2] * binc[3];
294
0
          hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
295
0
        }
296
0
        continue;
297
0
      }
298
0
      // Non-optimal case, need to do skip copy.
299
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
300
0
      {
301
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
302
0
        {
303
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
304
0
          {
305
0
            for (x = 0; x < dim[3]; x++)
306
0
              hp[x] = bp[x] / ap[x];
307
0
            ap += ainc[3];
308
0
            bp += binc[3];
309
0
            hp += hinc[3];
310
0
          }
311
0
          ap += (ainc[2] - dim[2]) * ainc[3];
312
0
          bp += (binc[2] - dim[2]) * binc[3];
313
0
          hp += (hinc[2] - dim[2]) * hinc[3];
314
0
        }
315
0
        ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
316
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
317
0
        hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
318
0
      }
319
0
    }
320
19.0k
  } else {
321
19.0k
    assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2);
322
19.0k
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
323
19.0k
    ccv_nnc_tensor_view_get_dim(b, dim);
324
19.0k
    assert(ccv_nnc_tensor_view_check_dim(g, dim));
325
19.0k
    ccv_nnc_tensor_view_get_inc(b, binc);
326
19.0k
    ccv_nnc_tensor_view_get_inc(g, ginc);
327
57.0k
    for (z = 0; z < output_size; 
z++38.0k
)
328
38.0k
    {
329
38.0k
      ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[z + 1];
330
38.0k
      ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[z];
331
38.0k
      assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
332
38.0k
      assert(ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2);
333
38.0k
      assert(ccv_nnc_tensor_view_check_dim(a, dim));
334
38.0k
      assert(ccv_nnc_tensor_view_check_dim(h, dim));
335
38.0k
      ccv_nnc_tensor_view_get_inc(a, ainc);
336
38.0k
      ccv_nnc_tensor_view_get_inc(h, hinc);
337
38.0k
      if (!CCV_IS_TENSOR_VIEW(g) && 
!38.0k
CCV_IS_TENSOR_VIEW38.0k
(a) &&
!38.0k
CCV_IS_TENSOR_VIEW38.0k
(b) &&
!38.0k
CCV_IS_TENSOR_VIEW38.0k
(h))
338
38.0k
      {
339
38.0k
        // Super optimal case, just do one for-loop for sum.
340
38.0k
        const int tensor_count = ccv_nnc_tensor_count(g->info);
341
108k
        for (x = 0; x < tensor_count; 
x++70.9k
)
342
70.9k
          h->data.f32[x] = g->data.f32[x] * b->data.f32[x] / a->data.f32[x];
343
38.0k
        continue;
344
38.0k
      }
345
10
      assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
346
10
      int i[CCV_NNC_MAX_DIM + 2];
347
10
      float* gp = g->data.f32;
348
10
      float* ap = a->data.f32;
349
10
      float* bp = b->data.f32;
350
10
      float* hp = h->data.f32;
351
10
      const int count = dim[2] * dim[3];
352
10
      if (ginc[3] == dim[3] && 
ainc[3] == dim[3]8
&&
binc[3] == dim[3]0
&&
hinc[3] == dim[3]0
)
353
0
      {
354
0
        // Special casing if the ainc[3] is the same as dim[3]
355
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
356
0
        {
357
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
358
0
          {
359
0
            for (x = 0; x < count; x++)
360
0
              hp[x] = gp[x] * bp[x] / ap[x];
361
0
            gp += ginc[2] * ginc[3];
362
0
            ap += ainc[2] * ainc[3];
363
0
            bp += binc[2] * binc[3];
364
0
            hp += hinc[2] * hinc[3];
365
0
          }
366
0
          gp += (ginc[1] - dim[1]) * ginc[2] * ginc[3];
367
0
          ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
368
0
          bp += (binc[1] - dim[1]) * binc[2] * binc[3];
369
0
          hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
370
0
        }
371
0
        continue;
372
0
      }
373
10
      // Non-optimal case, need to do skip copy.
374
20
      
for (i[0] = 0; 10
i[0] < dim[0];
i[0]++10
)
375
10
      {
376
20
        for (i[1] = 0; i[1] < dim[1]; 
i[1]++10
)
377
10
        {
378
20
          for (i[2] = 0; i[2] < dim[2]; 
i[2]++10
)
379
10
          {
380
20
            for (x = 0; x < dim[3]; 
x++10
)
381
10
              hp[x] = gp[x] * bp[x] / ap[x];
382
10
            gp += ginc[3];
383
10
            ap += ainc[3];
384
10
            bp += binc[3];
385
10
            hp += hinc[3];
386
10
          }
387
10
          gp += (ginc[2] - dim[2]) * ginc[3];
388
10
          ap += (ainc[2] - dim[2]) * ainc[3];
389
10
          bp += (binc[2] - dim[2]) * binc[3];
390
10
          hp += (hinc[2] - dim[2]) * hinc[3];
391
10
        }
392
10
        gp += (ginc[1] - dim[1]) * ginc[2] * ginc[3];
393
10
        ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
394
10
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
395
10
        hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
396
10
      }
397
10
    }
398
19.0k
  }
399
19.0k
  return CCV_NNC_EXEC_SUCCESS;
400
19.0k
}
401
402
static void _ccv_nnc_ewdiv_forw_cpu_ref(const float p, ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b, ccv_nnc_tensor_view_t* const c)
403
236
{
404
236
  // Assuming this is float 32.
405
236
  int dim[CCV_NNC_MAX_DIM_ALLOC];
406
236
  int ainc[CCV_NNC_MAX_DIM_ALLOC];
407
236
  int binc[CCV_NNC_MAX_DIM_ALLOC];
408
236
  int cinc[CCV_NNC_MAX_DIM_ALLOC];
409
236
  if (a == 0) // Take 0 as all ones tensor.
410
7
  {
411
7
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
412
7
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
413
7
    ccv_nnc_tensor_view_get_dim(b, dim);
414
7
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
415
7
    int x;
416
7
    if (!CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c))
417
7
    {
418
7
      // Super optimal case, just do one for-loop for sum.
419
7
      const int tensor_count = ccv_nnc_tensor_count(b->info);
420
1.04k
      for (x = 0; x < tensor_count; 
x++1.03k
)
421
1.03k
        c->data.f32[x] = p / b->data.f32[x];
422
7
      return;
423
7
    }
424
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
425
0
    ccv_nnc_tensor_view_get_inc(b, binc);
426
0
    ccv_nnc_tensor_view_get_inc(c, cinc);
427
0
    int i[CCV_NNC_MAX_DIM + 2];
428
0
    float* bp = b->data.f32;
429
0
    float* cp = c->data.f32;
430
0
    const int count = dim[2] * dim[3];
431
0
    if (binc[3] == dim[3] && cinc[3] == dim[3])
432
0
    {
433
0
      // Special casing if the ainc[3] is the same as dim[3]
434
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
435
0
      {
436
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
437
0
        {
438
0
          for (x = 0; x < count; x++)
439
0
            cp[x] = p / bp[x];
440
0
          bp += binc[2] * binc[3];
441
0
          cp += cinc[2] * cinc[3];
442
0
        }
443
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
444
0
        cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
445
0
      }
446
0
      return;
447
0
    }
448
0
    // Non-optimal case, need to do skip copy.
449
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
450
0
    {
451
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
452
0
      {
453
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
454
0
        {
455
0
          for (x = 0; x < dim[3]; x++)
456
0
            cp[x] = p / bp[x];
457
0
          bp += binc[3];
458
0
          cp += cinc[3];
459
0
        }
460
0
        bp += (binc[2] - dim[2]) * binc[3];
461
0
        cp += (cinc[2] - dim[2]) * cinc[3];
462
0
      }
463
0
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
464
0
      cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
465
0
    }
466
229
  } else {
467
229
    assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
468
229
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
469
229
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
470
229
    ccv_nnc_tensor_view_get_dim(a, dim);
471
229
    assert(ccv_nnc_tensor_view_check_dim(b, dim));
472
229
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
473
229
    int x;
474
229
    if (!CCV_IS_TENSOR_VIEW(a) && 
!227
CCV_IS_TENSOR_VIEW227
(b) &&
!227
CCV_IS_TENSOR_VIEW227
(c))
475
229
    {
476
227
      // Super optimal case, just do one for-loop for sum.
477
227
      const int tensor_count = ccv_nnc_tensor_count(a->info);
478
4.56k
      for (x = 0; x < tensor_count; 
x++4.33k
)
479
4.33k
        c->data.f32[x] = p * a->data.f32[x] / b->data.f32[x];
480
227
      return;
481
227
    }
482
2
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
483
2
    ccv_nnc_tensor_view_get_inc(a, ainc);
484
2
    ccv_nnc_tensor_view_get_inc(b, binc);
485
2
    ccv_nnc_tensor_view_get_inc(c, cinc);
486
2
    int i[CCV_NNC_MAX_DIM + 2];
487
2
    float* ap = a->data.f32;
488
2
    float* bp = b->data.f32;
489
2
    float* cp = c->data.f32;
490
2
    const int count = dim[2] * dim[3];
491
2
    if (ainc[3] == dim[3] && 
binc[3] == dim[3]0
&&
cinc[3] == dim[3]0
)
492
0
    {
493
0
      // Special casing if the ainc[3] is the same as dim[3]
494
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
495
0
      {
496
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
497
0
        {
498
0
          for (x = 0; x < count; x++)
499
0
            cp[x] = p * ap[x] / bp[x];
500
0
          ap += ainc[2] * ainc[3];
501
0
          bp += binc[2] * binc[3];
502
0
          cp += cinc[2] * cinc[3];
503
0
        }
504
0
        ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
505
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
506
0
        cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
507
0
      }
508
0
      return;
509
0
    }
510
2
    // Non-optimal case, need to do skip copy.
511
4
    
for (i[0] = 0; 2
i[0] < dim[0];
i[0]++2
)
512
2
    {
513
4
      for (i[1] = 0; i[1] < dim[1]; 
i[1]++2
)
514
2
      {
515
4
        for (i[2] = 0; i[2] < dim[2]; 
i[2]++2
)
516
2
        {
517
4
          for (x = 0; x < dim[3]; 
x++2
)
518
2
            cp[x] = p * ap[x] / bp[x];
519
2
          ap += ainc[3];
520
2
          bp += binc[3];
521
2
          cp += cinc[3];
522
2
        }
523
2
        ap += (ainc[2] - dim[2]) * ainc[3];
524
2
        bp += (binc[2] - dim[2]) * binc[3];
525
2
        cp += (cinc[2] - dim[2]) * cinc[3];
526
2
      }
527
2
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
528
2
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
529
2
      cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
530
2
    }
531
2
  }
532
236
}
533
534
static int _ccv_nnc_ewdiv_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
535
15
{
536
15
  _ccv_nnc_ewdiv_forw_cpu_ref(1, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]);
537
15
  return CCV_NNC_EXEC_SUCCESS;
538
15
}
539
540
static int _ccv_nnc_ewdiv_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
541
9
{
542
9
  // D[x / y, x] = 1 / y, D[x / y, y] = -x / y^2
543
9
  if (output_size == 1 || 
outputs[1] == 08
)
544
2
  {
545
2
    // When we only need D[x / y, x]
546
2
    _ccv_nnc_ewdiv_forw_cpu_ref(1, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
547
2
    return CCV_NNC_EXEC_SUCCESS;
548
2
  }
549
7
  int dim[CCV_NNC_MAX_DIM_ALLOC];
550
7
  int ginc[CCV_NNC_MAX_DIM_ALLOC];
551
7
  int binc[CCV_NNC_MAX_DIM_ALLOC];
552
7
  int cinc[CCV_NNC_MAX_DIM_ALLOC];
553
7
  int hainc[CCV_NNC_MAX_DIM_ALLOC];
554
7
  int hbinc[CCV_NNC_MAX_DIM_ALLOC];
555
7
  ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0];
556
7
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[2];
557
7
  ccv_nnc_tensor_view_t* c = (ccv_nnc_tensor_view_t*)inputs[3];
558
7
  ccv_nnc_tensor_view_t* ha = (ccv_nnc_tensor_view_t*)outputs[0];
559
7
  ccv_nnc_tensor_view_t* hb = (ccv_nnc_tensor_view_t*)outputs[1];
560
7
  if (g == 0)
561
0
  {
562
0
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
563
0
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
564
0
    assert(ccv_nnc_tensor_nd(hb->info.dim) <= CCV_NNC_MAX_DIM + 2);
565
0
    ccv_nnc_tensor_view_get_dim(b, dim);
566
0
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
567
0
    assert(ccv_nnc_tensor_view_check_dim(hb, dim));
568
0
    if (ha)
569
0
    {
570
0
      assert(ccv_nnc_tensor_nd(ha->info.dim) <= CCV_NNC_MAX_DIM + 2);
571
0
      assert(ccv_nnc_tensor_view_check_dim(ha, dim));
572
0
    }
573
0
    int x;
574
0
    if (!CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && (ha == 0 || !CCV_IS_TENSOR_VIEW(ha)) && !CCV_IS_TENSOR_VIEW(hb))
575
0
    {
576
0
      // Super optimal case, just do one for-loop for sum.
577
0
      const int tensor_count = ccv_nnc_tensor_count(b->info);
578
0
      if (ha == 0)
579
0
      {
580
0
        for (x = 0; x < tensor_count; x++)
581
0
        {
582
0
          const float v = 1 / b->data.f32[x];
583
0
          hb->data.f32[x] = -c->data.f32[x] * v;
584
0
        }
585
0
      } else {
586
0
        for (x = 0; x < tensor_count; x++)
587
0
        {
588
0
          const float v = 1 / b->data.f32[x];
589
0
          ha->data.f32[x] = v;
590
0
          hb->data.f32[x] = -c->data.f32[x] * v;
591
0
        }
592
0
      }
593
0
      return CCV_NNC_EXEC_SUCCESS;
594
0
    }
595
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
596
0
    ccv_nnc_tensor_view_get_inc(b, binc);
597
0
    ccv_nnc_tensor_view_get_inc(c, cinc);
598
0
    ccv_nnc_tensor_view_get_inc(hb, hbinc);
599
0
    int i[CCV_NNC_MAX_DIM + 2];
600
0
    float* bp = b->data.f32;
601
0
    float* cp = c->data.f32;
602
0
    float* hbp = hb->data.f32;
603
0
    const int count = dim[2] * dim[3];
604
0
    if (ha == 0)
605
0
    {
606
0
      if (binc[3] == dim[3] && cinc[3] == dim[3] && hbinc[3] == dim[3])
607
0
      {
608
0
        // Special casing if the ainc[3] is the same as dim[3]
609
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
610
0
        {
611
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
612
0
          {
613
0
            for (x = 0; x < count; x++)
614
0
            {
615
0
              const float v = 1 / bp[x];
616
0
              hbp[x] = -cp[x] * v;
617
0
            }
618
0
            bp += binc[2] * binc[3];
619
0
            cp += cinc[2] * cinc[3];
620
0
            hbp += hbinc[2] * hbinc[3];
621
0
          }
622
0
          bp += (binc[1] - dim[1]) * binc[2] * binc[3];
623
0
          cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
624
0
          hbp += (hbinc[1] - dim[1]) * hbinc[2] * hbinc[3];
625
0
        }
626
0
        return CCV_NNC_EXEC_SUCCESS;
627
0
      }
628
0
      // Non-optimal case, need to do skip copy.
629
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
630
0
      {
631
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
632
0
        {
633
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
634
0
          {
635
0
            for (x = 0; x < dim[3]; x++)
636
0
            {
637
0
              const float v = 1 / bp[x];
638
0
              hbp[x] = -cp[x] * v;
639
0
            }
640
0
            bp += binc[3];
641
0
            cp += cinc[3];
642
0
            hbp += hbinc[3];
643
0
          }
644
0
          bp += (binc[2] - dim[2]) * binc[3];
645
0
          cp += (cinc[2] - dim[2]) * cinc[3];
646
0
          hbp += (hbinc[2] - dim[2]) * hbinc[3];
647
0
        }
648
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
649
0
        cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
650
0
        hbp += (hbinc[1] - dim[1]) * hbinc[2] * hbinc[3];
651
0
      }
652
0
    } else {
653
0
      float* hap = ha->data.f32;
654
0
      ccv_nnc_tensor_view_get_inc(ha, hainc);
655
0
      if (binc[3] == dim[3] && cinc[3] == dim[3] && hainc[3] == dim[3] && hbinc[3] == dim[3])
656
0
      {
657
0
        // Special casing if the ainc[3] is the same as dim[3]
658
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
659
0
        {
660
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
661
0
          {
662
0
            for (x = 0; x < count; x++)
663
0
            {
664
0
              const float v = 1 / bp[x];
665
0
              hap[x] = v;
666
0
              hbp[x] = -cp[x] * v;
667
0
            }
668
0
            bp += binc[2] * binc[3];
669
0
            cp += cinc[2] * cinc[3];
670
0
            hap += hainc[2] * hainc[3];
671
0
            hbp += hbinc[2] * hbinc[3];
672
0
          }
673
0
          bp += (binc[1] - dim[1]) * binc[2] * binc[3];
674
0
          cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
675
0
          hap += (hainc[1] - dim[1]) * hainc[2] * hainc[3];
676
0
          hbp += (hbinc[1] - dim[1]) * hbinc[2] * hbinc[3];
677
0
        }
678
0
        return CCV_NNC_EXEC_SUCCESS;
679
0
      }
680
0
      // Non-optimal case, need to do skip copy.
681
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
682
0
      {
683
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
684
0
        {
685
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
686
0
          {
687
0
            for (x = 0; x < dim[3]; x++)
688
0
            {
689
0
              const float v = 1 / bp[x];
690
0
              hap[x] = v;
691
0
              hbp[x] = -cp[x] * v;
692
0
            }
693
0
            bp += binc[3];
694
0
            cp += cinc[3];
695
0
            hap += hainc[3];
696
0
            hbp += hbinc[3];
697
0
          }
698
0
          bp += (binc[2] - dim[2]) * binc[3];
699
0
          cp += (cinc[2] - dim[2]) * cinc[3];
700
0
          hap += (hainc[2] - dim[2]) * hainc[3];
701
0
          hbp += (hbinc[2] - dim[2]) * hbinc[3];
702
0
        }
703
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
704
0
        cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
705
0
        hap += (hainc[1] - dim[1]) * hainc[2] * hainc[3];
706
0
        hbp += (hbinc[1] - dim[1]) * hbinc[2] * hbinc[3];
707
0
      }
708
0
    }
709
7
  } else {
710
7
    assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2);
711
7
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
712
7
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
713
7
    assert(ccv_nnc_tensor_nd(hb->info.dim) <= CCV_NNC_MAX_DIM + 2);
714
7
    ccv_nnc_tensor_view_get_dim(b, dim);
715
7
    assert(ccv_nnc_tensor_view_check_dim(g, dim));
716
7
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
717
7
    assert(ccv_nnc_tensor_view_check_dim(hb, dim));
718
7
    if (ha)
719
0
    {
720
0
      assert(ccv_nnc_tensor_nd(ha->info.dim) <= CCV_NNC_MAX_DIM + 2);
721
0
      assert(ccv_nnc_tensor_view_check_dim(ha, dim));
722
0
    }
723
7
    int x;
724
7
    if (!CCV_IS_TENSOR_VIEW(g) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && (ha == 0 || 
!0
CCV_IS_TENSOR_VIEW0
(ha)) && !CCV_IS_TENSOR_VIEW(hb))
725
7
    {
726
7
      // Super optimal case, just do one for-loop for sum.
727
7
      const int tensor_count = ccv_nnc_tensor_count(g->info);
728
7
      if (ha == 0)
729
7
      {
730
1.02k
        for (x = 0; x < tensor_count; 
x++1.02k
)
731
1.02k
        {
732
1.02k
          const float v = g->data.f32[x] / b->data.f32[x];
733
1.02k
          hb->data.f32[x] = -c->data.f32[x] * v;
734
1.02k
        }
735
7
      } else {
736
0
        for (x = 0; x < tensor_count; x++)
737
0
        {
738
0
          const float v = g->data.f32[x] / b->data.f32[x];
739
0
          ha->data.f32[x] = v;
740
0
          hb->data.f32[x] = -c->data.f32[x] * v;
741
0
        }
742
0
      }
743
7
      return CCV_NNC_EXEC_SUCCESS;
744
7
    }
745
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
746
0
    ccv_nnc_tensor_view_get_inc(g, ginc);
747
0
    ccv_nnc_tensor_view_get_inc(b, binc);
748
0
    ccv_nnc_tensor_view_get_inc(c, cinc);
749
0
    ccv_nnc_tensor_view_get_inc(hb, hbinc);
750
0
    int i[CCV_NNC_MAX_DIM + 2];
751
0
    float* gp = g->data.f32;
752
0
    float* bp = b->data.f32;
753
0
    float* cp = c->data.f32;
754
0
    float* hbp = hb->data.f32;
755
0
    const int count = dim[2] * dim[3];
756
0
    if (ha == 0)
757
0
    {
758
0
      if (ginc[3] == dim[3] && binc[3] == dim[3] && cinc[3] == dim[3] && hbinc[3] == dim[3])
759
0
      {
760
0
        // Special casing if the ainc[3] is the same as dim[3]
761
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
762
0
        {
763
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
764
0
          {
765
0
            for (x = 0; x < count; x++)
766
0
            {
767
0
              const float v = gp[x] / bp[x];
768
0
              hbp[x] = -cp[x] * v;
769
0
            }
770
0
            gp += ginc[2] * ginc[3];
771
0
            bp += binc[2] * binc[3];
772
0
            cp += cinc[2] * cinc[3];
773
0
            hbp += hbinc[2] * hbinc[3];
774
0
          }
775
0
          gp += (ginc[1] - dim[1]) * ginc[2] * ginc[3];
776
0
          bp += (binc[1] - dim[1]) * binc[2] * binc[3];
777
0
          cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
778
0
          hbp += (hbinc[1] - dim[1]) * hbinc[2] * hbinc[3];
779
0
        }
780
0
        return CCV_NNC_EXEC_SUCCESS;
781
0
      }
782
0
      // Non-optimal case, need to do skip copy.
783
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
784
0
      {
785
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
786
0
        {
787
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
788
0
          {
789
0
            for (x = 0; x < dim[3]; x++)
790
0
            {
791
0
              const float v = gp[x] / bp[x];
792
0
              hbp[x] = -cp[x] * v;
793
0
            }
794
0
            gp += ginc[3];
795
0
            bp += binc[3];
796
0
            cp += cinc[3];
797
0
            hbp += hbinc[3];
798
0
          }
799
0
          gp += (ginc[2] - dim[2]) * ginc[3];
800
0
          bp += (binc[2] - dim[2]) * binc[3];
801
0
          cp += (cinc[2] - dim[2]) * cinc[3];
802
0
          hbp += (hbinc[2] - dim[2]) * hbinc[3];
803
0
        }
804
0
        gp += (ginc[1] - dim[1]) * ginc[2] * ginc[3];
805
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
806
0
        cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
807
0
        hbp += (hbinc[1] - dim[1]) * hbinc[2] * hbinc[3];
808
0
      }
809
0
    } else {
810
0
      ccv_nnc_tensor_view_get_inc(ha, hainc);
811
0
      float* hap = ha->data.f32;
812
0
      if (ginc[3] == dim[3] && binc[3] == dim[3] && cinc[3] == dim[3] && hainc[3] == dim[3] && hbinc[3] == dim[3])
813
0
      {
814
0
        // Special casing if the ainc[3] is the same as dim[3]
815
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
816
0
        {
817
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
818
0
          {
819
0
            for (x = 0; x < count; x++)
820
0
            {
821
0
              const float v = gp[x] / bp[x];
822
0
              hap[x] = v;
823
0
              hbp[x] = -cp[x] * v;
824
0
            }
825
0
            gp += ginc[2] * ginc[3];
826
0
            bp += binc[2] * binc[3];
827
0
            cp += cinc[2] * cinc[3];
828
0
            hap += hainc[2] * hainc[3];
829
0
            hbp += hbinc[2] * hbinc[3];
830
0
          }
831
0
          gp += (ginc[1] - dim[1]) * ginc[2] * ginc[3];
832
0
          bp += (binc[1] - dim[1]) * binc[2] * binc[3];
833
0
          cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
834
0
          hap += (hainc[1] - dim[1]) * hainc[2] * hainc[3];
835
0
          hbp += (hbinc[1] - dim[1]) * hbinc[2] * hbinc[3];
836
0
        }
837
0
        return CCV_NNC_EXEC_SUCCESS;
838
0
      }
839
0
      // Non-optimal case, need to do skip copy.
840
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
841
0
      {
842
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
843
0
        {
844
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
845
0
          {
846
0
            for (x = 0; x < dim[3]; x++)
847
0
            {
848
0
              const float v = gp[x] / bp[x];
849
0
              hap[x] = v;
850
0
              hbp[x] = -cp[x] * v;
851
0
            }
852
0
            gp += ginc[3];
853
0
            bp += binc[3];
854
0
            cp += cinc[3];
855
0
            hap += hainc[3];
856
0
            hbp += hbinc[3];
857
0
          }
858
0
          gp += (ginc[2] - dim[2]) * ginc[3];
859
0
          bp += (binc[2] - dim[2]) * binc[3];
860
0
          cp += (cinc[2] - dim[2]) * cinc[3];
861
0
          hap += (hainc[2] - dim[2]) * hainc[3];
862
0
          hbp += (hbinc[2] - dim[2]) * hbinc[3];
863
0
        }
864
0
        gp += (ginc[1] - dim[1]) * ginc[2] * ginc[3];
865
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
866
0
        cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
867
0
        hap += (hainc[1] - dim[1]) * hainc[2] * hainc[3];
868
0
        hbp += (hbinc[1] - dim[1]) * hbinc[2] * hbinc[3];
869
0
      }
870
0
    }
871
0
  }
872
7
  
return CCV_NNC_EXEC_SUCCESS0
;
873
7
}
874
875
static int _ccv_nnc_ewexp_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
876
7
{
877
7
  // Assuming this is float 32.
878
7
  int dim[CCV_NNC_MAX_DIM_ALLOC];
879
7
  int ainc[CCV_NNC_MAX_DIM_ALLOC];
880
7
  int binc[CCV_NNC_MAX_DIM_ALLOC];
881
7
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
882
7
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
883
7
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
884
7
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
885
7
  ccv_nnc_tensor_view_get_dim(a, dim);
886
7
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
887
7
  int x;
888
7
  if (!CCV_IS_TENSOR_VIEW(a) && 
!6
CCV_IS_TENSOR_VIEW6
(b))
889
7
  {
890
6
    // Super optimal case, just do one for-loop for sum.
891
6
    const int tensor_count = ccv_nnc_tensor_count(a->info);
892
134
    for (x = 0; x < tensor_count; 
x++128
)
893
128
      b->data.f32[x] = exp(a->data.f32[x]);
894
6
    return CCV_NNC_EXEC_SUCCESS;
895
6
  }
896
1
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
897
1
  ccv_nnc_tensor_view_get_inc(a, ainc);
898
1
  ccv_nnc_tensor_view_get_inc(b, binc);
899
1
  int i[CCV_NNC_MAX_DIM + 2];
900
1
  float* ap = a->data.f32;
901
1
  float* bp = b->data.f32;
902
1
  const int count = dim[2] * dim[3];
903
1
  if (ainc[3] == dim[3] && 
binc[3] == dim[3]0
)
904
0
  {
905
0
    // Special casing if the ainc[3] is the same as dim[3]
906
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
907
0
    {
908
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
909
0
      {
910
0
        for (x = 0; x < count; x++)
911
0
          bp[x] = exp(ap[x]);
912
0
        ap += ainc[2] * ainc[3];
913
0
        bp += binc[2] * binc[3];
914
0
      }
915
0
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
916
0
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
917
0
    }
918
0
    return CCV_NNC_EXEC_SUCCESS;
919
0
  }
920
1
  // Non-optimal case, need to do skip copy.
921
2
  
for (i[0] = 0; 1
i[0] < dim[0];
i[0]++1
)
922
1
  {
923
2
    for (i[1] = 0; i[1] < dim[1]; 
i[1]++1
)
924
1
    {
925
2
      for (i[2] = 0; i[2] < dim[2]; 
i[2]++1
)
926
1
      {
927
2
        for (x = 0; x < dim[3]; 
x++1
)
928
1
          bp[x] = exp(ap[x]);
929
1
        ap += ainc[3];
930
1
        bp += binc[3];
931
1
      }
932
1
      ap += (ainc[2] - dim[2]) * ainc[3];
933
1
      bp += (binc[2] - dim[2]) * binc[3];
934
1
    }
935
1
    ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
936
1
    bp += (binc[1] - dim[1]) * binc[2] * binc[3];
937
1
  }
938
1
  return CCV_NNC_EXEC_SUCCESS;
939
1
}
940
941
static int _ccv_nnc_ewexp_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
942
3
{
943
3
  // D[Exp[x], x] = Exp[x]
944
3
  if (inputs[0] == 0)
945
0
    _ccv_nnc_tensor_transfer_cpu_ref_f32((ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
946
3
  else
947
3
    _ccv_nnc_ewprod_forw_cpu_ref((ccv_nnc_tensor_view_t*[]){
948
3
      (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2]
949
3
    }, 2, (ccv_nnc_tensor_view_t**)outputs, output_size);
950
3
  return CCV_NNC_EXEC_SUCCESS;
951
3
}
952
953
static int _ccv_nnc_ewlog_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
954
241
{
955
241
  // Assuming this is float 32.
956
241
  int dim[CCV_NNC_MAX_DIM_ALLOC];
957
241
  int ainc[CCV_NNC_MAX_DIM_ALLOC];
958
241
  int binc[CCV_NNC_MAX_DIM_ALLOC];
959
241
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
960
241
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
961
241
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
962
241
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
963
241
  ccv_nnc_tensor_view_get_dim(a, dim);
964
241
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
965
241
  int x;
966
241
  if (!CCV_IS_TENSOR_VIEW(a) && 
!239
CCV_IS_TENSOR_VIEW239
(b))
967
241
  {
968
239
    // Super optimal case, just do one for-loop for sum.
969
239
    const int tensor_count = ccv_nnc_tensor_count(a->info);
970
601
    for (x = 0; x < tensor_count; 
x++362
)
971
362
      b->data.f32[x] = log(a->data.f32[x]);
972
239
    return CCV_NNC_EXEC_SUCCESS;
973
239
  }
974
2
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
975
2
  ccv_nnc_tensor_view_get_inc(a, ainc);
976
2
  ccv_nnc_tensor_view_get_inc(b, binc);
977
2
  int i[CCV_NNC_MAX_DIM + 2];
978
2
  float* ap = a->data.f32;
979
2
  float* bp = b->data.f32;
980
2
  const int count = dim[2] * dim[3];
981
2
  if (ainc[3] == dim[3] && 
binc[3] == dim[3]0
)
982
0
  {
983
0
    // Special casing if the ainc[3] is the same as dim[3]
984
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
985
0
    {
986
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
987
0
      {
988
0
        for (x = 0; x < count; x++)
989
0
          bp[x] = log(ap[x]);
990
0
        ap += ainc[2] * ainc[3];
991
0
        bp += binc[2] * binc[3];
992
0
      }
993
0
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
994
0
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
995
0
    }
996
0
    return CCV_NNC_EXEC_SUCCESS;
997
0
  }
998
2
  // Non-optimal case, need to do skip copy.
999
4
  
for (i[0] = 0; 2
i[0] < dim[0];
i[0]++2
)
1000
2
  {
1001
4
    for (i[1] = 0; i[1] < dim[1]; 
i[1]++2
)
1002
2
    {
1003
4
      for (i[2] = 0; i[2] < dim[2]; 
i[2]++2
)
1004
2
      {
1005
4
        for (x = 0; x < dim[3]; 
x++2
)
1006
2
          bp[x] = log(ap[x]);
1007
2
        ap += ainc[3];
1008
2
        bp += binc[3];
1009
2
      }
1010
2
      ap += (ainc[2] - dim[2]) * ainc[3];
1011
2
      bp += (binc[2] - dim[2]) * binc[3];
1012
2
    }
1013
2
    ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
1014
2
    bp += (binc[1] - dim[1]) * binc[2] * binc[3];
1015
2
  }
1016
2
  return CCV_NNC_EXEC_SUCCESS;
1017
2
}
1018
1019
static int _ccv_nnc_ewlog_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1020
218
{
1021
218
  // D[Log[x], x] = 1 / x
1022
218
  _ccv_nnc_ewdiv_forw_cpu_ref(1, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]);
1023
218
  return CCV_NNC_EXEC_SUCCESS;
1024
218
}
1025
1026
static int _ccv_nnc_ewsqrt_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1027
1
{
1028
1
  // Assuming this is float 32.
1029
1
  int dim[CCV_NNC_MAX_DIM_ALLOC];
1030
1
  int ainc[CCV_NNC_MAX_DIM_ALLOC];
1031
1
  int binc[CCV_NNC_MAX_DIM_ALLOC];
1032
1
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
1033
1
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
1034
1
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
1035
1
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
1036
1
  ccv_nnc_tensor_view_get_dim(a, dim);
1037
1
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
1038
1
  int x;
1039
1
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b))
1040
1
  {
1041
1
    // Super optimal case, just do one for-loop for sum.
1042
1
    const int tensor_count = ccv_nnc_tensor_count(a->info);
1043
11
    for (x = 0; x < tensor_count; 
x++10
)
1044
10
      b->data.f32[x] = sqrt(a->data.f32[x]);
1045
1
    return CCV_NNC_EXEC_SUCCESS;
1046
1
  }
1047
0
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1048
0
  ccv_nnc_tensor_view_get_inc(a, ainc);
1049
0
  ccv_nnc_tensor_view_get_inc(b, binc);
1050
0
  int i[CCV_NNC_MAX_DIM + 2];
1051
0
  float* ap = a->data.f32;
1052
0
  float* bp = b->data.f32;
1053
0
  const int count = dim[2] * dim[3];
1054
0
  if (ainc[3] == dim[3] && binc[3] == dim[3])
1055
0
  {
1056
0
    // Special casing if the ainc[3] is the same as dim[3]
1057
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1058
0
    {
1059
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1060
0
      {
1061
0
        for (x = 0; x < count; x++)
1062
0
          bp[x] = sqrt(ap[x]);
1063
0
        ap += ainc[2] * ainc[3];
1064
0
        bp += binc[2] * binc[3];
1065
0
      }
1066
0
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
1067
0
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
1068
0
    }
1069
0
    return CCV_NNC_EXEC_SUCCESS;
1070
0
  }
1071
0
  // Non-optimal case, need to do skip copy.
1072
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
1073
0
  {
1074
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
1075
0
    {
1076
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
1077
0
      {
1078
0
        for (x = 0; x < dim[3]; x++)
1079
0
          bp[x] = sqrt(ap[x]);
1080
0
        ap += ainc[3];
1081
0
        bp += binc[3];
1082
0
      }
1083
0
      ap += (ainc[2] - dim[2]) * ainc[3];
1084
0
      bp += (binc[2] - dim[2]) * binc[3];
1085
0
    }
1086
0
    ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
1087
0
    bp += (binc[1] - dim[1]) * binc[2] * binc[3];
1088
0
  }
1089
0
  return CCV_NNC_EXEC_SUCCESS;
1090
0
}
1091
1092
static int _ccv_nnc_ewsqrt_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1093
1
{
1094
1
  // D[Sqrt[x], x] = 0.5 / Sqrt[x]
1095
1
  _ccv_nnc_ewdiv_forw_cpu_ref(0.5, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
1096
1
  return CCV_NNC_EXEC_SUCCESS;
1097
1
}
1098
1099
static int _ccv_nnc_clamp_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1100
6
{
1101
6
  // Assuming this is float 32.
1102
6
  int dim[CCV_NNC_MAX_DIM_ALLOC];
1103
6
  int ainc[CCV_NNC_MAX_DIM_ALLOC];
1104
6
  int binc[CCV_NNC_MAX_DIM_ALLOC];
1105
6
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
1106
6
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
1107
6
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
1108
6
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
1109
6
  ccv_nnc_tensor_view_get_dim(a, dim);
1110
6
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
1111
6
  int x;
1112
6
  const float min = cmd.info.clamp.min;
1113
6
  const float max = cmd.info.clamp.max;
1114
6
  assert(!isnan(min) || !isnan(max));
1115
6
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b))
1116
6
  {
1117
6
    // Super optimal case, just do one for-loop for sum.
1118
6
    const int tensor_count = ccv_nnc_tensor_count(a->info);
1119
6
    if (isnan(min))
1120
6
    {
1121
2.00k
      for (x = 0; x < tensor_count; 
x++2.00k
)
1122
2.00k
        b->data.f32[x] = ccv_min(a->data.f32[x], max);
1123
4
    } else if (isnan(max)) {
1124
2.00k
      for (x = 0; x < tensor_count; 
x++2.00k
)
1125
2.00k
        b->data.f32[x] = ccv_max(a->data.f32[x], min);
1126
2
    } else {
1127
2.00k
      for (x = 0; x < tensor_count; 
x++2.00k
)
1128
2.00k
        b->data.f32[x] = ccv_clamp(a->data.f32[x], min, max);
1129
2
    }
1130
6
    return CCV_NNC_EXEC_SUCCESS;
1131
6
  }
1132
0
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1133
0
  ccv_nnc_tensor_view_get_inc(a, ainc);
1134
0
  ccv_nnc_tensor_view_get_inc(b, binc);
1135
0
  int i[CCV_NNC_MAX_DIM + 2];
1136
0
  float* ap = a->data.f32;
1137
0
  float* bp = b->data.f32;
1138
0
  const int count = dim[2] * dim[3];
1139
0
  if (isnan(min))
1140
0
  {
1141
0
    if (ainc[3] == dim[3] && binc[3] == dim[3])
1142
0
    {
1143
0
      // Special casing if the ainc[3] is the same as dim[3]
1144
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1145
0
      {
1146
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1147
0
        {
1148
0
          for (x = 0; x < count; x++)
1149
0
            bp[x] = ccv_min(ap[x], max);
1150
0
          ap += ainc[2] * ainc[3];
1151
0
          bp += binc[2] * binc[3];
1152
0
        }
1153
0
        ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
1154
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
1155
0
      }
1156
0
      return CCV_NNC_EXEC_SUCCESS;
1157
0
    }
1158
0
    // Non-optimal case, need to do skip copy.
1159
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1160
0
    {
1161
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1162
0
      {
1163
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
1164
0
        {
1165
0
          for (x = 0; x < dim[3]; x++)
1166
0
            bp[x] = ccv_min(ap[x], max);
1167
0
          ap += ainc[3];
1168
0
          bp += binc[3];
1169
0
        }
1170
0
        ap += (ainc[2] - dim[2]) * ainc[3];
1171
0
        bp += (binc[2] - dim[2]) * binc[3];
1172
0
      }
1173
0
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
1174
0
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
1175
0
    }
1176
0
  } else if (isnan(max)) {
1177
0
    if (ainc[3] == dim[3] && binc[3] == dim[3])
1178
0
    {
1179
0
      // Special casing if the ainc[3] is the same as dim[3]
1180
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1181
0
      {
1182
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1183
0
        {
1184
0
          for (x = 0; x < count; x++)
1185
0
            bp[x] = ccv_max(ap[x], min);
1186
0
          ap += ainc[2] * ainc[3];
1187
0
          bp += binc[2] * binc[3];
1188
0
        }
1189
0
        ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
1190
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
1191
0
      }
1192
0
      return CCV_NNC_EXEC_SUCCESS;
1193
0
    }
1194
0
    // Non-optimal case, need to do skip copy.
1195
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1196
0
    {
1197
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1198
0
      {
1199
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
1200
0
        {
1201
0
          for (x = 0; x < dim[3]; x++)
1202
0
            bp[x] = ccv_max(ap[x], min);
1203
0
          ap += ainc[3];
1204
0
          bp += binc[3];
1205
0
        }
1206
0
        ap += (ainc[2] - dim[2]) * ainc[3];
1207
0
        bp += (binc[2] - dim[2]) * binc[3];
1208
0
      }
1209
0
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
1210
0
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
1211
0
    }
1212
0
  } else {
1213
0
    if (ainc[3] == dim[3] && binc[3] == dim[3])
1214
0
    {
1215
0
      // Special casing if the ainc[3] is the same as dim[3]
1216
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1217
0
      {
1218
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1219
0
        {
1220
0
          for (x = 0; x < count; x++)
1221
0
            bp[x] = ccv_clamp(ap[x], min, max);
1222
0
          ap += ainc[2] * ainc[3];
1223
0
          bp += binc[2] * binc[3];
1224
0
        }
1225
0
        ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
1226
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
1227
0
      }
1228
0
      return CCV_NNC_EXEC_SUCCESS;
1229
0
    }
1230
0
    // Non-optimal case, need to do skip copy.
1231
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1232
0
    {
1233
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1234
0
      {
1235
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
1236
0
        {
1237
0
          for (x = 0; x < dim[3]; x++)
1238
0
            bp[x] = ccv_clamp(ap[x], min, max);
1239
0
          ap += ainc[3];
1240
0
          bp += binc[3];
1241
0
        }
1242
0
        ap += (ainc[2] - dim[2]) * ainc[3];
1243
0
        bp += (binc[2] - dim[2]) * binc[3];
1244
0
      }
1245
0
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
1246
0
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
1247
0
    }
1248
0
  }
1249
0
  return CCV_NNC_EXEC_SUCCESS;
1250
0
}
1251
1252
static int _ccv_nnc_clamp_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1253
3
{
1254
3
  assert(input_size == 3);
1255
3
  const ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0]; // gradient
1256
3
  const ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[2];
1257
3
  assert(output_size == 1);
1258
3
  ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[0];
1259
3
  // Assuming this is float 32.
1260
3
  int dim[CCV_NNC_MAX_DIM_ALLOC];
1261
3
  int hinc[CCV_NNC_MAX_DIM_ALLOC];
1262
3
  int binc[CCV_NNC_MAX_DIM_ALLOC];
1263
3
  assert(ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2);
1264
3
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
1265
3
  ccv_nnc_tensor_view_get_dim(g, dim);
1266
3
  ccv_nnc_tensor_view_get_dim(h, dim);
1267
3
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
1268
3
  int x;
1269
3
  const float min = cmd.info.clamp.min;
1270
3
  const float max = cmd.info.clamp.max;
1271
3
  assert(!isnan(min) || !isnan(max));
1272
3
  if (g)
1273
3
  {
1274
3
    if (!CCV_IS_TENSOR_VIEW(g) && !CCV_IS_TENSOR_VIEW(h) && !CCV_IS_TENSOR_VIEW(b))
1275
3
    {
1276
3
      // Super optimal case, just do one for-loop for sum.
1277
3
      const int tensor_count = ccv_nnc_tensor_count(g->info);
1278
3
      if (isnan(min))
1279
3
      {
1280
1.00k
        for (x = 0; x < tensor_count; 
x++1.00k
)
1281
1.00k
          h->data.f32[x] = b->data.f32[x] >= max ? 
0509
:
g->data.f32[x]491
;
1282
2
      } else if (isnan(max)) {
1283
1.00k
        for (x = 0; x < tensor_count; 
x++1.00k
)
1284
1.00k
          h->data.f32[x] = b->data.f32[x] <= min ? 
00
: g->data.f32[x];
1285
1
      } else {
1286
1.00k
        for (x = 0; x < tensor_count; 
x++1.00k
)
1287
1.00k
          h->data.f32[x] = (b->data.f32[x] >= max || 
b->data.f32[x] <= min491
) ?
0509
:
g->data.f32[x]491
;
1288
1
      }
1289
3
      return CCV_NNC_EXEC_SUCCESS;
1290
3
    }
1291
0
    int ginc[CCV_NNC_MAX_DIM_ALLOC];
1292
0
    assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2);
1293
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1294
0
    ccv_nnc_tensor_view_get_inc(g, ginc);
1295
0
    ccv_nnc_tensor_view_get_inc(b, binc);
1296
0
    ccv_nnc_tensor_view_get_inc(h, hinc);
1297
0
    int i[CCV_NNC_MAX_DIM + 2];
1298
0
    float* gp = g->data.f32;
1299
0
    float* bp = b->data.f32;
1300
0
    float* hp = h->data.f32;
1301
0
    const int count = dim[2] * dim[3];
1302
0
    const float min = cmd.info.clamp.min;
1303
0
    const float max = cmd.info.clamp.max;
1304
0
    assert(!isnan(min) || !isnan(max));
1305
0
    if (isnan(min))
1306
0
    {
1307
0
      if (ginc[3] == dim[3] && binc[3] == dim[3] && hinc[3] == dim[3])
1308
0
      {
1309
0
        // Special casing if the ginc[3] is the same as dim[3]
1310
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
1311
0
        {
1312
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
1313
0
          {
1314
0
            for (x = 0; x < count; x++)
1315
0
              hp[x] = bp[x] >= max ? 0 : gp[x];
1316
0
            gp += ginc[2] * ginc[3];
1317
0
            bp += binc[2] * binc[3];
1318
0
            hp += hinc[2] * hinc[3];
1319
0
          }
1320
0
          gp += (ginc[1] - dim[1]) * ginc[2] * ginc[3];
1321
0
          bp += (binc[1] - dim[1]) * binc[2] * binc[3];
1322
0
          hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
1323
0
        }
1324
0
        return CCV_NNC_EXEC_SUCCESS;
1325
0
      }
1326
0
      // Non-optimal case, need to do skip copy.
1327
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1328
0
      {
1329
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1330
0
        {
1331
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
1332
0
          {
1333
0
            for (x = 0; x < dim[3]; x++)
1334
0
              hp[x] = bp[x] >= max ? 0 : gp[x];
1335
0
            gp += ginc[3];
1336
0
            bp += binc[3];
1337
0
            hp += hinc[3];
1338
0
          }
1339
0
          gp += (ginc[2] - dim[2]) * ginc[3];
1340
0
          bp += (binc[2] - dim[2]) * binc[3];
1341
0
          hp += (hinc[2] - dim[2]) * hinc[3];
1342
0
        }
1343
0
        gp += (ginc[1] - dim[1]) * ginc[2] * ginc[3];
1344
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
1345
0
        hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
1346
0
      }
1347
0
    } else if (isnan(max)) {
1348
0
      if (ginc[3] == dim[3] && binc[3] == dim[3] && hinc[3] == dim[3])
1349
0
      {
1350
0
        // Special casing if the ginc[3] is the same as dim[3]
1351
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
1352
0
        {
1353
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
1354
0
          {
1355
0
            for (x = 0; x < count; x++)
1356
0
              hp[x] = bp[x] <= min ? 0 : gp[x];
1357
0
            gp += ginc[2] * ginc[3];
1358
0
            bp += binc[2] * binc[3];
1359
0
            hp += hinc[2] * hinc[3];
1360
0
          }
1361
0
          gp += (ginc[1] - dim[1]) * ginc[2] * ginc[3];
1362
0
          bp += (binc[1] - dim[1]) * binc[2] * binc[3];
1363
0
          hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
1364
0
        }
1365
0
        return CCV_NNC_EXEC_SUCCESS;
1366
0
      }
1367
0
      // Non-optimal case, need to do skip copy.
1368
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1369
0
      {
1370
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1371
0
        {
1372
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
1373
0
          {
1374
0
            for (x = 0; x < dim[3]; x++)
1375
0
              hp[x] = bp[x] <= min ? 0 : gp[x];
1376
0
            gp += ginc[3];
1377
0
            bp += binc[3];
1378
0
            hp += hinc[3];
1379
0
          }
1380
0
          gp += (ginc[2] - dim[2]) * ginc[3];
1381
0
          bp += (binc[2] - dim[2]) * binc[3];
1382
0
          hp += (hinc[2] - dim[2]) * hinc[3];
1383
0
        }
1384
0
        gp += (ginc[1] - dim[1]) * ginc[2] * ginc[3];
1385
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
1386
0
        hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
1387
0
      }
1388
0
    } else {
1389
0
      if (ginc[3] == dim[3] && binc[3] == dim[3] && hinc[3] == dim[3])
1390
0
      {
1391
0
        // Special casing if the ginc[3] is the same as dim[3]
1392
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
1393
0
        {
1394
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
1395
0
          {
1396
0
            for (x = 0; x < count; x++)
1397
0
              hp[x] = (bp[x] >= max || bp[x] <= min) ? 0 : gp[x];
1398
0
            gp += ginc[2] * ginc[3];
1399
0
            bp += binc[2] * binc[3];
1400
0
            hp += hinc[2] * hinc[3];
1401
0
          }
1402
0
          gp += (ginc[1] - dim[1]) * ginc[2] * ginc[3];
1403
0
          bp += (binc[1] - dim[1]) * binc[2] * binc[3];
1404
0
          hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
1405
0
        }
1406
0
        return CCV_NNC_EXEC_SUCCESS;
1407
0
      }
1408
0
      // Non-optimal case, need to do skip copy.
1409
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1410
0
      {
1411
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1412
0
        {
1413
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
1414
0
          {
1415
0
            for (x = 0; x < dim[3]; x++)
1416
0
              hp[x] = (bp[x] >= max || bp[x] <= min) ? 0 : gp[x];
1417
0
            gp += ginc[3];
1418
0
            bp += binc[3];
1419
0
            hp += hinc[3];
1420
0
          }
1421
0
          gp += (ginc[2] - dim[2]) * ginc[3];
1422
0
          bp += (binc[2] - dim[2]) * binc[3];
1423
0
          hp += (hinc[2] - dim[2]) * hinc[3];
1424
0
        }
1425
0
        gp += (ginc[1] - dim[1]) * ginc[2] * ginc[3];
1426
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
1427
0
        hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
1428
0
      }
1429
0
    }
1430
0
  } else {
1431
0
    if (!CCV_IS_TENSOR_VIEW(h) && !CCV_IS_TENSOR_VIEW(b))
1432
0
    {
1433
0
      // Super optimal case, just do one for-loop for sum.
1434
0
      const int tensor_count = ccv_nnc_tensor_count(h->info);
1435
0
      if (isnan(min))
1436
0
      {
1437
0
        for (x = 0; x < tensor_count; x++)
1438
0
          h->data.f32[x] = b->data.f32[x] >= max ? 0 : 1;
1439
0
      } else if (isnan(max)) {
1440
0
        for (x = 0; x < tensor_count; x++)
1441
0
          h->data.f32[x] = b->data.f32[x] <= min ? 0 : 1;
1442
0
      } else {
1443
0
        for (x = 0; x < tensor_count; x++)
1444
0
          h->data.f32[x] = (b->data.f32[x] >= max || b->data.f32[x] <= min) ? 0 : 1;
1445
0
      }
1446
0
      return CCV_NNC_EXEC_SUCCESS;
1447
0
    }
1448
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1449
0
    ccv_nnc_tensor_view_get_inc(b, binc);
1450
0
    ccv_nnc_tensor_view_get_inc(h, hinc);
1451
0
    int i[CCV_NNC_MAX_DIM + 2];
1452
0
    float* bp = b->data.f32;
1453
0
    float* hp = h->data.f32;
1454
0
    const int count = dim[2] * dim[3];
1455
0
    const float min = cmd.info.clamp.min;
1456
0
    const float max = cmd.info.clamp.max;
1457
0
    assert(!isnan(min) || !isnan(max));
1458
0
    if (isnan(min))
1459
0
    {
1460
0
      if (binc[3] == dim[3] && hinc[3] == dim[3])
1461
0
      {
1462
0
        // Special casing if the binc[3] is the same as dim[3]
1463
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
1464
0
        {
1465
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
1466
0
          {
1467
0
            for (x = 0; x < count; x++)
1468
0
              hp[x] = bp[x] >= max ? 0 : 1;
1469
0
            bp += binc[2] * binc[3];
1470
0
            hp += hinc[2] * hinc[3];
1471
0
          }
1472
0
          bp += (binc[1] - dim[1]) * binc[2] * binc[3];
1473
0
          hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
1474
0
        }
1475
0
        return CCV_NNC_EXEC_SUCCESS;
1476
0
      }
1477
0
      // Non-optimal case, need to do skip copy.
1478
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1479
0
      {
1480
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1481
0
        {
1482
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
1483
0
          {
1484
0
            for (x = 0; x < dim[3]; x++)
1485
0
              hp[x] = bp[x] >= max ? 0 : 1;
1486
0
            bp += binc[3];
1487
0
            hp += hinc[3];
1488
0
          }
1489
0
          bp += (binc[2] - dim[2]) * binc[3];
1490
0
          hp += (hinc[2] - dim[2]) * hinc[3];
1491
0
        }
1492
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
1493
0
        hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
1494
0
      }
1495
0
    } else if (isnan(max)) {
1496
0
      if (binc[3] == dim[3] && hinc[3] == dim[3])
1497
0
      {
1498
0
        // Special casing if the binc[3] is the same as dim[3]
1499
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
1500
0
        {
1501
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
1502
0
          {
1503
0
            for (x = 0; x < count; x++)
1504
0
              hp[x] = bp[x] <= min ? 0 : 1;
1505
0
            bp += binc[2] * binc[3];
1506
0
            hp += hinc[2] * hinc[3];
1507
0
          }
1508
0
          bp += (binc[1] - dim[1]) * binc[2] * binc[3];
1509
0
          hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
1510
0
        }
1511
0
        return CCV_NNC_EXEC_SUCCESS;
1512
0
      }
1513
0
      // Non-optimal case, need to do skip copy.
1514
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1515
0
      {
1516
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1517
0
        {
1518
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
1519
0
          {
1520
0
            for (x = 0; x < dim[3]; x++)
1521
0
              hp[x] = bp[x] <= min ? 0 : 1;
1522
0
            bp += binc[3];
1523
0
            hp += hinc[3];
1524
0
          }
1525
0
          bp += (binc[2] - dim[2]) * binc[3];
1526
0
          hp += (hinc[2] - dim[2]) * hinc[3];
1527
0
        }
1528
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
1529
0
        hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
1530
0
      }
1531
0
    } else {
1532
0
      if (binc[3] == dim[3] && hinc[3] == dim[3])
1533
0
      {
1534
0
        // Special casing if the binc[3] is the same as dim[3]
1535
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
1536
0
        {
1537
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
1538
0
          {
1539
0
            for (x = 0; x < count; x++)
1540
0
              hp[x] = (bp[x] >= max || bp[x] <= min) ? 0 : 1;
1541
0
            bp += binc[2] * binc[3];
1542
0
            hp += hinc[2] * hinc[3];
1543
0
          }
1544
0
          bp += (binc[1] - dim[1]) * binc[2] * binc[3];
1545
0
          hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
1546
0
        }
1547
0
        return CCV_NNC_EXEC_SUCCESS;
1548
0
      }
1549
0
      // Non-optimal case, need to do skip copy.
1550
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1551
0
      {
1552
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1553
0
        {
1554
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
1555
0
          {
1556
0
            for (x = 0; x < dim[3]; x++)
1557
0
              hp[x] = (bp[x] >= max || bp[x] <= min) ? 0 : 1;
1558
0
            bp += binc[3];
1559
0
            hp += hinc[3];
1560
0
          }
1561
0
          bp += (binc[2] - dim[2]) * binc[3];
1562
0
          hp += (hinc[2] - dim[2]) * hinc[3];
1563
0
        }
1564
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
1565
0
        hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
1566
0
      }
1567
0
    }
1568
0
  }
1569
3
  
return CCV_NNC_EXEC_SUCCESS0
;
1570
3
}
1571
1572
REGISTER_COMMAND_BACKEND(CCV_NNC_EWSUM_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1573
1
{
1574
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1575
1
  registry->tensor_datatypes = CCV_32F;
1576
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1577
1
  registry->algorithms = 1;
1578
1
  registry->exec = _ccv_nnc_ewsum_forw;
1579
1
}
1580
1581
REGISTER_COMMAND_BACKEND(CCV_NNC_EWSUM_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1582
1
{
1583
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1584
1
  registry->tensor_datatypes = CCV_32F;
1585
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1586
1
  registry->algorithms = 1;
1587
1
  registry->exec = _ccv_nnc_ewsum_back;
1588
1
}
1589
1590
REGISTER_COMMAND_BACKEND(CCV_NNC_EWPROD_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1591
1
{
1592
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1593
1
  registry->tensor_datatypes = CCV_32F;
1594
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1595
1
  registry->algorithms = 1;
1596
1
  registry->exec = _ccv_nnc_ewprod_forw;
1597
1
}
1598
1599
REGISTER_COMMAND_BACKEND(CCV_NNC_EWPROD_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1600
1
{
1601
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1602
1
  registry->tensor_datatypes = CCV_32F;
1603
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1604
1
  registry->algorithms = 1;
1605
1
  registry->exec = _ccv_nnc_ewprod_back;
1606
1
}
1607
1608
REGISTER_COMMAND_BACKEND(CCV_NNC_EWDIV_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1609
1
{
1610
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1611
1
  registry->tensor_datatypes = CCV_32F;
1612
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1613
1
  registry->algorithms = 1;
1614
1
  registry->exec = _ccv_nnc_ewdiv_forw;
1615
1
}
1616
1617
REGISTER_COMMAND_BACKEND(CCV_NNC_EWDIV_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1618
1
{
1619
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1620
1
  registry->tensor_datatypes = CCV_32F;
1621
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1622
1
  registry->algorithms = 1;
1623
1
  registry->exec = _ccv_nnc_ewdiv_back;
1624
1
}
1625
1626
REGISTER_COMMAND_BACKEND(CCV_NNC_EWEXP_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1627
1
{
1628
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1629
1
  registry->tensor_datatypes = CCV_32F;
1630
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1631
1
  registry->algorithms = 1;
1632
1
  registry->exec = _ccv_nnc_ewexp_forw;
1633
1
}
1634
1635
REGISTER_COMMAND_BACKEND(CCV_NNC_EWEXP_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1636
1
{
1637
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1638
1
  registry->tensor_datatypes = CCV_32F;
1639
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1640
1
  registry->algorithms = 1;
1641
1
  registry->exec = _ccv_nnc_ewexp_back;
1642
1
}
1643
1644
REGISTER_COMMAND_BACKEND(CCV_NNC_EWLOG_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1645
1
{
1646
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1647
1
  registry->tensor_datatypes = CCV_32F;
1648
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1649
1
  registry->algorithms = 1;
1650
1
  registry->exec = _ccv_nnc_ewlog_forw;
1651
1
}
1652
1653
REGISTER_COMMAND_BACKEND(CCV_NNC_EWLOG_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1654
1
{
1655
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1656
1
  registry->tensor_datatypes = CCV_32F;
1657
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1658
1
  registry->algorithms = 1;
1659
1
  registry->exec = _ccv_nnc_ewlog_back;
1660
1
}
1661
1662
REGISTER_COMMAND_BACKEND(CCV_NNC_EWSQRT_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1663
1
{
1664
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1665
1
  registry->tensor_datatypes = CCV_32F;
1666
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1667
1
  registry->algorithms = 1;
1668
1
  registry->exec = _ccv_nnc_ewsqrt_forw;
1669
1
}
1670
1671
REGISTER_COMMAND_BACKEND(CCV_NNC_EWSQRT_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1672
1
{
1673
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1674
1
  registry->tensor_datatypes = CCV_32F;
1675
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1676
1
  registry->algorithms = 1;
1677
1
  registry->exec = _ccv_nnc_ewsqrt_back;
1678
1
}
1679
1680
REGISTER_COMMAND_BACKEND(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1681
1
{
1682
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1683
1
  registry->tensor_datatypes = CCV_32F;
1684
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1685
1
  registry->algorithms = 1;
1686
1
  registry->exec = _ccv_nnc_clamp_forw;
1687
1
}
1688
1689
REGISTER_COMMAND_BACKEND(CCV_NNC_CLAMP_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1690
1
{
1691
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1692
1
  registry->tensor_datatypes = CCV_32F;
1693
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1694
1
  registry->algorithms = 1;
1695
1
  registry->exec = _ccv_nnc_clamp_back;
1696
1
}