Coverage Report

Created: 2024-08-19 11:27

/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/ew/ccv_nnc_ew_cpu_ref.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
#include "../_ccv_nnc_cpu_ref.h"
14
15
void _ccv_nnc_ewsum_forw_cpu_ref_f32(ccv_nnc_tensor_view_t* const* const inputs, const int input_size, ccv_nnc_tensor_view_t* const* const outputs, const int output_size)
16
35.9k
{
17
35.9k
  if (input_size == 1 && 
output_size == 10
)
18
0
  {
19
0
    _ccv_nnc_tensor_transfer_cpu_ref_f32(inputs[0], outputs[0]);
20
0
    return;
21
0
  }
22
  // Assuming this is float 32.
23
35.9k
  int dim[CCV_NNC_MAX_DIM_ALLOC];
24
35.9k
  int astride[CCV_NNC_MAX_DIM_ALLOC];
25
35.9k
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
26
35.9k
  int cstride[CCV_NNC_MAX_DIM_ALLOC];
27
35.9k
  int x, z;
28
35.9k
  int k = 0;
29
  // Bad, I promised this can be inplace operation. Need to first find out if there are share the same pointer first.
30
71.9k
  for (z = 1; z < input_size; 
z++35.9k
)
31
35.9k
  {
32
35.9k
    ccv_nnc_tensor_view_t* c = outputs[0];
33
35.9k
    ccv_nnc_tensor_view_t* a = inputs[z];
34
35.9k
    if (c->data.f32 == a->data.f32)
35
10
    {
36
10
      k = z;
37
10
      break;
38
10
    }
39
35.9k
  }
40
71.9k
  for (z = 0; z < input_size - 1; 
z++35.9k
)
41
35.9k
  {
42
35.9k
    ccv_nnc_tensor_view_t* c = outputs[0];
43
35.9k
    ccv_nnc_tensor_view_t* a = z > 0 ? 
c27
:
inputs[k]35.9k
;
44
35.9k
    ccv_nnc_tensor_view_t* b = z >= k ? 
inputs[z + 1]35.9k
:
inputs[z]10
;
45
35.9k
    assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
46
35.9k
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
47
35.9k
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
48
35.9k
    ccv_nnc_tensor_view_get_dim(a, dim);
49
35.9k
    assert(ccv_nnc_tensor_view_check_dim(b, dim));
50
35.9k
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
51
35.9k
    if (!CCV_IS_TENSOR_VIEW(a) && 
!35.9k
CCV_IS_TENSOR_VIEW35.9k
(b) &&
!35.9k
CCV_IS_TENSOR_VIEW35.9k
(c))
52
35.9k
    {
53
      // Super optimal case, just do one for-loop for sum.
54
35.9k
      const int tensor_count = ccv_nnc_tensor_count(a->info);
55
15.5M
      for (x = 0; x < tensor_count; 
x++15.4M
)
56
15.4M
        c->data.f32[x] = a->data.f32[x] + b->data.f32[x];
57
35.9k
      continue;
58
35.9k
    }
59
3
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
60
3
    ccv_nnc_tensor_view_get_stride(a, astride);
61
3
    ccv_nnc_tensor_view_get_stride(b, bstride);
62
3
    ccv_nnc_tensor_view_get_stride(c, cstride);
63
3
    int i[CCV_NNC_MAX_DIM + 2];
64
3
    float* const ap = a->data.f32;
65
3
    float* const bp = b->data.f32;
66
3
    float* const cp = c->data.f32;
67
3
    const int count = dim[2] * dim[3];
68
3
    if (astride[2] == dim[3] && bstride[2] == dim[3] && cstride[2] == dim[3] && astride[3] == 1 && bstride[3] == 1 && cstride[3] == 1)
69
3
    {
70
      // Special casing if the ainc[3] is the same as dim[3] (do memcpy for the last two dim)
71
6
      for (i[0] = 0; i[0] < dim[0]; 
i[0]++3
)
72
3
      {
73
3
        float* ap0 = ap + i[0] * astride[0];
74
3
        float* bp0 = bp + i[0] * bstride[0];
75
3
        float* cp0 = cp + i[0] * cstride[0];
76
6
        for (i[1] = 0; i[1] < dim[1]; 
i[1]++3
)
77
3
        {
78
6
          for (x = 0; x < count; 
x++3
)
79
3
            cp0[x] = ap0[x] + bp0[x];
80
3
          ap0 += astride[1];
81
3
          bp0 += bstride[1];
82
3
          cp0 += cstride[1];
83
3
        }
84
3
      }
85
3
      continue;
86
3
    }
87
    // Non-optimal case, need to do skip copy.
88
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
89
0
    {
90
0
      float* const ap0 = ap + i[0] * astride[0];
91
0
      float* const bp0 = bp + i[0] * bstride[0];
92
0
      float* const cp0 = cp + i[0] * cstride[0];
93
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
94
0
      {
95
0
        float* ap1 = ap0 + i[1] * astride[1];
96
0
        float* bp1 = bp0 + i[1] * bstride[1];
97
0
        float* cp1 = cp0 + i[1] * cstride[1];
98
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
99
0
        {
100
0
          for (x = 0; x < dim[3]; x++)
101
0
            cp1[x * cstride[3]] = ap1[x * astride[3]] + bp1[x * bstride[3]];
102
0
          ap1 += astride[2];
103
0
          bp1 += bstride[2];
104
0
          cp1 += cstride[2];
105
0
        }
106
0
      }
107
0
    }
108
0
  }
109
35.9k
}
110
111
void _ccv_nnc_ewsum_forw_cpu_ref_i32(ccv_nnc_tensor_view_t* const* const inputs, const int input_size, ccv_nnc_tensor_view_t* const* const outputs, const int output_size)
112
0
{
113
0
  if (input_size == 1 && output_size == 1)
114
0
  {
115
0
    _ccv_nnc_tensor_transfer_cpu_ref_f32(inputs[0], outputs[0]);
116
0
    return;
117
0
  }
118
  // Assuming this is float 32.
119
0
  int dim[CCV_NNC_MAX_DIM_ALLOC];
120
0
  int astride[CCV_NNC_MAX_DIM_ALLOC];
121
0
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
122
0
  int cstride[CCV_NNC_MAX_DIM_ALLOC];
123
0
  int x, z;
124
0
  int k = 0;
125
  // Bad, I promised this can be inplace operation. Need to first find out if there are share the same pointer first.
126
0
  for (z = 1; z < input_size; z++)
127
0
  {
128
0
    ccv_nnc_tensor_view_t* c = outputs[0];
129
0
    ccv_nnc_tensor_view_t* a = inputs[z];
130
0
    if (c->data.f32 == a->data.f32)
131
0
    {
132
0
      k = z;
133
0
      break;
134
0
    }
135
0
  }
136
0
  for (z = 0; z < input_size - 1; z++)
137
0
  {
138
0
    ccv_nnc_tensor_view_t* c = outputs[0];
139
0
    ccv_nnc_tensor_view_t* a = z > 0 ? c : inputs[k];
140
0
    ccv_nnc_tensor_view_t* b = z >= k ? inputs[z + 1] : inputs[z];
141
0
    assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
142
0
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
143
0
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
144
0
    ccv_nnc_tensor_view_get_dim(a, dim);
145
0
    assert(ccv_nnc_tensor_view_check_dim(b, dim));
146
0
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
147
0
    if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c))
148
0
    {
149
      // Super optimal case, just do one for-loop for sum.
150
0
      const int tensor_count = ccv_nnc_tensor_count(a->info);
151
0
      for (x = 0; x < tensor_count; x++)
152
0
        c->data.f32[x] = a->data.f32[x] + b->data.f32[x];
153
0
      continue;
154
0
    }
155
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
156
0
    ccv_nnc_tensor_view_get_stride(a, astride);
157
0
    ccv_nnc_tensor_view_get_stride(b, bstride);
158
0
    ccv_nnc_tensor_view_get_stride(c, cstride);
159
0
    int i[CCV_NNC_MAX_DIM + 2];
160
0
    int* const ap = a->data.i32;
161
0
    int* const bp = b->data.i32;
162
0
    int* const cp = c->data.i32;
163
0
    const int count = dim[2] * dim[3];
164
0
    if (astride[2] == dim[3] && bstride[2] == dim[3] && cstride[2] == dim[3] && astride[3] == 1 && bstride[3] == 1 && cstride[3] == 1)
165
0
    {
166
      // Special casing if the ainc[3] is the same as dim[3] (do memcpy for the last two dim)
167
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
168
0
      {
169
0
        int* ap0 = ap + i[0] * astride[0];
170
0
        int* bp0 = bp + i[0] * bstride[0];
171
0
        int* cp0 = cp + i[0] * cstride[0];
172
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
173
0
        {
174
0
          for (x = 0; x < count; x++)
175
0
            cp0[x] = ap0[x] + bp0[x];
176
0
          ap0 += astride[1];
177
0
          bp0 += bstride[1];
178
0
          cp0 += cstride[1];
179
0
        }
180
0
      }
181
0
      continue;
182
0
    }
183
    // Non-optimal case, need to do skip copy.
184
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
185
0
    {
186
0
      int* const ap0 = ap + i[0] * astride[0];
187
0
      int* const bp0 = bp + i[0] * bstride[0];
188
0
      int* const cp0 = cp + i[0] * cstride[0];
189
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
190
0
      {
191
0
        int* ap1 = ap0 + i[1] * astride[1];
192
0
        int* bp1 = bp0 + i[1] * bstride[1];
193
0
        int* cp1 = cp0 + i[1] * cstride[1];
194
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
195
0
        {
196
0
          for (x = 0; x < dim[3]; x++)
197
0
            cp1[x * cstride[3]] = ap1[x * astride[3]] + bp1[x * bstride[3]];
198
0
          ap1 += astride[2];
199
0
          bp1 += bstride[2];
200
0
          cp1 += cstride[2];
201
0
        }
202
0
      }
203
0
    }
204
0
  }
205
0
}
206
207
static int _ccv_nnc_ewsum_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
208
35.9k
{
209
35.9k
  if (outputs[0]->info.datatype == CCV_32S)
210
0
    _ccv_nnc_ewsum_forw_cpu_ref_i32((ccv_nnc_tensor_view_t**)inputs, input_size, (ccv_nnc_tensor_view_t**)outputs, output_size);
211
35.9k
  else
212
35.9k
    _ccv_nnc_ewsum_forw_cpu_ref_f32((ccv_nnc_tensor_view_t**)inputs, input_size, (ccv_nnc_tensor_view_t**)outputs, output_size);
213
35.9k
  return CCV_NNC_EXEC_SUCCESS;
214
35.9k
}
215
216
static int _ccv_nnc_ewsum_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
217
7.88k
{
218
  // D[x + y + z, x] = 1
219
7.88k
  int i;
220
7.88k
  if (inputs[0] == 0)
221
0
  {
222
    // Set them to 1.
223
0
    for (i = 0; i < output_size; i++)
224
0
      if (outputs[i])
225
0
        _ccv_nnc_tensor_set_cpu_ref_f32((ccv_nnc_tensor_view_t*)outputs[i], 1);
226
7.88k
  } else {
227
    // Copy over the gradient (If they are not pointing to the same tensor already).
228
23.6k
    for (i = 0; i < output_size; 
i++15.7k
)
229
15.7k
      if (outputs[i] && inputs[0]->data.f32 != outputs[i]->data.f32)
230
7.66k
        _ccv_nnc_tensor_transfer_cpu_ref_f32((ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)outputs[i]);
231
7.88k
  }
232
7.88k
  return CCV_NNC_EXEC_SUCCESS;
233
7.88k
}
234
235
void _ccv_nnc_ewprod_forw_cpu_ref(ccv_nnc_tensor_view_t* const* const inputs, const int input_size, ccv_nnc_tensor_view_t* const* const outputs, const int output_size)
236
30.6k
{
237
30.6k
  if (input_size == 1 && 
output_size == 10
)
238
0
  {
239
0
    _ccv_nnc_tensor_transfer_cpu_ref_f32(inputs[0], outputs[0]);
240
0
    return;
241
0
  }
242
  // Assuming this is float 32.
243
30.6k
  int dim[CCV_NNC_MAX_DIM_ALLOC];
244
30.6k
  int astride[CCV_NNC_MAX_DIM_ALLOC];
245
30.6k
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
246
30.6k
  int cstride[CCV_NNC_MAX_DIM_ALLOC];
247
30.6k
  int x, z;
248
30.6k
  int k = 0;
249
  // Bad, I promised this can be inplace operation. Need to first find out if there are share the same pointer first.
250
61.2k
  for (z = 1; z < input_size; 
z++30.6k
)
251
30.6k
  {
252
30.6k
    ccv_nnc_tensor_view_t* c = outputs[0];
253
30.6k
    ccv_nnc_tensor_view_t* a = inputs[z];
254
30.6k
    if (c->data.f32 == a->data.f32)
255
12
    {
256
12
      k = z;
257
12
      break;
258
12
    }
259
30.6k
  }
260
61.2k
  for (z = 0; z < input_size - 1; 
z++30.6k
)
261
30.6k
  {
262
30.6k
    ccv_nnc_tensor_view_t* c = outputs[0];
263
30.6k
    ccv_nnc_tensor_view_t* a = z > 0 ? 
c0
: inputs[k];
264
30.6k
    ccv_nnc_tensor_view_t* b = z >= k ? 
inputs[z + 1]30.6k
:
inputs[z]12
;
265
30.6k
    assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
266
30.6k
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
267
30.6k
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
268
30.6k
    ccv_nnc_tensor_view_get_dim(a, dim);
269
30.6k
    assert(ccv_nnc_tensor_view_check_dim(b, dim));
270
30.6k
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
271
30.6k
    if (!CCV_IS_TENSOR_VIEW(a) && 
!30.6k
CCV_IS_TENSOR_VIEW30.6k
(b) &&
!30.6k
CCV_IS_TENSOR_VIEW30.6k
(c))
272
30.6k
    {
273
      // Super optimal case, just do one for-loop for sum.
274
30.6k
      const int tensor_count = ccv_nnc_tensor_count(a->info);
275
99.2k
      for (x = 0; x < tensor_count; 
x++68.6k
)
276
68.6k
        c->data.f32[x] = a->data.f32[x] * b->data.f32[x];
277
30.6k
      continue;
278
30.6k
    }
279
3
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
280
3
    ccv_nnc_tensor_view_get_stride(a, astride);
281
3
    ccv_nnc_tensor_view_get_stride(b, bstride);
282
3
    ccv_nnc_tensor_view_get_stride(c, cstride);
283
3
    int i[CCV_NNC_MAX_DIM + 2];
284
3
    float* const ap = a->data.f32;
285
3
    float* const bp = b->data.f32;
286
3
    float* const cp = c->data.f32;
287
3
    const int count = dim[2] * dim[3];
288
3
    if (astride[2] == dim[3] && bstride[2] == dim[3] && cstride[2] == dim[3])
289
3
    {
290
      // Special casing if the ainc[3] is the same as dim[3]
291
6
      for (i[0] = 0; i[0] < dim[0]; 
i[0]++3
)
292
3
      {
293
3
        float* ap0 = ap + i[0] * astride[0];
294
3
        float* bp0 = bp + i[0] * bstride[0];
295
3
        float* cp0 = cp + i[0] * cstride[0];
296
6
        for (i[1] = 0; i[1] < dim[1]; 
i[1]++3
)
297
3
        {
298
6
          for (x = 0; x < count; 
x++3
)
299
3
            cp0[x] = ap0[x] * bp0[x];
300
3
          ap0 += astride[1];
301
3
          bp0 += bstride[1];
302
3
          cp0 += cstride[1];
303
3
        }
304
3
      }
305
3
      continue;
306
3
    }
307
    // Non-optimal case, need to do skip copy.
308
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
309
0
    {
310
0
      float* const ap0 = ap + i[0] * astride[0];
311
0
      float* const bp0 = bp + i[0] * bstride[0];
312
0
      float* const cp0 = cp + i[0] * cstride[0];
313
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
314
0
      {
315
0
        float* ap1 = ap0 + i[1] * astride[1];
316
0
        float* bp1 = bp0 + i[1] * bstride[1];
317
0
        float* cp1 = cp0 + i[1] * cstride[1];
318
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
319
0
        {
320
0
          for (x = 0; x < dim[3]; x++)
321
0
            cp1[x] = ap1[x] * bp1[x];
322
0
          ap1 += astride[2];
323
0
          bp1 += bstride[2];
324
0
          cp1 += cstride[2];
325
0
        }
326
0
      }
327
0
    }
328
0
  }
329
30.6k
}
330
331
static int _ccv_nnc_ewprod_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
332
20.1k
{
333
20.1k
  _ccv_nnc_ewprod_forw_cpu_ref((ccv_nnc_tensor_view_t**)inputs, input_size, (ccv_nnc_tensor_view_t**)outputs, output_size);
334
20.1k
  return CCV_NNC_EXEC_SUCCESS;
335
20.1k
}
336
337
static int _ccv_nnc_ewprod_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
338
20.0k
{
339
  // D[x * y * z, x] = y * z
340
  // Assuming this is float 32.
341
20.0k
  int dim[CCV_NNC_MAX_DIM_ALLOC];
342
20.0k
  int gstride[CCV_NNC_MAX_DIM_ALLOC];
343
20.0k
  int astride[CCV_NNC_MAX_DIM_ALLOC];
344
20.0k
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
345
20.0k
  int hstride[CCV_NNC_MAX_DIM_ALLOC];
346
20.0k
  int x, z;
347
20.0k
  ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0];
348
20.0k
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[output_size + 1];
349
20.0k
  if (g == 0)
350
0
  {
351
0
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
352
0
    ccv_nnc_tensor_view_get_dim(b, dim);
353
0
    ccv_nnc_tensor_view_get_stride(b, bstride);
354
0
    for (z = 0; z < output_size; z++)
355
0
    {
356
0
      ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[z + 1];
357
0
      ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[z];
358
0
      assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
359
0
      assert(ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2);
360
0
      assert(ccv_nnc_tensor_view_check_dim(a, dim));
361
0
      assert(ccv_nnc_tensor_view_check_dim(h, dim));
362
0
      ccv_nnc_tensor_view_get_stride(a, astride);
363
0
      ccv_nnc_tensor_view_get_stride(h, hstride);
364
0
      if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(h))
365
0
      {
366
        // Super optimal case, just do one for-loop for sum.
367
0
        const int tensor_count = ccv_nnc_tensor_count(b->info);
368
0
        for (x = 0; x < tensor_count; x++)
369
0
          h->data.f32[x] = b->data.f32[x] / a->data.f32[x];
370
0
        continue;
371
0
      }
372
0
      assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
373
0
      int i[CCV_NNC_MAX_DIM + 2];
374
0
      float* const ap = a->data.f32;
375
0
      float* const bp = b->data.f32;
376
0
      float* const hp = h->data.f32;
377
0
      const int count = dim[2] * dim[3];
378
0
      if (astride[2] == dim[3] && bstride[2] == dim[3] && hstride[2] == dim[3])
379
0
      {
380
        // Special casing if the ainc[3] is the same as dim[3]
381
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
382
0
        {
383
0
          float* ap0 = ap + i[0] * astride[0];
384
0
          float* bp0 = bp + i[0] * bstride[0];
385
0
          float* hp0 = hp + i[0] * hstride[0];
386
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
387
0
          {
388
0
            for (x = 0; x < count; x++)
389
0
              hp0[x] = bp0[x] / ap0[x];
390
0
            ap0 += astride[1];
391
0
            bp0 += bstride[1];
392
0
            hp0 += hstride[1];
393
0
          }
394
0
        }
395
0
        continue;
396
0
      }
397
      // Non-optimal case, need to do skip copy.
398
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
399
0
      {
400
0
        float* const ap0 = ap + i[0] * astride[0];
401
0
        float* const bp0 = bp + i[0] * bstride[0];
402
0
        float* const hp0 = hp + i[0] * hstride[0];
403
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
404
0
        {
405
0
          float* ap1 = ap0 + i[1] * astride[1];
406
0
          float* bp1 = bp0 + i[1] * bstride[1];
407
0
          float* hp1 = hp0 + i[1] * hstride[1];
408
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
409
0
          {
410
0
            for (x = 0; x < dim[3]; x++)
411
0
              hp1[x] = bp1[x] / ap1[x];
412
0
            ap1 += astride[2];
413
0
            bp1 += bstride[2];
414
0
            hp1 += hstride[2];
415
0
          }
416
0
        }
417
0
      }
418
0
    }
419
20.0k
  } else {
420
20.0k
    assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2);
421
20.0k
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
422
20.0k
    ccv_nnc_tensor_view_get_dim(b, dim);
423
20.0k
    assert(ccv_nnc_tensor_view_check_dim(g, dim));
424
20.0k
    ccv_nnc_tensor_view_get_stride(b, bstride);
425
20.0k
    ccv_nnc_tensor_view_get_stride(g, gstride);
426
60.0k
    for (z = 0; z < output_size; 
z++40.0k
)
427
40.0k
    {
428
40.0k
      ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[z + 1];
429
40.0k
      ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[z];
430
40.0k
      assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
431
40.0k
      assert(ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2);
432
40.0k
      assert(ccv_nnc_tensor_view_check_dim(a, dim));
433
40.0k
      assert(ccv_nnc_tensor_view_check_dim(h, dim));
434
40.0k
      ccv_nnc_tensor_view_get_stride(a, astride);
435
40.0k
      ccv_nnc_tensor_view_get_stride(h, hstride);
436
40.0k
      if (!CCV_IS_TENSOR_VIEW(g) && 
!40.0k
CCV_IS_TENSOR_VIEW40.0k
(a) &&
!40.0k
CCV_IS_TENSOR_VIEW40.0k
(b) &&
!40.0k
CCV_IS_TENSOR_VIEW40.0k
(h))
437
40.0k
      {
438
        // Super optimal case, just do one for-loop for sum.
439
40.0k
        const int tensor_count = ccv_nnc_tensor_count(g->info);
440
129k
        for (x = 0; x < tensor_count; 
x++89.2k
)
441
89.2k
          h->data.f32[x] = g->data.f32[x] * b->data.f32[x] / a->data.f32[x];
442
40.0k
        continue;
443
40.0k
      }
444
3
      assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
445
3
      int i[CCV_NNC_MAX_DIM + 2];
446
3
      float* const gp = g->data.f32;
447
3
      float* const ap = a->data.f32;
448
3
      float* const bp = b->data.f32;
449
3
      float* const hp = h->data.f32;
450
3
      const int count = dim[2] * dim[3];
451
3
      if (gstride[2] == dim[3] && astride[2] == dim[3] && bstride[2] == dim[3] && hstride[2] == dim[3])
452
3
      {
453
        // Special casing if the ainc[3] is the same as dim[3]
454
6
        for (i[0] = 0; i[0] < dim[0]; 
i[0]++3
)
455
3
        {
456
3
          float* gp0 = gp + i[0] * gstride[0];
457
3
          float* ap0 = ap + i[0] * astride[0];
458
3
          float* bp0 = bp + i[0] * bstride[0];
459
3
          float* hp0 = hp + i[0] * hstride[0];
460
6
          for (i[1] = 0; i[1] < dim[1]; 
i[1]++3
)
461
3
          {
462
6
            for (x = 0; x < count; 
x++3
)
463
3
              hp0[x] = gp0[x] * bp0[x] / ap0[x];
464
3
            gp0 += gstride[1];
465
3
            ap0 += astride[1];
466
3
            bp0 += bstride[1];
467
3
            hp0 += hstride[1];
468
3
          }
469
3
        }
470
3
        continue;
471
3
      }
472
      // Non-optimal case, need to do skip copy.
473
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
474
0
      {
475
0
        float* const gp0 = gp + i[0] * gstride[0];
476
0
        float* const ap0 = ap + i[0] * astride[0];
477
0
        float* const bp0 = bp + i[0] * bstride[0];
478
0
        float* const hp0 = hp + i[0] * hstride[0];
479
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
480
0
        {
481
0
          float* gp1 = gp0 + i[1] * gstride[1];
482
0
          float* ap1 = ap0 + i[1] * astride[1];
483
0
          float* bp1 = bp0 + i[1] * bstride[1];
484
0
          float* hp1 = hp0 + i[1] * hstride[1];
485
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
486
0
          {
487
0
            for (x = 0; x < dim[3]; x++)
488
0
              hp1[x] = gp1[x] * bp1[x] / ap1[x];
489
0
            gp1 += gstride[2];
490
0
            ap1 += astride[2];
491
0
            bp1 += bstride[2];
492
0
            hp1 += hstride[2];
493
0
          }
494
0
        }
495
0
      }
496
0
    }
497
20.0k
  }
498
20.0k
  return CCV_NNC_EXEC_SUCCESS;
499
20.0k
}
500
501
static void _ccv_nnc_ewdiv_forw_cpu_ref(const float p, ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b, ccv_nnc_tensor_view_t* const c)
502
258
{
503
  // Assuming this is float 32.
504
258
  int dim[CCV_NNC_MAX_DIM_ALLOC];
505
258
  int astride[CCV_NNC_MAX_DIM_ALLOC];
506
258
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
507
258
  int cstride[CCV_NNC_MAX_DIM_ALLOC];
508
258
  if (a == 0) // Take 0 as all ones tensor.
509
19
  {
510
19
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
511
19
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
512
19
    ccv_nnc_tensor_view_get_dim(b, dim);
513
19
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
514
19
    int x;
515
19
    if (!CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c))
516
19
    {
517
      // Super optimal case, just do one for-loop for sum.
518
19
      const int tensor_count = ccv_nnc_tensor_count(b->info);
519
1.98k
      for (x = 0; x < tensor_count; 
x++1.96k
)
520
1.96k
        c->data.f32[x] = p / b->data.f32[x];
521
19
      return;
522
19
    }
523
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
524
0
    ccv_nnc_tensor_view_get_stride(b, bstride);
525
0
    ccv_nnc_tensor_view_get_stride(c, cstride);
526
0
    int i[CCV_NNC_MAX_DIM + 2];
527
0
    float* const bp = b->data.f32;
528
0
    float* const cp = c->data.f32;
529
0
    const int count = dim[2] * dim[3];
530
0
    if (bstride[2] == dim[3] && cstride[2] == dim[3])
531
0
    {
532
      // Special casing if the ainc[3] is the same as dim[3]
533
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
534
0
      {
535
0
        float* bp0 = bp + i[0] * bstride[0];
536
0
        float* cp0 = cp + i[0] * cstride[0];
537
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
538
0
        {
539
0
          for (x = 0; x < count; x++)
540
0
            cp0[x] = p / bp0[x];
541
0
          bp0 += bstride[1];
542
0
          cp0 += cstride[1];
543
0
        }
544
0
      }
545
0
      return;
546
0
    }
547
    // Non-optimal case, need to do skip copy.
548
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
549
0
    {
550
0
      float* const bp0 = bp + i[0] * bstride[0];
551
0
      float* const cp0 = cp + i[0] * cstride[0];
552
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
553
0
      {
554
0
        float* bp1 = bp0 + i[1] * bstride[1];
555
0
        float* cp1 = cp0 + i[1] * cstride[1];
556
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
557
0
        {
558
0
          for (x = 0; x < dim[3]; x++)
559
0
            cp1[x] = p / bp1[x];
560
0
          bp1 += bstride[2];
561
0
          cp1 += cstride[2];
562
0
        }
563
0
      }
564
0
    }
565
239
  } else {
566
239
    assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
567
239
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
568
239
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
569
239
    ccv_nnc_tensor_view_get_dim(a, dim);
570
239
    assert(ccv_nnc_tensor_view_check_dim(b, dim));
571
239
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
572
239
    int x;
573
239
    if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c))
574
239
    {
575
      // Super optimal case, just do one for-loop for sum.
576
239
      const int tensor_count = ccv_nnc_tensor_count(a->info);
577
7.04k
      for (x = 0; x < tensor_count; 
x++6.80k
)
578
6.80k
        c->data.f32[x] = p * a->data.f32[x] / b->data.f32[x];
579
239
      return;
580
239
    }
581
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
582
0
    ccv_nnc_tensor_view_get_stride(a, astride);
583
0
    ccv_nnc_tensor_view_get_stride(b, bstride);
584
0
    ccv_nnc_tensor_view_get_stride(c, cstride);
585
0
    int i[CCV_NNC_MAX_DIM + 2];
586
0
    float* const ap = a->data.f32;
587
0
    float* const bp = b->data.f32;
588
0
    float* const cp = c->data.f32;
589
0
    const int count = dim[2] * dim[3];
590
0
    if (astride[2] == dim[3] && bstride[2] == dim[3] && cstride[2] == dim[3])
591
0
    {
592
      // Special casing if the ainc[3] is the same as dim[3]
593
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
594
0
      {
595
0
        float* ap0 = ap + i[0] * astride[0];
596
0
        float* bp0 = bp + i[0] * bstride[0];
597
0
        float* cp0 = cp + i[0] * cstride[0];
598
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
599
0
        {
600
0
          for (x = 0; x < count; x++)
601
0
            cp0[x] = p * ap0[x] / bp0[x];
602
0
          ap0 += astride[1];
603
0
          bp0 += bstride[1];
604
0
          cp0 += cstride[1];
605
0
        }
606
0
      }
607
0
      return;
608
0
    }
609
    // Non-optimal case, need to do skip copy.
610
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
611
0
    {
612
0
      float* const ap0 = ap + i[0] * astride[0];
613
0
      float* const bp0 = bp + i[0] * bstride[0];
614
0
      float* const cp0 = cp + i[0] * cstride[0];
615
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
616
0
      {
617
0
        float* ap1 = ap0 + i[1] * astride[1];
618
0
        float* bp1 = bp0 + i[1] * bstride[1];
619
0
        float* cp1 = cp0 + i[1] * cstride[1];
620
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
621
0
        {
622
0
          for (x = 0; x < dim[3]; x++)
623
0
            cp1[x] = p * ap1[x] / bp1[x];
624
0
          ap1 += astride[2];
625
0
          bp1 += bstride[2];
626
0
          cp1 += cstride[2];
627
0
        }
628
0
      }
629
0
    }
630
0
  }
631
258
}
632
633
static int _ccv_nnc_ewdiv_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
634
30
{
635
30
  _ccv_nnc_ewdiv_forw_cpu_ref(1, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]);
636
30
  return CCV_NNC_EXEC_SUCCESS;
637
30
}
638
639
static int _ccv_nnc_ewdiv_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
640
16
{
641
  // D[x / y, x] = 1 / y, D[x / y, y] = -x / y^2
642
16
  if (output_size == 1 || 
outputs[1] == 015
)
643
2
  {
644
    // When we only need D[x / y, x]
645
2
    _ccv_nnc_ewdiv_forw_cpu_ref(1, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
646
2
    return CCV_NNC_EXEC_SUCCESS;
647
2
  }
648
14
  int dim[CCV_NNC_MAX_DIM_ALLOC];
649
14
  int gstride[CCV_NNC_MAX_DIM_ALLOC];
650
14
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
651
14
  int cstride[CCV_NNC_MAX_DIM_ALLOC];
652
14
  int hastride[CCV_NNC_MAX_DIM_ALLOC];
653
14
  int hbstride[CCV_NNC_MAX_DIM_ALLOC];
654
14
  ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0];
655
14
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[2];
656
14
  ccv_nnc_tensor_view_t* c = (ccv_nnc_tensor_view_t*)inputs[3];
657
14
  ccv_nnc_tensor_view_t* ha = (ccv_nnc_tensor_view_t*)outputs[0];
658
14
  ccv_nnc_tensor_view_t* hb = (ccv_nnc_tensor_view_t*)outputs[1];
659
14
  if (g == 0)
660
0
  {
661
0
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
662
0
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
663
0
    assert(ccv_nnc_tensor_nd(hb->info.dim) <= CCV_NNC_MAX_DIM + 2);
664
0
    ccv_nnc_tensor_view_get_dim(b, dim);
665
0
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
666
0
    assert(ccv_nnc_tensor_view_check_dim(hb, dim));
667
0
    if (ha)
668
0
    {
669
0
      assert(ccv_nnc_tensor_nd(ha->info.dim) <= CCV_NNC_MAX_DIM + 2);
670
0
      assert(ccv_nnc_tensor_view_check_dim(ha, dim));
671
0
    }
672
0
    int x;
673
0
    if (!CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && (ha == 0 || !CCV_IS_TENSOR_VIEW(ha)) && !CCV_IS_TENSOR_VIEW(hb))
674
0
    {
675
      // Super optimal case, just do one for-loop for sum.
676
0
      const int tensor_count = ccv_nnc_tensor_count(b->info);
677
0
      if (ha == 0)
678
0
      {
679
0
        for (x = 0; x < tensor_count; x++)
680
0
        {
681
0
          const float v = 1 / b->data.f32[x];
682
0
          hb->data.f32[x] = -c->data.f32[x] * v;
683
0
        }
684
0
      } else {
685
0
        for (x = 0; x < tensor_count; x++)
686
0
        {
687
0
          const float v = 1 / b->data.f32[x];
688
0
          ha->data.f32[x] = v;
689
0
          hb->data.f32[x] = -c->data.f32[x] * v;
690
0
        }
691
0
      }
692
0
      return CCV_NNC_EXEC_SUCCESS;
693
0
    }
694
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
695
0
    ccv_nnc_tensor_view_get_stride(b, bstride);
696
0
    ccv_nnc_tensor_view_get_stride(c, cstride);
697
0
    ccv_nnc_tensor_view_get_stride(hb, hbstride);
698
0
    int i[CCV_NNC_MAX_DIM + 2];
699
0
    float* const bp = b->data.f32;
700
0
    float* const cp = c->data.f32;
701
0
    float* const hbp = hb->data.f32;
702
0
    const int count = dim[2] * dim[3];
703
0
    if (ha == 0)
704
0
    {
705
0
      if (bstride[2] == dim[3] && cstride[2] == dim[3] && hbstride[2] == dim[3])
706
0
      {
707
        // Special casing if the ainc[3] is the same as dim[3]
708
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
709
0
        {
710
0
          float* bp0 = bp + i[0] * bstride[0];
711
0
          float* cp0 = cp + i[0] * cstride[0];
712
0
          float* hbp0 = hbp + i[0] * hbstride[0];
713
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
714
0
          {
715
0
            for (x = 0; x < count; x++)
716
0
            {
717
0
              const float v = 1 / bp0[x];
718
0
              hbp0[x] = -cp0[x] * v;
719
0
            }
720
0
            bp0 += bstride[1];
721
0
            cp0 += cstride[1];
722
0
            hbp0 += hbstride[1];
723
0
          }
724
0
        }
725
0
        return CCV_NNC_EXEC_SUCCESS;
726
0
      }
727
      // Non-optimal case, need to do skip copy.
728
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
729
0
      {
730
0
        float* const bp0 = bp + i[0] * bstride[0];
731
0
        float* const cp0 = cp + i[0] * cstride[0];
732
0
        float* const hbp0 = hbp + i[0] * hbstride[0];
733
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
734
0
        {
735
0
          float* bp1 = bp0 + i[1] * bstride[1];
736
0
          float* cp1 = cp0 + i[1] * cstride[1];
737
0
          float* hbp1 = hbp0 + i[1] * hbstride[1];
738
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
739
0
          {
740
0
            for (x = 0; x < dim[3]; x++)
741
0
            {
742
0
              const float v = 1 / bp1[x];
743
0
              hbp1[x] = -cp1[x] * v;
744
0
            }
745
0
            bp1 += bstride[2];
746
0
            cp1 += cstride[2];
747
0
            hbp1 += hbstride[2];
748
0
          }
749
0
        }
750
0
      }
751
0
    } else {
752
0
      float* const hap = ha->data.f32;
753
0
      ccv_nnc_tensor_view_get_stride(ha, hastride);
754
0
      if (bstride[2] == dim[3] && cstride[2] == dim[3] && hastride[2] == dim[3] && hbstride[2] == dim[3])
755
0
      {
756
        // Special casing if the ainc[3] is the same as dim[3]
757
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
758
0
        {
759
0
          float* bp0 = bp + i[0] * bstride[0];
760
0
          float* cp0 = cp + i[0] * cstride[0];
761
0
          float* hap0 = hap + i[0] * hastride[0];
762
0
          float* hbp0 = hbp + i[0] * hbstride[0];
763
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
764
0
          {
765
0
            for (x = 0; x < count; x++)
766
0
            {
767
0
              const float v = 1 / bp0[x];
768
0
              hap0[x] = v;
769
0
              hbp0[x] = -cp0[x] * v;
770
0
            }
771
0
            bp0 += bstride[1];
772
0
            cp0 += cstride[1];
773
0
            hap0 += hastride[1];
774
0
            hbp0 += hbstride[1];
775
0
          }
776
0
        }
777
0
        return CCV_NNC_EXEC_SUCCESS;
778
0
      }
779
      // Non-optimal case, need to do skip copy.
780
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
781
0
      {
782
0
        float* const bp0 = bp + i[0] * bstride[0];
783
0
        float* const cp0 = cp + i[0] * cstride[0];
784
0
        float* const hap0 = hap + i[0] * hastride[0];
785
0
        float* const hbp0 = hbp + i[0] * hbstride[0];
786
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
787
0
        {
788
0
          float* bp1 = bp0 + i[1] * bstride[1];
789
0
          float* cp1 = cp0 + i[1] * cstride[1];
790
0
          float* hap1 = hap0 + i[1] * hastride[1];
791
0
          float* hbp1 = hbp0 + i[1] * hbstride[1];
792
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
793
0
          {
794
0
            for (x = 0; x < dim[3]; x++)
795
0
            {
796
0
              const float v = 1 / bp1[x];
797
0
              hap1[x] = v;
798
0
              hbp1[x] = -cp1[x] * v;
799
0
            }
800
0
            bp1 += bstride[2];
801
0
            cp1 += cstride[2];
802
0
            hap1 += hastride[2];
803
0
            hbp1 += hbstride[2];
804
0
          }
805
0
        }
806
0
      }
807
0
    }
808
14
  } else {
809
14
    assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2);
810
14
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
811
14
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
812
14
    assert(ccv_nnc_tensor_nd(hb->info.dim) <= CCV_NNC_MAX_DIM + 2);
813
14
    ccv_nnc_tensor_view_get_dim(b, dim);
814
14
    assert(ccv_nnc_tensor_view_check_dim(g, dim));
815
14
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
816
14
    assert(ccv_nnc_tensor_view_check_dim(hb, dim));
817
14
    if (ha)
818
1
    {
819
1
      assert(ccv_nnc_tensor_nd(ha->info.dim) <= CCV_NNC_MAX_DIM + 2);
820
1
      assert(ccv_nnc_tensor_view_check_dim(ha, dim));
821
1
    }
822
14
    int x;
823
14
    if (!CCV_IS_TENSOR_VIEW(g) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && (ha == 0 || 
!1
CCV_IS_TENSOR_VIEW1
(ha)) && !CCV_IS_TENSOR_VIEW(hb))
824
14
    {
825
      // Super optimal case, just do one for-loop for sum.
826
14
      const int tensor_count = ccv_nnc_tensor_count(g->info);
827
14
      if (ha == 0)
828
13
      {
829
1.50k
        for (x = 0; x < tensor_count; 
x++1.48k
)
830
1.48k
        {
831
1.48k
          const float v = g->data.f32[x] / b->data.f32[x];
832
1.48k
          hb->data.f32[x] = -c->data.f32[x] * v;
833
1.48k
        }
834
13
      } else {
835
2
        for (x = 0; x < tensor_count; 
x++1
)
836
1
        {
837
1
          const float v = g->data.f32[x] / b->data.f32[x];
838
1
          ha->data.f32[x] = v;
839
1
          hb->data.f32[x] = -c->data.f32[x] * v;
840
1
        }
841
1
      }
842
14
      return CCV_NNC_EXEC_SUCCESS;
843
14
    }
844
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
845
0
    ccv_nnc_tensor_view_get_stride(g, gstride);
846
0
    ccv_nnc_tensor_view_get_stride(b, bstride);
847
0
    ccv_nnc_tensor_view_get_stride(c, cstride);
848
0
    ccv_nnc_tensor_view_get_stride(hb, hbstride);
849
0
    int i[CCV_NNC_MAX_DIM + 2];
850
0
    float* const gp = g->data.f32;
851
0
    float* const bp = b->data.f32;
852
0
    float* const cp = c->data.f32;
853
0
    float* const hbp = hb->data.f32;
854
0
    const int count = dim[2] * dim[3];
855
0
    if (ha == 0)
856
0
    {
857
0
      if (gstride[2] == dim[3] && bstride[2] == dim[3] && cstride[2] == dim[3] && hbstride[2] == dim[3])
858
0
      {
859
        // Special casing if the ainc[3] is the same as dim[3]
860
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
861
0
        {
862
0
          float* gp0 = gp + i[0] * gstride[0];
863
0
          float* bp0 = bp + i[0] * bstride[0];
864
0
          float* cp0 = cp + i[0] * cstride[0];
865
0
          float* hbp0 = hbp + i[0] * hbstride[0];
866
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
867
0
          {
868
0
            for (x = 0; x < count; x++)
869
0
            {
870
0
              const float v = gp0[x] / bp0[x];
871
0
              hbp0[x] = -cp0[x] * v;
872
0
            }
873
0
            gp0 += gstride[1];
874
0
            bp0 += bstride[1];
875
0
            cp0 += cstride[1];
876
0
            hbp0 += hbstride[1];
877
0
          }
878
0
        }
879
0
        return CCV_NNC_EXEC_SUCCESS;
880
0
      }
881
      // Non-optimal case, need to do skip copy.
882
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
883
0
      {
884
0
        float* const gp0 = gp + i[0] * gstride[0];
885
0
        float* const bp0 = bp + i[0] * bstride[0];
886
0
        float* const cp0 = cp + i[0] * cstride[0];
887
0
        float* const hbp0 = hbp + i[0] * hbstride[0];
888
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
889
0
        {
890
0
          float* gp1 = gp0 + i[1] * gstride[1];
891
0
          float* bp1 = bp0 + i[1] * bstride[1];
892
0
          float* cp1 = cp0 + i[1] * cstride[1];
893
0
          float* hbp1 = hbp0 + i[1] * hbstride[1];
894
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
895
0
          {
896
0
            for (x = 0; x < dim[3]; x++)
897
0
            {
898
0
              const float v = gp1[x] / bp1[x];
899
0
              hbp1[x] = -cp1[x] * v;
900
0
            }
901
0
            gp1 += gstride[2];
902
0
            bp1 += bstride[2];
903
0
            cp1 += cstride[2];
904
0
            hbp1 += hbstride[2];
905
0
          }
906
0
        }
907
0
      }
908
0
    } else {
909
0
      ccv_nnc_tensor_view_get_stride(ha, hastride);
910
0
      float* const hap = ha->data.f32;
911
0
      if (gstride[2] == dim[3] && bstride[2] == dim[3] && cstride[2] == dim[3] && hastride[2] == dim[3] && hbstride[2] == dim[3])
912
0
      {
913
        // Special casing if the ainc[3] is the same as dim[3]
914
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
915
0
        {
916
0
          float* gp0 = gp + i[0] * gstride[0];
917
0
          float* bp0 = bp + i[0] * bstride[0];
918
0
          float* cp0 = cp + i[0] * cstride[0];
919
0
          float* hap0 = hap + i[0] * hastride[0];
920
0
          float* hbp0 = hbp + i[0] * hbstride[0];
921
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
922
0
          {
923
0
            for (x = 0; x < count; x++)
924
0
            {
925
0
              const float v = gp0[x] / bp0[x];
926
0
              hap0[x] = v;
927
0
              hbp0[x] = -cp0[x] * v;
928
0
            }
929
0
            gp0 += gstride[1];
930
0
            bp0 += bstride[1];
931
0
            cp0 += cstride[1];
932
0
            hap0 += hastride[1];
933
0
            hbp0 += hbstride[1];
934
0
          }
935
0
        }
936
0
        return CCV_NNC_EXEC_SUCCESS;
937
0
      }
938
      // Non-optimal case, need to do skip copy.
939
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
940
0
      {
941
0
        float* const gp0 = gp + i[0] * gstride[0];
942
0
        float* const bp0 = bp + i[0] * bstride[0];
943
0
        float* const cp0 = cp + i[0] * cstride[0];
944
0
        float* const hap0 = hap + i[0] * hastride[0];
945
0
        float* const hbp0 = hbp + i[0] * hbstride[0];
946
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
947
0
        {
948
0
          float* gp1 = gp0 + i[1] * gstride[1];
949
0
          float* bp1 = bp0 + i[1] * bstride[1];
950
0
          float* cp1 = cp0 + i[1] * cstride[1];
951
0
          float* hap1 = hap0 + i[1] * hastride[1];
952
0
          float* hbp1 = hbp0 + i[1] * hbstride[1];
953
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
954
0
          {
955
0
            for (x = 0; x < dim[3]; x++)
956
0
            {
957
0
              const float v = gp1[x] / bp1[x];
958
0
              hap1[x] = v;
959
0
              hbp1[x] = -cp1[x] * v;
960
0
            }
961
0
            gp1 += gstride[2];
962
0
            bp1 += bstride[2];
963
0
            cp1 += cstride[2];
964
0
            hap1 += hastride[2];
965
0
            hbp1 += hbstride[2];
966
0
          }
967
0
        }
968
0
      }
969
0
    }
970
0
  }
971
0
  return CCV_NNC_EXEC_SUCCESS;
972
14
}
973
974
static int _ccv_nnc_ewexp_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
975
21
{
976
  // Assuming this is float 32.
977
21
  int dim[CCV_NNC_MAX_DIM_ALLOC];
978
21
  int astride[CCV_NNC_MAX_DIM_ALLOC];
979
21
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
980
21
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
981
21
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
982
21
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
983
21
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
984
21
  ccv_nnc_tensor_view_get_dim(a, dim);
985
21
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
986
21
  int x;
987
21
  if (!CCV_IS_TENSOR_VIEW(a) && 
!20
CCV_IS_TENSOR_VIEW20
(b))
988
20
  {
989
    // Super optimal case, just do one for-loop for sum.
990
20
    const int tensor_count = ccv_nnc_tensor_count(a->info);
991
3.07k
    for (x = 0; x < tensor_count; 
x++3.05k
)
992
3.05k
      b->data.f32[x] = exp(a->data.f32[x]);
993
20
    return CCV_NNC_EXEC_SUCCESS;
994
20
  }
995
1
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
996
1
  ccv_nnc_tensor_view_get_stride(a, astride);
997
1
  ccv_nnc_tensor_view_get_stride(b, bstride);
998
1
  int i[CCV_NNC_MAX_DIM + 2];
999
1
  float* const ap = a->data.f32;
1000
1
  float* const bp = b->data.f32;
1001
1
  const int count = dim[2] * dim[3];
1002
1
  if (astride[2] == dim[3] && bstride[2] == dim[3])
1003
1
  {
1004
    // Special casing if the ainc[3] is the same as dim[3]
1005
2
    for (i[0] = 0; i[0] < dim[0]; 
i[0]++1
)
1006
1
    {
1007
1
      float* ap0 = ap + i[0] * astride[0];
1008
1
      float* bp0 = bp + i[0] * bstride[0];
1009
2
      for (i[1] = 0; i[1] < dim[1]; 
i[1]++1
)
1010
1
      {
1011
2
        for (x = 0; x < count; 
x++1
)
1012
1
          bp0[x] = exp(ap0[x]);
1013
1
        ap0 += astride[1];
1014
1
        bp0 += bstride[1];
1015
1
      }
1016
1
    }
1017
1
    return CCV_NNC_EXEC_SUCCESS;
1018
1
  }
1019
  // Non-optimal case, need to do skip copy.
1020
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
1021
0
  {
1022
0
    float* const ap0 = ap + i[0] * astride[0];
1023
0
    float* const bp0 = bp + i[0] * bstride[0];
1024
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
1025
0
    {
1026
0
      float* ap1 = ap0 + i[1] * astride[1];
1027
0
      float* bp1 = bp0 + i[1] * bstride[1];
1028
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
1029
0
      {
1030
0
        for (x = 0; x < dim[3]; x++)
1031
0
          bp1[x] = exp(ap1[x]);
1032
0
        ap1 += astride[2];
1033
0
        bp1 += bstride[2];
1034
0
      }
1035
0
    }
1036
0
  }
1037
0
  return CCV_NNC_EXEC_SUCCESS;
1038
1
}
1039
1040
static int _ccv_nnc_ewexp_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1041
9
{
1042
  // D[Exp[x], x] = Exp[x]
1043
9
  if (inputs[0] == 0)
1044
0
    _ccv_nnc_tensor_transfer_cpu_ref_f32((ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
1045
9
  else
1046
9
    _ccv_nnc_ewprod_forw_cpu_ref((ccv_nnc_tensor_view_t*[]){
1047
9
      (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2]
1048
9
    }, 2, (ccv_nnc_tensor_view_t**)outputs, output_size);
1049
9
  return CCV_NNC_EXEC_SUCCESS;
1050
9
}
1051
1052
static int _ccv_nnc_ewlog_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1053
258
{
1054
  // Assuming this is float 32.
1055
258
  int dim[CCV_NNC_MAX_DIM_ALLOC];
1056
258
  int astride[CCV_NNC_MAX_DIM_ALLOC];
1057
258
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
1058
258
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
1059
258
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
1060
258
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
1061
258
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
1062
258
  ccv_nnc_tensor_view_get_dim(a, dim);
1063
258
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
1064
258
  int x;
1065
258
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b))
1066
258
  {
1067
    // Super optimal case, just do one for-loop for sum.
1068
258
    const int tensor_count = ccv_nnc_tensor_count(a->info);
1069
3.55k
    for (x = 0; x < tensor_count; 
x++3.29k
)
1070
3.29k
      b->data.f32[x] = log(a->data.f32[x]);
1071
258
    return CCV_NNC_EXEC_SUCCESS;
1072
258
  }
1073
0
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1074
0
  ccv_nnc_tensor_view_get_stride(a, astride);
1075
0
  ccv_nnc_tensor_view_get_stride(b, bstride);
1076
0
  int i[CCV_NNC_MAX_DIM + 2];
1077
0
  float* const ap = a->data.f32;
1078
0
  float* const bp = b->data.f32;
1079
0
  const int count = dim[2] * dim[3];
1080
0
  if (astride[2] == dim[3] && bstride[2] == dim[3])
1081
0
  {
1082
    // Special casing if the ainc[3] is the same as dim[3]
1083
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1084
0
    {
1085
0
      float* ap0 = ap + i[0] * astride[0];
1086
0
      float* bp0 = bp + i[0] * bstride[0];
1087
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1088
0
      {
1089
0
        for (x = 0; x < count; x++)
1090
0
          bp0[x] = log(ap0[x]);
1091
0
        ap0 += astride[1];
1092
0
        bp0 += bstride[1];
1093
0
      }
1094
0
    }
1095
0
    return CCV_NNC_EXEC_SUCCESS;
1096
0
  }
1097
  // Non-optimal case, need to do skip copy.
1098
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
1099
0
  {
1100
0
    float* const ap0 = ap + i[0] * astride[0];
1101
0
    float* const bp0 = bp + i[0] * bstride[0];
1102
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
1103
0
    {
1104
0
      float* ap1 = ap0 + i[1] * astride[1];
1105
0
      float* bp1 = bp0 + i[1] * bstride[1];
1106
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
1107
0
      {
1108
0
        for (x = 0; x < dim[3]; x++)
1109
0
          bp1[x] = log(ap1[x]);
1110
0
        ap1 += astride[2];
1111
0
        bp1 += bstride[2];
1112
0
      }
1113
0
    }
1114
0
  }
1115
0
  return CCV_NNC_EXEC_SUCCESS;
1116
0
}
1117
1118
static int _ccv_nnc_ewlog_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1119
224
{
1120
  // D[Log[x], x] = 1 / x
1121
224
  _ccv_nnc_ewdiv_forw_cpu_ref(1, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]);
1122
224
  return CCV_NNC_EXEC_SUCCESS;
1123
224
}
1124
1125
static int _ccv_nnc_ewsqrt_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1126
5
{
1127
  // Assuming this is float 32.
1128
5
  int dim[CCV_NNC_MAX_DIM_ALLOC];
1129
5
  int astride[CCV_NNC_MAX_DIM_ALLOC];
1130
5
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
1131
5
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
1132
5
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
1133
5
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
1134
5
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
1135
5
  ccv_nnc_tensor_view_get_dim(a, dim);
1136
5
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
1137
5
  int x;
1138
5
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b))
1139
5
  {
1140
    // Super optimal case, just do one for-loop for sum.
1141
5
    const int tensor_count = ccv_nnc_tensor_count(a->info);
1142
2.01k
    for (x = 0; x < tensor_count; 
x++2.01k
)
1143
2.01k
      b->data.f32[x] = sqrt(a->data.f32[x]);
1144
5
    return CCV_NNC_EXEC_SUCCESS;
1145
5
  }
1146
0
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1147
0
  ccv_nnc_tensor_view_get_stride(a, astride);
1148
0
  ccv_nnc_tensor_view_get_stride(b, bstride);
1149
0
  int i[CCV_NNC_MAX_DIM + 2];
1150
0
  float* const ap = a->data.f32;
1151
0
  float* const bp = b->data.f32;
1152
0
  const int count = dim[2] * dim[3];
1153
0
  if (astride[2] == dim[3] && bstride[2] == dim[3])
1154
0
  {
1155
    // Special casing if the ainc[3] is the same as dim[3]
1156
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1157
0
    {
1158
0
      float* ap0 = ap + i[0] * astride[0];
1159
0
      float* bp0 = bp + i[0] * bstride[0];
1160
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1161
0
      {
1162
0
        for (x = 0; x < count; x++)
1163
0
          bp0[x] = sqrt(ap0[x]);
1164
0
        ap0 += astride[1];
1165
0
        bp0 += bstride[1];
1166
0
      }
1167
0
    }
1168
0
    return CCV_NNC_EXEC_SUCCESS;
1169
0
  }
1170
  // Non-optimal case, need to do skip copy.
1171
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
1172
0
  {
1173
0
    float* const ap0 = ap + i[0] * astride[0];
1174
0
    float* const bp0 = bp + i[0] * bstride[0];
1175
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
1176
0
    {
1177
0
      float* ap1 = ap0 + i[1] * astride[1];
1178
0
      float* bp1 = bp0 + i[1] * bstride[1];
1179
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
1180
0
      {
1181
0
        for (x = 0; x < dim[3]; x++)
1182
0
          bp1[x] = sqrt(ap1[x]);
1183
0
        ap1 += astride[2];
1184
0
        bp1 += bstride[2];
1185
0
      }
1186
0
    }
1187
0
  }
1188
0
  return CCV_NNC_EXEC_SUCCESS;
1189
0
}
1190
1191
static int _ccv_nnc_ewsqrt_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1192
2
{
1193
  // D[Sqrt[x], x] = 0.5 / Sqrt[x]
1194
2
  _ccv_nnc_ewdiv_forw_cpu_ref(0.5, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
1195
2
  return CCV_NNC_EXEC_SUCCESS;
1196
2
}
1197
1198
static int _ccv_nnc_clamp_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1199
8
{
1200
  // Assuming this is float 32.
1201
8
  int dim[CCV_NNC_MAX_DIM_ALLOC];
1202
8
  int astride[CCV_NNC_MAX_DIM_ALLOC];
1203
8
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
1204
8
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
1205
8
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
1206
8
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
1207
8
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
1208
8
  ccv_nnc_tensor_view_get_dim(a, dim);
1209
8
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
1210
8
  int x;
1211
8
  const float min = cmd.info.clamp.min;
1212
8
  const float max = cmd.info.clamp.max;
1213
8
  assert(!isnan(min) || !isnan(max));
1214
8
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b))
1215
8
  {
1216
    // Super optimal case, just do one for-loop for sum.
1217
8
    const int tensor_count = ccv_nnc_tensor_count(a->info);
1218
8
    if (isnan(min))
1219
4
    {
1220
2.00k
      for (x = 0; x < tensor_count; 
x++2.00k
)
1221
2.00k
        b->data.f32[x] = ccv_min(a->data.f32[x], max);
1222
4
    } else if (isnan(max)) {
1223
2.00k
      for (x = 0; x < tensor_count; 
x++2.00k
)
1224
2.00k
        b->data.f32[x] = ccv_max(a->data.f32[x], min);
1225
2
    } else {
1226
2.00k
      for (x = 0; x < tensor_count; 
x++2.00k
)
1227
2.00k
        b->data.f32[x] = ccv_clamp(a->data.f32[x], min, max);
1228
2
    }
1229
8
    return CCV_NNC_EXEC_SUCCESS;
1230
8
  }
1231
0
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1232
0
  ccv_nnc_tensor_view_get_stride(a, astride);
1233
0
  ccv_nnc_tensor_view_get_stride(b, bstride);
1234
0
  int i[CCV_NNC_MAX_DIM + 2];
1235
0
  float* const ap = a->data.f32;
1236
0
  float* const bp = b->data.f32;
1237
0
  const int count = dim[2] * dim[3];
1238
0
  if (isnan(min))
1239
0
  {
1240
0
    if (astride[2] == dim[3] && bstride[2] == dim[3])
1241
0
    {
1242
      // Special casing if the ainc[3] is the same as dim[3]
1243
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1244
0
      {
1245
0
        float* ap0 = ap + i[0] * astride[0];
1246
0
        float* bp0 = bp + i[0] * bstride[0];
1247
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1248
0
        {
1249
0
          for (x = 0; x < count; x++)
1250
0
            bp0[x] = ccv_min(ap0[x], max);
1251
0
          ap0 += astride[1];
1252
0
          bp0 += bstride[1];
1253
0
        }
1254
0
      }
1255
0
      return CCV_NNC_EXEC_SUCCESS;
1256
0
    }
1257
    // Non-optimal case, need to do skip copy.
1258
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1259
0
    {
1260
0
      float* const ap0 = ap + i[0] * astride[0];
1261
0
      float* const bp0 = bp + i[0] * bstride[0];
1262
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1263
0
      {
1264
0
        float* ap1 = ap0 + i[1] * astride[1];
1265
0
        float* bp1 = bp0 + i[1] * bstride[1];
1266
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
1267
0
        {
1268
0
          for (x = 0; x < dim[3]; x++)
1269
0
            bp1[x] = ccv_min(ap1[x], max);
1270
0
          ap1 += astride[2];
1271
0
          bp1 += bstride[2];
1272
0
        }
1273
0
      }
1274
0
    }
1275
0
  } else if (isnan(max)) {
1276
0
    if (astride[2] == dim[3] && bstride[2] == dim[3])
1277
0
    {
1278
      // Special casing if the ainc[3] is the same as dim[3]
1279
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1280
0
      {
1281
0
        float* ap0 = ap + i[0] * astride[0];
1282
0
        float* bp0 = bp + i[0] * bstride[0];
1283
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1284
0
        {
1285
0
          for (x = 0; x < count; x++)
1286
0
            bp0[x] = ccv_max(ap0[x], min);
1287
0
          ap0 += astride[1];
1288
0
          bp0 += bstride[1];
1289
0
        }
1290
0
      }
1291
0
      return CCV_NNC_EXEC_SUCCESS;
1292
0
    }
1293
    // Non-optimal case, need to do skip copy.
1294
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1295
0
    {
1296
0
      float* const ap0 = ap + i[0] * astride[0];
1297
0
      float* const bp0 = bp + i[0] * bstride[0];
1298
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1299
0
      {
1300
0
        float* ap1 = ap0 + i[1] * astride[1];
1301
0
        float* bp1 = bp0 + i[1] * bstride[1];
1302
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
1303
0
        {
1304
0
          for (x = 0; x < dim[3]; x++)
1305
0
            bp1[x] = ccv_max(ap1[x], min);
1306
0
          ap1 += astride[2];
1307
0
          bp1 += bstride[2];
1308
0
        }
1309
0
      }
1310
0
    }
1311
0
  } else {
1312
0
    if (astride[2] == dim[3] && bstride[2] == dim[3])
1313
0
    {
1314
      // Special casing if the ainc[3] is the same as dim[3]
1315
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1316
0
      {
1317
0
        float* ap0 = ap + i[0] * astride[0];
1318
0
        float* bp0 = bp + i[0] * bstride[0];
1319
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1320
0
        {
1321
0
          for (x = 0; x < count; x++)
1322
0
            bp0[x] = ccv_clamp(ap0[x], min, max);
1323
0
          ap0 += astride[1];
1324
0
          bp0 += bstride[1];
1325
0
        }
1326
0
      }
1327
0
      return CCV_NNC_EXEC_SUCCESS;
1328
0
    }
1329
    // Non-optimal case, need to do skip copy.
1330
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1331
0
    {
1332
0
      float* const ap0 = ap + i[0] * astride[0];
1333
0
      float* const bp0 = bp + i[0] * bstride[0];
1334
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1335
0
      {
1336
0
        float* ap1 = ap0 + i[1] * astride[1];
1337
0
        float* bp1 = bp0 + i[1] * bstride[1];
1338
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
1339
0
        {
1340
0
          for (x = 0; x < dim[3]; x++)
1341
0
            bp1[x] = ccv_clamp(ap1[x], min, max);
1342
0
          ap1 += astride[2];
1343
0
          bp1 += bstride[2];
1344
0
        }
1345
0
      }
1346
0
    }
1347
0
  }
1348
0
  return CCV_NNC_EXEC_SUCCESS;
1349
0
}
1350
1351
static int _ccv_nnc_clamp_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1352
3
{
1353
3
  assert(input_size == 3);
1354
3
  const ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0]; // gradient
1355
3
  const ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[2];
1356
3
  assert(output_size == 1);
1357
3
  ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[0];
1358
  // Assuming this is float 32.
1359
3
  int dim[CCV_NNC_MAX_DIM_ALLOC];
1360
3
  int hstride[CCV_NNC_MAX_DIM_ALLOC];
1361
3
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
1362
3
  assert(ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2);
1363
3
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
1364
3
  ccv_nnc_tensor_view_get_dim(g, dim);
1365
3
  ccv_nnc_tensor_view_get_dim(h, dim);
1366
3
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
1367
3
  int x;
1368
3
  const float min = cmd.info.clamp.min;
1369
3
  const float max = cmd.info.clamp.max;
1370
3
  assert(!isnan(min) || !isnan(max));
1371
3
  if (g)
1372
3
  {
1373
3
    if (!CCV_IS_TENSOR_VIEW(g) && !CCV_IS_TENSOR_VIEW(h) && !CCV_IS_TENSOR_VIEW(b))
1374
3
    {
1375
      // Super optimal case, just do one for-loop for sum.
1376
3
      const int tensor_count = ccv_nnc_tensor_count(g->info);
1377
3
      if (isnan(min))
1378
1
      {
1379
1.00k
        for (x = 0; x < tensor_count; 
x++1.00k
)
1380
1.00k
          h->data.f32[x] = b->data.f32[x] >= max ? 
0509
:
g->data.f32[x]491
;
1381
2
      } else if (isnan(max)) {
1382
1.00k
        for (x = 0; x < tensor_count; 
x++1.00k
)
1383
1.00k
          h->data.f32[x] = b->data.f32[x] <= min ? 
00
: g->data.f32[x];
1384
1
      } else {
1385
1.00k
        for (x = 0; x < tensor_count; 
x++1.00k
)
1386
1.00k
          h->data.f32[x] = (b->data.f32[x] >= max || 
b->data.f32[x] <= min491
) ?
0509
:
g->data.f32[x]491
;
1387
1
      }
1388
3
      return CCV_NNC_EXEC_SUCCESS;
1389
3
    }
1390
0
    int gstride[CCV_NNC_MAX_DIM_ALLOC];
1391
0
    assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2);
1392
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1393
0
    ccv_nnc_tensor_view_get_stride(g, gstride);
1394
0
    ccv_nnc_tensor_view_get_stride(b, bstride);
1395
0
    ccv_nnc_tensor_view_get_stride(h, hstride);
1396
0
    int i[CCV_NNC_MAX_DIM + 2];
1397
0
    float* const gp = g->data.f32;
1398
0
    float* const bp = b->data.f32;
1399
0
    float* const hp = h->data.f32;
1400
0
    const int count = dim[2] * dim[3];
1401
0
    const float min = cmd.info.clamp.min;
1402
0
    const float max = cmd.info.clamp.max;
1403
0
    assert(!isnan(min) || !isnan(max));
1404
0
    if (isnan(min))
1405
0
    {
1406
0
      if (gstride[2] == dim[3] && bstride[2] == dim[3] && hstride[2] == dim[3])
1407
0
      {
1408
        // Special casing if the ginc[3] is the same as dim[3]
1409
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
1410
0
        {
1411
0
          float* gp0 = gp + i[0] * gstride[0];
1412
0
          float* bp0 = bp + i[0] * bstride[0];
1413
0
          float* hp0 = hp + i[0] * hstride[0];
1414
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
1415
0
          {
1416
0
            for (x = 0; x < count; x++)
1417
0
              hp0[x] = bp0[x] >= max ? 0 : gp0[x];
1418
0
            gp0 += gstride[1];
1419
0
            bp0 += bstride[1];
1420
0
            hp0 += hstride[1];
1421
0
          }
1422
0
        }
1423
0
        return CCV_NNC_EXEC_SUCCESS;
1424
0
      }
1425
      // Non-optimal case, need to do skip copy.
1426
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1427
0
      {
1428
0
        float* const gp0 = gp + i[0] * gstride[0];
1429
0
        float* const bp0 = bp + i[0] * bstride[0];
1430
0
        float* const hp0 = hp + i[0] * hstride[0];
1431
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1432
0
        {
1433
0
          float* gp1 = gp0 + i[1] * gstride[1];
1434
0
          float* bp1 = bp0 + i[1] * bstride[1];
1435
0
          float* hp1 = hp0 + i[1] * hstride[1];
1436
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
1437
0
          {
1438
0
            for (x = 0; x < dim[3]; x++)
1439
0
              hp1[x] = bp1[x] >= max ? 0 : gp1[x];
1440
0
            gp1 += gstride[2];
1441
0
            bp1 += bstride[2];
1442
0
            hp1 += hstride[2];
1443
0
          }
1444
0
        }
1445
0
      }
1446
0
    } else if (isnan(max)) {
1447
0
      if (gstride[2] == dim[3] && bstride[2] == dim[3] && hstride[2] == dim[3])
1448
0
      {
1449
        // Special casing if the ginc[3] is the same as dim[3]
1450
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
1451
0
        {
1452
0
          float* gp0 = gp + i[0] * gstride[0];
1453
0
          float* bp0 = bp + i[0] * bstride[0];
1454
0
          float* hp0 = hp + i[0] * hstride[0];
1455
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
1456
0
          {
1457
0
            for (x = 0; x < count; x++)
1458
0
              hp0[x] = bp0[x] <= min ? 0 : gp0[x];
1459
0
            gp0 += gstride[1];
1460
0
            bp0 += bstride[1];
1461
0
            hp0 += hstride[1];
1462
0
          }
1463
0
        }
1464
0
        return CCV_NNC_EXEC_SUCCESS;
1465
0
      }
1466
      // Non-optimal case, need to do skip copy.
1467
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1468
0
      {
1469
0
        float* const gp0 = gp + i[0] * gstride[0];
1470
0
        float* const bp0 = bp + i[0] * bstride[0];
1471
0
        float* const hp0 = hp + i[0] * hstride[0];
1472
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1473
0
        {
1474
0
          float* gp1 = gp0 + i[1] * gstride[1];
1475
0
          float* bp1 = bp0 + i[1] * bstride[1];
1476
0
          float* hp1 = hp0 + i[1] * hstride[1];
1477
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
1478
0
          {
1479
0
            for (x = 0; x < dim[3]; x++)
1480
0
              hp1[x] = bp1[x] <= min ? 0 : gp1[x];
1481
0
            gp1 += gstride[2];
1482
0
            bp1 += bstride[2];
1483
0
            hp1 += hstride[2];
1484
0
          }
1485
0
        }
1486
0
      }
1487
0
    } else {
1488
0
      if (gstride[2] == dim[3] && bstride[2] == dim[3] && hstride[2] == dim[3])
1489
0
      {
1490
        // Special casing if the ginc[3] is the same as dim[3]
1491
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
1492
0
        {
1493
0
          float* gp0 = gp + i[0] * gstride[0];
1494
0
          float* bp0 = bp + i[0] * bstride[0];
1495
0
          float* hp0 = hp + i[0] * hstride[0];
1496
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
1497
0
          {
1498
0
            for (x = 0; x < count; x++)
1499
0
              hp0[x] = (bp0[x] >= max || bp0[x] <= min) ? 0 : gp0[x];
1500
0
            gp0 += gstride[1];
1501
0
            bp0 += bstride[1];
1502
0
            hp0 += hstride[1];
1503
0
          }
1504
0
        }
1505
0
        return CCV_NNC_EXEC_SUCCESS;
1506
0
      }
1507
      // Non-optimal case, need to do skip copy.
1508
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1509
0
      {
1510
0
        float* const gp0 = gp + i[0] * gstride[0];
1511
0
        float* const bp0 = bp + i[0] * bstride[0];
1512
0
        float* const hp0 = hp + i[0] * hstride[0];
1513
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1514
0
        {
1515
0
          float* gp1 = gp0 + i[1] * gstride[1];
1516
0
          float* bp1 = bp0 + i[1] * bstride[1];
1517
0
          float* hp1 = hp0 + i[1] * hstride[1];
1518
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
1519
0
          {
1520
0
            for (x = 0; x < dim[3]; x++)
1521
0
              hp1[x] = (bp1[x] >= max || bp1[x] <= min) ? 0 : gp1[x];
1522
0
            gp1 += gstride[2];
1523
0
            bp1 += bstride[2];
1524
0
            hp1 += hstride[2];
1525
0
          }
1526
0
        }
1527
0
      }
1528
0
    }
1529
0
  } else {
1530
0
    if (!CCV_IS_TENSOR_VIEW(h) && !CCV_IS_TENSOR_VIEW(b))
1531
0
    {
1532
      // Super optimal case, just do one for-loop for sum.
1533
0
      const int tensor_count = ccv_nnc_tensor_count(h->info);
1534
0
      if (isnan(min))
1535
0
      {
1536
0
        for (x = 0; x < tensor_count; x++)
1537
0
          h->data.f32[x] = b->data.f32[x] >= max ? 0 : 1;
1538
0
      } else if (isnan(max)) {
1539
0
        for (x = 0; x < tensor_count; x++)
1540
0
          h->data.f32[x] = b->data.f32[x] <= min ? 0 : 1;
1541
0
      } else {
1542
0
        for (x = 0; x < tensor_count; x++)
1543
0
          h->data.f32[x] = (b->data.f32[x] >= max || b->data.f32[x] <= min) ? 0 : 1;
1544
0
      }
1545
0
      return CCV_NNC_EXEC_SUCCESS;
1546
0
    }
1547
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1548
0
    ccv_nnc_tensor_view_get_stride(b, bstride);
1549
0
    ccv_nnc_tensor_view_get_stride(h, hstride);
1550
0
    int i[CCV_NNC_MAX_DIM + 2];
1551
0
    float* const bp = b->data.f32;
1552
0
    float* const hp = h->data.f32;
1553
0
    const int count = dim[2] * dim[3];
1554
0
    const float min = cmd.info.clamp.min;
1555
0
    const float max = cmd.info.clamp.max;
1556
0
    assert(!isnan(min) || !isnan(max));
1557
0
    if (isnan(min))
1558
0
    {
1559
0
      if (bstride[2] == dim[3] && hstride[2] == dim[3])
1560
0
      {
1561
        // Special casing if the binc[3] is the same as dim[3]
1562
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
1563
0
        {
1564
0
          float* bp0 = bp + i[0] * bstride[0];
1565
0
          float* hp0 = hp + i[0] * hstride[0];
1566
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
1567
0
          {
1568
0
            for (x = 0; x < count; x++)
1569
0
              hp0[x] = bp0[x] >= max ? 0 : 1;
1570
0
            bp0 += bstride[1];
1571
0
            hp0 += hstride[1];
1572
0
          }
1573
0
        }
1574
0
        return CCV_NNC_EXEC_SUCCESS;
1575
0
      }
1576
      // Non-optimal case, need to do skip copy.
1577
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1578
0
      {
1579
0
        float* const bp0 = bp + i[0] * bstride[0];
1580
0
        float* const hp0 = hp + i[0] * hstride[0];
1581
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1582
0
        {
1583
0
          float* bp1 = bp0 + i[1] * bstride[1];
1584
0
          float* hp1 = hp0 + i[1] * hstride[1];
1585
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
1586
0
          {
1587
0
            for (x = 0; x < dim[3]; x++)
1588
0
              hp1[x] = bp1[x] >= max ? 0 : 1;
1589
0
            bp1 += bstride[2];
1590
0
            hp1 += hstride[2];
1591
0
          }
1592
0
        }
1593
0
      }
1594
0
    } else if (isnan(max)) {
1595
0
      if (bstride[2] == dim[3] && hstride[2] == dim[3])
1596
0
      {
1597
        // Special casing if the binc[3] is the same as dim[3]
1598
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
1599
0
        {
1600
0
          float* bp0 = bp + i[0] * bstride[0];
1601
0
          float* hp0 = hp + i[0] * hstride[0];
1602
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
1603
0
          {
1604
0
            for (x = 0; x < count; x++)
1605
0
              hp0[x] = bp0[x] <= min ? 0 : 1;
1606
0
            bp0 += bstride[1];
1607
0
            hp0 += hstride[1];
1608
0
          }
1609
0
        }
1610
0
        return CCV_NNC_EXEC_SUCCESS;
1611
0
      }
1612
      // Non-optimal case, need to do skip copy.
1613
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1614
0
      {
1615
0
        float* const bp0 = bp + i[0] * bstride[0];
1616
0
        float* const hp0 = hp + i[0] * hstride[0];
1617
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1618
0
        {
1619
0
          float* bp1 = bp0 + i[1] * bstride[1];
1620
0
          float* hp1 = hp0 + i[1] * hstride[1];
1621
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
1622
0
          {
1623
0
            for (x = 0; x < dim[3]; x++)
1624
0
              hp1[x] = bp1[x] <= min ? 0 : 1;
1625
0
            bp1 += bstride[2];
1626
0
            hp1 += hstride[2];
1627
0
          }
1628
0
        }
1629
0
      }
1630
0
    } else {
1631
0
      if (bstride[2] == dim[3] && hstride[2] == dim[3])
1632
0
      {
1633
        // Special casing if the binc[3] is the same as dim[3]
1634
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
1635
0
        {
1636
0
          float* bp0 = bp + i[0] * bstride[0];
1637
0
          float* hp0 = hp + i[0] * hstride[0];
1638
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
1639
0
          {
1640
0
            for (x = 0; x < count; x++)
1641
0
              hp0[x] = (bp0[x] >= max || bp0[x] <= min) ? 0 : 1;
1642
0
            bp0 += bstride[1];
1643
0
            hp0 += hstride[1];
1644
0
          }
1645
0
        }
1646
0
        return CCV_NNC_EXEC_SUCCESS;
1647
0
      }
1648
      // Non-optimal case, need to do skip copy.
1649
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1650
0
      {
1651
0
        float* const bp0 = bp + i[0] * bstride[0];
1652
0
        float* const hp0 = hp + i[0] * hstride[0];
1653
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1654
0
        {
1655
0
          float* bp1 = bp0 + i[1] * bstride[1];
1656
0
          float* hp1 = hp0 + i[1] * hstride[1];
1657
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
1658
0
          {
1659
0
            for (x = 0; x < dim[3]; x++)
1660
0
              hp1[x] = (bp1[x] >= max || bp1[x] <= min) ? 0 : 1;
1661
0
            bp1 += bstride[2];
1662
0
            hp1 += hstride[2];
1663
0
          }
1664
0
        }
1665
0
      }
1666
0
    }
1667
0
  }
1668
0
  return CCV_NNC_EXEC_SUCCESS;
1669
3
}
1670
1671
REGISTER_COMMAND_BACKEND(CCV_NNC_EWSUM_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1672
1
{
1673
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1674
1
  registry->tensor_datatypes = CCV_32F;
1675
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1676
1
  registry->algorithms = 1;
1677
1
  registry->exec = _ccv_nnc_ewsum_forw;
1678
1
}
1679
1680
REGISTER_COMMAND_BACKEND(CCV_NNC_EWSUM_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1681
1
{
1682
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1683
1
  registry->tensor_datatypes = CCV_32F;
1684
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1685
1
  registry->algorithms = 1;
1686
1
  registry->exec = _ccv_nnc_ewsum_back;
1687
1
}
1688
1689
REGISTER_COMMAND_BACKEND(CCV_NNC_EWPROD_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1690
1
{
1691
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1692
1
  registry->tensor_datatypes = CCV_32F;
1693
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1694
1
  registry->algorithms = 1;
1695
1
  registry->exec = _ccv_nnc_ewprod_forw;
1696
1
}
1697
1698
REGISTER_COMMAND_BACKEND(CCV_NNC_EWPROD_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1699
1
{
1700
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1701
1
  registry->tensor_datatypes = CCV_32F;
1702
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1703
1
  registry->algorithms = 1;
1704
1
  registry->exec = _ccv_nnc_ewprod_back;
1705
1
}
1706
1707
REGISTER_COMMAND_BACKEND(CCV_NNC_EWDIV_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1708
1
{
1709
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1710
1
  registry->tensor_datatypes = CCV_32F;
1711
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1712
1
  registry->algorithms = 1;
1713
1
  registry->exec = _ccv_nnc_ewdiv_forw;
1714
1
}
1715
1716
REGISTER_COMMAND_BACKEND(CCV_NNC_EWDIV_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1717
1
{
1718
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1719
1
  registry->tensor_datatypes = CCV_32F;
1720
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1721
1
  registry->algorithms = 1;
1722
1
  registry->exec = _ccv_nnc_ewdiv_back;
1723
1
}
1724
1725
REGISTER_COMMAND_BACKEND(CCV_NNC_EWEXP_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1726
1
{
1727
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1728
1
  registry->tensor_datatypes = CCV_32F;
1729
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1730
1
  registry->algorithms = 1;
1731
1
  registry->exec = _ccv_nnc_ewexp_forw;
1732
1
}
1733
1734
REGISTER_COMMAND_BACKEND(CCV_NNC_EWEXP_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1735
1
{
1736
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1737
1
  registry->tensor_datatypes = CCV_32F;
1738
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1739
1
  registry->algorithms = 1;
1740
1
  registry->exec = _ccv_nnc_ewexp_back;
1741
1
}
1742
1743
REGISTER_COMMAND_BACKEND(CCV_NNC_EWLOG_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1744
1
{
1745
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1746
1
  registry->tensor_datatypes = CCV_32F;
1747
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1748
1
  registry->algorithms = 1;
1749
1
  registry->exec = _ccv_nnc_ewlog_forw;
1750
1
}
1751
1752
REGISTER_COMMAND_BACKEND(CCV_NNC_EWLOG_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1753
1
{
1754
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1755
1
  registry->tensor_datatypes = CCV_32F;
1756
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1757
1
  registry->algorithms = 1;
1758
1
  registry->exec = _ccv_nnc_ewlog_back;
1759
1
}
1760
1761
REGISTER_COMMAND_BACKEND(CCV_NNC_EWSQRT_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1762
1
{
1763
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1764
1
  registry->tensor_datatypes = CCV_32F;
1765
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1766
1
  registry->algorithms = 1;
1767
1
  registry->exec = _ccv_nnc_ewsqrt_forw;
1768
1
}
1769
1770
REGISTER_COMMAND_BACKEND(CCV_NNC_EWSQRT_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1771
1
{
1772
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1773
1
  registry->tensor_datatypes = CCV_32F;
1774
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1775
1
  registry->algorithms = 1;
1776
1
  registry->exec = _ccv_nnc_ewsqrt_back;
1777
1
}
1778
1779
REGISTER_COMMAND_BACKEND(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1780
1
{
1781
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1782
1
  registry->tensor_datatypes = CCV_32F;
1783
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1784
1
  registry->algorithms = 1;
1785
1
  registry->exec = _ccv_nnc_clamp_forw;
1786
1
}
1787
1788
REGISTER_COMMAND_BACKEND(CCV_NNC_CLAMP_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1789
1
{
1790
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1791
1
  registry->tensor_datatypes = CCV_32F;
1792
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1793
1
  registry->algorithms = 1;
1794
1
  registry->exec = _ccv_nnc_clamp_back;
1795
1
}