Coverage Report

Created: 2025-05-31 15:19

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/ew/ccv_nnc_ew_cpu_ref.c
Line
Count
Source
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
#include "../_ccv_nnc_cpu_ref.h"
14
15
void _ccv_nnc_ewsum_forw_cpu_ref_f32(ccv_nnc_tensor_view_t* const* const inputs, const int input_size, ccv_nnc_tensor_view_t* const* const outputs, const int output_size)
16
36.0k
{
17
36.0k
  if (input_size == 1 && 
output_size == 10
)
18
0
  {
19
0
    _ccv_nnc_tensor_transfer_cpu_ref_f32(inputs[0], outputs[0]);
20
0
    return;
21
0
  }
22
  // Assuming this is float 32.
23
36.0k
  int dim[CCV_NNC_MAX_DIM_ALLOC];
24
36.0k
  int astride[CCV_NNC_MAX_DIM_ALLOC];
25
36.0k
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
26
36.0k
  int cstride[CCV_NNC_MAX_DIM_ALLOC];
27
36.0k
  int x, z;
28
36.0k
  int k = 0;
29
  // Bad, I promised this can be inplace operation. Need to first find out if there are share the same pointer first.
30
72.1k
  for (z = 1; z < input_size; 
z++36.0k
)
31
36.0k
  {
32
36.0k
    ccv_nnc_tensor_view_t* c = outputs[0];
33
36.0k
    ccv_nnc_tensor_view_t* a = inputs[z];
34
36.0k
    if (c->data.f32 == a->data.f32)
35
10
    {
36
10
      k = z;
37
10
      break;
38
10
    }
39
36.0k
  }
40
72.1k
  for (z = 0; z < input_size - 1; 
z++36.0k
)
41
36.0k
  {
42
36.0k
    ccv_nnc_tensor_view_t* c = outputs[0];
43
36.0k
    ccv_nnc_tensor_view_t* a = z > 0 ? 
c27
:
inputs[k]36.0k
;
44
36.0k
    ccv_nnc_tensor_view_t* b = z >= k ? 
inputs[z + 1]36.0k
:
inputs[z]10
;
45
36.0k
    assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
46
36.0k
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
47
36.0k
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
48
36.0k
    ccv_nnc_tensor_view_get_dim(a, dim);
49
36.0k
    assert(ccv_nnc_tensor_view_check_dim(b, dim));
50
36.0k
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
51
36.0k
    if (!CCV_IS_TENSOR_VIEW(a) && 
!36.0k
CCV_IS_TENSOR_VIEW36.0k
(b) &&
!36.0k
CCV_IS_TENSOR_VIEW36.0k
(c))
52
36.0k
    {
53
      // Super optimal case, just do one for-loop for sum.
54
36.0k
      const int tensor_count = ccv_nnc_tensor_count(a->info);
55
15.5M
      for (x = 0; x < tensor_count; 
x++15.4M
)
56
15.4M
        c->data.f32[x] = a->data.f32[x] + b->data.f32[x];
57
36.0k
      continue;
58
36.0k
    }
59
36.0k
    assert
(CCV_NNC_MAX_DIM == 2)3
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
60
3
    ccv_nnc_tensor_view_get_stride(a, astride);
61
3
    ccv_nnc_tensor_view_get_stride(b, bstride);
62
3
    ccv_nnc_tensor_view_get_stride(c, cstride);
63
3
    int i[CCV_NNC_MAX_DIM + 2];
64
3
    float* const ap = a->data.f32;
65
3
    float* const bp = b->data.f32;
66
3
    float* const cp = c->data.f32;
67
3
    const int count = dim[2] * dim[3];
68
3
    if (astride[2] == dim[3] && bstride[2] == dim[3] && cstride[2] == dim[3] && astride[3] == 1 && bstride[3] == 1 && cstride[3] == 1)
69
3
    {
70
      // Special casing if the ainc[3] is the same as dim[3] (do memcpy for the last two dim)
71
6
      for (i[0] = 0; i[0] < dim[0]; 
i[0]++3
)
72
3
      {
73
3
        float* ap0 = ap + i[0] * astride[0];
74
3
        float* bp0 = bp + i[0] * bstride[0];
75
3
        float* cp0 = cp + i[0] * cstride[0];
76
6
        for (i[1] = 0; i[1] < dim[1]; 
i[1]++3
)
77
3
        {
78
6
          for (x = 0; x < count; 
x++3
)
79
3
            cp0[x] = ap0[x] + bp0[x];
80
3
          ap0 += astride[1];
81
3
          bp0 += bstride[1];
82
3
          cp0 += cstride[1];
83
3
        }
84
3
      }
85
3
      continue;
86
3
    }
87
    // Non-optimal case, need to do skip copy.
88
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
89
0
    {
90
0
      float* const ap0 = ap + i[0] * astride[0];
91
0
      float* const bp0 = bp + i[0] * bstride[0];
92
0
      float* const cp0 = cp + i[0] * cstride[0];
93
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
94
0
      {
95
0
        float* ap1 = ap0 + i[1] * astride[1];
96
0
        float* bp1 = bp0 + i[1] * bstride[1];
97
0
        float* cp1 = cp0 + i[1] * cstride[1];
98
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
99
0
        {
100
0
          for (x = 0; x < dim[3]; x++)
101
0
            cp1[x * cstride[3]] = ap1[x * astride[3]] + bp1[x * bstride[3]];
102
0
          ap1 += astride[2];
103
0
          bp1 += bstride[2];
104
0
          cp1 += cstride[2];
105
0
        }
106
0
      }
107
0
    }
108
0
  }
109
36.0k
}
110
111
void _ccv_nnc_ewsum_forw_cpu_ref_i32(ccv_nnc_tensor_view_t* const* const inputs, const int input_size, ccv_nnc_tensor_view_t* const* const outputs, const int output_size)
112
0
{
113
0
  if (input_size == 1 && output_size == 1)
114
0
  {
115
0
    _ccv_nnc_tensor_transfer_cpu_ref_f32(inputs[0], outputs[0]);
116
0
    return;
117
0
  }
118
  // Assuming this is float 32.
119
0
  int dim[CCV_NNC_MAX_DIM_ALLOC];
120
0
  int astride[CCV_NNC_MAX_DIM_ALLOC];
121
0
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
122
0
  int cstride[CCV_NNC_MAX_DIM_ALLOC];
123
0
  int x, z;
124
0
  int k = 0;
125
  // Bad, I promised this can be inplace operation. Need to first find out if there are share the same pointer first.
126
0
  for (z = 1; z < input_size; z++)
127
0
  {
128
0
    ccv_nnc_tensor_view_t* c = outputs[0];
129
0
    ccv_nnc_tensor_view_t* a = inputs[z];
130
0
    if (c->data.f32 == a->data.f32)
131
0
    {
132
0
      k = z;
133
0
      break;
134
0
    }
135
0
  }
136
0
  for (z = 0; z < input_size - 1; z++)
137
0
  {
138
0
    ccv_nnc_tensor_view_t* c = outputs[0];
139
0
    ccv_nnc_tensor_view_t* a = z > 0 ? c : inputs[k];
140
0
    ccv_nnc_tensor_view_t* b = z >= k ? inputs[z + 1] : inputs[z];
141
0
    assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
142
0
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
143
0
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
144
0
    ccv_nnc_tensor_view_get_dim(a, dim);
145
0
    assert(ccv_nnc_tensor_view_check_dim(b, dim));
146
0
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
147
0
    if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c))
148
0
    {
149
      // Super optimal case, just do one for-loop for sum.
150
0
      const int tensor_count = ccv_nnc_tensor_count(a->info);
151
0
      for (x = 0; x < tensor_count; x++)
152
0
        c->data.f32[x] = a->data.f32[x] + b->data.f32[x];
153
0
      continue;
154
0
    }
155
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
156
0
    ccv_nnc_tensor_view_get_stride(a, astride);
157
0
    ccv_nnc_tensor_view_get_stride(b, bstride);
158
0
    ccv_nnc_tensor_view_get_stride(c, cstride);
159
0
    int i[CCV_NNC_MAX_DIM + 2];
160
0
    int* const ap = a->data.i32;
161
0
    int* const bp = b->data.i32;
162
0
    int* const cp = c->data.i32;
163
0
    const int count = dim[2] * dim[3];
164
0
    if (astride[2] == dim[3] && bstride[2] == dim[3] && cstride[2] == dim[3] && astride[3] == 1 && bstride[3] == 1 && cstride[3] == 1)
165
0
    {
166
      // Special casing if the ainc[3] is the same as dim[3] (do memcpy for the last two dim)
167
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
168
0
      {
169
0
        int* ap0 = ap + i[0] * astride[0];
170
0
        int* bp0 = bp + i[0] * bstride[0];
171
0
        int* cp0 = cp + i[0] * cstride[0];
172
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
173
0
        {
174
0
          for (x = 0; x < count; x++)
175
0
            cp0[x] = ap0[x] + bp0[x];
176
0
          ap0 += astride[1];
177
0
          bp0 += bstride[1];
178
0
          cp0 += cstride[1];
179
0
        }
180
0
      }
181
0
      continue;
182
0
    }
183
    // Non-optimal case, need to do skip copy.
184
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
185
0
    {
186
0
      int* const ap0 = ap + i[0] * astride[0];
187
0
      int* const bp0 = bp + i[0] * bstride[0];
188
0
      int* const cp0 = cp + i[0] * cstride[0];
189
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
190
0
      {
191
0
        int* ap1 = ap0 + i[1] * astride[1];
192
0
        int* bp1 = bp0 + i[1] * bstride[1];
193
0
        int* cp1 = cp0 + i[1] * cstride[1];
194
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
195
0
        {
196
0
          for (x = 0; x < dim[3]; x++)
197
0
            cp1[x * cstride[3]] = ap1[x * astride[3]] + bp1[x * bstride[3]];
198
0
          ap1 += astride[2];
199
0
          bp1 += bstride[2];
200
0
          cp1 += cstride[2];
201
0
        }
202
0
      }
203
0
    }
204
0
  }
205
0
}
206
207
static int _ccv_nnc_ewsum_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
208
36.0k
{
209
36.0k
  if (outputs[0]->info.datatype == CCV_32S)
210
0
    _ccv_nnc_ewsum_forw_cpu_ref_i32((ccv_nnc_tensor_view_t**)inputs, input_size, (ccv_nnc_tensor_view_t**)outputs, output_size);
211
36.0k
  else
212
36.0k
    _ccv_nnc_ewsum_forw_cpu_ref_f32((ccv_nnc_tensor_view_t**)inputs, input_size, (ccv_nnc_tensor_view_t**)outputs, output_size);
213
36.0k
  return CCV_NNC_EXEC_SUCCESS;
214
36.0k
}
215
216
static int _ccv_nnc_ewsum_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
217
7.94k
{
218
  // D[x + y + z, x] = 1
219
7.94k
  int i;
220
7.94k
  if (inputs[0] == 0)
221
0
  {
222
    // Set them to 1.
223
0
    for (i = 0; i < output_size; i++)
224
0
      if (outputs[i])
225
0
        _ccv_nnc_tensor_set_cpu_ref_f32((ccv_nnc_tensor_view_t*)outputs[i], 1);
226
7.94k
  } else {
227
    // Copy over the gradient (If they are not pointing to the same tensor already).
228
23.8k
    for (i = 0; i < output_size; 
i++15.8k
)
229
15.8k
      if (outputs[i] && inputs[0]->data.f32 != outputs[i]->data.f32)
230
7.72k
        _ccv_nnc_tensor_transfer_cpu_ref_f32((ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)outputs[i]);
231
7.94k
  }
232
7.94k
  return CCV_NNC_EXEC_SUCCESS;
233
7.94k
}
234
235
void _ccv_nnc_ewprod_forw_cpu_ref(ccv_nnc_tensor_view_t* const* const inputs, const int input_size, ccv_nnc_tensor_view_t* const* const outputs, const int output_size)
236
30.6k
{
237
30.6k
  if (input_size == 1 && 
output_size == 10
)
238
0
  {
239
0
    _ccv_nnc_tensor_transfer_cpu_ref_f32(inputs[0], outputs[0]);
240
0
    return;
241
0
  }
242
  // Assuming this is float 32.
243
30.6k
  int dim[CCV_NNC_MAX_DIM_ALLOC];
244
30.6k
  int astride[CCV_NNC_MAX_DIM_ALLOC];
245
30.6k
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
246
30.6k
  int cstride[CCV_NNC_MAX_DIM_ALLOC];
247
30.6k
  int x, z;
248
30.6k
  int k = 0;
249
  // Bad, I promised this can be inplace operation. Need to first find out if there are share the same pointer first.
250
61.3k
  for (z = 1; z < input_size; 
z++30.6k
)
251
30.6k
  {
252
30.6k
    ccv_nnc_tensor_view_t* c = outputs[0];
253
30.6k
    ccv_nnc_tensor_view_t* a = inputs[z];
254
30.6k
    if (c->data.f32 == a->data.f32)
255
12
    {
256
12
      k = z;
257
12
      break;
258
12
    }
259
30.6k
  }
260
61.3k
  for (z = 0; z < input_size - 1; 
z++30.6k
)
261
30.6k
  {
262
30.6k
    ccv_nnc_tensor_view_t* c = outputs[0];
263
30.6k
    ccv_nnc_tensor_view_t* a = z > 0 ? 
c0
: inputs[k];
264
30.6k
    ccv_nnc_tensor_view_t* b = z >= k ? 
inputs[z + 1]30.6k
:
inputs[z]12
;
265
30.6k
    assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
266
30.6k
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
267
30.6k
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
268
30.6k
    ccv_nnc_tensor_view_get_dim(a, dim);
269
30.6k
    assert(ccv_nnc_tensor_view_check_dim(b, dim));
270
30.6k
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
271
30.6k
    if (!CCV_IS_TENSOR_VIEW(a) && 
!30.6k
CCV_IS_TENSOR_VIEW30.6k
(b) &&
!30.6k
CCV_IS_TENSOR_VIEW30.6k
(c))
272
30.6k
    {
273
      // Super optimal case, just do one for-loop for sum.
274
30.6k
      const int tensor_count = ccv_nnc_tensor_count(a->info);
275
99.3k
      for (x = 0; x < tensor_count; 
x++68.6k
)
276
68.6k
        c->data.f32[x] = a->data.f32[x] * b->data.f32[x];
277
30.6k
      continue;
278
30.6k
    }
279
30.6k
    assert
(CCV_NNC_MAX_DIM == 2)3
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
280
3
    ccv_nnc_tensor_view_get_stride(a, astride);
281
3
    ccv_nnc_tensor_view_get_stride(b, bstride);
282
3
    ccv_nnc_tensor_view_get_stride(c, cstride);
283
3
    int i[CCV_NNC_MAX_DIM + 2];
284
3
    float* const ap = a->data.f32;
285
3
    float* const bp = b->data.f32;
286
3
    float* const cp = c->data.f32;
287
3
    const int count = dim[2] * dim[3];
288
3
    if (astride[2] == dim[3] && bstride[2] == dim[3] && cstride[2] == dim[3])
289
3
    {
290
      // Special casing if the ainc[3] is the same as dim[3]
291
6
      for (i[0] = 0; i[0] < dim[0]; 
i[0]++3
)
292
3
      {
293
3
        float* ap0 = ap + i[0] * astride[0];
294
3
        float* bp0 = bp + i[0] * bstride[0];
295
3
        float* cp0 = cp + i[0] * cstride[0];
296
6
        for (i[1] = 0; i[1] < dim[1]; 
i[1]++3
)
297
3
        {
298
6
          for (x = 0; x < count; 
x++3
)
299
3
            cp0[x] = ap0[x] * bp0[x];
300
3
          ap0 += astride[1];
301
3
          bp0 += bstride[1];
302
3
          cp0 += cstride[1];
303
3
        }
304
3
      }
305
3
      continue;
306
3
    }
307
    // Non-optimal case, need to do skip copy.
308
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
309
0
    {
310
0
      float* const ap0 = ap + i[0] * astride[0];
311
0
      float* const bp0 = bp + i[0] * bstride[0];
312
0
      float* const cp0 = cp + i[0] * cstride[0];
313
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
314
0
      {
315
0
        float* ap1 = ap0 + i[1] * astride[1];
316
0
        float* bp1 = bp0 + i[1] * bstride[1];
317
0
        float* cp1 = cp0 + i[1] * cstride[1];
318
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
319
0
        {
320
0
          for (x = 0; x < dim[3]; x++)
321
0
            cp1[x] = ap1[x] * bp1[x];
322
0
          ap1 += astride[2];
323
0
          bp1 += bstride[2];
324
0
          cp1 += cstride[2];
325
0
        }
326
0
      }
327
0
    }
328
0
  }
329
30.6k
}
330
331
static int _ccv_nnc_ewprod_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
332
20.2k
{
333
20.2k
  _ccv_nnc_ewprod_forw_cpu_ref((ccv_nnc_tensor_view_t**)inputs, input_size, (ccv_nnc_tensor_view_t**)outputs, output_size);
334
20.2k
  return CCV_NNC_EXEC_SUCCESS;
335
20.2k
}
336
337
static int _ccv_nnc_ewprod_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
338
20.0k
{
339
  // D[x * y * z, x] = y * z
340
  // Assuming this is float 32.
341
20.0k
  int dim[CCV_NNC_MAX_DIM_ALLOC];
342
20.0k
  int gstride[CCV_NNC_MAX_DIM_ALLOC];
343
20.0k
  int astride[CCV_NNC_MAX_DIM_ALLOC];
344
20.0k
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
345
20.0k
  int hstride[CCV_NNC_MAX_DIM_ALLOC];
346
20.0k
  int x, z;
347
20.0k
  ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0];
348
20.0k
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[output_size + 1];
349
20.0k
  if (g == 0)
350
0
  {
351
0
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
352
0
    ccv_nnc_tensor_view_get_dim(b, dim);
353
0
    ccv_nnc_tensor_view_get_stride(b, bstride);
354
0
    for (z = 0; z < output_size; z++)
355
0
    {
356
0
      ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[z + 1];
357
0
      ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[z];
358
0
      assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
359
0
      assert(ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2);
360
0
      assert(ccv_nnc_tensor_view_check_dim(a, dim));
361
0
      assert(ccv_nnc_tensor_view_check_dim(h, dim));
362
0
      ccv_nnc_tensor_view_get_stride(a, astride);
363
0
      ccv_nnc_tensor_view_get_stride(h, hstride);
364
0
      if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(h))
365
0
      {
366
        // Super optimal case, just do one for-loop for sum.
367
0
        const int tensor_count = ccv_nnc_tensor_count(b->info);
368
0
        for (x = 0; x < tensor_count; x++)
369
0
          h->data.f32[x] = b->data.f32[x] / a->data.f32[x];
370
0
        continue;
371
0
      }
372
0
      assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
373
0
      int i[CCV_NNC_MAX_DIM + 2];
374
0
      float* const ap = a->data.f32;
375
0
      float* const bp = b->data.f32;
376
0
      float* const hp = h->data.f32;
377
0
      const int count = dim[2] * dim[3];
378
0
      if (astride[2] == dim[3] && bstride[2] == dim[3] && hstride[2] == dim[3])
379
0
      {
380
        // Special casing if the ainc[3] is the same as dim[3]
381
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
382
0
        {
383
0
          float* ap0 = ap + i[0] * astride[0];
384
0
          float* bp0 = bp + i[0] * bstride[0];
385
0
          float* hp0 = hp + i[0] * hstride[0];
386
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
387
0
          {
388
0
            for (x = 0; x < count; x++)
389
0
              hp0[x] = bp0[x] / ap0[x];
390
0
            ap0 += astride[1];
391
0
            bp0 += bstride[1];
392
0
            hp0 += hstride[1];
393
0
          }
394
0
        }
395
0
        continue;
396
0
      }
397
      // Non-optimal case, need to do skip copy.
398
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
399
0
      {
400
0
        float* const ap0 = ap + i[0] * astride[0];
401
0
        float* const bp0 = bp + i[0] * bstride[0];
402
0
        float* const hp0 = hp + i[0] * hstride[0];
403
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
404
0
        {
405
0
          float* ap1 = ap0 + i[1] * astride[1];
406
0
          float* bp1 = bp0 + i[1] * bstride[1];
407
0
          float* hp1 = hp0 + i[1] * hstride[1];
408
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
409
0
          {
410
0
            for (x = 0; x < dim[3]; x++)
411
0
              hp1[x] = bp1[x] / ap1[x];
412
0
            ap1 += astride[2];
413
0
            bp1 += bstride[2];
414
0
            hp1 += hstride[2];
415
0
          }
416
0
        }
417
0
      }
418
0
    }
419
20.0k
  } else {
420
20.0k
    assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2);
421
20.0k
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
422
20.0k
    ccv_nnc_tensor_view_get_dim(b, dim);
423
20.0k
    assert(ccv_nnc_tensor_view_check_dim(g, dim));
424
20.0k
    ccv_nnc_tensor_view_get_stride(b, bstride);
425
20.0k
    ccv_nnc_tensor_view_get_stride(g, gstride);
426
60.2k
    for (z = 0; z < output_size; 
z++40.1k
)
427
40.1k
    {
428
40.1k
      ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[z + 1];
429
40.1k
      ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[z];
430
40.1k
      assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
431
40.1k
      assert(ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2);
432
40.1k
      assert(ccv_nnc_tensor_view_check_dim(a, dim));
433
40.1k
      assert(ccv_nnc_tensor_view_check_dim(h, dim));
434
40.1k
      ccv_nnc_tensor_view_get_stride(a, astride);
435
40.1k
      ccv_nnc_tensor_view_get_stride(h, hstride);
436
40.1k
      if (!CCV_IS_TENSOR_VIEW(g) && 
!40.1k
CCV_IS_TENSOR_VIEW40.1k
(a) &&
!40.1k
CCV_IS_TENSOR_VIEW40.1k
(b) &&
!40.1k
CCV_IS_TENSOR_VIEW40.1k
(h))
437
40.1k
      {
438
        // Super optimal case, just do one for-loop for sum.
439
40.1k
        const int tensor_count = ccv_nnc_tensor_count(g->info);
440
129k
        for (x = 0; x < tensor_count; 
x++89.3k
)
441
89.3k
          h->data.f32[x] = g->data.f32[x] * b->data.f32[x] / a->data.f32[x];
442
40.1k
        continue;
443
40.1k
      }
444
40.1k
      assert
(CCV_NNC_MAX_DIM == 2)3
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
445
3
      int i[CCV_NNC_MAX_DIM + 2];
446
3
      float* const gp = g->data.f32;
447
3
      float* const ap = a->data.f32;
448
3
      float* const bp = b->data.f32;
449
3
      float* const hp = h->data.f32;
450
3
      const int count = dim[2] * dim[3];
451
3
      if (gstride[2] == dim[3] && astride[2] == dim[3] && bstride[2] == dim[3] && hstride[2] == dim[3])
452
3
      {
453
        // Special casing if the ainc[3] is the same as dim[3]
454
6
        for (i[0] = 0; i[0] < dim[0]; 
i[0]++3
)
455
3
        {
456
3
          float* gp0 = gp + i[0] * gstride[0];
457
3
          float* ap0 = ap + i[0] * astride[0];
458
3
          float* bp0 = bp + i[0] * bstride[0];
459
3
          float* hp0 = hp + i[0] * hstride[0];
460
6
          for (i[1] = 0; i[1] < dim[1]; 
i[1]++3
)
461
3
          {
462
6
            for (x = 0; x < count; 
x++3
)
463
3
              hp0[x] = gp0[x] * bp0[x] / ap0[x];
464
3
            gp0 += gstride[1];
465
3
            ap0 += astride[1];
466
3
            bp0 += bstride[1];
467
3
            hp0 += hstride[1];
468
3
          }
469
3
        }
470
3
        continue;
471
3
      }
472
      // Non-optimal case, need to do skip copy.
473
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
474
0
      {
475
0
        float* const gp0 = gp + i[0] * gstride[0];
476
0
        float* const ap0 = ap + i[0] * astride[0];
477
0
        float* const bp0 = bp + i[0] * bstride[0];
478
0
        float* const hp0 = hp + i[0] * hstride[0];
479
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
480
0
        {
481
0
          float* gp1 = gp0 + i[1] * gstride[1];
482
0
          float* ap1 = ap0 + i[1] * astride[1];
483
0
          float* bp1 = bp0 + i[1] * bstride[1];
484
0
          float* hp1 = hp0 + i[1] * hstride[1];
485
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
486
0
          {
487
0
            for (x = 0; x < dim[3]; x++)
488
0
              hp1[x] = gp1[x] * bp1[x] / ap1[x];
489
0
            gp1 += gstride[2];
490
0
            ap1 += astride[2];
491
0
            bp1 += bstride[2];
492
0
            hp1 += hstride[2];
493
0
          }
494
0
        }
495
0
      }
496
0
    }
497
20.0k
  }
498
20.0k
  return CCV_NNC_EXEC_SUCCESS;
499
20.0k
}
500
501
static void _ccv_nnc_ewdiv_forw_cpu_ref(const float p, ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b, ccv_nnc_tensor_view_t* const c)
502
258
{
503
  // Assuming this is float 32.
504
258
  int dim[CCV_NNC_MAX_DIM_ALLOC];
505
258
  int astride[CCV_NNC_MAX_DIM_ALLOC];
506
258
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
507
258
  int cstride[CCV_NNC_MAX_DIM_ALLOC];
508
258
  if (a == 0) // Take 0 as all ones tensor.
509
19
  {
510
19
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
511
19
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
512
19
    ccv_nnc_tensor_view_get_dim(b, dim);
513
19
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
514
19
    int x;
515
19
    if (!CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c))
516
19
    {
517
      // Super optimal case, just do one for-loop for sum.
518
19
      const int tensor_count = ccv_nnc_tensor_count(b->info);
519
1.98k
      for (x = 0; x < tensor_count; 
x++1.96k
)
520
1.96k
        c->data.f32[x] = p / b->data.f32[x];
521
19
      return;
522
19
    }
523
19
    assert
(CCV_NNC_MAX_DIM == 2)0
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
524
0
    ccv_nnc_tensor_view_get_stride(b, bstride);
525
0
    ccv_nnc_tensor_view_get_stride(c, cstride);
526
0
    int i[CCV_NNC_MAX_DIM + 2];
527
0
    float* const bp = b->data.f32;
528
0
    float* const cp = c->data.f32;
529
0
    const int count = dim[2] * dim[3];
530
0
    if (bstride[2] == dim[3] && cstride[2] == dim[3])
531
0
    {
532
      // Special casing if the ainc[3] is the same as dim[3]
533
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
534
0
      {
535
0
        float* bp0 = bp + i[0] * bstride[0];
536
0
        float* cp0 = cp + i[0] * cstride[0];
537
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
538
0
        {
539
0
          for (x = 0; x < count; x++)
540
0
            cp0[x] = p / bp0[x];
541
0
          bp0 += bstride[1];
542
0
          cp0 += cstride[1];
543
0
        }
544
0
      }
545
0
      return;
546
0
    }
547
    // Non-optimal case, need to do skip copy.
548
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
549
0
    {
550
0
      float* const bp0 = bp + i[0] * bstride[0];
551
0
      float* const cp0 = cp + i[0] * cstride[0];
552
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
553
0
      {
554
0
        float* bp1 = bp0 + i[1] * bstride[1];
555
0
        float* cp1 = cp0 + i[1] * cstride[1];
556
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
557
0
        {
558
0
          for (x = 0; x < dim[3]; x++)
559
0
            cp1[x] = p / bp1[x];
560
0
          bp1 += bstride[2];
561
0
          cp1 += cstride[2];
562
0
        }
563
0
      }
564
0
    }
565
239
  } else {
566
239
    assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
567
239
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
568
239
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
569
239
    ccv_nnc_tensor_view_get_dim(a, dim);
570
239
    assert(ccv_nnc_tensor_view_check_dim(b, dim));
571
239
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
572
239
    int x;
573
239
    if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c))
574
239
    {
575
      // Super optimal case, just do one for-loop for sum.
576
239
      const int tensor_count = ccv_nnc_tensor_count(a->info);
577
7.04k
      for (x = 0; x < tensor_count; 
x++6.80k
)
578
6.80k
        c->data.f32[x] = p * a->data.f32[x] / b->data.f32[x];
579
239
      return;
580
239
    }
581
239
    assert
(CCV_NNC_MAX_DIM == 2)0
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
582
0
    ccv_nnc_tensor_view_get_stride(a, astride);
583
0
    ccv_nnc_tensor_view_get_stride(b, bstride);
584
0
    ccv_nnc_tensor_view_get_stride(c, cstride);
585
0
    int i[CCV_NNC_MAX_DIM + 2];
586
0
    float* const ap = a->data.f32;
587
0
    float* const bp = b->data.f32;
588
0
    float* const cp = c->data.f32;
589
0
    const int count = dim[2] * dim[3];
590
0
    if (astride[2] == dim[3] && bstride[2] == dim[3] && cstride[2] == dim[3])
591
0
    {
592
      // Special casing if the ainc[3] is the same as dim[3]
593
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
594
0
      {
595
0
        float* ap0 = ap + i[0] * astride[0];
596
0
        float* bp0 = bp + i[0] * bstride[0];
597
0
        float* cp0 = cp + i[0] * cstride[0];
598
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
599
0
        {
600
0
          for (x = 0; x < count; x++)
601
0
            cp0[x] = p * ap0[x] / bp0[x];
602
0
          ap0 += astride[1];
603
0
          bp0 += bstride[1];
604
0
          cp0 += cstride[1];
605
0
        }
606
0
      }
607
0
      return;
608
0
    }
609
    // Non-optimal case, need to do skip copy.
610
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
611
0
    {
612
0
      float* const ap0 = ap + i[0] * astride[0];
613
0
      float* const bp0 = bp + i[0] * bstride[0];
614
0
      float* const cp0 = cp + i[0] * cstride[0];
615
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
616
0
      {
617
0
        float* ap1 = ap0 + i[1] * astride[1];
618
0
        float* bp1 = bp0 + i[1] * bstride[1];
619
0
        float* cp1 = cp0 + i[1] * cstride[1];
620
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
621
0
        {
622
0
          for (x = 0; x < dim[3]; x++)
623
0
            cp1[x] = p * ap1[x] / bp1[x];
624
0
          ap1 += astride[2];
625
0
          bp1 += bstride[2];
626
0
          cp1 += cstride[2];
627
0
        }
628
0
      }
629
0
    }
630
0
  }
631
258
}
632
633
static int _ccv_nnc_ewdiv_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
634
30
{
635
30
  _ccv_nnc_ewdiv_forw_cpu_ref(1, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]);
636
30
  return CCV_NNC_EXEC_SUCCESS;
637
30
}
638
639
static int _ccv_nnc_ewdiv_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
640
16
{
641
  // D[x / y, x] = 1 / y, D[x / y, y] = -x / y^2
642
16
  if (output_size == 1 || 
outputs[1] == 015
)
643
2
  {
644
    // When we only need D[x / y, x]
645
2
    _ccv_nnc_ewdiv_forw_cpu_ref(1, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
646
2
    return CCV_NNC_EXEC_SUCCESS;
647
2
  }
648
14
  int dim[CCV_NNC_MAX_DIM_ALLOC];
649
14
  int gstride[CCV_NNC_MAX_DIM_ALLOC];
650
14
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
651
14
  int cstride[CCV_NNC_MAX_DIM_ALLOC];
652
14
  int hastride[CCV_NNC_MAX_DIM_ALLOC];
653
14
  int hbstride[CCV_NNC_MAX_DIM_ALLOC];
654
14
  ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0];
655
14
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[2];
656
14
  ccv_nnc_tensor_view_t* c = (ccv_nnc_tensor_view_t*)inputs[3];
657
14
  ccv_nnc_tensor_view_t* ha = (ccv_nnc_tensor_view_t*)outputs[0];
658
14
  ccv_nnc_tensor_view_t* hb = (ccv_nnc_tensor_view_t*)outputs[1];
659
14
  if (g == 0)
660
0
  {
661
0
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
662
0
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
663
0
    assert(ccv_nnc_tensor_nd(hb->info.dim) <= CCV_NNC_MAX_DIM + 2);
664
0
    ccv_nnc_tensor_view_get_dim(b, dim);
665
0
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
666
0
    assert(ccv_nnc_tensor_view_check_dim(hb, dim));
667
0
    if (ha)
668
0
    {
669
0
      assert(ccv_nnc_tensor_nd(ha->info.dim) <= CCV_NNC_MAX_DIM + 2);
670
0
      assert(ccv_nnc_tensor_view_check_dim(ha, dim));
671
0
    }
672
0
    int x;
673
0
    if (!CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && (ha == 0 || !CCV_IS_TENSOR_VIEW(ha)) && !CCV_IS_TENSOR_VIEW(hb))
674
0
    {
675
      // Super optimal case, just do one for-loop for sum.
676
0
      const int tensor_count = ccv_nnc_tensor_count(b->info);
677
0
      if (ha == 0)
678
0
      {
679
0
        for (x = 0; x < tensor_count; x++)
680
0
        {
681
0
          const float v = 1 / b->data.f32[x];
682
0
          hb->data.f32[x] = -c->data.f32[x] * v;
683
0
        }
684
0
      } else {
685
0
        for (x = 0; x < tensor_count; x++)
686
0
        {
687
0
          const float v = 1 / b->data.f32[x];
688
0
          ha->data.f32[x] = v;
689
0
          hb->data.f32[x] = -c->data.f32[x] * v;
690
0
        }
691
0
      }
692
0
      return CCV_NNC_EXEC_SUCCESS;
693
0
    }
694
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
695
0
    ccv_nnc_tensor_view_get_stride(b, bstride);
696
0
    ccv_nnc_tensor_view_get_stride(c, cstride);
697
0
    ccv_nnc_tensor_view_get_stride(hb, hbstride);
698
0
    int i[CCV_NNC_MAX_DIM + 2];
699
0
    float* const bp = b->data.f32;
700
0
    float* const cp = c->data.f32;
701
0
    float* const hbp = hb->data.f32;
702
0
    const int count = dim[2] * dim[3];
703
0
    if (ha == 0)
704
0
    {
705
0
      if (bstride[2] == dim[3] && cstride[2] == dim[3] && hbstride[2] == dim[3])
706
0
      {
707
        // Special casing if the ainc[3] is the same as dim[3]
708
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
709
0
        {
710
0
          float* bp0 = bp + i[0] * bstride[0];
711
0
          float* cp0 = cp + i[0] * cstride[0];
712
0
          float* hbp0 = hbp + i[0] * hbstride[0];
713
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
714
0
          {
715
0
            for (x = 0; x < count; x++)
716
0
            {
717
0
              const float v = 1 / bp0[x];
718
0
              hbp0[x] = -cp0[x] * v;
719
0
            }
720
0
            bp0 += bstride[1];
721
0
            cp0 += cstride[1];
722
0
            hbp0 += hbstride[1];
723
0
          }
724
0
        }
725
0
        return CCV_NNC_EXEC_SUCCESS;
726
0
      }
727
      // Non-optimal case, need to do skip copy.
728
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
729
0
      {
730
0
        float* const bp0 = bp + i[0] * bstride[0];
731
0
        float* const cp0 = cp + i[0] * cstride[0];
732
0
        float* const hbp0 = hbp + i[0] * hbstride[0];
733
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
734
0
        {
735
0
          float* bp1 = bp0 + i[1] * bstride[1];
736
0
          float* cp1 = cp0 + i[1] * cstride[1];
737
0
          float* hbp1 = hbp0 + i[1] * hbstride[1];
738
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
739
0
          {
740
0
            for (x = 0; x < dim[3]; x++)
741
0
            {
742
0
              const float v = 1 / bp1[x];
743
0
              hbp1[x] = -cp1[x] * v;
744
0
            }
745
0
            bp1 += bstride[2];
746
0
            cp1 += cstride[2];
747
0
            hbp1 += hbstride[2];
748
0
          }
749
0
        }
750
0
      }
751
0
    } else {
752
0
      float* const hap = ha->data.f32;
753
0
      ccv_nnc_tensor_view_get_stride(ha, hastride);
754
0
      if (bstride[2] == dim[3] && cstride[2] == dim[3] && hastride[2] == dim[3] && hbstride[2] == dim[3])
755
0
      {
756
        // Special casing if the ainc[3] is the same as dim[3]
757
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
758
0
        {
759
0
          float* bp0 = bp + i[0] * bstride[0];
760
0
          float* cp0 = cp + i[0] * cstride[0];
761
0
          float* hap0 = hap + i[0] * hastride[0];
762
0
          float* hbp0 = hbp + i[0] * hbstride[0];
763
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
764
0
          {
765
0
            for (x = 0; x < count; x++)
766
0
            {
767
0
              const float v = 1 / bp0[x];
768
0
              hap0[x] = v;
769
0
              hbp0[x] = -cp0[x] * v;
770
0
            }
771
0
            bp0 += bstride[1];
772
0
            cp0 += cstride[1];
773
0
            hap0 += hastride[1];
774
0
            hbp0 += hbstride[1];
775
0
          }
776
0
        }
777
0
        return CCV_NNC_EXEC_SUCCESS;
778
0
      }
779
      // Non-optimal case, need to do skip copy.
780
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
781
0
      {
782
0
        float* const bp0 = bp + i[0] * bstride[0];
783
0
        float* const cp0 = cp + i[0] * cstride[0];
784
0
        float* const hap0 = hap + i[0] * hastride[0];
785
0
        float* const hbp0 = hbp + i[0] * hbstride[0];
786
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
787
0
        {
788
0
          float* bp1 = bp0 + i[1] * bstride[1];
789
0
          float* cp1 = cp0 + i[1] * cstride[1];
790
0
          float* hap1 = hap0 + i[1] * hastride[1];
791
0
          float* hbp1 = hbp0 + i[1] * hbstride[1];
792
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
793
0
          {
794
0
            for (x = 0; x < dim[3]; x++)
795
0
            {
796
0
              const float v = 1 / bp1[x];
797
0
              hap1[x] = v;
798
0
              hbp1[x] = -cp1[x] * v;
799
0
            }
800
0
            bp1 += bstride[2];
801
0
            cp1 += cstride[2];
802
0
            hap1 += hastride[2];
803
0
            hbp1 += hbstride[2];
804
0
          }
805
0
        }
806
0
      }
807
0
    }
808
14
  } else {
809
14
    assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2);
810
14
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
811
14
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
812
14
    assert(ccv_nnc_tensor_nd(hb->info.dim) <= CCV_NNC_MAX_DIM + 2);
813
14
    ccv_nnc_tensor_view_get_dim(b, dim);
814
14
    assert(ccv_nnc_tensor_view_check_dim(g, dim));
815
14
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
816
14
    assert(ccv_nnc_tensor_view_check_dim(hb, dim));
817
14
    if (ha)
818
1
    {
819
1
      assert(ccv_nnc_tensor_nd(ha->info.dim) <= CCV_NNC_MAX_DIM + 2);
820
1
      assert(ccv_nnc_tensor_view_check_dim(ha, dim));
821
1
    }
822
14
    int x;
823
14
    if (!CCV_IS_TENSOR_VIEW(g) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && (ha == 0 || 
!1
CCV_IS_TENSOR_VIEW1
(ha)) && !CCV_IS_TENSOR_VIEW(hb))
824
14
    {
825
      // Super optimal case, just do one for-loop for sum.
826
14
      const int tensor_count = ccv_nnc_tensor_count(g->info);
827
14
      if (ha == 0)
828
13
      {
829
1.50k
        for (x = 0; x < tensor_count; 
x++1.48k
)
830
1.48k
        {
831
1.48k
          const float v = g->data.f32[x] / b->data.f32[x];
832
1.48k
          hb->data.f32[x] = -c->data.f32[x] * v;
833
1.48k
        }
834
13
      } else {
835
2
        for (x = 0; x < tensor_count; 
x++1
)
836
1
        {
837
1
          const float v = g->data.f32[x] / b->data.f32[x];
838
1
          ha->data.f32[x] = v;
839
1
          hb->data.f32[x] = -c->data.f32[x] * v;
840
1
        }
841
1
      }
842
14
      return CCV_NNC_EXEC_SUCCESS;
843
14
    }
844
14
    assert
(CCV_NNC_MAX_DIM == 2)0
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
845
0
    ccv_nnc_tensor_view_get_stride(g, gstride);
846
0
    ccv_nnc_tensor_view_get_stride(b, bstride);
847
0
    ccv_nnc_tensor_view_get_stride(c, cstride);
848
0
    ccv_nnc_tensor_view_get_stride(hb, hbstride);
849
0
    int i[CCV_NNC_MAX_DIM + 2];
850
0
    float* const gp = g->data.f32;
851
0
    float* const bp = b->data.f32;
852
0
    float* const cp = c->data.f32;
853
0
    float* const hbp = hb->data.f32;
854
0
    const int count = dim[2] * dim[3];
855
0
    if (ha == 0)
856
0
    {
857
0
      if (gstride[2] == dim[3] && bstride[2] == dim[3] && cstride[2] == dim[3] && hbstride[2] == dim[3])
858
0
      {
859
        // Special casing if the ainc[3] is the same as dim[3]
860
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
861
0
        {
862
0
          float* gp0 = gp + i[0] * gstride[0];
863
0
          float* bp0 = bp + i[0] * bstride[0];
864
0
          float* cp0 = cp + i[0] * cstride[0];
865
0
          float* hbp0 = hbp + i[0] * hbstride[0];
866
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
867
0
          {
868
0
            for (x = 0; x < count; x++)
869
0
            {
870
0
              const float v = gp0[x] / bp0[x];
871
0
              hbp0[x] = -cp0[x] * v;
872
0
            }
873
0
            gp0 += gstride[1];
874
0
            bp0 += bstride[1];
875
0
            cp0 += cstride[1];
876
0
            hbp0 += hbstride[1];
877
0
          }
878
0
        }
879
0
        return CCV_NNC_EXEC_SUCCESS;
880
0
      }
881
      // Non-optimal case, need to do skip copy.
882
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
883
0
      {
884
0
        float* const gp0 = gp + i[0] * gstride[0];
885
0
        float* const bp0 = bp + i[0] * bstride[0];
886
0
        float* const cp0 = cp + i[0] * cstride[0];
887
0
        float* const hbp0 = hbp + i[0] * hbstride[0];
888
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
889
0
        {
890
0
          float* gp1 = gp0 + i[1] * gstride[1];
891
0
          float* bp1 = bp0 + i[1] * bstride[1];
892
0
          float* cp1 = cp0 + i[1] * cstride[1];
893
0
          float* hbp1 = hbp0 + i[1] * hbstride[1];
894
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
895
0
          {
896
0
            for (x = 0; x < dim[3]; x++)
897
0
            {
898
0
              const float v = gp1[x] / bp1[x];
899
0
              hbp1[x] = -cp1[x] * v;
900
0
            }
901
0
            gp1 += gstride[2];
902
0
            bp1 += bstride[2];
903
0
            cp1 += cstride[2];
904
0
            hbp1 += hbstride[2];
905
0
          }
906
0
        }
907
0
      }
908
0
    } else {
909
0
      ccv_nnc_tensor_view_get_stride(ha, hastride);
910
0
      float* const hap = ha->data.f32;
911
0
      if (gstride[2] == dim[3] && bstride[2] == dim[3] && cstride[2] == dim[3] && hastride[2] == dim[3] && hbstride[2] == dim[3])
912
0
      {
913
        // Special casing if the ainc[3] is the same as dim[3]
914
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
915
0
        {
916
0
          float* gp0 = gp + i[0] * gstride[0];
917
0
          float* bp0 = bp + i[0] * bstride[0];
918
0
          float* cp0 = cp + i[0] * cstride[0];
919
0
          float* hap0 = hap + i[0] * hastride[0];
920
0
          float* hbp0 = hbp + i[0] * hbstride[0];
921
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
922
0
          {
923
0
            for (x = 0; x < count; x++)
924
0
            {
925
0
              const float v = gp0[x] / bp0[x];
926
0
              hap0[x] = v;
927
0
              hbp0[x] = -cp0[x] * v;
928
0
            }
929
0
            gp0 += gstride[1];
930
0
            bp0 += bstride[1];
931
0
            cp0 += cstride[1];
932
0
            hap0 += hastride[1];
933
0
            hbp0 += hbstride[1];
934
0
          }
935
0
        }
936
0
        return CCV_NNC_EXEC_SUCCESS;
937
0
      }
938
      // Non-optimal case, need to do skip copy.
939
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
940
0
      {
941
0
        float* const gp0 = gp + i[0] * gstride[0];
942
0
        float* const bp0 = bp + i[0] * bstride[0];
943
0
        float* const cp0 = cp + i[0] * cstride[0];
944
0
        float* const hap0 = hap + i[0] * hastride[0];
945
0
        float* const hbp0 = hbp + i[0] * hbstride[0];
946
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
947
0
        {
948
0
          float* gp1 = gp0 + i[1] * gstride[1];
949
0
          float* bp1 = bp0 + i[1] * bstride[1];
950
0
          float* cp1 = cp0 + i[1] * cstride[1];
951
0
          float* hap1 = hap0 + i[1] * hastride[1];
952
0
          float* hbp1 = hbp0 + i[1] * hbstride[1];
953
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
954
0
          {
955
0
            for (x = 0; x < dim[3]; x++)
956
0
            {
957
0
              const float v = gp1[x] / bp1[x];
958
0
              hap1[x] = v;
959
0
              hbp1[x] = -cp1[x] * v;
960
0
            }
961
0
            gp1 += gstride[2];
962
0
            bp1 += bstride[2];
963
0
            cp1 += cstride[2];
964
0
            hap1 += hastride[2];
965
0
            hbp1 += hbstride[2];
966
0
          }
967
0
        }
968
0
      }
969
0
    }
970
0
  }
971
0
  return CCV_NNC_EXEC_SUCCESS;
972
14
}
973
974
static int _ccv_nnc_ewexp_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
975
21
{
976
  // Assuming this is float 32.
977
21
  int dim[CCV_NNC_MAX_DIM_ALLOC];
978
21
  int astride[CCV_NNC_MAX_DIM_ALLOC];
979
21
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
980
21
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
981
21
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
982
21
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
983
21
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
984
21
  ccv_nnc_tensor_view_get_dim(a, dim);
985
21
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
986
21
  int x;
987
21
  if (!CCV_IS_TENSOR_VIEW(a) && 
!20
CCV_IS_TENSOR_VIEW20
(b))
988
20
  {
989
    // Super optimal case, just do one for-loop for sum.
990
20
    const int tensor_count = ccv_nnc_tensor_count(a->info);
991
3.07k
    for (x = 0; x < tensor_count; 
x++3.05k
)
992
3.05k
      b->data.f32[x] = exp(a->data.f32[x]);
993
20
    return CCV_NNC_EXEC_SUCCESS;
994
20
  }
995
21
  assert
(CCV_NNC_MAX_DIM == 2)1
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
996
1
  ccv_nnc_tensor_view_get_stride(a, astride);
997
1
  ccv_nnc_tensor_view_get_stride(b, bstride);
998
1
  int i[CCV_NNC_MAX_DIM + 2];
999
1
  float* const ap = a->data.f32;
1000
1
  float* const bp = b->data.f32;
1001
1
  const int count = dim[2] * dim[3];
1002
1
  if (astride[2] == dim[3] && bstride[2] == dim[3])
1003
1
  {
1004
    // Special casing if the ainc[3] is the same as dim[3]
1005
2
    for (i[0] = 0; i[0] < dim[0]; 
i[0]++1
)
1006
1
    {
1007
1
      float* ap0 = ap + i[0] * astride[0];
1008
1
      float* bp0 = bp + i[0] * bstride[0];
1009
2
      for (i[1] = 0; i[1] < dim[1]; 
i[1]++1
)
1010
1
      {
1011
2
        for (x = 0; x < count; 
x++1
)
1012
1
          bp0[x] = exp(ap0[x]);
1013
1
        ap0 += astride[1];
1014
1
        bp0 += bstride[1];
1015
1
      }
1016
1
    }
1017
1
    return CCV_NNC_EXEC_SUCCESS;
1018
1
  }
1019
  // Non-optimal case, need to do skip copy.
1020
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
1021
0
  {
1022
0
    float* const ap0 = ap + i[0] * astride[0];
1023
0
    float* const bp0 = bp + i[0] * bstride[0];
1024
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
1025
0
    {
1026
0
      float* ap1 = ap0 + i[1] * astride[1];
1027
0
      float* bp1 = bp0 + i[1] * bstride[1];
1028
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
1029
0
      {
1030
0
        for (x = 0; x < dim[3]; x++)
1031
0
          bp1[x] = exp(ap1[x]);
1032
0
        ap1 += astride[2];
1033
0
        bp1 += bstride[2];
1034
0
      }
1035
0
    }
1036
0
  }
1037
0
  return CCV_NNC_EXEC_SUCCESS;
1038
1
}
1039
1040
static int _ccv_nnc_ewexp_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1041
9
{
1042
  // D[Exp[x], x] = Exp[x]
1043
9
  if (inputs[0] == 0)
1044
0
    _ccv_nnc_tensor_transfer_cpu_ref_f32((ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
1045
9
  else
1046
9
    _ccv_nnc_ewprod_forw_cpu_ref((ccv_nnc_tensor_view_t*[]){
1047
9
      (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2]
1048
9
    }, 2, (ccv_nnc_tensor_view_t**)outputs, output_size);
1049
9
  return CCV_NNC_EXEC_SUCCESS;
1050
9
}
1051
1052
static int _ccv_nnc_ewlog_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1053
258
{
1054
  // Assuming this is float 32.
1055
258
  int dim[CCV_NNC_MAX_DIM_ALLOC];
1056
258
  int astride[CCV_NNC_MAX_DIM_ALLOC];
1057
258
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
1058
258
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
1059
258
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
1060
258
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
1061
258
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
1062
258
  ccv_nnc_tensor_view_get_dim(a, dim);
1063
258
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
1064
258
  int x;
1065
258
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b))
1066
258
  {
1067
    // Super optimal case, just do one for-loop for sum.
1068
258
    const int tensor_count = ccv_nnc_tensor_count(a->info);
1069
3.55k
    for (x = 0; x < tensor_count; 
x++3.29k
)
1070
3.29k
      b->data.f32[x] = log(a->data.f32[x]);
1071
258
    return CCV_NNC_EXEC_SUCCESS;
1072
258
  }
1073
258
  assert
(CCV_NNC_MAX_DIM == 2)0
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1074
0
  ccv_nnc_tensor_view_get_stride(a, astride);
1075
0
  ccv_nnc_tensor_view_get_stride(b, bstride);
1076
0
  int i[CCV_NNC_MAX_DIM + 2];
1077
0
  float* const ap = a->data.f32;
1078
0
  float* const bp = b->data.f32;
1079
0
  const int count = dim[2] * dim[3];
1080
0
  if (astride[2] == dim[3] && bstride[2] == dim[3])
1081
0
  {
1082
    // Special casing if the ainc[3] is the same as dim[3]
1083
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1084
0
    {
1085
0
      float* ap0 = ap + i[0] * astride[0];
1086
0
      float* bp0 = bp + i[0] * bstride[0];
1087
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1088
0
      {
1089
0
        for (x = 0; x < count; x++)
1090
0
          bp0[x] = log(ap0[x]);
1091
0
        ap0 += astride[1];
1092
0
        bp0 += bstride[1];
1093
0
      }
1094
0
    }
1095
0
    return CCV_NNC_EXEC_SUCCESS;
1096
0
  }
1097
  // Non-optimal case, need to do skip copy.
1098
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
1099
0
  {
1100
0
    float* const ap0 = ap + i[0] * astride[0];
1101
0
    float* const bp0 = bp + i[0] * bstride[0];
1102
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
1103
0
    {
1104
0
      float* ap1 = ap0 + i[1] * astride[1];
1105
0
      float* bp1 = bp0 + i[1] * bstride[1];
1106
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
1107
0
      {
1108
0
        for (x = 0; x < dim[3]; x++)
1109
0
          bp1[x] = log(ap1[x]);
1110
0
        ap1 += astride[2];
1111
0
        bp1 += bstride[2];
1112
0
      }
1113
0
    }
1114
0
  }
1115
0
  return CCV_NNC_EXEC_SUCCESS;
1116
0
}
1117
1118
static int _ccv_nnc_ewlog_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1119
224
{
1120
  // D[Log[x], x] = 1 / x
1121
224
  _ccv_nnc_ewdiv_forw_cpu_ref(1, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]);
1122
224
  return CCV_NNC_EXEC_SUCCESS;
1123
224
}
1124
1125
static int _ccv_nnc_ewsqrt_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1126
5
{
1127
  // Assuming this is float 32.
1128
5
  int dim[CCV_NNC_MAX_DIM_ALLOC];
1129
5
  int astride[CCV_NNC_MAX_DIM_ALLOC];
1130
5
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
1131
5
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
1132
5
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
1133
5
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
1134
5
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
1135
5
  ccv_nnc_tensor_view_get_dim(a, dim);
1136
5
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
1137
5
  int x;
1138
5
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b))
1139
5
  {
1140
    // Super optimal case, just do one for-loop for sum.
1141
5
    const int tensor_count = ccv_nnc_tensor_count(a->info);
1142
2.01k
    for (x = 0; x < tensor_count; 
x++2.01k
)
1143
2.01k
      b->data.f32[x] = sqrt(a->data.f32[x]);
1144
5
    return CCV_NNC_EXEC_SUCCESS;
1145
5
  }
1146
5
  assert
(CCV_NNC_MAX_DIM == 2)0
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1147
0
  ccv_nnc_tensor_view_get_stride(a, astride);
1148
0
  ccv_nnc_tensor_view_get_stride(b, bstride);
1149
0
  int i[CCV_NNC_MAX_DIM + 2];
1150
0
  float* const ap = a->data.f32;
1151
0
  float* const bp = b->data.f32;
1152
0
  const int count = dim[2] * dim[3];
1153
0
  if (astride[2] == dim[3] && bstride[2] == dim[3])
1154
0
  {
1155
    // Special casing if the ainc[3] is the same as dim[3]
1156
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1157
0
    {
1158
0
      float* ap0 = ap + i[0] * astride[0];
1159
0
      float* bp0 = bp + i[0] * bstride[0];
1160
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1161
0
      {
1162
0
        for (x = 0; x < count; x++)
1163
0
          bp0[x] = sqrt(ap0[x]);
1164
0
        ap0 += astride[1];
1165
0
        bp0 += bstride[1];
1166
0
      }
1167
0
    }
1168
0
    return CCV_NNC_EXEC_SUCCESS;
1169
0
  }
1170
  // Non-optimal case, need to do skip copy.
1171
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
1172
0
  {
1173
0
    float* const ap0 = ap + i[0] * astride[0];
1174
0
    float* const bp0 = bp + i[0] * bstride[0];
1175
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
1176
0
    {
1177
0
      float* ap1 = ap0 + i[1] * astride[1];
1178
0
      float* bp1 = bp0 + i[1] * bstride[1];
1179
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
1180
0
      {
1181
0
        for (x = 0; x < dim[3]; x++)
1182
0
          bp1[x] = sqrt(ap1[x]);
1183
0
        ap1 += astride[2];
1184
0
        bp1 += bstride[2];
1185
0
      }
1186
0
    }
1187
0
  }
1188
0
  return CCV_NNC_EXEC_SUCCESS;
1189
0
}
1190
1191
static int _ccv_nnc_ewsqrt_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1192
2
{
1193
  // D[Sqrt[x], x] = 0.5 / Sqrt[x]
1194
2
  _ccv_nnc_ewdiv_forw_cpu_ref(0.5, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
1195
2
  return CCV_NNC_EXEC_SUCCESS;
1196
2
}
1197
1198
static int _ccv_nnc_ewabs_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1199
2
{
1200
  // Assuming this is float 32.
1201
2
  int dim[CCV_NNC_MAX_DIM_ALLOC];
1202
2
  int astride[CCV_NNC_MAX_DIM_ALLOC];
1203
2
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
1204
2
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
1205
2
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
1206
2
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
1207
2
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
1208
2
  ccv_nnc_tensor_view_get_dim(a, dim);
1209
2
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
1210
2
  int x;
1211
2
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b))
1212
2
  {
1213
    // Super optimal case, just do one for-loop for sum.
1214
2
    const int tensor_count = ccv_nnc_tensor_count(a->info);
1215
2.00k
    for (x = 0; x < tensor_count; 
x++2.00k
)
1216
2.00k
      b->data.f32[x] = fabs(a->data.f32[x]);
1217
2
    return CCV_NNC_EXEC_SUCCESS;
1218
2
  }
1219
2
  assert
(CCV_NNC_MAX_DIM == 2)0
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1220
0
  ccv_nnc_tensor_view_get_stride(a, astride);
1221
0
  ccv_nnc_tensor_view_get_stride(b, bstride);
1222
0
  int i[CCV_NNC_MAX_DIM + 2];
1223
0
  float* const ap = a->data.f32;
1224
0
  float* const bp = b->data.f32;
1225
0
  const int count = dim[2] * dim[3];
1226
0
  if (astride[2] == dim[3] && bstride[2] == dim[3])
1227
0
  {
1228
    // Special casing if the ainc[3] is the same as dim[3]
1229
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1230
0
    {
1231
0
      float* ap0 = ap + i[0] * astride[0];
1232
0
      float* bp0 = bp + i[0] * bstride[0];
1233
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1234
0
      {
1235
0
        for (x = 0; x < count; x++)
1236
0
          bp0[x] = fabs(ap0[x]);
1237
0
        ap0 += astride[1];
1238
0
        bp0 += bstride[1];
1239
0
      }
1240
0
    }
1241
0
    return CCV_NNC_EXEC_SUCCESS;
1242
0
  }
1243
  // Non-optimal case, need to do skip copy.
1244
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
1245
0
  {
1246
0
    float* const ap0 = ap + i[0] * astride[0];
1247
0
    float* const bp0 = bp + i[0] * bstride[0];
1248
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
1249
0
    {
1250
0
      float* ap1 = ap0 + i[1] * astride[1];
1251
0
      float* bp1 = bp0 + i[1] * bstride[1];
1252
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
1253
0
      {
1254
0
        for (x = 0; x < dim[3]; x++)
1255
0
          bp1[x] = fabs(ap1[x]);
1256
0
        ap1 += astride[2];
1257
0
        bp1 += bstride[2];
1258
0
      }
1259
0
    }
1260
0
  }
1261
0
  return CCV_NNC_EXEC_SUCCESS;
1262
0
}
1263
1264
static int _ccv_nnc_ewabs_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1265
1
{
1266
  // Assuming this is float 32.
1267
1
  int dim[CCV_NNC_MAX_DIM_ALLOC];
1268
1
  int gstride[CCV_NNC_MAX_DIM_ALLOC];
1269
1
  int astride[CCV_NNC_MAX_DIM_ALLOC];
1270
1
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
1271
1
  ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0];
1272
1
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[1];
1273
1
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
1274
1
  assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2);
1275
1
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
1276
1
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
1277
1
  ccv_nnc_tensor_view_get_dim(a, dim);
1278
1
  assert(ccv_nnc_tensor_view_check_dim(g, dim));
1279
1
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
1280
1
  int x;
1281
1
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(g))
1282
1
  {
1283
    // Super optimal case, just do one for-loop for sum.
1284
1
    const int tensor_count = ccv_nnc_tensor_count(a->info);
1285
1.00k
    for (x = 0; x < tensor_count; 
x++1.00k
)
1286
1.00k
      b->data.f32[x] = a->data.f32[x] >= 0 ? g->data.f32[x] : 
-g->data.f32[x]0
;
1287
1
    return CCV_NNC_EXEC_SUCCESS;
1288
1
  }
1289
1
  assert
(CCV_NNC_MAX_DIM == 2)0
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1290
0
  ccv_nnc_tensor_view_get_stride(g, astride);
1291
0
  ccv_nnc_tensor_view_get_stride(a, astride);
1292
0
  ccv_nnc_tensor_view_get_stride(b, bstride);
1293
0
  int i[CCV_NNC_MAX_DIM + 2];
1294
0
  float* const gp = g->data.f32;
1295
0
  float* const ap = a->data.f32;
1296
0
  float* const bp = b->data.f32;
1297
0
  const int count = dim[2] * dim[3];
1298
0
  if (astride[2] == dim[3] && bstride[2] == dim[3])
1299
0
  {
1300
    // Special casing if the ainc[3] is the same as dim[3]
1301
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1302
0
    {
1303
0
      float* gp0 = gp + i[0] * gstride[0];
1304
0
      float* ap0 = ap + i[0] * astride[0];
1305
0
      float* bp0 = bp + i[0] * bstride[0];
1306
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1307
0
      {
1308
0
        for (x = 0; x < count; x++)
1309
0
          bp0[x] = ap0[x] >= 0 ? gp0[x] : -gp0[x];
1310
0
        gp0 += gstride[1];
1311
0
        ap0 += astride[1];
1312
0
        bp0 += bstride[1];
1313
0
      }
1314
0
    }
1315
0
    return CCV_NNC_EXEC_SUCCESS;
1316
0
  }
1317
  // Non-optimal case, need to do skip copy.
1318
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
1319
0
  {
1320
0
    float* const gp0 = gp + i[0] * gstride[0];
1321
0
    float* const ap0 = ap + i[0] * astride[0];
1322
0
    float* const bp0 = bp + i[0] * bstride[0];
1323
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
1324
0
    {
1325
0
      float* gp1 = gp0 + i[1] * gstride[1];
1326
0
      float* ap1 = ap0 + i[1] * astride[1];
1327
0
      float* bp1 = bp0 + i[1] * bstride[1];
1328
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
1329
0
      {
1330
0
        for (x = 0; x < dim[3]; x++)
1331
0
          bp1[x] = ap1[x] >= 0 ? gp1[x] : -gp1[x];
1332
0
        gp1 += gstride[2];
1333
0
        ap1 += astride[2];
1334
0
        bp1 += bstride[2];
1335
0
      }
1336
0
    }
1337
0
  }
1338
0
  return CCV_NNC_EXEC_SUCCESS;
1339
0
}
1340
1341
static int _ccv_nnc_clamp_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1342
8
{
1343
  // Assuming this is float 32.
1344
8
  int dim[CCV_NNC_MAX_DIM_ALLOC];
1345
8
  int astride[CCV_NNC_MAX_DIM_ALLOC];
1346
8
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
1347
8
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
1348
8
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
1349
8
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
1350
8
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
1351
8
  ccv_nnc_tensor_view_get_dim(a, dim);
1352
8
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
1353
8
  int x;
1354
8
  const float min = cmd.info.clamp.min;
1355
8
  const float max = cmd.info.clamp.max;
1356
8
  assert(!isnan(min) || !isnan(max));
1357
8
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b))
1358
8
  {
1359
    // Super optimal case, just do one for-loop for sum.
1360
8
    const int tensor_count = ccv_nnc_tensor_count(a->info);
1361
8
    if (isnan(min))
1362
4
    {
1363
2.00k
      for (x = 0; x < tensor_count; 
x++2.00k
)
1364
2.00k
        b->data.f32[x] = ccv_min(a->data.f32[x], max);
1365
4
    } else if (isnan(max)) {
1366
2.00k
      for (x = 0; x < tensor_count; 
x++2.00k
)
1367
2.00k
        b->data.f32[x] = ccv_max(a->data.f32[x], min);
1368
2
    } else {
1369
2.00k
      for (x = 0; x < tensor_count; 
x++2.00k
)
1370
2.00k
        b->data.f32[x] = ccv_clamp(a->data.f32[x], min, max);
1371
2
    }
1372
8
    return CCV_NNC_EXEC_SUCCESS;
1373
8
  }
1374
8
  assert
(CCV_NNC_MAX_DIM == 2)0
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1375
0
  ccv_nnc_tensor_view_get_stride(a, astride);
1376
0
  ccv_nnc_tensor_view_get_stride(b, bstride);
1377
0
  int i[CCV_NNC_MAX_DIM + 2];
1378
0
  float* const ap = a->data.f32;
1379
0
  float* const bp = b->data.f32;
1380
0
  const int count = dim[2] * dim[3];
1381
0
  if (isnan(min))
1382
0
  {
1383
0
    if (astride[2] == dim[3] && bstride[2] == dim[3])
1384
0
    {
1385
      // Special casing if the ainc[3] is the same as dim[3]
1386
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1387
0
      {
1388
0
        float* ap0 = ap + i[0] * astride[0];
1389
0
        float* bp0 = bp + i[0] * bstride[0];
1390
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1391
0
        {
1392
0
          for (x = 0; x < count; x++)
1393
0
            bp0[x] = ccv_min(ap0[x], max);
1394
0
          ap0 += astride[1];
1395
0
          bp0 += bstride[1];
1396
0
        }
1397
0
      }
1398
0
      return CCV_NNC_EXEC_SUCCESS;
1399
0
    }
1400
    // Non-optimal case, need to do skip copy.
1401
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1402
0
    {
1403
0
      float* const ap0 = ap + i[0] * astride[0];
1404
0
      float* const bp0 = bp + i[0] * bstride[0];
1405
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1406
0
      {
1407
0
        float* ap1 = ap0 + i[1] * astride[1];
1408
0
        float* bp1 = bp0 + i[1] * bstride[1];
1409
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
1410
0
        {
1411
0
          for (x = 0; x < dim[3]; x++)
1412
0
            bp1[x] = ccv_min(ap1[x], max);
1413
0
          ap1 += astride[2];
1414
0
          bp1 += bstride[2];
1415
0
        }
1416
0
      }
1417
0
    }
1418
0
  } else if (isnan(max)) {
1419
0
    if (astride[2] == dim[3] && bstride[2] == dim[3])
1420
0
    {
1421
      // Special casing if the ainc[3] is the same as dim[3]
1422
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1423
0
      {
1424
0
        float* ap0 = ap + i[0] * astride[0];
1425
0
        float* bp0 = bp + i[0] * bstride[0];
1426
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1427
0
        {
1428
0
          for (x = 0; x < count; x++)
1429
0
            bp0[x] = ccv_max(ap0[x], min);
1430
0
          ap0 += astride[1];
1431
0
          bp0 += bstride[1];
1432
0
        }
1433
0
      }
1434
0
      return CCV_NNC_EXEC_SUCCESS;
1435
0
    }
1436
    // Non-optimal case, need to do skip copy.
1437
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1438
0
    {
1439
0
      float* const ap0 = ap + i[0] * astride[0];
1440
0
      float* const bp0 = bp + i[0] * bstride[0];
1441
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1442
0
      {
1443
0
        float* ap1 = ap0 + i[1] * astride[1];
1444
0
        float* bp1 = bp0 + i[1] * bstride[1];
1445
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
1446
0
        {
1447
0
          for (x = 0; x < dim[3]; x++)
1448
0
            bp1[x] = ccv_max(ap1[x], min);
1449
0
          ap1 += astride[2];
1450
0
          bp1 += bstride[2];
1451
0
        }
1452
0
      }
1453
0
    }
1454
0
  } else {
1455
0
    if (astride[2] == dim[3] && bstride[2] == dim[3])
1456
0
    {
1457
      // Special casing if the ainc[3] is the same as dim[3]
1458
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1459
0
      {
1460
0
        float* ap0 = ap + i[0] * astride[0];
1461
0
        float* bp0 = bp + i[0] * bstride[0];
1462
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1463
0
        {
1464
0
          for (x = 0; x < count; x++)
1465
0
            bp0[x] = ccv_clamp(ap0[x], min, max);
1466
0
          ap0 += astride[1];
1467
0
          bp0 += bstride[1];
1468
0
        }
1469
0
      }
1470
0
      return CCV_NNC_EXEC_SUCCESS;
1471
0
    }
1472
    // Non-optimal case, need to do skip copy.
1473
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1474
0
    {
1475
0
      float* const ap0 = ap + i[0] * astride[0];
1476
0
      float* const bp0 = bp + i[0] * bstride[0];
1477
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1478
0
      {
1479
0
        float* ap1 = ap0 + i[1] * astride[1];
1480
0
        float* bp1 = bp0 + i[1] * bstride[1];
1481
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
1482
0
        {
1483
0
          for (x = 0; x < dim[3]; x++)
1484
0
            bp1[x] = ccv_clamp(ap1[x], min, max);
1485
0
          ap1 += astride[2];
1486
0
          bp1 += bstride[2];
1487
0
        }
1488
0
      }
1489
0
    }
1490
0
  }
1491
0
  return CCV_NNC_EXEC_SUCCESS;
1492
0
}
1493
1494
static int _ccv_nnc_clamp_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1495
3
{
1496
3
  assert(input_size == 3);
1497
3
  const ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0]; // gradient
1498
3
  const ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[2];
1499
3
  assert(output_size == 1);
1500
3
  ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[0];
1501
  // Assuming this is float 32.
1502
3
  int dim[CCV_NNC_MAX_DIM_ALLOC];
1503
3
  int hstride[CCV_NNC_MAX_DIM_ALLOC];
1504
3
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
1505
3
  assert(ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2);
1506
3
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
1507
3
  ccv_nnc_tensor_view_get_dim(g, dim);
1508
3
  ccv_nnc_tensor_view_get_dim(h, dim);
1509
3
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
1510
3
  int x;
1511
3
  const float min = cmd.info.clamp.min;
1512
3
  const float max = cmd.info.clamp.max;
1513
3
  assert(!isnan(min) || !isnan(max));
1514
3
  if (g)
1515
3
  {
1516
3
    if (!CCV_IS_TENSOR_VIEW(g) && !CCV_IS_TENSOR_VIEW(h) && !CCV_IS_TENSOR_VIEW(b))
1517
3
    {
1518
      // Super optimal case, just do one for-loop for sum.
1519
3
      const int tensor_count = ccv_nnc_tensor_count(g->info);
1520
3
      if (isnan(min))
1521
1
      {
1522
1.00k
        for (x = 0; x < tensor_count; 
x++1.00k
)
1523
1.00k
          h->data.f32[x] = b->data.f32[x] >= max ? 
0509
:
g->data.f32[x]491
;
1524
2
      } else if (isnan(max)) {
1525
1.00k
        for (x = 0; x < tensor_count; 
x++1.00k
)
1526
1.00k
          h->data.f32[x] = b->data.f32[x] <= min ? 
00
: g->data.f32[x];
1527
1
      } else {
1528
1.00k
        for (x = 0; x < tensor_count; 
x++1.00k
)
1529
1.00k
          h->data.f32[x] = (b->data.f32[x] >= max || 
b->data.f32[x] <= min491
) ?
0509
:
g->data.f32[x]491
;
1530
1
      }
1531
3
      return CCV_NNC_EXEC_SUCCESS;
1532
3
    }
1533
0
    int gstride[CCV_NNC_MAX_DIM_ALLOC];
1534
0
    assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2);
1535
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1536
0
    ccv_nnc_tensor_view_get_stride(g, gstride);
1537
0
    ccv_nnc_tensor_view_get_stride(b, bstride);
1538
0
    ccv_nnc_tensor_view_get_stride(h, hstride);
1539
0
    int i[CCV_NNC_MAX_DIM + 2];
1540
0
    float* const gp = g->data.f32;
1541
0
    float* const bp = b->data.f32;
1542
0
    float* const hp = h->data.f32;
1543
0
    const int count = dim[2] * dim[3];
1544
0
    const float min = cmd.info.clamp.min;
1545
0
    const float max = cmd.info.clamp.max;
1546
0
    assert(!isnan(min) || !isnan(max));
1547
0
    if (isnan(min))
1548
0
    {
1549
0
      if (gstride[2] == dim[3] && bstride[2] == dim[3] && hstride[2] == dim[3])
1550
0
      {
1551
        // Special casing if the ginc[3] is the same as dim[3]
1552
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
1553
0
        {
1554
0
          float* gp0 = gp + i[0] * gstride[0];
1555
0
          float* bp0 = bp + i[0] * bstride[0];
1556
0
          float* hp0 = hp + i[0] * hstride[0];
1557
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
1558
0
          {
1559
0
            for (x = 0; x < count; x++)
1560
0
              hp0[x] = bp0[x] >= max ? 0 : gp0[x];
1561
0
            gp0 += gstride[1];
1562
0
            bp0 += bstride[1];
1563
0
            hp0 += hstride[1];
1564
0
          }
1565
0
        }
1566
0
        return CCV_NNC_EXEC_SUCCESS;
1567
0
      }
1568
      // Non-optimal case, need to do skip copy.
1569
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1570
0
      {
1571
0
        float* const gp0 = gp + i[0] * gstride[0];
1572
0
        float* const bp0 = bp + i[0] * bstride[0];
1573
0
        float* const hp0 = hp + i[0] * hstride[0];
1574
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1575
0
        {
1576
0
          float* gp1 = gp0 + i[1] * gstride[1];
1577
0
          float* bp1 = bp0 + i[1] * bstride[1];
1578
0
          float* hp1 = hp0 + i[1] * hstride[1];
1579
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
1580
0
          {
1581
0
            for (x = 0; x < dim[3]; x++)
1582
0
              hp1[x] = bp1[x] >= max ? 0 : gp1[x];
1583
0
            gp1 += gstride[2];
1584
0
            bp1 += bstride[2];
1585
0
            hp1 += hstride[2];
1586
0
          }
1587
0
        }
1588
0
      }
1589
0
    } else if (isnan(max)) {
1590
0
      if (gstride[2] == dim[3] && bstride[2] == dim[3] && hstride[2] == dim[3])
1591
0
      {
1592
        // Special casing if the ginc[3] is the same as dim[3]
1593
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
1594
0
        {
1595
0
          float* gp0 = gp + i[0] * gstride[0];
1596
0
          float* bp0 = bp + i[0] * bstride[0];
1597
0
          float* hp0 = hp + i[0] * hstride[0];
1598
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
1599
0
          {
1600
0
            for (x = 0; x < count; x++)
1601
0
              hp0[x] = bp0[x] <= min ? 0 : gp0[x];
1602
0
            gp0 += gstride[1];
1603
0
            bp0 += bstride[1];
1604
0
            hp0 += hstride[1];
1605
0
          }
1606
0
        }
1607
0
        return CCV_NNC_EXEC_SUCCESS;
1608
0
      }
1609
      // Non-optimal case, need to do skip copy.
1610
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1611
0
      {
1612
0
        float* const gp0 = gp + i[0] * gstride[0];
1613
0
        float* const bp0 = bp + i[0] * bstride[0];
1614
0
        float* const hp0 = hp + i[0] * hstride[0];
1615
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1616
0
        {
1617
0
          float* gp1 = gp0 + i[1] * gstride[1];
1618
0
          float* bp1 = bp0 + i[1] * bstride[1];
1619
0
          float* hp1 = hp0 + i[1] * hstride[1];
1620
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
1621
0
          {
1622
0
            for (x = 0; x < dim[3]; x++)
1623
0
              hp1[x] = bp1[x] <= min ? 0 : gp1[x];
1624
0
            gp1 += gstride[2];
1625
0
            bp1 += bstride[2];
1626
0
            hp1 += hstride[2];
1627
0
          }
1628
0
        }
1629
0
      }
1630
0
    } else {
1631
0
      if (gstride[2] == dim[3] && bstride[2] == dim[3] && hstride[2] == dim[3])
1632
0
      {
1633
        // Special casing if the ginc[3] is the same as dim[3]
1634
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
1635
0
        {
1636
0
          float* gp0 = gp + i[0] * gstride[0];
1637
0
          float* bp0 = bp + i[0] * bstride[0];
1638
0
          float* hp0 = hp + i[0] * hstride[0];
1639
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
1640
0
          {
1641
0
            for (x = 0; x < count; x++)
1642
0
              hp0[x] = (bp0[x] >= max || bp0[x] <= min) ? 0 : gp0[x];
1643
0
            gp0 += gstride[1];
1644
0
            bp0 += bstride[1];
1645
0
            hp0 += hstride[1];
1646
0
          }
1647
0
        }
1648
0
        return CCV_NNC_EXEC_SUCCESS;
1649
0
      }
1650
      // Non-optimal case, need to do skip copy.
1651
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1652
0
      {
1653
0
        float* const gp0 = gp + i[0] * gstride[0];
1654
0
        float* const bp0 = bp + i[0] * bstride[0];
1655
0
        float* const hp0 = hp + i[0] * hstride[0];
1656
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1657
0
        {
1658
0
          float* gp1 = gp0 + i[1] * gstride[1];
1659
0
          float* bp1 = bp0 + i[1] * bstride[1];
1660
0
          float* hp1 = hp0 + i[1] * hstride[1];
1661
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
1662
0
          {
1663
0
            for (x = 0; x < dim[3]; x++)
1664
0
              hp1[x] = (bp1[x] >= max || bp1[x] <= min) ? 0 : gp1[x];
1665
0
            gp1 += gstride[2];
1666
0
            bp1 += bstride[2];
1667
0
            hp1 += hstride[2];
1668
0
          }
1669
0
        }
1670
0
      }
1671
0
    }
1672
0
  } else {
1673
0
    if (!CCV_IS_TENSOR_VIEW(h) && !CCV_IS_TENSOR_VIEW(b))
1674
0
    {
1675
      // Super optimal case, just do one for-loop for sum.
1676
0
      const int tensor_count = ccv_nnc_tensor_count(h->info);
1677
0
      if (isnan(min))
1678
0
      {
1679
0
        for (x = 0; x < tensor_count; x++)
1680
0
          h->data.f32[x] = b->data.f32[x] >= max ? 0 : 1;
1681
0
      } else if (isnan(max)) {
1682
0
        for (x = 0; x < tensor_count; x++)
1683
0
          h->data.f32[x] = b->data.f32[x] <= min ? 0 : 1;
1684
0
      } else {
1685
0
        for (x = 0; x < tensor_count; x++)
1686
0
          h->data.f32[x] = (b->data.f32[x] >= max || b->data.f32[x] <= min) ? 0 : 1;
1687
0
      }
1688
0
      return CCV_NNC_EXEC_SUCCESS;
1689
0
    }
1690
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1691
0
    ccv_nnc_tensor_view_get_stride(b, bstride);
1692
0
    ccv_nnc_tensor_view_get_stride(h, hstride);
1693
0
    int i[CCV_NNC_MAX_DIM + 2];
1694
0
    float* const bp = b->data.f32;
1695
0
    float* const hp = h->data.f32;
1696
0
    const int count = dim[2] * dim[3];
1697
0
    const float min = cmd.info.clamp.min;
1698
0
    const float max = cmd.info.clamp.max;
1699
0
    assert(!isnan(min) || !isnan(max));
1700
0
    if (isnan(min))
1701
0
    {
1702
0
      if (bstride[2] == dim[3] && hstride[2] == dim[3])
1703
0
      {
1704
        // Special casing if the binc[3] is the same as dim[3]
1705
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
1706
0
        {
1707
0
          float* bp0 = bp + i[0] * bstride[0];
1708
0
          float* hp0 = hp + i[0] * hstride[0];
1709
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
1710
0
          {
1711
0
            for (x = 0; x < count; x++)
1712
0
              hp0[x] = bp0[x] >= max ? 0 : 1;
1713
0
            bp0 += bstride[1];
1714
0
            hp0 += hstride[1];
1715
0
          }
1716
0
        }
1717
0
        return CCV_NNC_EXEC_SUCCESS;
1718
0
      }
1719
      // Non-optimal case, need to do skip copy.
1720
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1721
0
      {
1722
0
        float* const bp0 = bp + i[0] * bstride[0];
1723
0
        float* const hp0 = hp + i[0] * hstride[0];
1724
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1725
0
        {
1726
0
          float* bp1 = bp0 + i[1] * bstride[1];
1727
0
          float* hp1 = hp0 + i[1] * hstride[1];
1728
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
1729
0
          {
1730
0
            for (x = 0; x < dim[3]; x++)
1731
0
              hp1[x] = bp1[x] >= max ? 0 : 1;
1732
0
            bp1 += bstride[2];
1733
0
            hp1 += hstride[2];
1734
0
          }
1735
0
        }
1736
0
      }
1737
0
    } else if (isnan(max)) {
1738
0
      if (bstride[2] == dim[3] && hstride[2] == dim[3])
1739
0
      {
1740
        // Special casing if the binc[3] is the same as dim[3]
1741
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
1742
0
        {
1743
0
          float* bp0 = bp + i[0] * bstride[0];
1744
0
          float* hp0 = hp + i[0] * hstride[0];
1745
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
1746
0
          {
1747
0
            for (x = 0; x < count; x++)
1748
0
              hp0[x] = bp0[x] <= min ? 0 : 1;
1749
0
            bp0 += bstride[1];
1750
0
            hp0 += hstride[1];
1751
0
          }
1752
0
        }
1753
0
        return CCV_NNC_EXEC_SUCCESS;
1754
0
      }
1755
      // Non-optimal case, need to do skip copy.
1756
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1757
0
      {
1758
0
        float* const bp0 = bp + i[0] * bstride[0];
1759
0
        float* const hp0 = hp + i[0] * hstride[0];
1760
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1761
0
        {
1762
0
          float* bp1 = bp0 + i[1] * bstride[1];
1763
0
          float* hp1 = hp0 + i[1] * hstride[1];
1764
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
1765
0
          {
1766
0
            for (x = 0; x < dim[3]; x++)
1767
0
              hp1[x] = bp1[x] <= min ? 0 : 1;
1768
0
            bp1 += bstride[2];
1769
0
            hp1 += hstride[2];
1770
0
          }
1771
0
        }
1772
0
      }
1773
0
    } else {
1774
0
      if (bstride[2] == dim[3] && hstride[2] == dim[3])
1775
0
      {
1776
        // Special casing if the binc[3] is the same as dim[3]
1777
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
1778
0
        {
1779
0
          float* bp0 = bp + i[0] * bstride[0];
1780
0
          float* hp0 = hp + i[0] * hstride[0];
1781
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
1782
0
          {
1783
0
            for (x = 0; x < count; x++)
1784
0
              hp0[x] = (bp0[x] >= max || bp0[x] <= min) ? 0 : 1;
1785
0
            bp0 += bstride[1];
1786
0
            hp0 += hstride[1];
1787
0
          }
1788
0
        }
1789
0
        return CCV_NNC_EXEC_SUCCESS;
1790
0
      }
1791
      // Non-optimal case, need to do skip copy.
1792
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1793
0
      {
1794
0
        float* const bp0 = bp + i[0] * bstride[0];
1795
0
        float* const hp0 = hp + i[0] * hstride[0];
1796
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1797
0
        {
1798
0
          float* bp1 = bp0 + i[1] * bstride[1];
1799
0
          float* hp1 = hp0 + i[1] * hstride[1];
1800
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
1801
0
          {
1802
0
            for (x = 0; x < dim[3]; x++)
1803
0
              hp1[x] = (bp1[x] >= max || bp1[x] <= min) ? 0 : 1;
1804
0
            bp1 += bstride[2];
1805
0
            hp1 += hstride[2];
1806
0
          }
1807
0
        }
1808
0
      }
1809
0
    }
1810
0
  }
1811
0
  return CCV_NNC_EXEC_SUCCESS;
1812
3
}
1813
1814
REGISTER_COMMAND_BACKEND(CCV_NNC_EWSUM_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1815
1
{
1816
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1817
1
  registry->tensor_datatypes = CCV_32F;
1818
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1819
1
  registry->algorithms = 1;
1820
1
  registry->exec = _ccv_nnc_ewsum_forw;
1821
1
}
1822
1823
REGISTER_COMMAND_BACKEND(CCV_NNC_EWSUM_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1824
1
{
1825
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1826
1
  registry->tensor_datatypes = CCV_32F;
1827
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1828
1
  registry->algorithms = 1;
1829
1
  registry->exec = _ccv_nnc_ewsum_back;
1830
1
}
1831
1832
REGISTER_COMMAND_BACKEND(CCV_NNC_EWPROD_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1833
1
{
1834
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1835
1
  registry->tensor_datatypes = CCV_32F;
1836
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1837
1
  registry->algorithms = 1;
1838
1
  registry->exec = _ccv_nnc_ewprod_forw;
1839
1
}
1840
1841
REGISTER_COMMAND_BACKEND(CCV_NNC_EWPROD_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1842
1
{
1843
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1844
1
  registry->tensor_datatypes = CCV_32F;
1845
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1846
1
  registry->algorithms = 1;
1847
1
  registry->exec = _ccv_nnc_ewprod_back;
1848
1
}
1849
1850
REGISTER_COMMAND_BACKEND(CCV_NNC_EWDIV_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1851
1
{
1852
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1853
1
  registry->tensor_datatypes = CCV_32F;
1854
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1855
1
  registry->algorithms = 1;
1856
1
  registry->exec = _ccv_nnc_ewdiv_forw;
1857
1
}
1858
1859
REGISTER_COMMAND_BACKEND(CCV_NNC_EWDIV_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1860
1
{
1861
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1862
1
  registry->tensor_datatypes = CCV_32F;
1863
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1864
1
  registry->algorithms = 1;
1865
1
  registry->exec = _ccv_nnc_ewdiv_back;
1866
1
}
1867
1868
REGISTER_COMMAND_BACKEND(CCV_NNC_EWEXP_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1869
1
{
1870
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1871
1
  registry->tensor_datatypes = CCV_32F;
1872
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1873
1
  registry->algorithms = 1;
1874
1
  registry->exec = _ccv_nnc_ewexp_forw;
1875
1
}
1876
1877
REGISTER_COMMAND_BACKEND(CCV_NNC_EWEXP_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1878
1
{
1879
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1880
1
  registry->tensor_datatypes = CCV_32F;
1881
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1882
1
  registry->algorithms = 1;
1883
1
  registry->exec = _ccv_nnc_ewexp_back;
1884
1
}
1885
1886
REGISTER_COMMAND_BACKEND(CCV_NNC_EWLOG_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1887
1
{
1888
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1889
1
  registry->tensor_datatypes = CCV_32F;
1890
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1891
1
  registry->algorithms = 1;
1892
1
  registry->exec = _ccv_nnc_ewlog_forw;
1893
1
}
1894
1895
REGISTER_COMMAND_BACKEND(CCV_NNC_EWLOG_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1896
1
{
1897
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1898
1
  registry->tensor_datatypes = CCV_32F;
1899
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1900
1
  registry->algorithms = 1;
1901
1
  registry->exec = _ccv_nnc_ewlog_back;
1902
1
}
1903
1904
REGISTER_COMMAND_BACKEND(CCV_NNC_EWSQRT_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1905
1
{
1906
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1907
1
  registry->tensor_datatypes = CCV_32F;
1908
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1909
1
  registry->algorithms = 1;
1910
1
  registry->exec = _ccv_nnc_ewsqrt_forw;
1911
1
}
1912
1913
REGISTER_COMMAND_BACKEND(CCV_NNC_EWSQRT_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1914
1
{
1915
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1916
1
  registry->tensor_datatypes = CCV_32F;
1917
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1918
1
  registry->algorithms = 1;
1919
1
  registry->exec = _ccv_nnc_ewsqrt_back;
1920
1
}
1921
1922
REGISTER_COMMAND_BACKEND(CCV_NNC_EWABS_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1923
1
{
1924
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1925
1
  registry->tensor_datatypes = CCV_32F;
1926
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1927
1
  registry->algorithms = 1;
1928
1
  registry->exec = _ccv_nnc_ewabs_forw;
1929
1
}
1930
1931
REGISTER_COMMAND_BACKEND(CCV_NNC_EWABS_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1932
1
{
1933
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1934
1
  registry->tensor_datatypes = CCV_32F;
1935
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1936
1
  registry->algorithms = 1;
1937
1
  registry->exec = _ccv_nnc_ewabs_back;
1938
1
}
1939
1940
REGISTER_COMMAND_BACKEND(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1941
1
{
1942
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1943
1
  registry->tensor_datatypes = CCV_32F;
1944
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1945
1
  registry->algorithms = 1;
1946
1
  registry->exec = _ccv_nnc_clamp_forw;
1947
1
}
1948
1949
REGISTER_COMMAND_BACKEND(CCV_NNC_CLAMP_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1950
1
{
1951
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1952
1
  registry->tensor_datatypes = CCV_32F;
1953
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1954
1
  registry->algorithms = 1;
1955
1
  registry->exec = _ccv_nnc_clamp_back;
1956
1
}