Coverage Report

Created: 2026-04-14 19:22

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/ew/ccv_nnc_ew_cpu_ref.c
Line
Count
Source
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
#include "../_ccv_nnc_cpu_ref.h"
14
15
void _ccv_nnc_ewsum_forw_cpu_ref_f32(ccv_nnc_tensor_view_t* const* const inputs, const int input_size, ccv_nnc_tensor_view_t* const* const outputs, const int output_size)
16
36.0k
{
17
36.0k
  if (input_size == 1 && 
output_size == 10
)
18
0
  {
19
0
    _ccv_nnc_tensor_transfer_cpu_ref_f32(inputs[0], outputs[0]);
20
0
    return;
21
0
  }
22
  // Assuming this is float 32.
23
36.0k
  int dim[CCV_NNC_MAX_DIM_ALLOC];
24
36.0k
  int astride[CCV_NNC_MAX_DIM_ALLOC];
25
36.0k
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
26
36.0k
  int cstride[CCV_NNC_MAX_DIM_ALLOC];
27
36.0k
  int x, z;
28
36.0k
  int k = 0;
29
  // Bad, I promised this can be inplace operation. Need to first find out if there are share the same pointer first.
30
72.1k
  for (z = 1; z < input_size; 
z++36.0k
)
31
36.0k
  {
32
36.0k
    ccv_nnc_tensor_view_t* c = outputs[0];
33
36.0k
    ccv_nnc_tensor_view_t* a = inputs[z];
34
36.0k
    if (c->data.f32 == a->data.f32)
35
10
    {
36
10
      k = z;
37
10
      break;
38
10
    }
39
36.0k
  }
40
72.1k
  for (z = 0; z < input_size - 1; 
z++36.0k
)
41
36.0k
  {
42
36.0k
    ccv_nnc_tensor_view_t* c = outputs[0];
43
36.0k
    ccv_nnc_tensor_view_t* a = z > 0 ? 
c28
:
inputs[k]36.0k
;
44
36.0k
    ccv_nnc_tensor_view_t* b = z >= k ? 
inputs[z + 1]36.0k
:
inputs[z]10
;
45
36.0k
    assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
46
36.0k
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
47
36.0k
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
48
36.0k
    ccv_nnc_tensor_view_get_dim(a, dim);
49
36.0k
    assert(ccv_nnc_tensor_view_check_dim(b, dim));
50
36.0k
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
51
36.0k
    if (!CCV_IS_TENSOR_VIEW(a) && 
!36.0k
CCV_IS_TENSOR_VIEW36.0k
(b) &&
!36.0k
CCV_IS_TENSOR_VIEW36.0k
(c))
52
36.0k
    {
53
      // Super optimal case, just do one for-loop for sum.
54
36.0k
      const int tensor_count = ccv_nnc_tensor_count(a->info);
55
15.5M
      for (x = 0; x < tensor_count; 
x++15.4M
)
56
15.4M
        c->data.f32[x] = a->data.f32[x] + b->data.f32[x];
57
36.0k
      continue;
58
36.0k
    }
59
36.0k
    assert
(CCV_NNC_MAX_DIM == 2)3
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
60
3
    ccv_nnc_tensor_view_get_stride(a, astride);
61
3
    ccv_nnc_tensor_view_get_stride(b, bstride);
62
3
    ccv_nnc_tensor_view_get_stride(c, cstride);
63
3
    int i[CCV_NNC_MAX_DIM + 2];
64
3
    float* const ap = a->data.f32;
65
3
    float* const bp = b->data.f32;
66
3
    float* const cp = c->data.f32;
67
3
    const int count = dim[2] * dim[3];
68
3
    if (astride[2] == dim[3] && bstride[2] == dim[3] && cstride[2] == dim[3] && astride[3] == 1 && bstride[3] == 1 && cstride[3] == 1)
69
3
    {
70
      // Special casing if the ainc[3] is the same as dim[3] (do memcpy for the last two dim)
71
6
      for (i[0] = 0; i[0] < dim[0]; 
i[0]++3
)
72
3
      {
73
3
        float* ap0 = ap + i[0] * astride[0];
74
3
        float* bp0 = bp + i[0] * bstride[0];
75
3
        float* cp0 = cp + i[0] * cstride[0];
76
6
        for (i[1] = 0; i[1] < dim[1]; 
i[1]++3
)
77
3
        {
78
6
          for (x = 0; x < count; 
x++3
)
79
3
            cp0[x] = ap0[x] + bp0[x];
80
3
          ap0 += astride[1];
81
3
          bp0 += bstride[1];
82
3
          cp0 += cstride[1];
83
3
        }
84
3
      }
85
3
      continue;
86
3
    }
87
    // Non-optimal case, need to do skip copy.
88
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
89
0
    {
90
0
      float* const ap0 = ap + i[0] * astride[0];
91
0
      float* const bp0 = bp + i[0] * bstride[0];
92
0
      float* const cp0 = cp + i[0] * cstride[0];
93
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
94
0
      {
95
0
        float* ap1 = ap0 + i[1] * astride[1];
96
0
        float* bp1 = bp0 + i[1] * bstride[1];
97
0
        float* cp1 = cp0 + i[1] * cstride[1];
98
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
99
0
        {
100
0
          for (x = 0; x < dim[3]; x++)
101
0
            cp1[x * cstride[3]] = ap1[x * astride[3]] + bp1[x * bstride[3]];
102
0
          ap1 += astride[2];
103
0
          bp1 += bstride[2];
104
0
          cp1 += cstride[2];
105
0
        }
106
0
      }
107
0
    }
108
0
  }
109
36.0k
}
110
111
void _ccv_nnc_ewsum_forw_cpu_ref_i32(ccv_nnc_tensor_view_t* const* const inputs, const int input_size, ccv_nnc_tensor_view_t* const* const outputs, const int output_size)
112
0
{
113
0
  if (input_size == 1 && output_size == 1)
114
0
  {
115
0
    _ccv_nnc_tensor_transfer_cpu_ref_f32(inputs[0], outputs[0]);
116
0
    return;
117
0
  }
118
  // Assuming this is float 32.
119
0
  int dim[CCV_NNC_MAX_DIM_ALLOC];
120
0
  int astride[CCV_NNC_MAX_DIM_ALLOC];
121
0
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
122
0
  int cstride[CCV_NNC_MAX_DIM_ALLOC];
123
0
  int x, z;
124
0
  int k = 0;
125
  // Bad, I promised this can be inplace operation. Need to first find out if there are share the same pointer first.
126
0
  for (z = 1; z < input_size; z++)
127
0
  {
128
0
    ccv_nnc_tensor_view_t* c = outputs[0];
129
0
    ccv_nnc_tensor_view_t* a = inputs[z];
130
0
    if (c->data.f32 == a->data.f32)
131
0
    {
132
0
      k = z;
133
0
      break;
134
0
    }
135
0
  }
136
0
  for (z = 0; z < input_size - 1; z++)
137
0
  {
138
0
    ccv_nnc_tensor_view_t* c = outputs[0];
139
0
    ccv_nnc_tensor_view_t* a = z > 0 ? c : inputs[k];
140
0
    ccv_nnc_tensor_view_t* b = z >= k ? inputs[z + 1] : inputs[z];
141
0
    assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
142
0
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
143
0
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
144
0
    ccv_nnc_tensor_view_get_dim(a, dim);
145
0
    assert(ccv_nnc_tensor_view_check_dim(b, dim));
146
0
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
147
0
    if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c))
148
0
    {
149
      // Super optimal case, just do one for-loop for sum.
150
0
      const int tensor_count = ccv_nnc_tensor_count(a->info);
151
0
      for (x = 0; x < tensor_count; x++)
152
0
        c->data.f32[x] = a->data.f32[x] + b->data.f32[x];
153
0
      continue;
154
0
    }
155
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
156
0
    ccv_nnc_tensor_view_get_stride(a, astride);
157
0
    ccv_nnc_tensor_view_get_stride(b, bstride);
158
0
    ccv_nnc_tensor_view_get_stride(c, cstride);
159
0
    int i[CCV_NNC_MAX_DIM + 2];
160
0
    int* const ap = a->data.i32;
161
0
    int* const bp = b->data.i32;
162
0
    int* const cp = c->data.i32;
163
0
    const int count = dim[2] * dim[3];
164
0
    if (astride[2] == dim[3] && bstride[2] == dim[3] && cstride[2] == dim[3] && astride[3] == 1 && bstride[3] == 1 && cstride[3] == 1)
165
0
    {
166
      // Special casing if the ainc[3] is the same as dim[3] (do memcpy for the last two dim)
167
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
168
0
      {
169
0
        int* ap0 = ap + i[0] * astride[0];
170
0
        int* bp0 = bp + i[0] * bstride[0];
171
0
        int* cp0 = cp + i[0] * cstride[0];
172
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
173
0
        {
174
0
          for (x = 0; x < count; x++)
175
0
            cp0[x] = ap0[x] + bp0[x];
176
0
          ap0 += astride[1];
177
0
          bp0 += bstride[1];
178
0
          cp0 += cstride[1];
179
0
        }
180
0
      }
181
0
      continue;
182
0
    }
183
    // Non-optimal case, need to do skip copy.
184
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
185
0
    {
186
0
      int* const ap0 = ap + i[0] * astride[0];
187
0
      int* const bp0 = bp + i[0] * bstride[0];
188
0
      int* const cp0 = cp + i[0] * cstride[0];
189
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
190
0
      {
191
0
        int* ap1 = ap0 + i[1] * astride[1];
192
0
        int* bp1 = bp0 + i[1] * bstride[1];
193
0
        int* cp1 = cp0 + i[1] * cstride[1];
194
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
195
0
        {
196
0
          for (x = 0; x < dim[3]; x++)
197
0
            cp1[x * cstride[3]] = ap1[x * astride[3]] + bp1[x * bstride[3]];
198
0
          ap1 += astride[2];
199
0
          bp1 += bstride[2];
200
0
          cp1 += cstride[2];
201
0
        }
202
0
      }
203
0
    }
204
0
  }
205
0
}
206
207
static int _ccv_nnc_ewsum_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
208
36.0k
{
209
36.0k
  if (outputs[0]->info.datatype == CCV_32S)
210
0
    _ccv_nnc_ewsum_forw_cpu_ref_i32((ccv_nnc_tensor_view_t**)inputs, input_size, (ccv_nnc_tensor_view_t**)outputs, output_size);
211
36.0k
  else
212
36.0k
    _ccv_nnc_ewsum_forw_cpu_ref_f32((ccv_nnc_tensor_view_t**)inputs, input_size, (ccv_nnc_tensor_view_t**)outputs, output_size);
213
36.0k
  return CCV_NNC_EXEC_SUCCESS;
214
36.0k
}
215
216
static int _ccv_nnc_ewsum_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
217
7.94k
{
218
  // D[x + y + z, x] = 1
219
7.94k
  int i;
220
7.94k
  if (inputs[0] == 0)
221
0
  {
222
    // Set them to 1.
223
0
    for (i = 0; i < output_size; i++)
224
0
      if (outputs[i])
225
0
        _ccv_nnc_tensor_set_cpu_ref_f32((ccv_nnc_tensor_view_t*)outputs[i], 1);
226
7.94k
  } else {
227
    // Copy over the gradient (If they are not pointing to the same tensor already).
228
23.8k
    for (i = 0; i < output_size; 
i++15.8k
)
229
15.8k
      if (outputs[i] && inputs[0]->data.f32 != outputs[i]->data.f32)
230
7.72k
        _ccv_nnc_tensor_transfer_cpu_ref_f32((ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)outputs[i]);
231
7.94k
  }
232
7.94k
  return CCV_NNC_EXEC_SUCCESS;
233
7.94k
}
234
235
void _ccv_nnc_ewprod_forw_cpu_ref(ccv_nnc_tensor_view_t* const* const inputs, const int input_size, ccv_nnc_tensor_view_t* const* const outputs, const int output_size)
236
30.7k
{
237
30.7k
  if (input_size == 1 && 
output_size == 10
)
238
0
  {
239
0
    _ccv_nnc_tensor_transfer_cpu_ref_f32(inputs[0], outputs[0]);
240
0
    return;
241
0
  }
242
  // Assuming this is float 32.
243
30.7k
  int dim[CCV_NNC_MAX_DIM_ALLOC];
244
30.7k
  int astride[CCV_NNC_MAX_DIM_ALLOC];
245
30.7k
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
246
30.7k
  int cstride[CCV_NNC_MAX_DIM_ALLOC];
247
30.7k
  int x, z;
248
30.7k
  int k = 0;
249
  // Bad, I promised this can be inplace operation. Need to first find out if there are share the same pointer first.
250
61.3k
  for (z = 1; z < input_size; 
z++30.6k
)
251
30.7k
  {
252
30.7k
    ccv_nnc_tensor_view_t* c = outputs[0];
253
30.7k
    ccv_nnc_tensor_view_t* a = inputs[z];
254
30.7k
    if (c->data.f32 == a->data.f32)
255
12
    {
256
12
      k = z;
257
12
      break;
258
12
    }
259
30.7k
  }
260
61.4k
  for (z = 0; z < input_size - 1; 
z++30.7k
)
261
30.7k
  {
262
30.7k
    ccv_nnc_tensor_view_t* c = outputs[0];
263
30.7k
    ccv_nnc_tensor_view_t* a = z > 0 ? 
c0
: inputs[k];
264
30.7k
    ccv_nnc_tensor_view_t* b = z >= k ? 
inputs[z + 1]30.6k
:
inputs[z]12
;
265
30.7k
    assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
266
30.7k
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
267
30.7k
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
268
30.7k
    ccv_nnc_tensor_view_get_dim(a, dim);
269
30.7k
    assert(ccv_nnc_tensor_view_check_dim(b, dim));
270
30.7k
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
271
30.7k
    if (!CCV_IS_TENSOR_VIEW(a) && 
!30.7k
CCV_IS_TENSOR_VIEW30.7k
(b) &&
!30.6k
CCV_IS_TENSOR_VIEW30.6k
(c))
272
30.6k
    {
273
      // Super optimal case, just do one for-loop for sum.
274
30.6k
      const int tensor_count = ccv_nnc_tensor_count(a->info);
275
102k
      for (x = 0; x < tensor_count; 
x++71.5k
)
276
71.5k
        c->data.f32[x] = a->data.f32[x] * b->data.f32[x];
277
30.6k
      continue;
278
30.6k
    }
279
30.7k
    assert
(CCV_NNC_MAX_DIM == 2)3
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
280
3
    ccv_nnc_tensor_view_get_stride(a, astride);
281
3
    ccv_nnc_tensor_view_get_stride(b, bstride);
282
3
    ccv_nnc_tensor_view_get_stride(c, cstride);
283
3
    int i[CCV_NNC_MAX_DIM + 2];
284
3
    float* const ap = a->data.f32;
285
3
    float* const bp = b->data.f32;
286
3
    float* const cp = c->data.f32;
287
3
    const int count = dim[2] * dim[3];
288
3
    if (astride[2] == dim[3] && bstride[2] == dim[3] && cstride[2] == dim[3])
289
3
    {
290
      // Special casing if the ainc[3] is the same as dim[3]
291
6
      for (i[0] = 0; i[0] < dim[0]; 
i[0]++3
)
292
3
      {
293
3
        float* ap0 = ap + i[0] * astride[0];
294
3
        float* bp0 = bp + i[0] * bstride[0];
295
3
        float* cp0 = cp + i[0] * cstride[0];
296
6
        for (i[1] = 0; i[1] < dim[1]; 
i[1]++3
)
297
3
        {
298
6
          for (x = 0; x < count; 
x++3
)
299
3
            cp0[x] = ap0[x] * bp0[x];
300
3
          ap0 += astride[1];
301
3
          bp0 += bstride[1];
302
3
          cp0 += cstride[1];
303
3
        }
304
3
      }
305
3
      continue;
306
3
    }
307
    // Non-optimal case, need to do skip copy.
308
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
309
0
    {
310
0
      float* const ap0 = ap + i[0] * astride[0];
311
0
      float* const bp0 = bp + i[0] * bstride[0];
312
0
      float* const cp0 = cp + i[0] * cstride[0];
313
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
314
0
      {
315
0
        float* ap1 = ap0 + i[1] * astride[1];
316
0
        float* bp1 = bp0 + i[1] * bstride[1];
317
0
        float* cp1 = cp0 + i[1] * cstride[1];
318
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
319
0
        {
320
0
          for (x = 0; x < dim[3]; x++)
321
0
            cp1[x] = ap1[x] * bp1[x];
322
0
          ap1 += astride[2];
323
0
          bp1 += bstride[2];
324
0
          cp1 += cstride[2];
325
0
        }
326
0
      }
327
0
    }
328
0
  }
329
30.7k
}
330
331
static int _ccv_nnc_ewprod_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
332
20.2k
{
333
20.2k
  _ccv_nnc_ewprod_forw_cpu_ref((ccv_nnc_tensor_view_t**)inputs, input_size, (ccv_nnc_tensor_view_t**)outputs, output_size);
334
20.2k
  return CCV_NNC_EXEC_SUCCESS;
335
20.2k
}
336
337
static int _ccv_nnc_ewprod_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
338
20.0k
{
339
  // D[x * y * z, x] = y * z
340
  // Assuming this is float 32.
341
20.0k
  int dim[CCV_NNC_MAX_DIM_ALLOC];
342
20.0k
  int gstride[CCV_NNC_MAX_DIM_ALLOC];
343
20.0k
  int astride[CCV_NNC_MAX_DIM_ALLOC];
344
20.0k
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
345
20.0k
  int hstride[CCV_NNC_MAX_DIM_ALLOC];
346
20.0k
  int x, z;
347
20.0k
  ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0];
348
20.0k
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[output_size + 1];
349
20.0k
  if (g == 0)
350
0
  {
351
0
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
352
0
    ccv_nnc_tensor_view_get_dim(b, dim);
353
0
    ccv_nnc_tensor_view_get_stride(b, bstride);
354
0
    for (z = 0; z < output_size; z++)
355
0
    {
356
0
      ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[z + 1];
357
0
      ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[z];
358
0
      assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
359
0
      assert(ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2);
360
0
      assert(ccv_nnc_tensor_view_check_dim(a, dim));
361
0
      assert(ccv_nnc_tensor_view_check_dim(h, dim));
362
0
      ccv_nnc_tensor_view_get_stride(a, astride);
363
0
      ccv_nnc_tensor_view_get_stride(h, hstride);
364
0
      if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(h))
365
0
      {
366
        // Super optimal case, just do one for-loop for sum.
367
0
        const int tensor_count = ccv_nnc_tensor_count(b->info);
368
0
        for (x = 0; x < tensor_count; x++)
369
0
          h->data.f32[x] = b->data.f32[x] / a->data.f32[x];
370
0
        continue;
371
0
      }
372
0
      assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
373
0
      int i[CCV_NNC_MAX_DIM + 2];
374
0
      float* const ap = a->data.f32;
375
0
      float* const bp = b->data.f32;
376
0
      float* const hp = h->data.f32;
377
0
      const int count = dim[2] * dim[3];
378
0
      if (astride[2] == dim[3] && bstride[2] == dim[3] && hstride[2] == dim[3])
379
0
      {
380
        // Special casing if the ainc[3] is the same as dim[3]
381
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
382
0
        {
383
0
          float* ap0 = ap + i[0] * astride[0];
384
0
          float* bp0 = bp + i[0] * bstride[0];
385
0
          float* hp0 = hp + i[0] * hstride[0];
386
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
387
0
          {
388
0
            for (x = 0; x < count; x++)
389
0
              hp0[x] = bp0[x] / ap0[x];
390
0
            ap0 += astride[1];
391
0
            bp0 += bstride[1];
392
0
            hp0 += hstride[1];
393
0
          }
394
0
        }
395
0
        continue;
396
0
      }
397
      // Non-optimal case, need to do skip copy.
398
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
399
0
      {
400
0
        float* const ap0 = ap + i[0] * astride[0];
401
0
        float* const bp0 = bp + i[0] * bstride[0];
402
0
        float* const hp0 = hp + i[0] * hstride[0];
403
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
404
0
        {
405
0
          float* ap1 = ap0 + i[1] * astride[1];
406
0
          float* bp1 = bp0 + i[1] * bstride[1];
407
0
          float* hp1 = hp0 + i[1] * hstride[1];
408
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
409
0
          {
410
0
            for (x = 0; x < dim[3]; x++)
411
0
              hp1[x] = bp1[x] / ap1[x];
412
0
            ap1 += astride[2];
413
0
            bp1 += bstride[2];
414
0
            hp1 += hstride[2];
415
0
          }
416
0
        }
417
0
      }
418
0
    }
419
20.0k
  } else {
420
20.0k
    assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2);
421
20.0k
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
422
20.0k
    ccv_nnc_tensor_view_get_dim(b, dim);
423
20.0k
    assert(ccv_nnc_tensor_view_check_dim(g, dim));
424
20.0k
    ccv_nnc_tensor_view_get_stride(b, bstride);
425
20.0k
    ccv_nnc_tensor_view_get_stride(g, gstride);
426
60.2k
    for (z = 0; z < output_size; 
z++40.1k
)
427
40.1k
    {
428
40.1k
      ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[z + 1];
429
40.1k
      ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[z];
430
40.1k
      assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
431
40.1k
      assert(ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2);
432
40.1k
      assert(ccv_nnc_tensor_view_check_dim(a, dim));
433
40.1k
      assert(ccv_nnc_tensor_view_check_dim(h, dim));
434
40.1k
      ccv_nnc_tensor_view_get_stride(a, astride);
435
40.1k
      ccv_nnc_tensor_view_get_stride(h, hstride);
436
40.1k
      if (!CCV_IS_TENSOR_VIEW(g) && 
!40.1k
CCV_IS_TENSOR_VIEW40.1k
(a) &&
!40.1k
CCV_IS_TENSOR_VIEW40.1k
(b) &&
!40.1k
CCV_IS_TENSOR_VIEW40.1k
(h))
437
40.1k
      {
438
        // Super optimal case, just do one for-loop for sum.
439
40.1k
        const int tensor_count = ccv_nnc_tensor_count(g->info);
440
132k
        for (x = 0; x < tensor_count; 
x++91.8k
)
441
91.8k
          h->data.f32[x] = g->data.f32[x] * b->data.f32[x] / a->data.f32[x];
442
40.1k
        continue;
443
40.1k
      }
444
40.1k
      assert
(CCV_NNC_MAX_DIM == 2)3
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
445
3
      int i[CCV_NNC_MAX_DIM + 2];
446
3
      float* const gp = g->data.f32;
447
3
      float* const ap = a->data.f32;
448
3
      float* const bp = b->data.f32;
449
3
      float* const hp = h->data.f32;
450
3
      const int count = dim[2] * dim[3];
451
3
      if (gstride[2] == dim[3] && astride[2] == dim[3] && bstride[2] == dim[3] && hstride[2] == dim[3])
452
3
      {
453
        // Special casing if the ainc[3] is the same as dim[3]
454
6
        for (i[0] = 0; i[0] < dim[0]; 
i[0]++3
)
455
3
        {
456
3
          float* gp0 = gp + i[0] * gstride[0];
457
3
          float* ap0 = ap + i[0] * astride[0];
458
3
          float* bp0 = bp + i[0] * bstride[0];
459
3
          float* hp0 = hp + i[0] * hstride[0];
460
6
          for (i[1] = 0; i[1] < dim[1]; 
i[1]++3
)
461
3
          {
462
6
            for (x = 0; x < count; 
x++3
)
463
3
              hp0[x] = gp0[x] * bp0[x] / ap0[x];
464
3
            gp0 += gstride[1];
465
3
            ap0 += astride[1];
466
3
            bp0 += bstride[1];
467
3
            hp0 += hstride[1];
468
3
          }
469
3
        }
470
3
        continue;
471
3
      }
472
      // Non-optimal case, need to do skip copy.
473
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
474
0
      {
475
0
        float* const gp0 = gp + i[0] * gstride[0];
476
0
        float* const ap0 = ap + i[0] * astride[0];
477
0
        float* const bp0 = bp + i[0] * bstride[0];
478
0
        float* const hp0 = hp + i[0] * hstride[0];
479
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
480
0
        {
481
0
          float* gp1 = gp0 + i[1] * gstride[1];
482
0
          float* ap1 = ap0 + i[1] * astride[1];
483
0
          float* bp1 = bp0 + i[1] * bstride[1];
484
0
          float* hp1 = hp0 + i[1] * hstride[1];
485
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
486
0
          {
487
0
            for (x = 0; x < dim[3]; x++)
488
0
              hp1[x] = gp1[x] * bp1[x] / ap1[x];
489
0
            gp1 += gstride[2];
490
0
            ap1 += astride[2];
491
0
            bp1 += bstride[2];
492
0
            hp1 += hstride[2];
493
0
          }
494
0
        }
495
0
      }
496
0
    }
497
20.0k
  }
498
20.0k
  return CCV_NNC_EXEC_SUCCESS;
499
20.0k
}
500
501
static void _ccv_nnc_ewdiv_forw_cpu_ref(const float p, ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b, ccv_nnc_tensor_view_t* const c)
502
261
{
503
  // Assuming this is float 32.
504
261
  int dim[CCV_NNC_MAX_DIM_ALLOC];
505
261
  int astride[CCV_NNC_MAX_DIM_ALLOC];
506
261
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
507
261
  int cstride[CCV_NNC_MAX_DIM_ALLOC];
508
261
  if (a == 0) // Take 0 as all ones tensor.
509
21
  {
510
21
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
511
21
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
512
21
    ccv_nnc_tensor_view_get_dim(b, dim);
513
21
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
514
21
    int x;
515
21
    if (!CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c))
516
21
    {
517
      // Super optimal case, just do one for-loop for sum.
518
21
      const int tensor_count = ccv_nnc_tensor_count(b->info);
519
2.00k
      for (x = 0; x < tensor_count; 
x++1.98k
)
520
1.98k
        c->data.f32[x] = p / b->data.f32[x];
521
21
      return;
522
21
    }
523
21
    assert
(CCV_NNC_MAX_DIM == 2)0
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
524
0
    ccv_nnc_tensor_view_get_stride(b, bstride);
525
0
    ccv_nnc_tensor_view_get_stride(c, cstride);
526
0
    int i[CCV_NNC_MAX_DIM + 2];
527
0
    float* const bp = b->data.f32;
528
0
    float* const cp = c->data.f32;
529
0
    const int count = dim[2] * dim[3];
530
0
    if (bstride[2] == dim[3] && cstride[2] == dim[3])
531
0
    {
532
      // Special casing if the ainc[3] is the same as dim[3]
533
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
534
0
      {
535
0
        float* bp0 = bp + i[0] * bstride[0];
536
0
        float* cp0 = cp + i[0] * cstride[0];
537
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
538
0
        {
539
0
          for (x = 0; x < count; x++)
540
0
            cp0[x] = p / bp0[x];
541
0
          bp0 += bstride[1];
542
0
          cp0 += cstride[1];
543
0
        }
544
0
      }
545
0
      return;
546
0
    }
547
    // Non-optimal case, need to do skip copy.
548
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
549
0
    {
550
0
      float* const bp0 = bp + i[0] * bstride[0];
551
0
      float* const cp0 = cp + i[0] * cstride[0];
552
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
553
0
      {
554
0
        float* bp1 = bp0 + i[1] * bstride[1];
555
0
        float* cp1 = cp0 + i[1] * cstride[1];
556
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
557
0
        {
558
0
          for (x = 0; x < dim[3]; x++)
559
0
            cp1[x] = p / bp1[x];
560
0
          bp1 += bstride[2];
561
0
          cp1 += cstride[2];
562
0
        }
563
0
      }
564
0
    }
565
240
  } else {
566
240
    assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
567
240
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
568
240
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
569
240
    ccv_nnc_tensor_view_get_dim(a, dim);
570
240
    assert(ccv_nnc_tensor_view_check_dim(b, dim));
571
240
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
572
240
    int x;
573
240
    if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c))
574
240
    {
575
      // Super optimal case, just do one for-loop for sum.
576
240
      const int tensor_count = ccv_nnc_tensor_count(a->info);
577
7.05k
      for (x = 0; x < tensor_count; 
x++6.81k
)
578
6.81k
        c->data.f32[x] = p * a->data.f32[x] / b->data.f32[x];
579
240
      return;
580
240
    }
581
240
    assert
(CCV_NNC_MAX_DIM == 2)0
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
582
0
    ccv_nnc_tensor_view_get_stride(a, astride);
583
0
    ccv_nnc_tensor_view_get_stride(b, bstride);
584
0
    ccv_nnc_tensor_view_get_stride(c, cstride);
585
0
    int i[CCV_NNC_MAX_DIM + 2];
586
0
    float* const ap = a->data.f32;
587
0
    float* const bp = b->data.f32;
588
0
    float* const cp = c->data.f32;
589
0
    const int count = dim[2] * dim[3];
590
0
    if (astride[2] == dim[3] && bstride[2] == dim[3] && cstride[2] == dim[3])
591
0
    {
592
      // Special casing if the ainc[3] is the same as dim[3]
593
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
594
0
      {
595
0
        float* ap0 = ap + i[0] * astride[0];
596
0
        float* bp0 = bp + i[0] * bstride[0];
597
0
        float* cp0 = cp + i[0] * cstride[0];
598
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
599
0
        {
600
0
          for (x = 0; x < count; x++)
601
0
            cp0[x] = p * ap0[x] / bp0[x];
602
0
          ap0 += astride[1];
603
0
          bp0 += bstride[1];
604
0
          cp0 += cstride[1];
605
0
        }
606
0
      }
607
0
      return;
608
0
    }
609
    // Non-optimal case, need to do skip copy.
610
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
611
0
    {
612
0
      float* const ap0 = ap + i[0] * astride[0];
613
0
      float* const bp0 = bp + i[0] * bstride[0];
614
0
      float* const cp0 = cp + i[0] * cstride[0];
615
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
616
0
      {
617
0
        float* ap1 = ap0 + i[1] * astride[1];
618
0
        float* bp1 = bp0 + i[1] * bstride[1];
619
0
        float* cp1 = cp0 + i[1] * cstride[1];
620
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
621
0
        {
622
0
          for (x = 0; x < dim[3]; x++)
623
0
            cp1[x] = p * ap1[x] / bp1[x];
624
0
          ap1 += astride[2];
625
0
          bp1 += bstride[2];
626
0
          cp1 += cstride[2];
627
0
        }
628
0
      }
629
0
    }
630
0
  }
631
261
}
632
633
static int _ccv_nnc_ewdiv_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
634
32
{
635
32
  _ccv_nnc_ewdiv_forw_cpu_ref(1, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]);
636
32
  return CCV_NNC_EXEC_SUCCESS;
637
32
}
638
639
static int _ccv_nnc_ewdiv_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
640
17
{
641
  // D[x / y, x] = 1 / y, D[x / y, y] = -x / y^2
642
17
  if (output_size == 1 || 
outputs[1] == 016
)
643
2
  {
644
    // When we only need D[x / y, x]
645
2
    _ccv_nnc_ewdiv_forw_cpu_ref(1, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
646
2
    return CCV_NNC_EXEC_SUCCESS;
647
2
  }
648
15
  int dim[CCV_NNC_MAX_DIM_ALLOC];
649
15
  int gstride[CCV_NNC_MAX_DIM_ALLOC];
650
15
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
651
15
  int cstride[CCV_NNC_MAX_DIM_ALLOC];
652
15
  int hastride[CCV_NNC_MAX_DIM_ALLOC];
653
15
  int hbstride[CCV_NNC_MAX_DIM_ALLOC];
654
15
  ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0];
655
15
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[2];
656
15
  ccv_nnc_tensor_view_t* c = (ccv_nnc_tensor_view_t*)inputs[3];
657
15
  ccv_nnc_tensor_view_t* ha = (ccv_nnc_tensor_view_t*)outputs[0];
658
15
  ccv_nnc_tensor_view_t* hb = (ccv_nnc_tensor_view_t*)outputs[1];
659
15
  if (g == 0)
660
0
  {
661
0
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
662
0
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
663
0
    assert(ccv_nnc_tensor_nd(hb->info.dim) <= CCV_NNC_MAX_DIM + 2);
664
0
    ccv_nnc_tensor_view_get_dim(b, dim);
665
0
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
666
0
    assert(ccv_nnc_tensor_view_check_dim(hb, dim));
667
0
    if (ha)
668
0
    {
669
0
      assert(ccv_nnc_tensor_nd(ha->info.dim) <= CCV_NNC_MAX_DIM + 2);
670
0
      assert(ccv_nnc_tensor_view_check_dim(ha, dim));
671
0
    }
672
0
    int x;
673
0
    if (!CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && (ha == 0 || !CCV_IS_TENSOR_VIEW(ha)) && !CCV_IS_TENSOR_VIEW(hb))
674
0
    {
675
      // Super optimal case, just do one for-loop for sum.
676
0
      const int tensor_count = ccv_nnc_tensor_count(b->info);
677
0
      if (ha == 0)
678
0
      {
679
0
        for (x = 0; x < tensor_count; x++)
680
0
        {
681
0
          const float v = 1 / b->data.f32[x];
682
0
          hb->data.f32[x] = -c->data.f32[x] * v;
683
0
        }
684
0
      } else {
685
0
        for (x = 0; x < tensor_count; x++)
686
0
        {
687
0
          const float v = 1 / b->data.f32[x];
688
0
          ha->data.f32[x] = v;
689
0
          hb->data.f32[x] = -c->data.f32[x] * v;
690
0
        }
691
0
      }
692
0
      return CCV_NNC_EXEC_SUCCESS;
693
0
    }
694
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
695
0
    ccv_nnc_tensor_view_get_stride(b, bstride);
696
0
    ccv_nnc_tensor_view_get_stride(c, cstride);
697
0
    ccv_nnc_tensor_view_get_stride(hb, hbstride);
698
0
    int i[CCV_NNC_MAX_DIM + 2];
699
0
    float* const bp = b->data.f32;
700
0
    float* const cp = c->data.f32;
701
0
    float* const hbp = hb->data.f32;
702
0
    const int count = dim[2] * dim[3];
703
0
    if (ha == 0)
704
0
    {
705
0
      if (bstride[2] == dim[3] && cstride[2] == dim[3] && hbstride[2] == dim[3])
706
0
      {
707
        // Special casing if the ainc[3] is the same as dim[3]
708
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
709
0
        {
710
0
          float* bp0 = bp + i[0] * bstride[0];
711
0
          float* cp0 = cp + i[0] * cstride[0];
712
0
          float* hbp0 = hbp + i[0] * hbstride[0];
713
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
714
0
          {
715
0
            for (x = 0; x < count; x++)
716
0
            {
717
0
              const float v = 1 / bp0[x];
718
0
              hbp0[x] = -cp0[x] * v;
719
0
            }
720
0
            bp0 += bstride[1];
721
0
            cp0 += cstride[1];
722
0
            hbp0 += hbstride[1];
723
0
          }
724
0
        }
725
0
        return CCV_NNC_EXEC_SUCCESS;
726
0
      }
727
      // Non-optimal case, need to do skip copy.
728
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
729
0
      {
730
0
        float* const bp0 = bp + i[0] * bstride[0];
731
0
        float* const cp0 = cp + i[0] * cstride[0];
732
0
        float* const hbp0 = hbp + i[0] * hbstride[0];
733
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
734
0
        {
735
0
          float* bp1 = bp0 + i[1] * bstride[1];
736
0
          float* cp1 = cp0 + i[1] * cstride[1];
737
0
          float* hbp1 = hbp0 + i[1] * hbstride[1];
738
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
739
0
          {
740
0
            for (x = 0; x < dim[3]; x++)
741
0
            {
742
0
              const float v = 1 / bp1[x];
743
0
              hbp1[x] = -cp1[x] * v;
744
0
            }
745
0
            bp1 += bstride[2];
746
0
            cp1 += cstride[2];
747
0
            hbp1 += hbstride[2];
748
0
          }
749
0
        }
750
0
      }
751
0
    } else {
752
0
      float* const hap = ha->data.f32;
753
0
      ccv_nnc_tensor_view_get_stride(ha, hastride);
754
0
      if (bstride[2] == dim[3] && cstride[2] == dim[3] && hastride[2] == dim[3] && hbstride[2] == dim[3])
755
0
      {
756
        // Special casing if the ainc[3] is the same as dim[3]
757
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
758
0
        {
759
0
          float* bp0 = bp + i[0] * bstride[0];
760
0
          float* cp0 = cp + i[0] * cstride[0];
761
0
          float* hap0 = hap + i[0] * hastride[0];
762
0
          float* hbp0 = hbp + i[0] * hbstride[0];
763
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
764
0
          {
765
0
            for (x = 0; x < count; x++)
766
0
            {
767
0
              const float v = 1 / bp0[x];
768
0
              hap0[x] = v;
769
0
              hbp0[x] = -cp0[x] * v;
770
0
            }
771
0
            bp0 += bstride[1];
772
0
            cp0 += cstride[1];
773
0
            hap0 += hastride[1];
774
0
            hbp0 += hbstride[1];
775
0
          }
776
0
        }
777
0
        return CCV_NNC_EXEC_SUCCESS;
778
0
      }
779
      // Non-optimal case, need to do skip copy.
780
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
781
0
      {
782
0
        float* const bp0 = bp + i[0] * bstride[0];
783
0
        float* const cp0 = cp + i[0] * cstride[0];
784
0
        float* const hap0 = hap + i[0] * hastride[0];
785
0
        float* const hbp0 = hbp + i[0] * hbstride[0];
786
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
787
0
        {
788
0
          float* bp1 = bp0 + i[1] * bstride[1];
789
0
          float* cp1 = cp0 + i[1] * cstride[1];
790
0
          float* hap1 = hap0 + i[1] * hastride[1];
791
0
          float* hbp1 = hbp0 + i[1] * hbstride[1];
792
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
793
0
          {
794
0
            for (x = 0; x < dim[3]; x++)
795
0
            {
796
0
              const float v = 1 / bp1[x];
797
0
              hap1[x] = v;
798
0
              hbp1[x] = -cp1[x] * v;
799
0
            }
800
0
            bp1 += bstride[2];
801
0
            cp1 += cstride[2];
802
0
            hap1 += hastride[2];
803
0
            hbp1 += hbstride[2];
804
0
          }
805
0
        }
806
0
      }
807
0
    }
808
15
  } else {
809
15
    assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2);
810
15
    assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
811
15
    assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
812
15
    assert(ccv_nnc_tensor_nd(hb->info.dim) <= CCV_NNC_MAX_DIM + 2);
813
15
    ccv_nnc_tensor_view_get_dim(b, dim);
814
15
    assert(ccv_nnc_tensor_view_check_dim(g, dim));
815
15
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
816
15
    assert(ccv_nnc_tensor_view_check_dim(hb, dim));
817
15
    if (ha)
818
1
    {
819
1
      assert(ccv_nnc_tensor_nd(ha->info.dim) <= CCV_NNC_MAX_DIM + 2);
820
1
      assert(ccv_nnc_tensor_view_check_dim(ha, dim));
821
1
    }
822
15
    int x;
823
15
    if (!CCV_IS_TENSOR_VIEW(g) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && (ha == 0 || 
!1
CCV_IS_TENSOR_VIEW1
(ha)) && !CCV_IS_TENSOR_VIEW(hb))
824
15
    {
825
      // Super optimal case, just do one for-loop for sum.
826
15
      const int tensor_count = ccv_nnc_tensor_count(g->info);
827
15
      if (ha == 0)
828
14
      {
829
1.50k
        for (x = 0; x < tensor_count; 
x++1.49k
)
830
1.49k
        {
831
1.49k
          const float v = g->data.f32[x] / b->data.f32[x];
832
1.49k
          hb->data.f32[x] = -c->data.f32[x] * v;
833
1.49k
        }
834
14
      } else {
835
2
        for (x = 0; x < tensor_count; 
x++1
)
836
1
        {
837
1
          const float v = g->data.f32[x] / b->data.f32[x];
838
1
          ha->data.f32[x] = v;
839
1
          hb->data.f32[x] = -c->data.f32[x] * v;
840
1
        }
841
1
      }
842
15
      return CCV_NNC_EXEC_SUCCESS;
843
15
    }
844
15
    assert
(CCV_NNC_MAX_DIM == 2)0
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
845
0
    ccv_nnc_tensor_view_get_stride(g, gstride);
846
0
    ccv_nnc_tensor_view_get_stride(b, bstride);
847
0
    ccv_nnc_tensor_view_get_stride(c, cstride);
848
0
    ccv_nnc_tensor_view_get_stride(hb, hbstride);
849
0
    int i[CCV_NNC_MAX_DIM + 2];
850
0
    float* const gp = g->data.f32;
851
0
    float* const bp = b->data.f32;
852
0
    float* const cp = c->data.f32;
853
0
    float* const hbp = hb->data.f32;
854
0
    const int count = dim[2] * dim[3];
855
0
    if (ha == 0)
856
0
    {
857
0
      if (gstride[2] == dim[3] && bstride[2] == dim[3] && cstride[2] == dim[3] && hbstride[2] == dim[3])
858
0
      {
859
        // Special casing if the ainc[3] is the same as dim[3]
860
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
861
0
        {
862
0
          float* gp0 = gp + i[0] * gstride[0];
863
0
          float* bp0 = bp + i[0] * bstride[0];
864
0
          float* cp0 = cp + i[0] * cstride[0];
865
0
          float* hbp0 = hbp + i[0] * hbstride[0];
866
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
867
0
          {
868
0
            for (x = 0; x < count; x++)
869
0
            {
870
0
              const float v = gp0[x] / bp0[x];
871
0
              hbp0[x] = -cp0[x] * v;
872
0
            }
873
0
            gp0 += gstride[1];
874
0
            bp0 += bstride[1];
875
0
            cp0 += cstride[1];
876
0
            hbp0 += hbstride[1];
877
0
          }
878
0
        }
879
0
        return CCV_NNC_EXEC_SUCCESS;
880
0
      }
881
      // Non-optimal case, need to do skip copy.
882
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
883
0
      {
884
0
        float* const gp0 = gp + i[0] * gstride[0];
885
0
        float* const bp0 = bp + i[0] * bstride[0];
886
0
        float* const cp0 = cp + i[0] * cstride[0];
887
0
        float* const hbp0 = hbp + i[0] * hbstride[0];
888
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
889
0
        {
890
0
          float* gp1 = gp0 + i[1] * gstride[1];
891
0
          float* bp1 = bp0 + i[1] * bstride[1];
892
0
          float* cp1 = cp0 + i[1] * cstride[1];
893
0
          float* hbp1 = hbp0 + i[1] * hbstride[1];
894
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
895
0
          {
896
0
            for (x = 0; x < dim[3]; x++)
897
0
            {
898
0
              const float v = gp1[x] / bp1[x];
899
0
              hbp1[x] = -cp1[x] * v;
900
0
            }
901
0
            gp1 += gstride[2];
902
0
            bp1 += bstride[2];
903
0
            cp1 += cstride[2];
904
0
            hbp1 += hbstride[2];
905
0
          }
906
0
        }
907
0
      }
908
0
    } else {
909
0
      ccv_nnc_tensor_view_get_stride(ha, hastride);
910
0
      float* const hap = ha->data.f32;
911
0
      if (gstride[2] == dim[3] && bstride[2] == dim[3] && cstride[2] == dim[3] && hastride[2] == dim[3] && hbstride[2] == dim[3])
912
0
      {
913
        // Special casing if the ainc[3] is the same as dim[3]
914
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
915
0
        {
916
0
          float* gp0 = gp + i[0] * gstride[0];
917
0
          float* bp0 = bp + i[0] * bstride[0];
918
0
          float* cp0 = cp + i[0] * cstride[0];
919
0
          float* hap0 = hap + i[0] * hastride[0];
920
0
          float* hbp0 = hbp + i[0] * hbstride[0];
921
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
922
0
          {
923
0
            for (x = 0; x < count; x++)
924
0
            {
925
0
              const float v = gp0[x] / bp0[x];
926
0
              hap0[x] = v;
927
0
              hbp0[x] = -cp0[x] * v;
928
0
            }
929
0
            gp0 += gstride[1];
930
0
            bp0 += bstride[1];
931
0
            cp0 += cstride[1];
932
0
            hap0 += hastride[1];
933
0
            hbp0 += hbstride[1];
934
0
          }
935
0
        }
936
0
        return CCV_NNC_EXEC_SUCCESS;
937
0
      }
938
      // Non-optimal case, need to do skip copy.
939
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
940
0
      {
941
0
        float* const gp0 = gp + i[0] * gstride[0];
942
0
        float* const bp0 = bp + i[0] * bstride[0];
943
0
        float* const cp0 = cp + i[0] * cstride[0];
944
0
        float* const hap0 = hap + i[0] * hastride[0];
945
0
        float* const hbp0 = hbp + i[0] * hbstride[0];
946
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
947
0
        {
948
0
          float* gp1 = gp0 + i[1] * gstride[1];
949
0
          float* bp1 = bp0 + i[1] * bstride[1];
950
0
          float* cp1 = cp0 + i[1] * cstride[1];
951
0
          float* hap1 = hap0 + i[1] * hastride[1];
952
0
          float* hbp1 = hbp0 + i[1] * hbstride[1];
953
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
954
0
          {
955
0
            for (x = 0; x < dim[3]; x++)
956
0
            {
957
0
              const float v = gp1[x] / bp1[x];
958
0
              hap1[x] = v;
959
0
              hbp1[x] = -cp1[x] * v;
960
0
            }
961
0
            gp1 += gstride[2];
962
0
            bp1 += bstride[2];
963
0
            cp1 += cstride[2];
964
0
            hap1 += hastride[2];
965
0
            hbp1 += hbstride[2];
966
0
          }
967
0
        }
968
0
      }
969
0
    }
970
0
  }
971
0
  return CCV_NNC_EXEC_SUCCESS;
972
15
}
973
974
static int _ccv_nnc_ewexp_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
975
23
{
976
  // Assuming this is float 32.
977
23
  int dim[CCV_NNC_MAX_DIM_ALLOC];
978
23
  int astride[CCV_NNC_MAX_DIM_ALLOC];
979
23
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
980
23
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
981
23
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
982
23
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
983
23
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
984
23
  ccv_nnc_tensor_view_get_dim(a, dim);
985
23
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
986
23
  int x;
987
23
  if (!CCV_IS_TENSOR_VIEW(a) && 
!22
CCV_IS_TENSOR_VIEW22
(b))
988
22
  {
989
    // Super optimal case, just do one for-loop for sum.
990
22
    const int tensor_count = ccv_nnc_tensor_count(a->info);
991
3.09k
    for (x = 0; x < tensor_count; 
x++3.07k
)
992
3.07k
      b->data.f32[x] = exp(a->data.f32[x]);
993
22
    return CCV_NNC_EXEC_SUCCESS;
994
22
  }
995
23
  assert
(CCV_NNC_MAX_DIM == 2)1
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
996
1
  ccv_nnc_tensor_view_get_stride(a, astride);
997
1
  ccv_nnc_tensor_view_get_stride(b, bstride);
998
1
  int i[CCV_NNC_MAX_DIM + 2];
999
1
  float* const ap = a->data.f32;
1000
1
  float* const bp = b->data.f32;
1001
1
  const int count = dim[2] * dim[3];
1002
1
  if (astride[2] == dim[3] && bstride[2] == dim[3])
1003
1
  {
1004
    // Special casing if the ainc[3] is the same as dim[3]
1005
2
    for (i[0] = 0; i[0] < dim[0]; 
i[0]++1
)
1006
1
    {
1007
1
      float* ap0 = ap + i[0] * astride[0];
1008
1
      float* bp0 = bp + i[0] * bstride[0];
1009
2
      for (i[1] = 0; i[1] < dim[1]; 
i[1]++1
)
1010
1
      {
1011
2
        for (x = 0; x < count; 
x++1
)
1012
1
          bp0[x] = exp(ap0[x]);
1013
1
        ap0 += astride[1];
1014
1
        bp0 += bstride[1];
1015
1
      }
1016
1
    }
1017
1
    return CCV_NNC_EXEC_SUCCESS;
1018
1
  }
1019
  // Non-optimal case, need to do skip copy.
1020
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
1021
0
  {
1022
0
    float* const ap0 = ap + i[0] * astride[0];
1023
0
    float* const bp0 = bp + i[0] * bstride[0];
1024
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
1025
0
    {
1026
0
      float* ap1 = ap0 + i[1] * astride[1];
1027
0
      float* bp1 = bp0 + i[1] * bstride[1];
1028
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
1029
0
      {
1030
0
        for (x = 0; x < dim[3]; x++)
1031
0
          bp1[x] = exp(ap1[x]);
1032
0
        ap1 += astride[2];
1033
0
        bp1 += bstride[2];
1034
0
      }
1035
0
    }
1036
0
  }
1037
0
  return CCV_NNC_EXEC_SUCCESS;
1038
1
}
1039
1040
static int _ccv_nnc_ewexp_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1041
10
{
1042
  // D[Exp[x], x] = Exp[x]
1043
10
  if (inputs[0] == 0)
1044
0
    _ccv_nnc_tensor_transfer_cpu_ref_f32((ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
1045
10
  else
1046
10
    _ccv_nnc_ewprod_forw_cpu_ref((ccv_nnc_tensor_view_t*[]){
1047
10
      (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2]
1048
10
    }, 2, (ccv_nnc_tensor_view_t**)outputs, output_size);
1049
10
  return CCV_NNC_EXEC_SUCCESS;
1050
10
}
1051
1052
static void _ccv_nnc_ewpow_forw_cpu_ref(ccv_nnc_tensor_view_t* const a, const float exp, ccv_nnc_tensor_view_t* const c)
1053
4
{
1054
  // Assuming this is float 32.
1055
4
  int dim[CCV_NNC_MAX_DIM_ALLOC];
1056
4
  int astride[CCV_NNC_MAX_DIM_ALLOC];
1057
4
  int cstride[CCV_NNC_MAX_DIM_ALLOC];
1058
4
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
1059
4
  assert(ccv_nnc_tensor_nd(c->info.dim) <= CCV_NNC_MAX_DIM + 2);
1060
4
  ccv_nnc_tensor_view_get_dim(a, dim);
1061
4
  assert(ccv_nnc_tensor_view_check_dim(c, dim));
1062
4
  int x;
1063
4
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(c))
1064
4
  {
1065
4
    const int tensor_count = ccv_nnc_tensor_count(a->info);
1066
2.00k
    for (x = 0; x < tensor_count; 
x++2.00k
)
1067
2.00k
      c->data.f32[x] = powf(a->data.f32[x], exp);
1068
4
    return;
1069
4
  }
1070
4
  assert
(CCV_NNC_MAX_DIM == 2)0
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1071
0
  ccv_nnc_tensor_view_get_stride(a, astride);
1072
0
  ccv_nnc_tensor_view_get_stride(c, cstride);
1073
0
  int i[CCV_NNC_MAX_DIM + 2];
1074
0
  float* const ap = a->data.f32;
1075
0
  float* const cp = c->data.f32;
1076
0
  const int count = dim[2] * dim[3];
1077
0
  if (astride[2] == dim[3] && cstride[2] == dim[3])
1078
0
  {
1079
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1080
0
    {
1081
0
      float* ap0 = ap + i[0] * astride[0];
1082
0
      float* cp0 = cp + i[0] * cstride[0];
1083
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1084
0
      {
1085
0
        for (x = 0; x < count; x++)
1086
0
          cp0[x] = powf(ap0[x], exp);
1087
0
        ap0 += astride[1];
1088
0
        cp0 += cstride[1];
1089
0
      }
1090
0
    }
1091
0
    return;
1092
0
  }
1093
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
1094
0
  {
1095
0
    float* const ap0 = ap + i[0] * astride[0];
1096
0
    float* const cp0 = cp + i[0] * cstride[0];
1097
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
1098
0
    {
1099
0
      float* ap1 = ap0 + i[1] * astride[1];
1100
0
      float* cp1 = cp0 + i[1] * cstride[1];
1101
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
1102
0
      {
1103
0
        for (x = 0; x < dim[3]; x++)
1104
0
          cp1[x] = powf(ap1[x], exp);
1105
0
        ap1 += astride[2];
1106
0
        cp1 += cstride[2];
1107
0
      }
1108
0
    }
1109
0
  }
1110
0
}
1111
1112
static int _ccv_nnc_ewpow_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1113
4
{
1114
4
  _ccv_nnc_ewpow_forw_cpu_ref((ccv_nnc_tensor_view_t*)inputs[0], cmd.info.pow.exponent, (ccv_nnc_tensor_view_t*)outputs[0]);
1115
4
  return CCV_NNC_EXEC_SUCCESS;
1116
4
}
1117
1118
static void _ccv_nnc_ewpow_back_da_cpu_ref(ccv_nnc_tensor_view_t* const g, ccv_nnc_tensor_view_t* const a, const float exp, ccv_nnc_tensor_view_t* const h)
1119
3
{
1120
  // D[pow(a, exp), a] = exp * pow(a, exp - 1)
1121
3
  int dim[CCV_NNC_MAX_DIM_ALLOC];
1122
3
  int gstride[CCV_NNC_MAX_DIM_ALLOC];
1123
3
  int astride[CCV_NNC_MAX_DIM_ALLOC];
1124
3
  int hstride[CCV_NNC_MAX_DIM_ALLOC];
1125
3
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
1126
3
  assert(ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2);
1127
3
  ccv_nnc_tensor_view_get_dim(a, dim);
1128
3
  assert(ccv_nnc_tensor_view_check_dim(h, dim));
1129
3
  if (g)
1130
2
  {
1131
2
    assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2);
1132
2
    assert(ccv_nnc_tensor_view_check_dim(g, dim));
1133
2
  }
1134
3
  int x;
1135
3
  if ((!g || 
!2
CCV_IS_TENSOR_VIEW2
(g)) && !CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(h))
1136
3
  {
1137
3
    const int tensor_count = ccv_nnc_tensor_count(a->info);
1138
3
    if (g)
1139
2
    {
1140
1.00k
      for (x = 0; x < tensor_count; 
x++1.00k
)
1141
1.00k
        h->data.f32[x] = g->data.f32[x] * exp * powf(a->data.f32[x], exp - 1);
1142
2
    } else {
1143
2
      for (x = 0; x < tensor_count; 
x++1
)
1144
1
        h->data.f32[x] = exp * powf(a->data.f32[x], exp - 1);
1145
1
    }
1146
3
    return;
1147
3
  }
1148
3
  assert
(CCV_NNC_MAX_DIM == 2)0
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1149
0
  if (g)
1150
0
    ccv_nnc_tensor_view_get_stride(g, gstride);
1151
0
  ccv_nnc_tensor_view_get_stride(a, astride);
1152
0
  ccv_nnc_tensor_view_get_stride(h, hstride);
1153
0
  int i[CCV_NNC_MAX_DIM + 2];
1154
0
  float* const gp = g ? g->data.f32 : 0;
1155
0
  float* const ap = a->data.f32;
1156
0
  float* const hp = h->data.f32;
1157
0
  const int count = dim[2] * dim[3];
1158
0
  if ((!g || gstride[2] == dim[3]) && astride[2] == dim[3] && hstride[2] == dim[3])
1159
0
  {
1160
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1161
0
    {
1162
0
      float* gp0 = g ? gp + i[0] * gstride[0] : 0;
1163
0
      float* ap0 = ap + i[0] * astride[0];
1164
0
      float* hp0 = hp + i[0] * hstride[0];
1165
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1166
0
      {
1167
0
        if (g)
1168
0
        {
1169
0
          for (x = 0; x < count; x++)
1170
0
            hp0[x] = gp0[x] * exp * powf(ap0[x], exp - 1);
1171
0
          gp0 += gstride[1];
1172
0
        } else {
1173
0
          for (x = 0; x < count; x++)
1174
0
            hp0[x] = exp * powf(ap0[x], exp - 1);
1175
0
        }
1176
0
        ap0 += astride[1];
1177
0
        hp0 += hstride[1];
1178
0
      }
1179
0
    }
1180
0
    return;
1181
0
  }
1182
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
1183
0
  {
1184
0
    float* const gp0 = g ? gp + i[0] * gstride[0] : 0;
1185
0
    float* const ap0 = ap + i[0] * astride[0];
1186
0
    float* const hp0 = hp + i[0] * hstride[0];
1187
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
1188
0
    {
1189
0
      float* gp1 = g ? gp0 + i[1] * gstride[1] : 0;
1190
0
      float* ap1 = ap0 + i[1] * astride[1];
1191
0
      float* hp1 = hp0 + i[1] * hstride[1];
1192
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
1193
0
      {
1194
0
        if (g)
1195
0
        {
1196
0
          for (x = 0; x < dim[3]; x++)
1197
0
            hp1[x] = gp1[x] * exp * powf(ap1[x], exp - 1);
1198
0
          gp1 += gstride[2];
1199
0
        } else {
1200
0
          for (x = 0; x < dim[3]; x++)
1201
0
            hp1[x] = exp * powf(ap1[x], exp - 1);
1202
0
        }
1203
0
        ap1 += astride[2];
1204
0
        hp1 += hstride[2];
1205
0
      }
1206
0
    }
1207
0
  }
1208
0
}
1209
1210
static int _ccv_nnc_ewpow_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1211
3
{
1212
3
  ccv_nnc_tensor_view_t* const g = (ccv_nnc_tensor_view_t*)inputs[0];
1213
3
  ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[1];
1214
3
  if (output_size > 0 && outputs[0])
1215
3
    _ccv_nnc_ewpow_back_da_cpu_ref(g, a, cmd.info.pow.exponent, (ccv_nnc_tensor_view_t*)outputs[0]);
1216
3
  return CCV_NNC_EXEC_SUCCESS;
1217
3
}
1218
1219
static int _ccv_nnc_ewlog_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1220
261
{
1221
  // Assuming this is float 32.
1222
261
  int dim[CCV_NNC_MAX_DIM_ALLOC];
1223
261
  int astride[CCV_NNC_MAX_DIM_ALLOC];
1224
261
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
1225
261
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
1226
261
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
1227
261
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
1228
261
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
1229
261
  ccv_nnc_tensor_view_get_dim(a, dim);
1230
261
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
1231
261
  int x;
1232
261
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b))
1233
261
  {
1234
    // Super optimal case, just do one for-loop for sum.
1235
261
    const int tensor_count = ccv_nnc_tensor_count(a->info);
1236
3.57k
    for (x = 0; x < tensor_count; 
x++3.31k
)
1237
3.31k
      b->data.f32[x] = log(a->data.f32[x]);
1238
261
    return CCV_NNC_EXEC_SUCCESS;
1239
261
  }
1240
261
  assert
(CCV_NNC_MAX_DIM == 2)0
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1241
0
  ccv_nnc_tensor_view_get_stride(a, astride);
1242
0
  ccv_nnc_tensor_view_get_stride(b, bstride);
1243
0
  int i[CCV_NNC_MAX_DIM + 2];
1244
0
  float* const ap = a->data.f32;
1245
0
  float* const bp = b->data.f32;
1246
0
  const int count = dim[2] * dim[3];
1247
0
  if (astride[2] == dim[3] && bstride[2] == dim[3])
1248
0
  {
1249
    // Special casing if the ainc[3] is the same as dim[3]
1250
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1251
0
    {
1252
0
      float* ap0 = ap + i[0] * astride[0];
1253
0
      float* bp0 = bp + i[0] * bstride[0];
1254
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1255
0
      {
1256
0
        for (x = 0; x < count; x++)
1257
0
          bp0[x] = log(ap0[x]);
1258
0
        ap0 += astride[1];
1259
0
        bp0 += bstride[1];
1260
0
      }
1261
0
    }
1262
0
    return CCV_NNC_EXEC_SUCCESS;
1263
0
  }
1264
  // Non-optimal case, need to do skip copy.
1265
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
1266
0
  {
1267
0
    float* const ap0 = ap + i[0] * astride[0];
1268
0
    float* const bp0 = bp + i[0] * bstride[0];
1269
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
1270
0
    {
1271
0
      float* ap1 = ap0 + i[1] * astride[1];
1272
0
      float* bp1 = bp0 + i[1] * bstride[1];
1273
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
1274
0
      {
1275
0
        for (x = 0; x < dim[3]; x++)
1276
0
          bp1[x] = log(ap1[x]);
1277
0
        ap1 += astride[2];
1278
0
        bp1 += bstride[2];
1279
0
      }
1280
0
    }
1281
0
  }
1282
0
  return CCV_NNC_EXEC_SUCCESS;
1283
0
}
1284
1285
static int _ccv_nnc_ewlog_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1286
225
{
1287
  // D[Log[x], x] = 1 / x
1288
225
  _ccv_nnc_ewdiv_forw_cpu_ref(1, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]);
1289
225
  return CCV_NNC_EXEC_SUCCESS;
1290
225
}
1291
1292
static int _ccv_nnc_ewsqrt_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1293
5
{
1294
  // Assuming this is float 32.
1295
5
  int dim[CCV_NNC_MAX_DIM_ALLOC];
1296
5
  int astride[CCV_NNC_MAX_DIM_ALLOC];
1297
5
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
1298
5
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
1299
5
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
1300
5
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
1301
5
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
1302
5
  ccv_nnc_tensor_view_get_dim(a, dim);
1303
5
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
1304
5
  int x;
1305
5
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b))
1306
5
  {
1307
    // Super optimal case, just do one for-loop for sum.
1308
5
    const int tensor_count = ccv_nnc_tensor_count(a->info);
1309
2.01k
    for (x = 0; x < tensor_count; 
x++2.01k
)
1310
2.01k
      b->data.f32[x] = sqrt(a->data.f32[x]);
1311
5
    return CCV_NNC_EXEC_SUCCESS;
1312
5
  }
1313
5
  assert
(CCV_NNC_MAX_DIM == 2)0
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1314
0
  ccv_nnc_tensor_view_get_stride(a, astride);
1315
0
  ccv_nnc_tensor_view_get_stride(b, bstride);
1316
0
  int i[CCV_NNC_MAX_DIM + 2];
1317
0
  float* const ap = a->data.f32;
1318
0
  float* const bp = b->data.f32;
1319
0
  const int count = dim[2] * dim[3];
1320
0
  if (astride[2] == dim[3] && bstride[2] == dim[3])
1321
0
  {
1322
    // Special casing if the ainc[3] is the same as dim[3]
1323
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1324
0
    {
1325
0
      float* ap0 = ap + i[0] * astride[0];
1326
0
      float* bp0 = bp + i[0] * bstride[0];
1327
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1328
0
      {
1329
0
        for (x = 0; x < count; x++)
1330
0
          bp0[x] = sqrt(ap0[x]);
1331
0
        ap0 += astride[1];
1332
0
        bp0 += bstride[1];
1333
0
      }
1334
0
    }
1335
0
    return CCV_NNC_EXEC_SUCCESS;
1336
0
  }
1337
  // Non-optimal case, need to do skip copy.
1338
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
1339
0
  {
1340
0
    float* const ap0 = ap + i[0] * astride[0];
1341
0
    float* const bp0 = bp + i[0] * bstride[0];
1342
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
1343
0
    {
1344
0
      float* ap1 = ap0 + i[1] * astride[1];
1345
0
      float* bp1 = bp0 + i[1] * bstride[1];
1346
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
1347
0
      {
1348
0
        for (x = 0; x < dim[3]; x++)
1349
0
          bp1[x] = sqrt(ap1[x]);
1350
0
        ap1 += astride[2];
1351
0
        bp1 += bstride[2];
1352
0
      }
1353
0
    }
1354
0
  }
1355
0
  return CCV_NNC_EXEC_SUCCESS;
1356
0
}
1357
1358
static int _ccv_nnc_ewsqrt_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1359
2
{
1360
  // D[Sqrt[x], x] = 0.5 / Sqrt[x]
1361
2
  _ccv_nnc_ewdiv_forw_cpu_ref(0.5, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
1362
2
  return CCV_NNC_EXEC_SUCCESS;
1363
2
}
1364
1365
static int _ccv_nnc_ewsin_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1366
4
{
1367
  // Assuming this is float 32.
1368
4
  int dim[CCV_NNC_MAX_DIM_ALLOC];
1369
4
  int astride[CCV_NNC_MAX_DIM_ALLOC];
1370
4
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
1371
4
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
1372
4
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
1373
4
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
1374
4
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
1375
4
  ccv_nnc_tensor_view_get_dim(a, dim);
1376
4
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
1377
4
  int x;
1378
4
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b))
1379
4
  {
1380
4
    const int tensor_count = ccv_nnc_tensor_count(a->info);
1381
2.00k
    for (x = 0; x < tensor_count; 
x++2.00k
)
1382
2.00k
      b->data.f32[x] = sinf(a->data.f32[x]);
1383
4
    return CCV_NNC_EXEC_SUCCESS;
1384
4
  }
1385
4
  assert
(CCV_NNC_MAX_DIM == 2)0
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1386
0
  ccv_nnc_tensor_view_get_stride(a, astride);
1387
0
  ccv_nnc_tensor_view_get_stride(b, bstride);
1388
0
  int i[CCV_NNC_MAX_DIM + 2];
1389
0
  float* const ap = a->data.f32;
1390
0
  float* const bp = b->data.f32;
1391
0
  const int count = dim[2] * dim[3];
1392
0
  if (astride[2] == dim[3] && bstride[2] == dim[3])
1393
0
  {
1394
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1395
0
    {
1396
0
      float* ap0 = ap + i[0] * astride[0];
1397
0
      float* bp0 = bp + i[0] * bstride[0];
1398
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1399
0
      {
1400
0
        for (x = 0; x < count; x++)
1401
0
          bp0[x] = sinf(ap0[x]);
1402
0
        ap0 += astride[1];
1403
0
        bp0 += bstride[1];
1404
0
      }
1405
0
    }
1406
0
    return CCV_NNC_EXEC_SUCCESS;
1407
0
  }
1408
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
1409
0
  {
1410
0
    float* const ap0 = ap + i[0] * astride[0];
1411
0
    float* const bp0 = bp + i[0] * bstride[0];
1412
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
1413
0
    {
1414
0
      float* ap1 = ap0 + i[1] * astride[1];
1415
0
      float* bp1 = bp0 + i[1] * bstride[1];
1416
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
1417
0
      {
1418
0
        for (x = 0; x < dim[3]; x++)
1419
0
          bp1[x] = sinf(ap1[x]);
1420
0
        ap1 += astride[2];
1421
0
        bp1 += bstride[2];
1422
0
      }
1423
0
    }
1424
0
  }
1425
0
  return CCV_NNC_EXEC_SUCCESS;
1426
0
}
1427
1428
static int _ccv_nnc_ewsin_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1429
3
{
1430
  // D[Sin[x], x] = Cos[x]
1431
3
  int dim[CCV_NNC_MAX_DIM_ALLOC];
1432
3
  int gstride[CCV_NNC_MAX_DIM_ALLOC];
1433
3
  int astride[CCV_NNC_MAX_DIM_ALLOC];
1434
3
  int hstride[CCV_NNC_MAX_DIM_ALLOC];
1435
3
  ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0];
1436
3
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[1];
1437
3
  ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[0];
1438
3
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
1439
3
  assert(ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2);
1440
3
  ccv_nnc_tensor_view_get_dim(a, dim);
1441
3
  assert(ccv_nnc_tensor_view_check_dim(h, dim));
1442
3
  if (g)
1443
2
  {
1444
2
    assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2);
1445
2
    assert(ccv_nnc_tensor_view_check_dim(g, dim));
1446
2
  }
1447
3
  int x;
1448
3
  if ((!g || 
!2
CCV_IS_TENSOR_VIEW2
(g)) && !CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(h))
1449
3
  {
1450
3
    const int tensor_count = ccv_nnc_tensor_count(a->info);
1451
3
    if (g)
1452
2
    {
1453
1.00k
      for (x = 0; x < tensor_count; 
x++1.00k
)
1454
1.00k
        h->data.f32[x] = g->data.f32[x] * cosf(a->data.f32[x]);
1455
2
    } else {
1456
2
      for (x = 0; x < tensor_count; 
x++1
)
1457
1
        h->data.f32[x] = cosf(a->data.f32[x]);
1458
1
    }
1459
3
    return CCV_NNC_EXEC_SUCCESS;
1460
3
  }
1461
3
  assert
(CCV_NNC_MAX_DIM == 2)0
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1462
0
  if (g)
1463
0
    ccv_nnc_tensor_view_get_stride(g, gstride);
1464
0
  ccv_nnc_tensor_view_get_stride(a, astride);
1465
0
  ccv_nnc_tensor_view_get_stride(h, hstride);
1466
0
  int i[CCV_NNC_MAX_DIM + 2];
1467
0
  float* const gp = g ? g->data.f32 : 0;
1468
0
  float* const ap = a->data.f32;
1469
0
  float* const hp = h->data.f32;
1470
0
  const int count = dim[2] * dim[3];
1471
0
  if ((!g || gstride[2] == dim[3]) && astride[2] == dim[3] && hstride[2] == dim[3])
1472
0
  {
1473
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1474
0
    {
1475
0
      float* gp0 = g ? gp + i[0] * gstride[0] : 0;
1476
0
      float* ap0 = ap + i[0] * astride[0];
1477
0
      float* hp0 = hp + i[0] * hstride[0];
1478
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1479
0
      {
1480
0
        if (g)
1481
0
        {
1482
0
          for (x = 0; x < count; x++)
1483
0
            hp0[x] = gp0[x] * cosf(ap0[x]);
1484
0
          gp0 += gstride[1];
1485
0
        } else {
1486
0
          for (x = 0; x < count; x++)
1487
0
            hp0[x] = cosf(ap0[x]);
1488
0
        }
1489
0
        ap0 += astride[1];
1490
0
        hp0 += hstride[1];
1491
0
      }
1492
0
    }
1493
0
    return CCV_NNC_EXEC_SUCCESS;
1494
0
  }
1495
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
1496
0
  {
1497
0
    float* const gp0 = g ? gp + i[0] * gstride[0] : 0;
1498
0
    float* const ap0 = ap + i[0] * astride[0];
1499
0
    float* const hp0 = hp + i[0] * hstride[0];
1500
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
1501
0
    {
1502
0
      float* gp1 = g ? gp0 + i[1] * gstride[1] : 0;
1503
0
      float* ap1 = ap0 + i[1] * astride[1];
1504
0
      float* hp1 = hp0 + i[1] * hstride[1];
1505
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
1506
0
      {
1507
0
        if (g)
1508
0
        {
1509
0
          for (x = 0; x < dim[3]; x++)
1510
0
            hp1[x] = gp1[x] * cosf(ap1[x]);
1511
0
          gp1 += gstride[2];
1512
0
        } else {
1513
0
          for (x = 0; x < dim[3]; x++)
1514
0
            hp1[x] = cosf(ap1[x]);
1515
0
        }
1516
0
        ap1 += astride[2];
1517
0
        hp1 += hstride[2];
1518
0
      }
1519
0
    }
1520
0
  }
1521
0
  return CCV_NNC_EXEC_SUCCESS;
1522
0
}
1523
1524
static int _ccv_nnc_ewcos_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1525
4
{
1526
  // Assuming this is float 32.
1527
4
  int dim[CCV_NNC_MAX_DIM_ALLOC];
1528
4
  int astride[CCV_NNC_MAX_DIM_ALLOC];
1529
4
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
1530
4
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
1531
4
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
1532
4
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
1533
4
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
1534
4
  ccv_nnc_tensor_view_get_dim(a, dim);
1535
4
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
1536
4
  int x;
1537
4
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b))
1538
4
  {
1539
4
    const int tensor_count = ccv_nnc_tensor_count(a->info);
1540
2.00k
    for (x = 0; x < tensor_count; 
x++2.00k
)
1541
2.00k
      b->data.f32[x] = cosf(a->data.f32[x]);
1542
4
    return CCV_NNC_EXEC_SUCCESS;
1543
4
  }
1544
4
  assert
(CCV_NNC_MAX_DIM == 2)0
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1545
0
  ccv_nnc_tensor_view_get_stride(a, astride);
1546
0
  ccv_nnc_tensor_view_get_stride(b, bstride);
1547
0
  int i[CCV_NNC_MAX_DIM + 2];
1548
0
  float* const ap = a->data.f32;
1549
0
  float* const bp = b->data.f32;
1550
0
  const int count = dim[2] * dim[3];
1551
0
  if (astride[2] == dim[3] && bstride[2] == dim[3])
1552
0
  {
1553
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1554
0
    {
1555
0
      float* ap0 = ap + i[0] * astride[0];
1556
0
      float* bp0 = bp + i[0] * bstride[0];
1557
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1558
0
      {
1559
0
        for (x = 0; x < count; x++)
1560
0
          bp0[x] = cosf(ap0[x]);
1561
0
        ap0 += astride[1];
1562
0
        bp0 += bstride[1];
1563
0
      }
1564
0
    }
1565
0
    return CCV_NNC_EXEC_SUCCESS;
1566
0
  }
1567
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
1568
0
  {
1569
0
    float* const ap0 = ap + i[0] * astride[0];
1570
0
    float* const bp0 = bp + i[0] * bstride[0];
1571
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
1572
0
    {
1573
0
      float* ap1 = ap0 + i[1] * astride[1];
1574
0
      float* bp1 = bp0 + i[1] * bstride[1];
1575
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
1576
0
      {
1577
0
        for (x = 0; x < dim[3]; x++)
1578
0
          bp1[x] = cosf(ap1[x]);
1579
0
        ap1 += astride[2];
1580
0
        bp1 += bstride[2];
1581
0
      }
1582
0
    }
1583
0
  }
1584
0
  return CCV_NNC_EXEC_SUCCESS;
1585
0
}
1586
1587
static int _ccv_nnc_ewcos_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1588
3
{
1589
  // D[Cos[x], x] = -Sin[x]
1590
3
  int dim[CCV_NNC_MAX_DIM_ALLOC];
1591
3
  int gstride[CCV_NNC_MAX_DIM_ALLOC];
1592
3
  int astride[CCV_NNC_MAX_DIM_ALLOC];
1593
3
  int hstride[CCV_NNC_MAX_DIM_ALLOC];
1594
3
  ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0];
1595
3
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[1];
1596
3
  ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[0];
1597
3
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
1598
3
  assert(ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2);
1599
3
  ccv_nnc_tensor_view_get_dim(a, dim);
1600
3
  assert(ccv_nnc_tensor_view_check_dim(h, dim));
1601
3
  if (g)
1602
2
  {
1603
2
    assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2);
1604
2
    assert(ccv_nnc_tensor_view_check_dim(g, dim));
1605
2
  }
1606
3
  int x;
1607
3
  if ((!g || 
!2
CCV_IS_TENSOR_VIEW2
(g)) && !CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(h))
1608
3
  {
1609
3
    const int tensor_count = ccv_nnc_tensor_count(a->info);
1610
3
    if (g)
1611
2
    {
1612
1.00k
      for (x = 0; x < tensor_count; 
x++1.00k
)
1613
1.00k
        h->data.f32[x] = -g->data.f32[x] * sinf(a->data.f32[x]);
1614
2
    } else {
1615
2
      for (x = 0; x < tensor_count; 
x++1
)
1616
1
        h->data.f32[x] = -sinf(a->data.f32[x]);
1617
1
    }
1618
3
    return CCV_NNC_EXEC_SUCCESS;
1619
3
  }
1620
3
  assert
(CCV_NNC_MAX_DIM == 2)0
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1621
0
  if (g)
1622
0
    ccv_nnc_tensor_view_get_stride(g, gstride);
1623
0
  ccv_nnc_tensor_view_get_stride(a, astride);
1624
0
  ccv_nnc_tensor_view_get_stride(h, hstride);
1625
0
  int i[CCV_NNC_MAX_DIM + 2];
1626
0
  float* const gp = g ? g->data.f32 : 0;
1627
0
  float* const ap = a->data.f32;
1628
0
  float* const hp = h->data.f32;
1629
0
  const int count = dim[2] * dim[3];
1630
0
  if ((!g || gstride[2] == dim[3]) && astride[2] == dim[3] && hstride[2] == dim[3])
1631
0
  {
1632
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1633
0
    {
1634
0
      float* gp0 = g ? gp + i[0] * gstride[0] : 0;
1635
0
      float* ap0 = ap + i[0] * astride[0];
1636
0
      float* hp0 = hp + i[0] * hstride[0];
1637
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1638
0
      {
1639
0
        if (g)
1640
0
        {
1641
0
          for (x = 0; x < count; x++)
1642
0
            hp0[x] = -gp0[x] * sinf(ap0[x]);
1643
0
          gp0 += gstride[1];
1644
0
        } else {
1645
0
          for (x = 0; x < count; x++)
1646
0
            hp0[x] = -sinf(ap0[x]);
1647
0
        }
1648
0
        ap0 += astride[1];
1649
0
        hp0 += hstride[1];
1650
0
      }
1651
0
    }
1652
0
    return CCV_NNC_EXEC_SUCCESS;
1653
0
  }
1654
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
1655
0
  {
1656
0
    float* const gp0 = g ? gp + i[0] * gstride[0] : 0;
1657
0
    float* const ap0 = ap + i[0] * astride[0];
1658
0
    float* const hp0 = hp + i[0] * hstride[0];
1659
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
1660
0
    {
1661
0
      float* gp1 = g ? gp0 + i[1] * gstride[1] : 0;
1662
0
      float* ap1 = ap0 + i[1] * astride[1];
1663
0
      float* hp1 = hp0 + i[1] * hstride[1];
1664
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
1665
0
      {
1666
0
        if (g)
1667
0
        {
1668
0
          for (x = 0; x < dim[3]; x++)
1669
0
            hp1[x] = -gp1[x] * sinf(ap1[x]);
1670
0
          gp1 += gstride[2];
1671
0
        } else {
1672
0
          for (x = 0; x < dim[3]; x++)
1673
0
            hp1[x] = -sinf(ap1[x]);
1674
0
        }
1675
0
        ap1 += astride[2];
1676
0
        hp1 += hstride[2];
1677
0
      }
1678
0
    }
1679
0
  }
1680
0
  return CCV_NNC_EXEC_SUCCESS;
1681
0
}
1682
1683
static int _ccv_nnc_ewabs_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1684
2
{
1685
  // Assuming this is float 32.
1686
2
  int dim[CCV_NNC_MAX_DIM_ALLOC];
1687
2
  int astride[CCV_NNC_MAX_DIM_ALLOC];
1688
2
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
1689
2
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
1690
2
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
1691
2
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
1692
2
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
1693
2
  ccv_nnc_tensor_view_get_dim(a, dim);
1694
2
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
1695
2
  int x;
1696
2
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b))
1697
2
  {
1698
    // Super optimal case, just do one for-loop for sum.
1699
2
    const int tensor_count = ccv_nnc_tensor_count(a->info);
1700
2.00k
    for (x = 0; x < tensor_count; 
x++2.00k
)
1701
2.00k
      b->data.f32[x] = fabs(a->data.f32[x]);
1702
2
    return CCV_NNC_EXEC_SUCCESS;
1703
2
  }
1704
2
  assert
(CCV_NNC_MAX_DIM == 2)0
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1705
0
  ccv_nnc_tensor_view_get_stride(a, astride);
1706
0
  ccv_nnc_tensor_view_get_stride(b, bstride);
1707
0
  int i[CCV_NNC_MAX_DIM + 2];
1708
0
  float* const ap = a->data.f32;
1709
0
  float* const bp = b->data.f32;
1710
0
  const int count = dim[2] * dim[3];
1711
0
  if (astride[2] == dim[3] && bstride[2] == dim[3])
1712
0
  {
1713
    // Special casing if the ainc[3] is the same as dim[3]
1714
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1715
0
    {
1716
0
      float* ap0 = ap + i[0] * astride[0];
1717
0
      float* bp0 = bp + i[0] * bstride[0];
1718
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1719
0
      {
1720
0
        for (x = 0; x < count; x++)
1721
0
          bp0[x] = fabs(ap0[x]);
1722
0
        ap0 += astride[1];
1723
0
        bp0 += bstride[1];
1724
0
      }
1725
0
    }
1726
0
    return CCV_NNC_EXEC_SUCCESS;
1727
0
  }
1728
  // Non-optimal case, need to do skip copy.
1729
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
1730
0
  {
1731
0
    float* const ap0 = ap + i[0] * astride[0];
1732
0
    float* const bp0 = bp + i[0] * bstride[0];
1733
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
1734
0
    {
1735
0
      float* ap1 = ap0 + i[1] * astride[1];
1736
0
      float* bp1 = bp0 + i[1] * bstride[1];
1737
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
1738
0
      {
1739
0
        for (x = 0; x < dim[3]; x++)
1740
0
          bp1[x] = fabs(ap1[x]);
1741
0
        ap1 += astride[2];
1742
0
        bp1 += bstride[2];
1743
0
      }
1744
0
    }
1745
0
  }
1746
0
  return CCV_NNC_EXEC_SUCCESS;
1747
0
}
1748
1749
static int _ccv_nnc_ewabs_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1750
1
{
1751
  // Assuming this is float 32.
1752
1
  int dim[CCV_NNC_MAX_DIM_ALLOC];
1753
1
  int gstride[CCV_NNC_MAX_DIM_ALLOC];
1754
1
  int astride[CCV_NNC_MAX_DIM_ALLOC];
1755
1
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
1756
1
  ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0];
1757
1
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[1];
1758
1
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
1759
1
  assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2);
1760
1
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
1761
1
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
1762
1
  ccv_nnc_tensor_view_get_dim(a, dim);
1763
1
  assert(ccv_nnc_tensor_view_check_dim(g, dim));
1764
1
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
1765
1
  int x;
1766
1
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(g))
1767
1
  {
1768
    // Super optimal case, just do one for-loop for sum.
1769
1
    const int tensor_count = ccv_nnc_tensor_count(a->info);
1770
1.00k
    for (x = 0; x < tensor_count; 
x++1.00k
)
1771
1.00k
      b->data.f32[x] = a->data.f32[x] >= 0 ? g->data.f32[x] : 
-g->data.f32[x]0
;
1772
1
    return CCV_NNC_EXEC_SUCCESS;
1773
1
  }
1774
1
  assert
(CCV_NNC_MAX_DIM == 2)0
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1775
0
  ccv_nnc_tensor_view_get_stride(g, astride);
1776
0
  ccv_nnc_tensor_view_get_stride(a, astride);
1777
0
  ccv_nnc_tensor_view_get_stride(b, bstride);
1778
0
  int i[CCV_NNC_MAX_DIM + 2];
1779
0
  float* const gp = g->data.f32;
1780
0
  float* const ap = a->data.f32;
1781
0
  float* const bp = b->data.f32;
1782
0
  const int count = dim[2] * dim[3];
1783
0
  if (astride[2] == dim[3] && bstride[2] == dim[3])
1784
0
  {
1785
    // Special casing if the ainc[3] is the same as dim[3]
1786
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1787
0
    {
1788
0
      float* gp0 = gp + i[0] * gstride[0];
1789
0
      float* ap0 = ap + i[0] * astride[0];
1790
0
      float* bp0 = bp + i[0] * bstride[0];
1791
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1792
0
      {
1793
0
        for (x = 0; x < count; x++)
1794
0
          bp0[x] = ap0[x] >= 0 ? gp0[x] : -gp0[x];
1795
0
        gp0 += gstride[1];
1796
0
        ap0 += astride[1];
1797
0
        bp0 += bstride[1];
1798
0
      }
1799
0
    }
1800
0
    return CCV_NNC_EXEC_SUCCESS;
1801
0
  }
1802
  // Non-optimal case, need to do skip copy.
1803
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
1804
0
  {
1805
0
    float* const gp0 = gp + i[0] * gstride[0];
1806
0
    float* const ap0 = ap + i[0] * astride[0];
1807
0
    float* const bp0 = bp + i[0] * bstride[0];
1808
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
1809
0
    {
1810
0
      float* gp1 = gp0 + i[1] * gstride[1];
1811
0
      float* ap1 = ap0 + i[1] * astride[1];
1812
0
      float* bp1 = bp0 + i[1] * bstride[1];
1813
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
1814
0
      {
1815
0
        for (x = 0; x < dim[3]; x++)
1816
0
          bp1[x] = ap1[x] >= 0 ? gp1[x] : -gp1[x];
1817
0
        gp1 += gstride[2];
1818
0
        ap1 += astride[2];
1819
0
        bp1 += bstride[2];
1820
0
      }
1821
0
    }
1822
0
  }
1823
0
  return CCV_NNC_EXEC_SUCCESS;
1824
0
}
1825
1826
static int _ccv_nnc_clamp_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1827
8
{
1828
  // Assuming this is float 32.
1829
8
  int dim[CCV_NNC_MAX_DIM_ALLOC];
1830
8
  int astride[CCV_NNC_MAX_DIM_ALLOC];
1831
8
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
1832
8
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
1833
8
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
1834
8
  assert(ccv_nnc_tensor_nd(a->info.dim) <= CCV_NNC_MAX_DIM + 2);
1835
8
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
1836
8
  ccv_nnc_tensor_view_get_dim(a, dim);
1837
8
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
1838
8
  int x;
1839
8
  const float min = cmd.info.clamp.min;
1840
8
  const float max = cmd.info.clamp.max;
1841
8
  assert(!isnan(min) || !isnan(max));
1842
8
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b))
1843
8
  {
1844
    // Super optimal case, just do one for-loop for sum.
1845
8
    const int tensor_count = ccv_nnc_tensor_count(a->info);
1846
8
    if (isnan(min))
1847
4
    {
1848
2.00k
      for (x = 0; x < tensor_count; 
x++2.00k
)
1849
2.00k
        b->data.f32[x] = ccv_min(a->data.f32[x], max);
1850
4
    } else if (isnan(max)) {
1851
2.00k
      for (x = 0; x < tensor_count; 
x++2.00k
)
1852
2.00k
        b->data.f32[x] = ccv_max(a->data.f32[x], min);
1853
2
    } else {
1854
2.00k
      for (x = 0; x < tensor_count; 
x++2.00k
)
1855
2.00k
        b->data.f32[x] = ccv_clamp(a->data.f32[x], min, max);
1856
2
    }
1857
8
    return CCV_NNC_EXEC_SUCCESS;
1858
8
  }
1859
8
  assert
(CCV_NNC_MAX_DIM == 2)0
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1860
0
  ccv_nnc_tensor_view_get_stride(a, astride);
1861
0
  ccv_nnc_tensor_view_get_stride(b, bstride);
1862
0
  int i[CCV_NNC_MAX_DIM + 2];
1863
0
  float* const ap = a->data.f32;
1864
0
  float* const bp = b->data.f32;
1865
0
  const int count = dim[2] * dim[3];
1866
0
  if (isnan(min))
1867
0
  {
1868
0
    if (astride[2] == dim[3] && bstride[2] == dim[3])
1869
0
    {
1870
      // Special casing if the ainc[3] is the same as dim[3]
1871
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1872
0
      {
1873
0
        float* ap0 = ap + i[0] * astride[0];
1874
0
        float* bp0 = bp + i[0] * bstride[0];
1875
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1876
0
        {
1877
0
          for (x = 0; x < count; x++)
1878
0
            bp0[x] = ccv_min(ap0[x], max);
1879
0
          ap0 += astride[1];
1880
0
          bp0 += bstride[1];
1881
0
        }
1882
0
      }
1883
0
      return CCV_NNC_EXEC_SUCCESS;
1884
0
    }
1885
    // Non-optimal case, need to do skip copy.
1886
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1887
0
    {
1888
0
      float* const ap0 = ap + i[0] * astride[0];
1889
0
      float* const bp0 = bp + i[0] * bstride[0];
1890
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1891
0
      {
1892
0
        float* ap1 = ap0 + i[1] * astride[1];
1893
0
        float* bp1 = bp0 + i[1] * bstride[1];
1894
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
1895
0
        {
1896
0
          for (x = 0; x < dim[3]; x++)
1897
0
            bp1[x] = ccv_min(ap1[x], max);
1898
0
          ap1 += astride[2];
1899
0
          bp1 += bstride[2];
1900
0
        }
1901
0
      }
1902
0
    }
1903
0
  } else if (isnan(max)) {
1904
0
    if (astride[2] == dim[3] && bstride[2] == dim[3])
1905
0
    {
1906
      // Special casing if the ainc[3] is the same as dim[3]
1907
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1908
0
      {
1909
0
        float* ap0 = ap + i[0] * astride[0];
1910
0
        float* bp0 = bp + i[0] * bstride[0];
1911
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1912
0
        {
1913
0
          for (x = 0; x < count; x++)
1914
0
            bp0[x] = ccv_max(ap0[x], min);
1915
0
          ap0 += astride[1];
1916
0
          bp0 += bstride[1];
1917
0
        }
1918
0
      }
1919
0
      return CCV_NNC_EXEC_SUCCESS;
1920
0
    }
1921
    // Non-optimal case, need to do skip copy.
1922
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1923
0
    {
1924
0
      float* const ap0 = ap + i[0] * astride[0];
1925
0
      float* const bp0 = bp + i[0] * bstride[0];
1926
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1927
0
      {
1928
0
        float* ap1 = ap0 + i[1] * astride[1];
1929
0
        float* bp1 = bp0 + i[1] * bstride[1];
1930
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
1931
0
        {
1932
0
          for (x = 0; x < dim[3]; x++)
1933
0
            bp1[x] = ccv_max(ap1[x], min);
1934
0
          ap1 += astride[2];
1935
0
          bp1 += bstride[2];
1936
0
        }
1937
0
      }
1938
0
    }
1939
0
  } else {
1940
0
    if (astride[2] == dim[3] && bstride[2] == dim[3])
1941
0
    {
1942
      // Special casing if the ainc[3] is the same as dim[3]
1943
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
1944
0
      {
1945
0
        float* ap0 = ap + i[0] * astride[0];
1946
0
        float* bp0 = bp + i[0] * bstride[0];
1947
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
1948
0
        {
1949
0
          for (x = 0; x < count; x++)
1950
0
            bp0[x] = ccv_clamp(ap0[x], min, max);
1951
0
          ap0 += astride[1];
1952
0
          bp0 += bstride[1];
1953
0
        }
1954
0
      }
1955
0
      return CCV_NNC_EXEC_SUCCESS;
1956
0
    }
1957
    // Non-optimal case, need to do skip copy.
1958
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1959
0
    {
1960
0
      float* const ap0 = ap + i[0] * astride[0];
1961
0
      float* const bp0 = bp + i[0] * bstride[0];
1962
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1963
0
      {
1964
0
        float* ap1 = ap0 + i[1] * astride[1];
1965
0
        float* bp1 = bp0 + i[1] * bstride[1];
1966
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
1967
0
        {
1968
0
          for (x = 0; x < dim[3]; x++)
1969
0
            bp1[x] = ccv_clamp(ap1[x], min, max);
1970
0
          ap1 += astride[2];
1971
0
          bp1 += bstride[2];
1972
0
        }
1973
0
      }
1974
0
    }
1975
0
  }
1976
0
  return CCV_NNC_EXEC_SUCCESS;
1977
0
}
1978
1979
static int _ccv_nnc_clamp_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1980
3
{
1981
3
  assert(input_size == 3);
1982
3
  const ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0]; // gradient
1983
3
  const ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[2];
1984
3
  assert(output_size == 1);
1985
3
  ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[0];
1986
  // Assuming this is float 32.
1987
3
  int dim[CCV_NNC_MAX_DIM_ALLOC];
1988
3
  int hstride[CCV_NNC_MAX_DIM_ALLOC];
1989
3
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
1990
3
  assert(ccv_nnc_tensor_nd(h->info.dim) <= CCV_NNC_MAX_DIM + 2);
1991
3
  assert(ccv_nnc_tensor_nd(b->info.dim) <= CCV_NNC_MAX_DIM + 2);
1992
3
  ccv_nnc_tensor_view_get_dim(g, dim);
1993
3
  ccv_nnc_tensor_view_get_dim(h, dim);
1994
3
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
1995
3
  int x;
1996
3
  const float min = cmd.info.clamp.min;
1997
3
  const float max = cmd.info.clamp.max;
1998
3
  assert(!isnan(min) || !isnan(max));
1999
3
  if (g)
2000
3
  {
2001
3
    if (!CCV_IS_TENSOR_VIEW(g) && !CCV_IS_TENSOR_VIEW(h) && !CCV_IS_TENSOR_VIEW(b))
2002
3
    {
2003
      // Super optimal case, just do one for-loop for sum.
2004
3
      const int tensor_count = ccv_nnc_tensor_count(g->info);
2005
3
      if (isnan(min))
2006
1
      {
2007
1.00k
        for (x = 0; x < tensor_count; 
x++1.00k
)
2008
1.00k
          h->data.f32[x] = b->data.f32[x] >= max ? 
0509
:
g->data.f32[x]491
;
2009
2
      } else if (isnan(max)) {
2010
1.00k
        for (x = 0; x < tensor_count; 
x++1.00k
)
2011
1.00k
          h->data.f32[x] = b->data.f32[x] <= min ? 
00
: g->data.f32[x];
2012
1
      } else {
2013
1.00k
        for (x = 0; x < tensor_count; 
x++1.00k
)
2014
1.00k
          h->data.f32[x] = (b->data.f32[x] >= max || 
b->data.f32[x] <= min491
) ?
0509
:
g->data.f32[x]491
;
2015
1
      }
2016
3
      return CCV_NNC_EXEC_SUCCESS;
2017
3
    }
2018
0
    int gstride[CCV_NNC_MAX_DIM_ALLOC];
2019
0
    assert(ccv_nnc_tensor_nd(g->info.dim) <= CCV_NNC_MAX_DIM + 2);
2020
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
2021
0
    ccv_nnc_tensor_view_get_stride(g, gstride);
2022
0
    ccv_nnc_tensor_view_get_stride(b, bstride);
2023
0
    ccv_nnc_tensor_view_get_stride(h, hstride);
2024
0
    int i[CCV_NNC_MAX_DIM + 2];
2025
0
    float* const gp = g->data.f32;
2026
0
    float* const bp = b->data.f32;
2027
0
    float* const hp = h->data.f32;
2028
0
    const int count = dim[2] * dim[3];
2029
0
    const float min = cmd.info.clamp.min;
2030
0
    const float max = cmd.info.clamp.max;
2031
0
    assert(!isnan(min) || !isnan(max));
2032
0
    if (isnan(min))
2033
0
    {
2034
0
      if (gstride[2] == dim[3] && bstride[2] == dim[3] && hstride[2] == dim[3])
2035
0
      {
2036
        // Special casing if the ginc[3] is the same as dim[3]
2037
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
2038
0
        {
2039
0
          float* gp0 = gp + i[0] * gstride[0];
2040
0
          float* bp0 = bp + i[0] * bstride[0];
2041
0
          float* hp0 = hp + i[0] * hstride[0];
2042
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
2043
0
          {
2044
0
            for (x = 0; x < count; x++)
2045
0
              hp0[x] = bp0[x] >= max ? 0 : gp0[x];
2046
0
            gp0 += gstride[1];
2047
0
            bp0 += bstride[1];
2048
0
            hp0 += hstride[1];
2049
0
          }
2050
0
        }
2051
0
        return CCV_NNC_EXEC_SUCCESS;
2052
0
      }
2053
      // Non-optimal case, need to do skip copy.
2054
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
2055
0
      {
2056
0
        float* const gp0 = gp + i[0] * gstride[0];
2057
0
        float* const bp0 = bp + i[0] * bstride[0];
2058
0
        float* const hp0 = hp + i[0] * hstride[0];
2059
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
2060
0
        {
2061
0
          float* gp1 = gp0 + i[1] * gstride[1];
2062
0
          float* bp1 = bp0 + i[1] * bstride[1];
2063
0
          float* hp1 = hp0 + i[1] * hstride[1];
2064
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
2065
0
          {
2066
0
            for (x = 0; x < dim[3]; x++)
2067
0
              hp1[x] = bp1[x] >= max ? 0 : gp1[x];
2068
0
            gp1 += gstride[2];
2069
0
            bp1 += bstride[2];
2070
0
            hp1 += hstride[2];
2071
0
          }
2072
0
        }
2073
0
      }
2074
0
    } else if (isnan(max)) {
2075
0
      if (gstride[2] == dim[3] && bstride[2] == dim[3] && hstride[2] == dim[3])
2076
0
      {
2077
        // Special casing if the ginc[3] is the same as dim[3]
2078
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
2079
0
        {
2080
0
          float* gp0 = gp + i[0] * gstride[0];
2081
0
          float* bp0 = bp + i[0] * bstride[0];
2082
0
          float* hp0 = hp + i[0] * hstride[0];
2083
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
2084
0
          {
2085
0
            for (x = 0; x < count; x++)
2086
0
              hp0[x] = bp0[x] <= min ? 0 : gp0[x];
2087
0
            gp0 += gstride[1];
2088
0
            bp0 += bstride[1];
2089
0
            hp0 += hstride[1];
2090
0
          }
2091
0
        }
2092
0
        return CCV_NNC_EXEC_SUCCESS;
2093
0
      }
2094
      // Non-optimal case, need to do skip copy.
2095
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
2096
0
      {
2097
0
        float* const gp0 = gp + i[0] * gstride[0];
2098
0
        float* const bp0 = bp + i[0] * bstride[0];
2099
0
        float* const hp0 = hp + i[0] * hstride[0];
2100
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
2101
0
        {
2102
0
          float* gp1 = gp0 + i[1] * gstride[1];
2103
0
          float* bp1 = bp0 + i[1] * bstride[1];
2104
0
          float* hp1 = hp0 + i[1] * hstride[1];
2105
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
2106
0
          {
2107
0
            for (x = 0; x < dim[3]; x++)
2108
0
              hp1[x] = bp1[x] <= min ? 0 : gp1[x];
2109
0
            gp1 += gstride[2];
2110
0
            bp1 += bstride[2];
2111
0
            hp1 += hstride[2];
2112
0
          }
2113
0
        }
2114
0
      }
2115
0
    } else {
2116
0
      if (gstride[2] == dim[3] && bstride[2] == dim[3] && hstride[2] == dim[3])
2117
0
      {
2118
        // Special casing if the ginc[3] is the same as dim[3]
2119
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
2120
0
        {
2121
0
          float* gp0 = gp + i[0] * gstride[0];
2122
0
          float* bp0 = bp + i[0] * bstride[0];
2123
0
          float* hp0 = hp + i[0] * hstride[0];
2124
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
2125
0
          {
2126
0
            for (x = 0; x < count; x++)
2127
0
              hp0[x] = (bp0[x] >= max || bp0[x] <= min) ? 0 : gp0[x];
2128
0
            gp0 += gstride[1];
2129
0
            bp0 += bstride[1];
2130
0
            hp0 += hstride[1];
2131
0
          }
2132
0
        }
2133
0
        return CCV_NNC_EXEC_SUCCESS;
2134
0
      }
2135
      // Non-optimal case, need to do skip copy.
2136
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
2137
0
      {
2138
0
        float* const gp0 = gp + i[0] * gstride[0];
2139
0
        float* const bp0 = bp + i[0] * bstride[0];
2140
0
        float* const hp0 = hp + i[0] * hstride[0];
2141
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
2142
0
        {
2143
0
          float* gp1 = gp0 + i[1] * gstride[1];
2144
0
          float* bp1 = bp0 + i[1] * bstride[1];
2145
0
          float* hp1 = hp0 + i[1] * hstride[1];
2146
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
2147
0
          {
2148
0
            for (x = 0; x < dim[3]; x++)
2149
0
              hp1[x] = (bp1[x] >= max || bp1[x] <= min) ? 0 : gp1[x];
2150
0
            gp1 += gstride[2];
2151
0
            bp1 += bstride[2];
2152
0
            hp1 += hstride[2];
2153
0
          }
2154
0
        }
2155
0
      }
2156
0
    }
2157
0
  } else {
2158
0
    if (!CCV_IS_TENSOR_VIEW(h) && !CCV_IS_TENSOR_VIEW(b))
2159
0
    {
2160
      // Super optimal case, just do one for-loop for sum.
2161
0
      const int tensor_count = ccv_nnc_tensor_count(h->info);
2162
0
      if (isnan(min))
2163
0
      {
2164
0
        for (x = 0; x < tensor_count; x++)
2165
0
          h->data.f32[x] = b->data.f32[x] >= max ? 0 : 1;
2166
0
      } else if (isnan(max)) {
2167
0
        for (x = 0; x < tensor_count; x++)
2168
0
          h->data.f32[x] = b->data.f32[x] <= min ? 0 : 1;
2169
0
      } else {
2170
0
        for (x = 0; x < tensor_count; x++)
2171
0
          h->data.f32[x] = (b->data.f32[x] >= max || b->data.f32[x] <= min) ? 0 : 1;
2172
0
      }
2173
0
      return CCV_NNC_EXEC_SUCCESS;
2174
0
    }
2175
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
2176
0
    ccv_nnc_tensor_view_get_stride(b, bstride);
2177
0
    ccv_nnc_tensor_view_get_stride(h, hstride);
2178
0
    int i[CCV_NNC_MAX_DIM + 2];
2179
0
    float* const bp = b->data.f32;
2180
0
    float* const hp = h->data.f32;
2181
0
    const int count = dim[2] * dim[3];
2182
0
    const float min = cmd.info.clamp.min;
2183
0
    const float max = cmd.info.clamp.max;
2184
0
    assert(!isnan(min) || !isnan(max));
2185
0
    if (isnan(min))
2186
0
    {
2187
0
      if (bstride[2] == dim[3] && hstride[2] == dim[3])
2188
0
      {
2189
        // Special casing if the binc[3] is the same as dim[3]
2190
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
2191
0
        {
2192
0
          float* bp0 = bp + i[0] * bstride[0];
2193
0
          float* hp0 = hp + i[0] * hstride[0];
2194
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
2195
0
          {
2196
0
            for (x = 0; x < count; x++)
2197
0
              hp0[x] = bp0[x] >= max ? 0 : 1;
2198
0
            bp0 += bstride[1];
2199
0
            hp0 += hstride[1];
2200
0
          }
2201
0
        }
2202
0
        return CCV_NNC_EXEC_SUCCESS;
2203
0
      }
2204
      // Non-optimal case, need to do skip copy.
2205
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
2206
0
      {
2207
0
        float* const bp0 = bp + i[0] * bstride[0];
2208
0
        float* const hp0 = hp + i[0] * hstride[0];
2209
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
2210
0
        {
2211
0
          float* bp1 = bp0 + i[1] * bstride[1];
2212
0
          float* hp1 = hp0 + i[1] * hstride[1];
2213
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
2214
0
          {
2215
0
            for (x = 0; x < dim[3]; x++)
2216
0
              hp1[x] = bp1[x] >= max ? 0 : 1;
2217
0
            bp1 += bstride[2];
2218
0
            hp1 += hstride[2];
2219
0
          }
2220
0
        }
2221
0
      }
2222
0
    } else if (isnan(max)) {
2223
0
      if (bstride[2] == dim[3] && hstride[2] == dim[3])
2224
0
      {
2225
        // Special casing if the binc[3] is the same as dim[3]
2226
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
2227
0
        {
2228
0
          float* bp0 = bp + i[0] * bstride[0];
2229
0
          float* hp0 = hp + i[0] * hstride[0];
2230
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
2231
0
          {
2232
0
            for (x = 0; x < count; x++)
2233
0
              hp0[x] = bp0[x] <= min ? 0 : 1;
2234
0
            bp0 += bstride[1];
2235
0
            hp0 += hstride[1];
2236
0
          }
2237
0
        }
2238
0
        return CCV_NNC_EXEC_SUCCESS;
2239
0
      }
2240
      // Non-optimal case, need to do skip copy.
2241
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
2242
0
      {
2243
0
        float* const bp0 = bp + i[0] * bstride[0];
2244
0
        float* const hp0 = hp + i[0] * hstride[0];
2245
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
2246
0
        {
2247
0
          float* bp1 = bp0 + i[1] * bstride[1];
2248
0
          float* hp1 = hp0 + i[1] * hstride[1];
2249
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
2250
0
          {
2251
0
            for (x = 0; x < dim[3]; x++)
2252
0
              hp1[x] = bp1[x] <= min ? 0 : 1;
2253
0
            bp1 += bstride[2];
2254
0
            hp1 += hstride[2];
2255
0
          }
2256
0
        }
2257
0
      }
2258
0
    } else {
2259
0
      if (bstride[2] == dim[3] && hstride[2] == dim[3])
2260
0
      {
2261
        // Special casing if the binc[3] is the same as dim[3]
2262
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
2263
0
        {
2264
0
          float* bp0 = bp + i[0] * bstride[0];
2265
0
          float* hp0 = hp + i[0] * hstride[0];
2266
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
2267
0
          {
2268
0
            for (x = 0; x < count; x++)
2269
0
              hp0[x] = (bp0[x] >= max || bp0[x] <= min) ? 0 : 1;
2270
0
            bp0 += bstride[1];
2271
0
            hp0 += hstride[1];
2272
0
          }
2273
0
        }
2274
0
        return CCV_NNC_EXEC_SUCCESS;
2275
0
      }
2276
      // Non-optimal case, need to do skip copy.
2277
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
2278
0
      {
2279
0
        float* const bp0 = bp + i[0] * bstride[0];
2280
0
        float* const hp0 = hp + i[0] * hstride[0];
2281
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
2282
0
        {
2283
0
          float* bp1 = bp0 + i[1] * bstride[1];
2284
0
          float* hp1 = hp0 + i[1] * hstride[1];
2285
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
2286
0
          {
2287
0
            for (x = 0; x < dim[3]; x++)
2288
0
              hp1[x] = (bp1[x] >= max || bp1[x] <= min) ? 0 : 1;
2289
0
            bp1 += bstride[2];
2290
0
            hp1 += hstride[2];
2291
0
          }
2292
0
        }
2293
0
      }
2294
0
    }
2295
0
  }
2296
0
  return CCV_NNC_EXEC_SUCCESS;
2297
3
}
2298
2299
REGISTER_COMMAND_BACKEND(CCV_NNC_EWSUM_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
2300
1
{
2301
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
2302
1
  registry->tensor_datatypes = CCV_32F;
2303
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
2304
1
  registry->algorithms = 1;
2305
1
  registry->exec = _ccv_nnc_ewsum_forw;
2306
1
}
2307
2308
REGISTER_COMMAND_BACKEND(CCV_NNC_EWSUM_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
2309
1
{
2310
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
2311
1
  registry->tensor_datatypes = CCV_32F;
2312
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
2313
1
  registry->algorithms = 1;
2314
1
  registry->exec = _ccv_nnc_ewsum_back;
2315
1
}
2316
2317
REGISTER_COMMAND_BACKEND(CCV_NNC_EWPROD_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
2318
1
{
2319
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
2320
1
  registry->tensor_datatypes = CCV_32F;
2321
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
2322
1
  registry->algorithms = 1;
2323
1
  registry->exec = _ccv_nnc_ewprod_forw;
2324
1
}
2325
2326
REGISTER_COMMAND_BACKEND(CCV_NNC_EWPROD_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
2327
1
{
2328
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
2329
1
  registry->tensor_datatypes = CCV_32F;
2330
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
2331
1
  registry->algorithms = 1;
2332
1
  registry->exec = _ccv_nnc_ewprod_back;
2333
1
}
2334
2335
REGISTER_COMMAND_BACKEND(CCV_NNC_EWDIV_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
2336
1
{
2337
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
2338
1
  registry->tensor_datatypes = CCV_32F;
2339
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
2340
1
  registry->algorithms = 1;
2341
1
  registry->exec = _ccv_nnc_ewdiv_forw;
2342
1
}
2343
2344
REGISTER_COMMAND_BACKEND(CCV_NNC_EWDIV_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
2345
1
{
2346
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
2347
1
  registry->tensor_datatypes = CCV_32F;
2348
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
2349
1
  registry->algorithms = 1;
2350
1
  registry->exec = _ccv_nnc_ewdiv_back;
2351
1
}
2352
2353
REGISTER_COMMAND_BACKEND(CCV_NNC_EWEXP_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
2354
1
{
2355
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
2356
1
  registry->tensor_datatypes = CCV_32F;
2357
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
2358
1
  registry->algorithms = 1;
2359
1
  registry->exec = _ccv_nnc_ewexp_forw;
2360
1
}
2361
2362
REGISTER_COMMAND_BACKEND(CCV_NNC_EWEXP_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
2363
1
{
2364
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
2365
1
  registry->tensor_datatypes = CCV_32F;
2366
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
2367
1
  registry->algorithms = 1;
2368
1
  registry->exec = _ccv_nnc_ewexp_back;
2369
1
}
2370
2371
REGISTER_COMMAND_BACKEND(CCV_NNC_EWPOW_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
2372
1
{
2373
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
2374
1
  registry->tensor_datatypes = CCV_32F;
2375
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
2376
1
  registry->algorithms = 1;
2377
1
  registry->exec = _ccv_nnc_ewpow_forw;
2378
1
}
2379
2380
REGISTER_COMMAND_BACKEND(CCV_NNC_EWPOW_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
2381
1
{
2382
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
2383
1
  registry->tensor_datatypes = CCV_32F;
2384
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
2385
1
  registry->algorithms = 1;
2386
1
  registry->exec = _ccv_nnc_ewpow_back;
2387
1
}
2388
2389
REGISTER_COMMAND_BACKEND(CCV_NNC_EWLOG_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
2390
1
{
2391
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
2392
1
  registry->tensor_datatypes = CCV_32F;
2393
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
2394
1
  registry->algorithms = 1;
2395
1
  registry->exec = _ccv_nnc_ewlog_forw;
2396
1
}
2397
2398
REGISTER_COMMAND_BACKEND(CCV_NNC_EWLOG_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
2399
1
{
2400
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
2401
1
  registry->tensor_datatypes = CCV_32F;
2402
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
2403
1
  registry->algorithms = 1;
2404
1
  registry->exec = _ccv_nnc_ewlog_back;
2405
1
}
2406
2407
REGISTER_COMMAND_BACKEND(CCV_NNC_EWSQRT_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
2408
1
{
2409
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
2410
1
  registry->tensor_datatypes = CCV_32F;
2411
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
2412
1
  registry->algorithms = 1;
2413
1
  registry->exec = _ccv_nnc_ewsqrt_forw;
2414
1
}
2415
2416
REGISTER_COMMAND_BACKEND(CCV_NNC_EWSQRT_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
2417
1
{
2418
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
2419
1
  registry->tensor_datatypes = CCV_32F;
2420
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
2421
1
  registry->algorithms = 1;
2422
1
  registry->exec = _ccv_nnc_ewsqrt_back;
2423
1
}
2424
2425
REGISTER_COMMAND_BACKEND(CCV_NNC_EWSIN_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
2426
1
{
2427
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
2428
1
  registry->tensor_datatypes = CCV_32F;
2429
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
2430
1
  registry->algorithms = 1;
2431
1
  registry->exec = _ccv_nnc_ewsin_forw;
2432
1
}
2433
2434
REGISTER_COMMAND_BACKEND(CCV_NNC_EWSIN_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
2435
1
{
2436
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
2437
1
  registry->tensor_datatypes = CCV_32F;
2438
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
2439
1
  registry->algorithms = 1;
2440
1
  registry->exec = _ccv_nnc_ewsin_back;
2441
1
}
2442
2443
REGISTER_COMMAND_BACKEND(CCV_NNC_EWCOS_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
2444
1
{
2445
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
2446
1
  registry->tensor_datatypes = CCV_32F;
2447
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
2448
1
  registry->algorithms = 1;
2449
1
  registry->exec = _ccv_nnc_ewcos_forw;
2450
1
}
2451
2452
REGISTER_COMMAND_BACKEND(CCV_NNC_EWCOS_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
2453
1
{
2454
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
2455
1
  registry->tensor_datatypes = CCV_32F;
2456
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
2457
1
  registry->algorithms = 1;
2458
1
  registry->exec = _ccv_nnc_ewcos_back;
2459
1
}
2460
2461
REGISTER_COMMAND_BACKEND(CCV_NNC_EWABS_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
2462
1
{
2463
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
2464
1
  registry->tensor_datatypes = CCV_32F;
2465
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
2466
1
  registry->algorithms = 1;
2467
1
  registry->exec = _ccv_nnc_ewabs_forw;
2468
1
}
2469
2470
REGISTER_COMMAND_BACKEND(CCV_NNC_EWABS_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
2471
1
{
2472
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
2473
1
  registry->tensor_datatypes = CCV_32F;
2474
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
2475
1
  registry->algorithms = 1;
2476
1
  registry->exec = _ccv_nnc_ewabs_back;
2477
1
}
2478
2479
REGISTER_COMMAND_BACKEND(CCV_NNC_CLAMP_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
2480
1
{
2481
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
2482
1
  registry->tensor_datatypes = CCV_32F;
2483
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
2484
1
  registry->algorithms = 1;
2485
1
  registry->exec = _ccv_nnc_clamp_forw;
2486
1
}
2487
2488
REGISTER_COMMAND_BACKEND(CCV_NNC_CLAMP_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
2489
1
{
2490
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
2491
1
  registry->tensor_datatypes = CCV_32F;
2492
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
2493
1
  registry->algorithms = 1;
2494
1
  registry->exec = _ccv_nnc_clamp_back;
2495
1
}