Coverage Report

Created: 2019-07-03 22:50

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/cmd/ew/ccv_nnc_ew_cpu_ref.c
Line
Count
Source (jump to first uncovered line)
1
#include <ccv.h>
2
#include <ccv_internal.h>
3
#include <nnc/ccv_nnc.h>
4
#include <nnc/ccv_nnc_easy.h>
5
#include <nnc/ccv_nnc_internal.h>
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
#include "../_ccv_nnc_cpu_ref.h"
14
15
void _ccv_nnc_ewsum_forw_cpu_ref(ccv_nnc_tensor_view_t* const* const inputs, const int input_size, ccv_nnc_tensor_view_t* const* const outputs, const int output_size)
16
13.8k
{
17
13.8k
  if (input_size == 1 && 
output_size == 10
)
18
0
  {
19
0
    _ccv_nnc_tensor_transfer_cpu_ref(inputs[0], outputs[0]);
20
0
    return;
21
0
  }
22
13.8k
  // Assuming this is float 32.
23
13.8k
  int dim[CCV_NNC_MAX_DIM + 2];
24
13.8k
  int ainc[CCV_NNC_MAX_DIM + 2];
25
13.8k
  int binc[CCV_NNC_MAX_DIM + 2];
26
13.8k
  int cinc[CCV_NNC_MAX_DIM + 2];
27
13.8k
  int x, z;
28
13.8k
  int k = 0;
29
13.8k
  // Bad, I promised this can be inplace operation. Need to first find out if there are share the same pointer first.
30
27.7k
  for (z = 1; z < input_size; 
z++13.8k
)
31
13.8k
  {
32
13.8k
    ccv_nnc_tensor_view_t* c = outputs[0];
33
13.8k
    ccv_nnc_tensor_view_t* a = inputs[z];
34
13.8k
    if (c->data.f32 == a->data.f32)
35
11
    {
36
11
      k = z;
37
11
      break;
38
11
    }
39
13.8k
  }
40
27.7k
  for (z = 0; z < input_size - 1; 
z++13.8k
)
41
13.8k
  {
42
13.8k
    ccv_nnc_tensor_view_t* c = outputs[0];
43
13.8k
    ccv_nnc_tensor_view_t* a = z > 0 ? 
c14
:
inputs[k]13.8k
;
44
13.8k
    ccv_nnc_tensor_view_t* b = z >= k ? 
inputs[z + 1]13.8k
:
inputs[z]11
;
45
13.8k
    assert(a->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
46
13.8k
    assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
47
13.8k
    assert(c->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
48
13.8k
    ccv_nnc_tensor_view_get_dim(a, dim);
49
13.8k
    assert(ccv_nnc_tensor_view_check_dim(b, dim));
50
13.8k
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
51
13.8k
    if (!CCV_IS_TENSOR_VIEW(a) && 
!13.8k
CCV_IS_TENSOR_VIEW13.8k
(b) &&
!13.8k
CCV_IS_TENSOR_VIEW13.8k
(c))
52
13.8k
    {
53
13.8k
      // Super optimal case, just do one for-loop for sum.
54
13.8k
      const int tensor_count = ccv_nnc_tensor_count(a->info);
55
15.4M
      for (x = 0; x < tensor_count; 
x++15.4M
)
56
15.4M
        c->data.f32[x] = a->data.f32[x] + b->data.f32[x];
57
13.8k
      continue;
58
13.8k
    }
59
6
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
60
6
    ccv_nnc_tensor_view_get_inc(a, ainc);
61
6
    ccv_nnc_tensor_view_get_inc(b, binc);
62
6
    ccv_nnc_tensor_view_get_inc(c, cinc);
63
6
    int i[CCV_NNC_MAX_DIM + 2];
64
6
    float* ap = a->data.f32;
65
6
    float* bp = b->data.f32;
66
6
    float* cp = c->data.f32;
67
6
    const int count = dim[2] * dim[3];
68
6
    if (ainc[3] == dim[3] && 
binc[3] == dim[3]3
&&
cinc[3] == dim[3]0
)
69
0
    {
70
0
      // Special casing if the ainc[3] is the same as dim[3] (do memcpy for the last two dim)
71
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
72
0
      {
73
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
74
0
        {
75
0
          for (x = 0; x < count; x++)
76
0
            cp[x] = ap[x] + bp[x];
77
0
          ap += ainc[2] * ainc[3];
78
0
          bp += binc[2] * binc[3];
79
0
          cp += cinc[2] * cinc[3];
80
0
        }
81
0
        ap += (ainc[1] - dim[1]) * ainc[2] * ainc[0];
82
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
83
0
        cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
84
0
      }
85
0
      continue;
86
0
    }
87
6
    // Non-optimal case, need to do skip copy.
88
12
    
for (i[0] = 0; 6
i[0] < dim[0];
i[0]++6
)
89
6
    {
90
12
      for (i[1] = 0; i[1] < dim[1]; 
i[1]++6
)
91
6
      {
92
12
        for (i[2] = 0; i[2] < dim[2]; 
i[2]++6
)
93
6
        {
94
12
          for (x = 0; x < dim[3]; 
x++6
)
95
6
            cp[x] = ap[x] + bp[x];
96
6
          ap += ainc[3];
97
6
          bp += binc[3];
98
6
          cp += cinc[3];
99
6
        }
100
6
        ap += (ainc[2] - dim[2]) * ainc[3];
101
6
        bp += (binc[2] - dim[2]) * binc[3];
102
6
        cp += (cinc[2] - dim[2]) * cinc[3];
103
6
      }
104
6
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
105
6
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
106
6
      cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
107
6
    }
108
6
  }
109
13.8k
}
110
111
static int _ccv_nnc_ewsum_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
112
13.8k
{
113
13.8k
  _ccv_nnc_ewsum_forw_cpu_ref((ccv_nnc_tensor_view_t**)inputs, input_size, (ccv_nnc_tensor_view_t**)outputs, output_size);
114
13.8k
  return CCV_NNC_EXEC_SUCCESS;
115
13.8k
}
116
117
static int _ccv_nnc_ewsum_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
118
2.23k
{
119
2.23k
  // D[x + y + z, x] = 1
120
2.23k
  int i;
121
2.23k
  if (inputs[0] == 0)
122
0
  {
123
0
    // Set them to 1.
124
0
    for (i = 0; i < output_size; i++)
125
0
      if (outputs[i])
126
0
        _ccv_nnc_tensor_set_cpu_ref((ccv_nnc_tensor_view_t*)outputs[i], 1);
127
2.23k
  } else {
128
2.23k
    // Copy over the gradient (If they are not pointing to the same tensor already).
129
6.70k
    for (i = 0; i < output_size; 
i++4.46k
)
130
4.46k
      if (outputs[i] && inputs[0]->data.f32 != outputs[i]->data.f32)
131
2.23k
        _ccv_nnc_tensor_transfer_cpu_ref((ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)outputs[i]);
132
2.23k
  }
133
2.23k
  return CCV_NNC_EXEC_SUCCESS;
134
2.23k
}
135
136
void _ccv_nnc_ewprod_forw_cpu_ref(ccv_nnc_tensor_view_t* const* const inputs, const int input_size, ccv_nnc_tensor_view_t* const* const outputs, const int output_size)
137
6.59k
{
138
6.59k
  if (input_size == 1 && 
output_size == 10
)
139
0
  {
140
0
    _ccv_nnc_tensor_transfer_cpu_ref(inputs[0], outputs[0]);
141
0
    return;
142
0
  }
143
6.59k
  // Assuming this is float 32.
144
6.59k
  int dim[CCV_NNC_MAX_DIM + 2];
145
6.59k
  int ainc[CCV_NNC_MAX_DIM + 2];
146
6.59k
  int binc[CCV_NNC_MAX_DIM + 2];
147
6.59k
  int cinc[CCV_NNC_MAX_DIM + 2];
148
6.59k
  int x, z;
149
6.59k
  int k = 0;
150
6.59k
  // Bad, I promised this can be inplace operation. Need to first find out if there are share the same pointer first.
151
13.1k
  for (z = 1; z < input_size; 
z++6.58k
)
152
6.59k
  {
153
6.59k
    ccv_nnc_tensor_view_t* c = outputs[0];
154
6.59k
    ccv_nnc_tensor_view_t* a = inputs[z];
155
6.59k
    if (c->data.f32 == a->data.f32)
156
12
    {
157
12
      k = z;
158
12
      break;
159
12
    }
160
6.59k
  }
161
13.1k
  for (z = 0; z < input_size - 1; 
z++6.59k
)
162
6.59k
  {
163
6.59k
    ccv_nnc_tensor_view_t* c = outputs[0];
164
6.59k
    ccv_nnc_tensor_view_t* a = z > 0 ? 
c0
: inputs[k];
165
6.59k
    ccv_nnc_tensor_view_t* b = z >= k ? 
inputs[z + 1]6.58k
:
inputs[z]12
;
166
6.59k
    assert(a->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
167
6.59k
    assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
168
6.59k
    assert(c->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
169
6.59k
    ccv_nnc_tensor_view_get_dim(a, dim);
170
6.59k
    assert(ccv_nnc_tensor_view_check_dim(b, dim));
171
6.59k
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
172
6.59k
    if (!CCV_IS_TENSOR_VIEW(a) && 
!6.58k
CCV_IS_TENSOR_VIEW6.58k
(b) &&
!6.58k
CCV_IS_TENSOR_VIEW6.58k
(c))
173
6.59k
    {
174
6.58k
      // Super optimal case, just do one for-loop for sum.
175
6.58k
      const int tensor_count = ccv_nnc_tensor_count(a->info);
176
21.1k
      for (x = 0; x < tensor_count; 
x++14.5k
)
177
14.5k
        c->data.f32[x] = a->data.f32[x] * b->data.f32[x];
178
6.58k
      continue;
179
6.58k
    }
180
6
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
181
6
    ccv_nnc_tensor_view_get_inc(a, ainc);
182
6
    ccv_nnc_tensor_view_get_inc(b, binc);
183
6
    ccv_nnc_tensor_view_get_inc(c, cinc);
184
6
    int i[CCV_NNC_MAX_DIM + 2];
185
6
    float* ap = a->data.f32;
186
6
    float* bp = b->data.f32;
187
6
    float* cp = c->data.f32;
188
6
    const int count = dim[2] * dim[3];
189
6
    if (ainc[3] == dim[3] && 
binc[3] == dim[3]1
&&
cinc[3] == dim[3]1
)
190
0
    {
191
0
      // Special casing if the ainc[3] is the same as dim[3]
192
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
193
0
      {
194
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
195
0
        {
196
0
          for (x = 0; x < count; x++)
197
0
            cp[x] = ap[x] * bp[x];
198
0
          ap += ainc[2] * ainc[3];
199
0
          bp += binc[2] * binc[3];
200
0
          cp += cinc[2] * cinc[3];
201
0
        }
202
0
        ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
203
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
204
0
        cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
205
0
      }
206
0
      continue;
207
0
    }
208
6
    // Non-optimal case, need to do skip copy.
209
12
    
for (i[0] = 0; 6
i[0] < dim[0];
i[0]++6
)
210
6
    {
211
12
      for (i[1] = 0; i[1] < dim[1]; 
i[1]++6
)
212
6
      {
213
12
        for (i[2] = 0; i[2] < dim[2]; 
i[2]++6
)
214
6
        {
215
12
          for (x = 0; x < dim[3]; 
x++6
)
216
6
            cp[x] = ap[x] * bp[x];
217
6
          ap += ainc[3];
218
6
          bp += binc[3];
219
6
          cp += cinc[3];
220
6
        }
221
6
        ap += (ainc[2] - dim[2]) * ainc[3];
222
6
        bp += (binc[2] - dim[2]) * binc[3];
223
6
        cp += (cinc[2] - dim[2]) * cinc[3];
224
6
      }
225
6
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
226
6
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
227
6
      cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
228
6
    }
229
6
  }
230
6.59k
}
231
232
static int _ccv_nnc_ewprod_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
233
6.59k
{
234
6.59k
  _ccv_nnc_ewprod_forw_cpu_ref((ccv_nnc_tensor_view_t**)inputs, input_size, (ccv_nnc_tensor_view_t**)outputs, output_size);
235
6.59k
  return CCV_NNC_EXEC_SUCCESS;
236
6.59k
}
237
238
static int _ccv_nnc_ewprod_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
239
6.47k
{
240
6.47k
  // D[x * y * z, x] = y * z
241
6.47k
  // Assuming this is float 32.
242
6.47k
  int dim[CCV_NNC_MAX_DIM + 2];
243
6.47k
  int ginc[CCV_NNC_MAX_DIM + 2];
244
6.47k
  int ainc[CCV_NNC_MAX_DIM + 2];
245
6.47k
  int binc[CCV_NNC_MAX_DIM + 2];
246
6.47k
  int hinc[CCV_NNC_MAX_DIM + 2];
247
6.47k
  int x, z;
248
6.47k
  ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0];
249
6.47k
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[output_size + 1];
250
6.47k
  if (g == 0)
251
0
  {
252
0
    assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
253
0
    ccv_nnc_tensor_view_get_dim(b, dim);
254
0
    ccv_nnc_tensor_view_get_inc(b, binc);
255
0
    for (z = 0; z < output_size; z++)
256
0
    {
257
0
      ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[z + 1];
258
0
      ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[z];
259
0
      assert(a->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
260
0
      assert(h->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
261
0
      assert(ccv_nnc_tensor_view_check_dim(a, dim));
262
0
      assert(ccv_nnc_tensor_view_check_dim(h, dim));
263
0
      ccv_nnc_tensor_view_get_inc(a, ainc);
264
0
      ccv_nnc_tensor_view_get_inc(h, hinc);
265
0
      if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(h))
266
0
      {
267
0
        // Super optimal case, just do one for-loop for sum.
268
0
        const int tensor_count = ccv_nnc_tensor_count(b->info);
269
0
        for (x = 0; x < tensor_count; x++)
270
0
          h->data.f32[x] = b->data.f32[x] / a->data.f32[x];
271
0
        continue;
272
0
      }
273
0
      assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
274
0
      int i[CCV_NNC_MAX_DIM + 2];
275
0
      float* ap = a->data.f32;
276
0
      float* bp = b->data.f32;
277
0
      float* hp = h->data.f32;
278
0
      const int count = dim[2] * dim[3];
279
0
      if (ainc[3] == dim[3] && binc[3] == dim[3] && hinc[3] == dim[3])
280
0
      {
281
0
        // Special casing if the ainc[3] is the same as dim[3]
282
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
283
0
        {
284
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
285
0
          {
286
0
            for (x = 0; x < count; x++)
287
0
              hp[x] = bp[x] / ap[x];
288
0
            ap += ainc[2] * ainc[3];
289
0
            bp += binc[2] * binc[3];
290
0
            hp += hinc[2] * hinc[3];
291
0
          }
292
0
          ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
293
0
          bp += (binc[1] - dim[1]) * binc[2] * binc[3];
294
0
          hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
295
0
        }
296
0
        continue;
297
0
      }
298
0
      // Non-optimal case, need to do skip copy.
299
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
300
0
      {
301
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
302
0
        {
303
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
304
0
          {
305
0
            for (x = 0; x < dim[3]; x++)
306
0
              hp[x] = bp[x] / ap[x];
307
0
            ap += ainc[3];
308
0
            bp += binc[3];
309
0
            hp += hinc[3];
310
0
          }
311
0
          ap += (ainc[2] - dim[2]) * ainc[3];
312
0
          bp += (binc[2] - dim[2]) * binc[3];
313
0
          hp += (hinc[2] - dim[2]) * hinc[3];
314
0
        }
315
0
        ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
316
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
317
0
        hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
318
0
      }
319
0
    }
320
6.47k
  } else {
321
6.47k
    assert(g->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
322
6.47k
    assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
323
6.47k
    ccv_nnc_tensor_view_get_dim(b, dim);
324
6.47k
    assert(ccv_nnc_tensor_view_check_dim(g, dim));
325
6.47k
    ccv_nnc_tensor_view_get_inc(b, binc);
326
6.47k
    ccv_nnc_tensor_view_get_inc(g, ginc);
327
19.4k
    for (z = 0; z < output_size; 
z++12.9k
)
328
12.9k
    {
329
12.9k
      ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[z + 1];
330
12.9k
      ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[z];
331
12.9k
      assert(a->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
332
12.9k
      assert(h->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
333
12.9k
      assert(ccv_nnc_tensor_view_check_dim(a, dim));
334
12.9k
      assert(ccv_nnc_tensor_view_check_dim(h, dim));
335
12.9k
      ccv_nnc_tensor_view_get_inc(a, ainc);
336
12.9k
      ccv_nnc_tensor_view_get_inc(h, hinc);
337
12.9k
      if (!CCV_IS_TENSOR_VIEW(g) && 
!12.9k
CCV_IS_TENSOR_VIEW12.9k
(a) &&
!12.9k
CCV_IS_TENSOR_VIEW12.9k
(b) &&
!12.9k
CCV_IS_TENSOR_VIEW12.9k
(h))
338
12.9k
      {
339
12.9k
        // Super optimal case, just do one for-loop for sum.
340
12.9k
        const int tensor_count = ccv_nnc_tensor_count(g->info);
341
38.0k
        for (x = 0; x < tensor_count; 
x++25.0k
)
342
25.0k
          h->data.f32[x] = g->data.f32[x] * b->data.f32[x] / a->data.f32[x];
343
12.9k
        continue;
344
12.9k
      }
345
10
      assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
346
10
      int i[CCV_NNC_MAX_DIM + 2];
347
10
      float* gp = g->data.f32;
348
10
      float* ap = a->data.f32;
349
10
      float* bp = b->data.f32;
350
10
      float* hp = h->data.f32;
351
10
      const int count = dim[2] * dim[3];
352
10
      if (ginc[3] == dim[3] && 
ainc[3] == dim[3]8
&&
binc[3] == dim[3]0
&&
hinc[3] == dim[3]0
)
353
0
      {
354
0
        // Special casing if the ainc[3] is the same as dim[3]
355
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
356
0
        {
357
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
358
0
          {
359
0
            for (x = 0; x < count; x++)
360
0
              hp[x] = gp[x] * bp[x] / ap[x];
361
0
            gp += ginc[2] * ginc[3];
362
0
            ap += ainc[2] * ainc[3];
363
0
            bp += binc[2] * binc[3];
364
0
            hp += hinc[2] * hinc[3];
365
0
          }
366
0
          gp += (ginc[1] - dim[1]) * ginc[2] * ginc[3];
367
0
          ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
368
0
          bp += (binc[1] - dim[1]) * binc[2] * binc[3];
369
0
          hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
370
0
        }
371
0
        continue;
372
0
      }
373
10
      // Non-optimal case, need to do skip copy.
374
20
      
for (i[0] = 0; 10
i[0] < dim[0];
i[0]++10
)
375
10
      {
376
20
        for (i[1] = 0; i[1] < dim[1]; 
i[1]++10
)
377
10
        {
378
20
          for (i[2] = 0; i[2] < dim[2]; 
i[2]++10
)
379
10
          {
380
20
            for (x = 0; x < dim[3]; 
x++10
)
381
10
              hp[x] = gp[x] * bp[x] / ap[x];
382
10
            gp += ginc[3];
383
10
            ap += ainc[3];
384
10
            bp += binc[3];
385
10
            hp += hinc[3];
386
10
          }
387
10
          gp += (ginc[2] - dim[2]) * ginc[3];
388
10
          ap += (ainc[2] - dim[2]) * ainc[3];
389
10
          bp += (binc[2] - dim[2]) * binc[3];
390
10
          hp += (hinc[2] - dim[2]) * hinc[3];
391
10
        }
392
10
        gp += (ginc[1] - dim[1]) * ginc[2] * ginc[3];
393
10
        ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
394
10
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
395
10
        hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
396
10
      }
397
10
    }
398
6.47k
  }
399
6.47k
  return CCV_NNC_EXEC_SUCCESS;
400
6.47k
}
401
402
static void _ccv_nnc_ewdiv_forw_cpu_ref(const float p, ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b, ccv_nnc_tensor_view_t* const c)
403
22
{
404
22
  // Assuming this is float 32.
405
22
  int dim[CCV_NNC_MAX_DIM + 2];
406
22
  int ainc[CCV_NNC_MAX_DIM + 2];
407
22
  int binc[CCV_NNC_MAX_DIM + 2];
408
22
  int cinc[CCV_NNC_MAX_DIM + 2];
409
22
  if (a == 0) // Take 0 as all ones tensor.
410
4
  {
411
4
    assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
412
4
    assert(c->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
413
4
    ccv_nnc_tensor_view_get_dim(b, dim);
414
4
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
415
4
    int x;
416
4
    if (!CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c))
417
4
    {
418
4
      // Super optimal case, just do one for-loop for sum.
419
4
      const int tensor_count = ccv_nnc_tensor_count(b->info);
420
26
      for (x = 0; x < tensor_count; 
x++22
)
421
22
        c->data.f32[x] = p / b->data.f32[x];
422
4
      return;
423
4
    }
424
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
425
0
    ccv_nnc_tensor_view_get_inc(b, binc);
426
0
    ccv_nnc_tensor_view_get_inc(c, cinc);
427
0
    int i[CCV_NNC_MAX_DIM + 2];
428
0
    float* bp = b->data.f32;
429
0
    float* cp = c->data.f32;
430
0
    const int count = dim[2] * dim[3];
431
0
    if (binc[3] == dim[3] && cinc[3] == dim[3])
432
0
    {
433
0
      // Special casing if the ainc[3] is the same as dim[3]
434
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
435
0
      {
436
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
437
0
        {
438
0
          for (x = 0; x < count; x++)
439
0
            cp[x] = p / bp[x];
440
0
          bp += binc[2] * binc[3];
441
0
          cp += cinc[2] * cinc[3];
442
0
        }
443
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
444
0
        cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
445
0
      }
446
0
      return;
447
0
    }
448
0
    // Non-optimal case, need to do skip copy.
449
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
450
0
    {
451
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
452
0
      {
453
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
454
0
        {
455
0
          for (x = 0; x < dim[3]; x++)
456
0
            cp[x] = p / bp[x];
457
0
          bp += binc[3];
458
0
          cp += cinc[3];
459
0
        }
460
0
        bp += (binc[2] - dim[2]) * binc[3];
461
0
        cp += (cinc[2] - dim[2]) * cinc[3];
462
0
      }
463
0
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
464
0
      cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
465
0
    }
466
18
  } else {
467
18
    assert(a->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
468
18
    assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
469
18
    assert(c->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
470
18
    ccv_nnc_tensor_view_get_dim(a, dim);
471
18
    assert(ccv_nnc_tensor_view_check_dim(b, dim));
472
18
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
473
18
    int x;
474
18
    if (!CCV_IS_TENSOR_VIEW(a) && 
!16
CCV_IS_TENSOR_VIEW16
(b) &&
!16
CCV_IS_TENSOR_VIEW16
(c))
475
18
    {
476
16
      // Super optimal case, just do one for-loop for sum.
477
16
      const int tensor_count = ccv_nnc_tensor_count(a->info);
478
41
      for (x = 0; x < tensor_count; 
x++25
)
479
25
        c->data.f32[x] = p * a->data.f32[x] / b->data.f32[x];
480
16
      return;
481
16
    }
482
2
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
483
2
    ccv_nnc_tensor_view_get_inc(a, ainc);
484
2
    ccv_nnc_tensor_view_get_inc(b, binc);
485
2
    ccv_nnc_tensor_view_get_inc(c, cinc);
486
2
    int i[CCV_NNC_MAX_DIM + 2];
487
2
    float* ap = a->data.f32;
488
2
    float* bp = b->data.f32;
489
2
    float* cp = c->data.f32;
490
2
    const int count = dim[2] * dim[3];
491
2
    if (ainc[3] == dim[3] && 
binc[3] == dim[3]0
&&
cinc[3] == dim[3]0
)
492
0
    {
493
0
      // Special casing if the ainc[3] is the same as dim[3]
494
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
495
0
      {
496
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
497
0
        {
498
0
          for (x = 0; x < count; x++)
499
0
            cp[x] = p * ap[x] / bp[x];
500
0
          ap += ainc[2] * ainc[3];
501
0
          bp += binc[2] * binc[3];
502
0
          cp += cinc[2] * cinc[3];
503
0
        }
504
0
        ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
505
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
506
0
        cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
507
0
      }
508
0
      return;
509
0
    }
510
2
    // Non-optimal case, need to do skip copy.
511
4
    
for (i[0] = 0; 2
i[0] < dim[0];
i[0]++2
)
512
2
    {
513
4
      for (i[1] = 0; i[1] < dim[1]; 
i[1]++2
)
514
2
      {
515
4
        for (i[2] = 0; i[2] < dim[2]; 
i[2]++2
)
516
2
        {
517
4
          for (x = 0; x < dim[3]; 
x++2
)
518
2
            cp[x] = p * ap[x] / bp[x];
519
2
          ap += ainc[3];
520
2
          bp += binc[3];
521
2
          cp += cinc[3];
522
2
        }
523
2
        ap += (ainc[2] - dim[2]) * ainc[3];
524
2
        bp += (binc[2] - dim[2]) * binc[3];
525
2
        cp += (cinc[2] - dim[2]) * cinc[3];
526
2
      }
527
2
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
528
2
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
529
2
      cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
530
2
    }
531
2
  }
532
22
}
533
534
static int _ccv_nnc_ewdiv_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
535
7
{
536
7
  _ccv_nnc_ewdiv_forw_cpu_ref(1, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]);
537
7
  return CCV_NNC_EXEC_SUCCESS;
538
7
}
539
540
static int _ccv_nnc_ewdiv_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
541
4
{
542
4
  // D[x / y, x] = 1 / y, D[x / y, y] = -x / y^2
543
4
  if (output_size == 1 || outputs[1] == 0)
544
1
  {
545
1
    // When we only need D[x / y, y]
546
1
    _ccv_nnc_ewdiv_forw_cpu_ref(1, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
547
1
    return CCV_NNC_EXEC_SUCCESS;
548
1
  }
549
3
  int dim[CCV_NNC_MAX_DIM + 2];
550
3
  int ginc[CCV_NNC_MAX_DIM + 2];
551
3
  int binc[CCV_NNC_MAX_DIM + 2];
552
3
  int cinc[CCV_NNC_MAX_DIM + 2];
553
3
  int hainc[CCV_NNC_MAX_DIM + 2];
554
3
  int hbinc[CCV_NNC_MAX_DIM + 2];
555
3
  ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0];
556
3
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[2];
557
3
  ccv_nnc_tensor_view_t* c = (ccv_nnc_tensor_view_t*)inputs[3];
558
3
  ccv_nnc_tensor_view_t* ha = (ccv_nnc_tensor_view_t*)outputs[0];
559
3
  ccv_nnc_tensor_view_t* hb = (ccv_nnc_tensor_view_t*)outputs[1];
560
3
  if (g == 0)
561
0
  {
562
0
    assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
563
0
    assert(c->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
564
0
    assert(hb->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
565
0
    ccv_nnc_tensor_view_get_dim(b, dim);
566
0
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
567
0
    assert(ccv_nnc_tensor_view_check_dim(hb, dim));
568
0
    if (ha)
569
0
    {
570
0
      assert(ha->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
571
0
      assert(ccv_nnc_tensor_view_check_dim(ha, dim));
572
0
    }
573
0
    int x;
574
0
    if (!CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && (ha == 0 || !CCV_IS_TENSOR_VIEW(ha)) && !CCV_IS_TENSOR_VIEW(hb))
575
0
    {
576
0
      // Super optimal case, just do one for-loop for sum.
577
0
      const int tensor_count = ccv_nnc_tensor_count(b->info);
578
0
      if (ha == 0)
579
0
      {
580
0
        for (x = 0; x < tensor_count; x++)
581
0
        {
582
0
          const float v = 1 / b->data.f32[x];
583
0
          hb->data.f32[x] = -c->data.f32[x] * v;
584
0
        }
585
0
      } else {
586
0
        for (x = 0; x < tensor_count; x++)
587
0
        {
588
0
          const float v = 1 / b->data.f32[x];
589
0
          ha->data.f32[x] = v;
590
0
          hb->data.f32[x] = -c->data.f32[x] * v;
591
0
        }
592
0
      }
593
0
      return CCV_NNC_EXEC_SUCCESS;
594
0
    }
595
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
596
0
    ccv_nnc_tensor_view_get_inc(b, binc);
597
0
    ccv_nnc_tensor_view_get_inc(c, cinc);
598
0
    ccv_nnc_tensor_view_get_inc(hb, hbinc);
599
0
    int i[CCV_NNC_MAX_DIM + 2];
600
0
    float* bp = b->data.f32;
601
0
    float* cp = c->data.f32;
602
0
    float* hbp = hb->data.f32;
603
0
    const int count = dim[2] * dim[3];
604
0
    if (ha == 0)
605
0
    {
606
0
      if (binc[3] == dim[3] && cinc[3] == dim[3] && hbinc[3] == dim[3])
607
0
      {
608
0
        // Special casing if the ainc[3] is the same as dim[3]
609
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
610
0
        {
611
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
612
0
          {
613
0
            for (x = 0; x < count; x++)
614
0
            {
615
0
              const float v = 1 / bp[x];
616
0
              hbp[x] = -cp[x] * v;
617
0
            }
618
0
            bp += binc[2] * binc[3];
619
0
            cp += cinc[2] * cinc[3];
620
0
            hbp += hbinc[2] * hbinc[3];
621
0
          }
622
0
          bp += (binc[1] - dim[1]) * binc[2] * binc[3];
623
0
          cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
624
0
          hbp += (hbinc[1] - dim[1]) * hbinc[2] * hbinc[3];
625
0
        }
626
0
        return CCV_NNC_EXEC_SUCCESS;
627
0
      }
628
0
      // Non-optimal case, need to do skip copy.
629
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
630
0
      {
631
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
632
0
        {
633
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
634
0
          {
635
0
            for (x = 0; x < dim[3]; x++)
636
0
            {
637
0
              const float v = 1 / bp[x];
638
0
              hbp[x] = -cp[x] * v;
639
0
            }
640
0
            bp += binc[3];
641
0
            cp += cinc[3];
642
0
            hbp += hbinc[3];
643
0
          }
644
0
          bp += (binc[2] - dim[2]) * binc[3];
645
0
          cp += (cinc[2] - dim[2]) * cinc[3];
646
0
          hbp += (hbinc[2] - dim[2]) * hbinc[3];
647
0
        }
648
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
649
0
        cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
650
0
        hbp += (hbinc[1] - dim[1]) * hbinc[2] * hbinc[3];
651
0
      }
652
0
    } else {
653
0
      float* hap = ha->data.f32;
654
0
      ccv_nnc_tensor_view_get_inc(ha, hainc);
655
0
      if (binc[3] == dim[3] && cinc[3] == dim[3] && hainc[3] == dim[3] && hbinc[3] == dim[3])
656
0
      {
657
0
        // Special casing if the ainc[3] is the same as dim[3]
658
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
659
0
        {
660
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
661
0
          {
662
0
            for (x = 0; x < count; x++)
663
0
            {
664
0
              const float v = 1 / bp[x];
665
0
              hap[x] = v;
666
0
              hbp[x] = -cp[x] * v;
667
0
            }
668
0
            bp += binc[2] * binc[3];
669
0
            cp += cinc[2] * cinc[3];
670
0
            hap += hainc[2] * hainc[3];
671
0
            hbp += hbinc[2] * hbinc[3];
672
0
          }
673
0
          bp += (binc[1] - dim[1]) * binc[2] * binc[3];
674
0
          cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
675
0
          hap += (hainc[1] - dim[1]) * hainc[2] * hainc[3];
676
0
          hbp += (hbinc[1] - dim[1]) * hbinc[2] * hbinc[3];
677
0
        }
678
0
        return CCV_NNC_EXEC_SUCCESS;
679
0
      }
680
0
      // Non-optimal case, need to do skip copy.
681
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
682
0
      {
683
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
684
0
        {
685
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
686
0
          {
687
0
            for (x = 0; x < dim[3]; x++)
688
0
            {
689
0
              const float v = 1 / bp[x];
690
0
              hap[x] = v;
691
0
              hbp[x] = -cp[x] * v;
692
0
            }
693
0
            bp += binc[3];
694
0
            cp += cinc[3];
695
0
            hap += hainc[3];
696
0
            hbp += hbinc[3];
697
0
          }
698
0
          bp += (binc[2] - dim[2]) * binc[3];
699
0
          cp += (cinc[2] - dim[2]) * cinc[3];
700
0
          hap += (hainc[2] - dim[2]) * hainc[3];
701
0
          hbp += (hbinc[2] - dim[2]) * hbinc[3];
702
0
        }
703
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
704
0
        cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
705
0
        hap += (hainc[1] - dim[1]) * hainc[2] * hainc[3];
706
0
        hbp += (hbinc[1] - dim[1]) * hbinc[2] * hbinc[3];
707
0
      }
708
0
    }
709
3
  } else {
710
3
    assert(g->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
711
3
    assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
712
3
    assert(c->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
713
3
    assert(hb->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
714
3
    ccv_nnc_tensor_view_get_dim(b, dim);
715
3
    assert(ccv_nnc_tensor_view_check_dim(g, dim));
716
3
    assert(ccv_nnc_tensor_view_check_dim(c, dim));
717
3
    assert(ccv_nnc_tensor_view_check_dim(hb, dim));
718
3
    if (ha)
719
0
    {
720
0
      assert(ha->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
721
0
      assert(ccv_nnc_tensor_view_check_dim(ha, dim));
722
0
    }
723
3
    int x;
724
3
    if (!CCV_IS_TENSOR_VIEW(g) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && (ha == 0 || 
!0
CCV_IS_TENSOR_VIEW0
(ha)) && !CCV_IS_TENSOR_VIEW(hb))
725
3
    {
726
3
      // Super optimal case, just do one for-loop for sum.
727
3
      const int tensor_count = ccv_nnc_tensor_count(g->info);
728
3
      if (ha == 0)
729
3
      {
730
15
        for (x = 0; x < tensor_count; 
x++12
)
731
12
        {
732
12
          const float v = g->data.f32[x] / b->data.f32[x];
733
12
          hb->data.f32[x] = -c->data.f32[x] * v;
734
12
        }
735
3
      } else {
736
0
        for (x = 0; x < tensor_count; x++)
737
0
        {
738
0
          const float v = g->data.f32[x] / b->data.f32[x];
739
0
          ha->data.f32[x] = v;
740
0
          hb->data.f32[x] = -c->data.f32[x] * v;
741
0
        }
742
0
      }
743
3
      return CCV_NNC_EXEC_SUCCESS;
744
3
    }
745
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
746
0
    ccv_nnc_tensor_view_get_inc(g, ginc);
747
0
    ccv_nnc_tensor_view_get_inc(b, binc);
748
0
    ccv_nnc_tensor_view_get_inc(c, cinc);
749
0
    ccv_nnc_tensor_view_get_inc(hb, hbinc);
750
0
    int i[CCV_NNC_MAX_DIM + 2];
751
0
    float* gp = g->data.f32;
752
0
    float* bp = b->data.f32;
753
0
    float* cp = c->data.f32;
754
0
    float* hbp = hb->data.f32;
755
0
    const int count = dim[2] * dim[3];
756
0
    if (ha == 0)
757
0
    {
758
0
      if (ginc[3] == dim[3] && binc[3] == dim[3] && cinc[3] == dim[3] && hbinc[3] == dim[3])
759
0
      {
760
0
        // Special casing if the ainc[3] is the same as dim[3]
761
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
762
0
        {
763
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
764
0
          {
765
0
            for (x = 0; x < count; x++)
766
0
            {
767
0
              const float v = gp[x] / bp[x];
768
0
              hbp[x] = -cp[x] * v;
769
0
            }
770
0
            gp += ginc[2] * ginc[3];
771
0
            bp += binc[2] * binc[3];
772
0
            cp += cinc[2] * cinc[3];
773
0
            hbp += hbinc[2] * hbinc[3];
774
0
          }
775
0
          gp += (ginc[1] - dim[1]) * ginc[2] * ginc[3];
776
0
          bp += (binc[1] - dim[1]) * binc[2] * binc[3];
777
0
          cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
778
0
          hbp += (hbinc[1] - dim[1]) * hbinc[2] * hbinc[3];
779
0
        }
780
0
        return CCV_NNC_EXEC_SUCCESS;
781
0
      }
782
0
      // Non-optimal case, need to do skip copy.
783
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
784
0
      {
785
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
786
0
        {
787
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
788
0
          {
789
0
            for (x = 0; x < dim[3]; x++)
790
0
            {
791
0
              const float v = gp[x] / bp[x];
792
0
              hbp[x] = -cp[x] * v;
793
0
            }
794
0
            gp += ginc[3];
795
0
            bp += binc[3];
796
0
            cp += cinc[3];
797
0
            hbp += hbinc[3];
798
0
          }
799
0
          gp += (ginc[2] - dim[2]) * ginc[3];
800
0
          bp += (binc[2] - dim[2]) * binc[3];
801
0
          cp += (cinc[2] - dim[2]) * cinc[3];
802
0
          hbp += (hbinc[2] - dim[2]) * hbinc[3];
803
0
        }
804
0
        gp += (ginc[1] - dim[1]) * ginc[2] * ginc[3];
805
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
806
0
        cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
807
0
        hbp += (hbinc[1] - dim[1]) * hbinc[2] * hbinc[3];
808
0
      }
809
0
    } else {
810
0
      ccv_nnc_tensor_view_get_inc(ha, hainc);
811
0
      float* hap = ha->data.f32;
812
0
      if (ginc[3] == dim[3] && binc[3] == dim[3] && cinc[3] == dim[3] && hainc[3] == dim[3] && hbinc[3] == dim[3])
813
0
      {
814
0
        // Special casing if the ainc[3] is the same as dim[3]
815
0
        for (i[0] = 0; i[0] < dim[0]; i[0]++)
816
0
        {
817
0
          for (i[1] = 0; i[1] < dim[1]; i[1]++)
818
0
          {
819
0
            for (x = 0; x < count; x++)
820
0
            {
821
0
              const float v = gp[x] / bp[x];
822
0
              hap[x] = v;
823
0
              hbp[x] = -cp[x] * v;
824
0
            }
825
0
            gp += ginc[2] * ginc[3];
826
0
            bp += binc[2] * binc[3];
827
0
            cp += cinc[2] * cinc[3];
828
0
            hap += hainc[2] * hainc[3];
829
0
            hbp += hbinc[2] * hbinc[3];
830
0
          }
831
0
          gp += (ginc[1] - dim[1]) * ginc[2] * ginc[3];
832
0
          bp += (binc[1] - dim[1]) * binc[2] * binc[3];
833
0
          cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
834
0
          hap += (hainc[1] - dim[1]) * hainc[2] * hainc[3];
835
0
          hbp += (hbinc[1] - dim[1]) * hbinc[2] * hbinc[3];
836
0
        }
837
0
        return CCV_NNC_EXEC_SUCCESS;
838
0
      }
839
0
      // Non-optimal case, need to do skip copy.
840
0
      for (i[0] = 0; i[0] < dim[0]; i[0]++)
841
0
      {
842
0
        for (i[1] = 0; i[1] < dim[1]; i[1]++)
843
0
        {
844
0
          for (i[2] = 0; i[2] < dim[2]; i[2]++)
845
0
          {
846
0
            for (x = 0; x < dim[3]; x++)
847
0
            {
848
0
              const float v = gp[x] / bp[x];
849
0
              hap[x] = v;
850
0
              hbp[x] = -cp[x] * v;
851
0
            }
852
0
            gp += ginc[3];
853
0
            bp += binc[3];
854
0
            cp += cinc[3];
855
0
            hap += hainc[3];
856
0
            hbp += hbinc[3];
857
0
          }
858
0
          gp += (ginc[2] - dim[2]) * ginc[3];
859
0
          bp += (binc[2] - dim[2]) * binc[3];
860
0
          cp += (cinc[2] - dim[2]) * cinc[3];
861
0
          hap += (hainc[2] - dim[2]) * hainc[3];
862
0
          hbp += (hbinc[2] - dim[2]) * hbinc[3];
863
0
        }
864
0
        gp += (ginc[1] - dim[1]) * ginc[2] * ginc[3];
865
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
866
0
        cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
867
0
        hap += (hainc[1] - dim[1]) * hainc[2] * hainc[3];
868
0
        hbp += (hbinc[1] - dim[1]) * hbinc[2] * hbinc[3];
869
0
      }
870
0
    }
871
0
  }
872
3
  
return CCV_NNC_EXEC_SUCCESS0
;
873
3
}
874
875
static int _ccv_nnc_ewexp_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
876
5
{
877
5
  // Assuming this is float 32.
878
5
  int dim[CCV_NNC_MAX_DIM + 2];
879
5
  int ainc[CCV_NNC_MAX_DIM + 2];
880
5
  int binc[CCV_NNC_MAX_DIM + 2];
881
5
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
882
5
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
883
5
  assert(a->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
884
5
  assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
885
5
  ccv_nnc_tensor_view_get_dim(a, dim);
886
5
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
887
5
  int x;
888
5
  if (!CCV_IS_TENSOR_VIEW(a) && 
!4
CCV_IS_TENSOR_VIEW4
(b))
889
5
  {
890
4
    // Super optimal case, just do one for-loop for sum.
891
4
    const int tensor_count = ccv_nnc_tensor_count(a->info);
892
116
    for (x = 0; x < tensor_count; 
x++112
)
893
112
      b->data.f32[x] = exp(a->data.f32[x]);
894
4
    return CCV_NNC_EXEC_SUCCESS;
895
4
  }
896
1
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
897
1
  ccv_nnc_tensor_view_get_inc(a, ainc);
898
1
  ccv_nnc_tensor_view_get_inc(b, binc);
899
1
  int i[CCV_NNC_MAX_DIM + 2];
900
1
  float* ap = a->data.f32;
901
1
  float* bp = b->data.f32;
902
1
  const int count = dim[2] * dim[3];
903
1
  if (ainc[3] == dim[3] && 
binc[3] == dim[3]0
)
904
0
  {
905
0
    // Special casing if the ainc[3] is the same as dim[3]
906
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
907
0
    {
908
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
909
0
      {
910
0
        for (x = 0; x < count; x++)
911
0
          bp[x] = exp(ap[x]);
912
0
        ap += ainc[2] * ainc[3];
913
0
        bp += binc[2] * binc[3];
914
0
      }
915
0
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
916
0
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
917
0
    }
918
0
    return CCV_NNC_EXEC_SUCCESS;
919
0
  }
920
1
  // Non-optimal case, need to do skip copy.
921
2
  
for (i[0] = 0; 1
i[0] < dim[0];
i[0]++1
)
922
1
  {
923
2
    for (i[1] = 0; i[1] < dim[1]; 
i[1]++1
)
924
1
    {
925
2
      for (i[2] = 0; i[2] < dim[2]; 
i[2]++1
)
926
1
      {
927
2
        for (x = 0; x < dim[3]; 
x++1
)
928
1
          bp[x] = exp(ap[x]);
929
1
        ap += ainc[3];
930
1
        bp += binc[3];
931
1
      }
932
1
      ap += (ainc[2] - dim[2]) * ainc[3];
933
1
      bp += (binc[2] - dim[2]) * binc[3];
934
1
    }
935
1
    ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
936
1
    bp += (binc[1] - dim[1]) * binc[2] * binc[3];
937
1
  }
938
1
  return CCV_NNC_EXEC_SUCCESS;
939
1
}
940
941
static int _ccv_nnc_ewexp_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
942
2
{
943
2
  // D[Exp[x], x] = Exp[x]
944
2
  if (inputs[0] == 0)
945
0
    _ccv_nnc_tensor_transfer_cpu_ref((ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
946
2
  else
947
2
    _ccv_nnc_ewprod_forw_cpu_ref((ccv_nnc_tensor_view_t*[]){
948
2
      (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2]
949
2
    }, 2, (ccv_nnc_tensor_view_t**)outputs, output_size);
950
2
  return CCV_NNC_EXEC_SUCCESS;
951
2
}
952
953
static int _ccv_nnc_ewlog_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
954
29
{
955
29
  // Assuming this is float 32.
956
29
  int dim[CCV_NNC_MAX_DIM + 2];
957
29
  int ainc[CCV_NNC_MAX_DIM + 2];
958
29
  int binc[CCV_NNC_MAX_DIM + 2];
959
29
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
960
29
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
961
29
  assert(a->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
962
29
  assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
963
29
  ccv_nnc_tensor_view_get_dim(a, dim);
964
29
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
965
29
  int x;
966
29
  if (!CCV_IS_TENSOR_VIEW(a) && 
!27
CCV_IS_TENSOR_VIEW27
(b))
967
29
  {
968
27
    // Super optimal case, just do one for-loop for sum.
969
27
    const int tensor_count = ccv_nnc_tensor_count(a->info);
970
63
    for (x = 0; x < tensor_count; 
x++36
)
971
36
      b->data.f32[x] = log(a->data.f32[x]);
972
27
    return CCV_NNC_EXEC_SUCCESS;
973
27
  }
974
2
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
975
2
  ccv_nnc_tensor_view_get_inc(a, ainc);
976
2
  ccv_nnc_tensor_view_get_inc(b, binc);
977
2
  int i[CCV_NNC_MAX_DIM + 2];
978
2
  float* ap = a->data.f32;
979
2
  float* bp = b->data.f32;
980
2
  const int count = dim[2] * dim[3];
981
2
  if (ainc[3] == dim[3] && 
binc[3] == dim[3]0
)
982
0
  {
983
0
    // Special casing if the ainc[3] is the same as dim[3]
984
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
985
0
    {
986
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
987
0
      {
988
0
        for (x = 0; x < count; x++)
989
0
          bp[x] = log(ap[x]);
990
0
        ap += ainc[2] * ainc[3];
991
0
        bp += binc[2] * binc[3];
992
0
      }
993
0
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
994
0
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
995
0
    }
996
0
    return CCV_NNC_EXEC_SUCCESS;
997
0
  }
998
2
  // Non-optimal case, need to do skip copy.
999
4
  
for (i[0] = 0; 2
i[0] < dim[0];
i[0]++2
)
1000
2
  {
1001
4
    for (i[1] = 0; i[1] < dim[1]; 
i[1]++2
)
1002
2
    {
1003
4
      for (i[2] = 0; i[2] < dim[2]; 
i[2]++2
)
1004
2
      {
1005
4
        for (x = 0; x < dim[3]; 
x++2
)
1006
2
          bp[x] = log(ap[x]);
1007
2
        ap += ainc[3];
1008
2
        bp += binc[3];
1009
2
      }
1010
2
      ap += (ainc[2] - dim[2]) * ainc[3];
1011
2
      bp += (binc[2] - dim[2]) * binc[3];
1012
2
    }
1013
2
    ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
1014
2
    bp += (binc[1] - dim[1]) * binc[2] * binc[3];
1015
2
  }
1016
2
  return CCV_NNC_EXEC_SUCCESS;
1017
2
}
1018
1019
static int _ccv_nnc_ewlog_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1020
13
{
1021
13
  // D[Log[x], x] = 1 / x
1022
13
  _ccv_nnc_ewdiv_forw_cpu_ref(1, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]);
1023
13
  return CCV_NNC_EXEC_SUCCESS;
1024
13
}
1025
1026
static int _ccv_nnc_ewsqrt_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1027
1
{
1028
1
  // Assuming this is float 32.
1029
1
  int dim[CCV_NNC_MAX_DIM + 2];
1030
1
  int ainc[CCV_NNC_MAX_DIM + 2];
1031
1
  int binc[CCV_NNC_MAX_DIM + 2];
1032
1
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
1033
1
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
1034
1
  assert(a->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
1035
1
  assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
1036
1
  ccv_nnc_tensor_view_get_dim(a, dim);
1037
1
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
1038
1
  int x;
1039
1
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b))
1040
1
  {
1041
1
    // Super optimal case, just do one for-loop for sum.
1042
1
    const int tensor_count = ccv_nnc_tensor_count(a->info);
1043
11
    for (x = 0; x < tensor_count; 
x++10
)
1044
10
      b->data.f32[x] = sqrt(a->data.f32[x]);
1045
1
    return CCV_NNC_EXEC_SUCCESS;
1046
1
  }
1047
0
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1048
0
  ccv_nnc_tensor_view_get_inc(a, ainc);
1049
0
  ccv_nnc_tensor_view_get_inc(b, binc);
1050
0
  int i[CCV_NNC_MAX_DIM + 2];
1051
0
  float* ap = a->data.f32;
1052
0
  float* bp = b->data.f32;
1053
0
  const int count = dim[2] * dim[3];
1054
0
  if (ainc[3] == dim[3] && binc[3] == dim[3])
1055
0
  {
1056
0
    // Special casing if the ainc[3] is the same as dim[3]
1057
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
1058
0
    {
1059
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
1060
0
      {
1061
0
        for (x = 0; x < count; x++)
1062
0
          bp[x] = sqrt(ap[x]);
1063
0
        ap += ainc[2] * ainc[3];
1064
0
        bp += binc[2] * binc[3];
1065
0
      }
1066
0
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
1067
0
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
1068
0
    }
1069
0
    return CCV_NNC_EXEC_SUCCESS;
1070
0
  }
1071
0
  // Non-optimal case, need to do skip copy.
1072
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
1073
0
  {
1074
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
1075
0
    {
1076
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
1077
0
      {
1078
0
        for (x = 0; x < dim[3]; x++)
1079
0
          bp[x] = sqrt(ap[x]);
1080
0
        ap += ainc[3];
1081
0
        bp += binc[3];
1082
0
      }
1083
0
      ap += (ainc[2] - dim[2]) * ainc[3];
1084
0
      bp += (binc[2] - dim[2]) * binc[3];
1085
0
    }
1086
0
    ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
1087
0
    bp += (binc[1] - dim[1]) * binc[2] * binc[3];
1088
0
  }
1089
0
  return CCV_NNC_EXEC_SUCCESS;
1090
0
}
1091
1092
static int _ccv_nnc_ewsqrt_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1093
1
{
1094
1
  // D[Sqrt[x], x] = 0.5 / Sqrt[x]
1095
1
  _ccv_nnc_ewdiv_forw_cpu_ref(0.5, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
1096
1
  return CCV_NNC_EXEC_SUCCESS;
1097
1
}
1098
1099
REGISTER_COMMAND_BACKEND(CCV_NNC_EWSUM_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1100
1
{
1101
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1102
1
  registry->tensor_datatypes = CCV_32F;
1103
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1104
1
  registry->algorithms = 1;
1105
1
  registry->exec = _ccv_nnc_ewsum_forw;
1106
1
}
1107
1108
REGISTER_COMMAND_BACKEND(CCV_NNC_EWSUM_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1109
1
{
1110
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1111
1
  registry->tensor_datatypes = CCV_32F;
1112
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1113
1
  registry->algorithms = 1;
1114
1
  registry->exec = _ccv_nnc_ewsum_back;
1115
1
}
1116
1117
REGISTER_COMMAND_BACKEND(CCV_NNC_EWPROD_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1118
1
{
1119
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1120
1
  registry->tensor_datatypes = CCV_32F;
1121
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1122
1
  registry->algorithms = 1;
1123
1
  registry->exec = _ccv_nnc_ewprod_forw;
1124
1
}
1125
1126
REGISTER_COMMAND_BACKEND(CCV_NNC_EWPROD_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1127
1
{
1128
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1129
1
  registry->tensor_datatypes = CCV_32F;
1130
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1131
1
  registry->algorithms = 1;
1132
1
  registry->exec = _ccv_nnc_ewprod_back;
1133
1
}
1134
1135
REGISTER_COMMAND_BACKEND(CCV_NNC_EWDIV_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1136
1
{
1137
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1138
1
  registry->tensor_datatypes = CCV_32F;
1139
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1140
1
  registry->algorithms = 1;
1141
1
  registry->exec = _ccv_nnc_ewdiv_forw;
1142
1
}
1143
1144
REGISTER_COMMAND_BACKEND(CCV_NNC_EWDIV_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1145
1
{
1146
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1147
1
  registry->tensor_datatypes = CCV_32F;
1148
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1149
1
  registry->algorithms = 1;
1150
1
  registry->exec = _ccv_nnc_ewdiv_back;
1151
1
}
1152
1153
REGISTER_COMMAND_BACKEND(CCV_NNC_EWEXP_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1154
1
{
1155
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1156
1
  registry->tensor_datatypes = CCV_32F;
1157
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1158
1
  registry->algorithms = 1;
1159
1
  registry->exec = _ccv_nnc_ewexp_forw;
1160
1
}
1161
1162
REGISTER_COMMAND_BACKEND(CCV_NNC_EWEXP_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1163
1
{
1164
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1165
1
  registry->tensor_datatypes = CCV_32F;
1166
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1167
1
  registry->algorithms = 1;
1168
1
  registry->exec = _ccv_nnc_ewexp_back;
1169
1
}
1170
1171
REGISTER_COMMAND_BACKEND(CCV_NNC_EWLOG_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1172
1
{
1173
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1174
1
  registry->tensor_datatypes = CCV_32F;
1175
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1176
1
  registry->algorithms = 1;
1177
1
  registry->exec = _ccv_nnc_ewlog_forw;
1178
1
}
1179
1180
REGISTER_COMMAND_BACKEND(CCV_NNC_EWLOG_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1181
1
{
1182
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1183
1
  registry->tensor_datatypes = CCV_32F;
1184
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1185
1
  registry->algorithms = 1;
1186
1
  registry->exec = _ccv_nnc_ewlog_back;
1187
1
}
1188
1189
REGISTER_COMMAND_BACKEND(CCV_NNC_EWSQRT_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1190
1
{
1191
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1192
1
  registry->tensor_datatypes = CCV_32F;
1193
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1194
1
  registry->algorithms = 1;
1195
1
  registry->exec = _ccv_nnc_ewsqrt_forw;
1196
1
}
1197
1198
REGISTER_COMMAND_BACKEND(CCV_NNC_EWSQRT_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1199
1
{
1200
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
1201
1
  registry->tensor_datatypes = CCV_32F;
1202
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1203
1
  registry->algorithms = 1;
1204
1
  registry->exec = _ccv_nnc_ewsqrt_back;
1205
1
}