Coverage Report

Created: 2017-11-12 13:27

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/cmd/ew/ccv_nnc_ew_cpu_ref.c
Line
Count
Source (jump to first uncovered line)
1
#include <ccv.h>
2
#include <ccv_internal.h>
3
#include <nnc/ccv_nnc.h>
4
#include <nnc/ccv_nnc_easy.h>
5
#include <nnc/ccv_nnc_internal.h>
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
13
#include "../_ccv_nnc_cpu_ref.h"
14
15
int _ccv_nnc_ewsum_forw_cpu_ref(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const ccv_nnc_stream_context_t* const stream_context)
16
27
{
17
27
  if (
input_size == 1 && 27
output_size == 10
)
18
0
  {
19
0
    _ccv_nnc_tensor_transfer_cpu_ref((const ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)outputs[0]);
20
0
    return CCV_NNC_EXEC_SUCCESS;
21
0
  }
22
27
  // Assuming this is float 32.
23
27
  int dim[CCV_NNC_MAX_DIM + 2];
24
27
  int ainc[CCV_NNC_MAX_DIM + 2];
25
27
  int binc[CCV_NNC_MAX_DIM + 2];
26
27
  int cinc[CCV_NNC_MAX_DIM + 2];
27
27
  int x, z;
28
27
  int k = 0;
29
27
  // Bad, I promised this can be inplace operation. Need to first find out if there are share the same pointer first.
30
51
  for (z = 1; 
z < input_size51
;
z++24
)
31
30
  {
32
30
    ccv_nnc_tensor_view_t* c = (ccv_nnc_tensor_view_t*)outputs[0];
33
30
    ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[z];
34
30
    if (c->data.f32 == a->data.f32)
35
6
    {
36
6
      k = z;
37
6
      break;
38
6
    }
39
30
  }
40
59
  for (z = 0; 
z < input_size - 159
;
z++32
)
41
32
  {
42
32
    ccv_nnc_tensor_view_t* c = (ccv_nnc_tensor_view_t*)outputs[0];
43
27
    ccv_nnc_tensor_view_t* a = z > 0 ? 
c5
:
(ccv_nnc_tensor_view_t*)inputs[k]27
;
44
26
    ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)(z >= k ? 
inputs[z + 1]26
:
inputs[z]6
);
45
32
    assert(a->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
46
32
    assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
47
32
    assert(c->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
48
32
    ccv_nnc_tensor_view_get_dim(a, dim);
49
32
    ccv_nnc_tensor_view_check_dim(b, dim);
50
32
    ccv_nnc_tensor_view_check_dim(c, dim);
51
32
    if (
!32
CCV_IS_TENSOR_VIEW32
(a) &&
!30
CCV_IS_TENSOR_VIEW30
(b) &&
!27
CCV_IS_TENSOR_VIEW27
(c))
52
27
    {
53
27
      // Super optimal case, just do one for-loop for sum.
54
27
      const int tensor_count = ccv_nnc_tensor_count(a->info);
55
1.15k
      for (x = 0; 
x < tensor_count1.15k
;
x++1.12k
)
56
1.12k
        c->data.f32[x] = a->data.f32[x] + b->data.f32[x];
57
27
      continue;
58
27
    }
59
5
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
60
5
    ccv_nnc_tensor_view_get_inc(a, ainc);
61
5
    ccv_nnc_tensor_view_get_inc(b, binc);
62
5
    ccv_nnc_tensor_view_get_inc(c, cinc);
63
5
    int i[CCV_NNC_MAX_DIM + 2];
64
5
    float* ap = a->data.f32;
65
5
    float* bp = b->data.f32;
66
5
    float* cp = c->data.f32;
67
5
    const int count = dim[2] * dim[3];
68
5
    if (
ainc[3] == dim[3] && 5
binc[3] == dim[3]3
&&
cinc[3] == dim[3]0
)
69
0
    {
70
0
      // Special casing if the ainc[3] is the same as dim[3] (do memcpy for the last two dim)
71
0
      for (i[0] = 0; 
i[0] < dim[0]0
;
i[0]++0
)
72
0
      {
73
0
        for (i[1] = 0; 
i[1] < dim[1]0
;
i[1]++0
)
74
0
        {
75
0
          for (x = 0; 
x < count0
;
x++0
)
76
0
            cp[x] = ap[x] + bp[x];
77
0
          ap += ainc[2] * ainc[3];
78
0
          bp += binc[2] * binc[3];
79
0
          cp += cinc[2] * cinc[3];
80
0
        }
81
0
        ap += (ainc[1] - dim[1]) * ainc[2] * ainc[0];
82
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
83
0
        cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
84
0
      }
85
0
      continue;
86
0
    }
87
5
    // Non-optimal case, need to do skip copy.
88
10
    
for (i[0] = 0; 5
i[0] < dim[0]10
;
i[0]++5
)
89
5
    {
90
10
      for (i[1] = 0; 
i[1] < dim[1]10
;
i[1]++5
)
91
5
      {
92
10
        for (i[2] = 0; 
i[2] < dim[2]10
;
i[2]++5
)
93
5
        {
94
10
          for (x = 0; 
x < dim[3]10
;
x++5
)
95
5
            cp[x] = ap[x] + bp[x];
96
5
          ap += ainc[3];
97
5
          bp += binc[3];
98
5
          cp += cinc[3];
99
5
        }
100
5
        ap += (ainc[2] - dim[2]) * ainc[3];
101
5
        bp += (binc[2] - dim[2]) * binc[3];
102
5
        cp += (cinc[2] - dim[2]) * cinc[3];
103
5
      }
104
5
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
105
5
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
106
5
      cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
107
5
    }
108
5
  }
109
27
  return CCV_NNC_EXEC_SUCCESS;
110
27
}
111
112
static int _ccv_nnc_ewsum_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const ccv_nnc_stream_context_t* const stream_context)
113
7
{
114
7
  // D[x + y + z, x] = 1
115
7
  int i;
116
7
  if (inputs[0] == 0)
117
0
  {
118
0
    // Set them to 1.
119
0
    for (i = 0; 
i < output_size0
;
i++0
)
120
0
      
if (0
outputs[i]0
)
121
0
        _ccv_nnc_tensor_set_cpu_ref((ccv_nnc_tensor_view_t*)outputs[i], 1);
122
7
  } else {
123
7
    // Copy over the gradient (If they are not pointing to the same tensor already).
124
22
    for (i = 0; 
i < output_size22
;
i++15
)
125
15
      
if (15
inputs[0] != outputs[i] && 15
outputs[i]8
)
126
6
        _ccv_nnc_tensor_transfer_cpu_ref((ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)outputs[i]);
127
7
  }
128
7
  return CCV_NNC_EXEC_SUCCESS;
129
7
}
130
131
static int _ccv_nnc_ewprod_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const ccv_nnc_stream_context_t* const stream_context)
132
49
{
133
49
  if (
input_size == 1 && 49
output_size == 10
)
134
0
  {
135
0
    _ccv_nnc_tensor_transfer_cpu_ref((const ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)outputs[0]);
136
0
    return CCV_NNC_EXEC_SUCCESS;
137
0
  }
138
49
  // Assuming this is float 32.
139
49
  int dim[CCV_NNC_MAX_DIM + 2];
140
49
  int ainc[CCV_NNC_MAX_DIM + 2];
141
49
  int binc[CCV_NNC_MAX_DIM + 2];
142
49
  int cinc[CCV_NNC_MAX_DIM + 2];
143
49
  int x, z;
144
49
  int k = 0;
145
49
  // Bad, I promised this can be inplace operation. Need to first find out if there are share the same pointer first.
146
92
  for (z = 1; 
z < input_size92
;
z++43
)
147
49
  {
148
49
    ccv_nnc_tensor_view_t* c = (ccv_nnc_tensor_view_t*)outputs[0];
149
49
    ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[z];
150
49
    if (c->data.f32 == a->data.f32)
151
6
    {
152
6
      k = z;
153
6
      break;
154
6
    }
155
49
  }
156
98
  for (z = 0; 
z < input_size - 198
;
z++49
)
157
49
  {
158
49
    ccv_nnc_tensor_view_t* c = (ccv_nnc_tensor_view_t*)outputs[0];
159
49
    ccv_nnc_tensor_view_t* a = z > 0 ? 
c0
:
(ccv_nnc_tensor_view_t*)inputs[k]49
;
160
43
    ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)(z >= k ? 
inputs[z + 1]43
:
inputs[z]6
);
161
49
    assert(a->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
162
49
    assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
163
49
    assert(c->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
164
49
    ccv_nnc_tensor_view_get_dim(a, dim);
165
49
    ccv_nnc_tensor_view_check_dim(b, dim);
166
49
    ccv_nnc_tensor_view_check_dim(c, dim);
167
49
    if (
!49
CCV_IS_TENSOR_VIEW49
(a) &&
!44
CCV_IS_TENSOR_VIEW44
(b) &&
!44
CCV_IS_TENSOR_VIEW44
(c))
168
43
    {
169
43
      // Super optimal case, just do one for-loop for sum.
170
43
      const int tensor_count = ccv_nnc_tensor_count(a->info);
171
86
      for (x = 0; 
x < tensor_count86
;
x++43
)
172
43
        c->data.f32[x] = a->data.f32[x] * b->data.f32[x];
173
43
      continue;
174
43
    }
175
6
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
176
6
    ccv_nnc_tensor_view_get_inc(a, ainc);
177
6
    ccv_nnc_tensor_view_get_inc(b, binc);
178
6
    ccv_nnc_tensor_view_get_inc(c, cinc);
179
6
    int i[CCV_NNC_MAX_DIM + 2];
180
6
    float* ap = a->data.f32;
181
6
    float* bp = b->data.f32;
182
6
    float* cp = c->data.f32;
183
6
    const int count = dim[2] * dim[3];
184
6
    if (
ainc[3] == dim[3] && 6
binc[3] == dim[3]1
&&
cinc[3] == dim[3]1
)
185
0
    {
186
0
      // Special casing if the ainc[3] is the same as dim[3]
187
0
      for (i[0] = 0; 
i[0] < dim[0]0
;
i[0]++0
)
188
0
      {
189
0
        for (i[1] = 0; 
i[1] < dim[1]0
;
i[1]++0
)
190
0
        {
191
0
          for (x = 0; 
x < count0
;
x++0
)
192
0
            cp[x] = ap[x] * bp[x];
193
0
          ap += ainc[2] * ainc[3];
194
0
          bp += binc[2] * binc[3];
195
0
          cp += cinc[2] * cinc[3];
196
0
        }
197
0
        ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
198
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
199
0
        cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
200
0
      }
201
0
      continue;
202
0
    }
203
6
    // Non-optimal case, need to do skip copy.
204
12
    
for (i[0] = 0; 6
i[0] < dim[0]12
;
i[0]++6
)
205
6
    {
206
12
      for (i[1] = 0; 
i[1] < dim[1]12
;
i[1]++6
)
207
6
      {
208
12
        for (i[2] = 0; 
i[2] < dim[2]12
;
i[2]++6
)
209
6
        {
210
12
          for (x = 0; 
x < dim[3]12
;
x++6
)
211
6
            cp[x] = ap[x] * bp[x];
212
6
          ap += ainc[3];
213
6
          bp += binc[3];
214
6
          cp += cinc[3];
215
6
        }
216
6
        ap += (ainc[2] - dim[2]) * ainc[3];
217
6
        bp += (binc[2] - dim[2]) * binc[3];
218
6
        cp += (cinc[2] - dim[2]) * cinc[3];
219
6
      }
220
6
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
221
6
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
222
6
      cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
223
6
    }
224
6
  }
225
49
  return CCV_NNC_EXEC_SUCCESS;
226
49
}
227
228
static int _ccv_nnc_ewprod_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const ccv_nnc_stream_context_t* const stream_context)
229
13
{
230
13
  // D[x * y * z, x] = y * z
231
13
  // Assuming this is float 32.
232
13
  int dim[CCV_NNC_MAX_DIM + 2];
233
13
  int ginc[CCV_NNC_MAX_DIM + 2];
234
13
  int ainc[CCV_NNC_MAX_DIM + 2];
235
13
  int binc[CCV_NNC_MAX_DIM + 2];
236
13
  int hinc[CCV_NNC_MAX_DIM + 2];
237
13
  int x, z;
238
13
  ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0];
239
13
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[output_size + 1];
240
13
  if (g == 0)
241
0
  {
242
0
    assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
243
0
    ccv_nnc_tensor_view_get_dim(b, dim);
244
0
    ccv_nnc_tensor_view_get_inc(b, binc);
245
0
    for (z = 0; 
z < output_size0
;
z++0
)
246
0
    {
247
0
      ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[z + 1];
248
0
      ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[z];
249
0
      assert(a->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
250
0
      assert(h->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
251
0
      ccv_nnc_tensor_view_check_dim(a, dim);
252
0
      ccv_nnc_tensor_view_check_dim(h, dim);
253
0
      ccv_nnc_tensor_view_get_inc(a, ainc);
254
0
      ccv_nnc_tensor_view_get_inc(h, hinc);
255
0
      if (
!0
CCV_IS_TENSOR_VIEW0
(a) &&
!0
CCV_IS_TENSOR_VIEW0
(b) &&
!0
CCV_IS_TENSOR_VIEW0
(h))
256
0
      {
257
0
        // Super optimal case, just do one for-loop for sum.
258
0
        const int tensor_count = ccv_nnc_tensor_count(b->info);
259
0
        for (x = 0; 
x < tensor_count0
;
x++0
)
260
0
          h->data.f32[x] = b->data.f32[x] / a->data.f32[x];
261
0
        continue;
262
0
      }
263
0
      assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
264
0
      int i[CCV_NNC_MAX_DIM + 2];
265
0
      float* ap = a->data.f32;
266
0
      float* bp = b->data.f32;
267
0
      float* hp = h->data.f32;
268
0
      const int count = dim[2] * dim[3];
269
0
      if (
ainc[3] == dim[3] && 0
binc[3] == dim[3]0
&&
hinc[3] == dim[3]0
)
270
0
      {
271
0
        // Special casing if the ainc[3] is the same as dim[3]
272
0
        for (i[0] = 0; 
i[0] < dim[0]0
;
i[0]++0
)
273
0
        {
274
0
          for (i[1] = 0; 
i[1] < dim[1]0
;
i[1]++0
)
275
0
          {
276
0
            for (x = 0; 
x < count0
;
x++0
)
277
0
              hp[x] = bp[x] / ap[x];
278
0
            ap += ainc[2] * ainc[3];
279
0
            bp += binc[2] * binc[3];
280
0
            hp += hinc[2] * hinc[3];
281
0
          }
282
0
          ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
283
0
          bp += (binc[1] - dim[1]) * binc[2] * binc[3];
284
0
          hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
285
0
        }
286
0
        continue;
287
0
      }
288
0
      // Non-optimal case, need to do skip copy.
289
0
      
for (i[0] = 0; 0
i[0] < dim[0]0
;
i[0]++0
)
290
0
      {
291
0
        for (i[1] = 0; 
i[1] < dim[1]0
;
i[1]++0
)
292
0
        {
293
0
          for (i[2] = 0; 
i[2] < dim[2]0
;
i[2]++0
)
294
0
          {
295
0
            for (x = 0; 
x < dim[3]0
;
x++0
)
296
0
              hp[x] = bp[x] / ap[x];
297
0
            ap += ainc[3];
298
0
            bp += binc[3];
299
0
            hp += hinc[3];
300
0
          }
301
0
          ap += (ainc[2] - dim[2]) * ainc[3];
302
0
          bp += (binc[2] - dim[2]) * binc[3];
303
0
          hp += (hinc[2] - dim[2]) * hinc[3];
304
0
        }
305
0
        ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
306
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
307
0
        hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
308
0
      }
309
0
    }
310
13
  } else {
311
13
    assert(g->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
312
13
    assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
313
13
    ccv_nnc_tensor_view_get_dim(b, dim);
314
13
    ccv_nnc_tensor_view_check_dim(g, dim);
315
13
    ccv_nnc_tensor_view_get_inc(b, binc);
316
13
    ccv_nnc_tensor_view_get_inc(g, ginc);
317
39
    for (z = 0; 
z < output_size39
;
z++26
)
318
26
    {
319
26
      ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[z + 1];
320
26
      ccv_nnc_tensor_view_t* h = (ccv_nnc_tensor_view_t*)outputs[z];
321
26
      assert(a->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
322
26
      assert(h->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
323
26
      ccv_nnc_tensor_view_check_dim(a, dim);
324
26
      ccv_nnc_tensor_view_check_dim(h, dim);
325
26
      ccv_nnc_tensor_view_get_inc(a, ainc);
326
26
      ccv_nnc_tensor_view_get_inc(h, hinc);
327
26
      if (
!26
CCV_IS_TENSOR_VIEW26
(g) &&
!24
CCV_IS_TENSOR_VIEW24
(a) &&
!16
CCV_IS_TENSOR_VIEW16
(b) &&
!16
CCV_IS_TENSOR_VIEW16
(h))
328
16
      {
329
16
        // Super optimal case, just do one for-loop for sum.
330
16
        const int tensor_count = ccv_nnc_tensor_count(g->info);
331
32
        for (x = 0; 
x < tensor_count32
;
x++16
)
332
16
          h->data.f32[x] = g->data.f32[x] * b->data.f32[x] / a->data.f32[x];
333
16
        continue;
334
16
      }
335
10
      assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
336
10
      int i[CCV_NNC_MAX_DIM + 2];
337
10
      float* gp = g->data.f32;
338
10
      float* ap = a->data.f32;
339
10
      float* bp = b->data.f32;
340
10
      float* hp = h->data.f32;
341
10
      const int count = dim[2] * dim[3];
342
10
      if (
ginc[3] == dim[3] && 10
ainc[3] == dim[3]8
&&
binc[3] == dim[3]0
&&
hinc[3] == dim[3]0
)
343
0
      {
344
0
        // Special casing if the ainc[3] is the same as dim[3]
345
0
        for (i[0] = 0; 
i[0] < dim[0]0
;
i[0]++0
)
346
0
        {
347
0
          for (i[1] = 0; 
i[1] < dim[1]0
;
i[1]++0
)
348
0
          {
349
0
            for (x = 0; 
x < count0
;
x++0
)
350
0
              hp[x] = gp[x] * bp[x] / ap[x];
351
0
            gp += ginc[2] * ginc[3];
352
0
            ap += ainc[2] * ainc[3];
353
0
            bp += binc[2] * binc[3];
354
0
            hp += hinc[2] * hinc[3];
355
0
          }
356
0
          gp += (ginc[1] - dim[1]) * ginc[2] * ginc[3];
357
0
          ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
358
0
          bp += (binc[1] - dim[1]) * binc[2] * binc[3];
359
0
          hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
360
0
        }
361
0
        continue;
362
0
      }
363
10
      // Non-optimal case, need to do skip copy.
364
20
      
for (i[0] = 0; 10
i[0] < dim[0]20
;
i[0]++10
)
365
10
      {
366
20
        for (i[1] = 0; 
i[1] < dim[1]20
;
i[1]++10
)
367
10
        {
368
20
          for (i[2] = 0; 
i[2] < dim[2]20
;
i[2]++10
)
369
10
          {
370
20
            for (x = 0; 
x < dim[3]20
;
x++10
)
371
10
              hp[x] = gp[x] * bp[x] / ap[x];
372
10
            gp += ginc[3];
373
10
            ap += ainc[3];
374
10
            bp += binc[3];
375
10
            hp += hinc[3];
376
10
          }
377
10
          gp += (ginc[2] - dim[2]) * ginc[3];
378
10
          ap += (ainc[2] - dim[2]) * ainc[3];
379
10
          bp += (binc[2] - dim[2]) * binc[3];
380
10
          hp += (hinc[2] - dim[2]) * hinc[3];
381
10
        }
382
10
        gp += (ginc[1] - dim[1]) * ginc[2] * ginc[3];
383
10
        ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
384
10
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
385
10
        hp += (hinc[1] - dim[1]) * hinc[2] * hinc[3];
386
10
      }
387
10
    }
388
13
  }
389
13
  return CCV_NNC_EXEC_SUCCESS;
390
13
}
391
392
static int _ccv_nnc_ewdiv_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const ccv_nnc_stream_context_t* const stream_context)
393
5
{
394
5
  // Assuming this is float 32.
395
5
  int dim[CCV_NNC_MAX_DIM + 2];
396
5
  int ainc[CCV_NNC_MAX_DIM + 2];
397
5
  int binc[CCV_NNC_MAX_DIM + 2];
398
5
  int cinc[CCV_NNC_MAX_DIM + 2];
399
5
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
400
5
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[1];
401
5
  ccv_nnc_tensor_view_t* c = (ccv_nnc_tensor_view_t*)outputs[0];
402
5
  if (a == 0) // Take 0 as all ones tensor.
403
0
  {
404
0
    assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
405
0
    assert(c->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
406
0
    ccv_nnc_tensor_view_get_dim(b, dim);
407
0
    ccv_nnc_tensor_view_check_dim(c, dim);
408
0
    int x;
409
0
    if (
!0
CCV_IS_TENSOR_VIEW0
(b) &&
!0
CCV_IS_TENSOR_VIEW0
(c))
410
0
    {
411
0
      // Super optimal case, just do one for-loop for sum.
412
0
      const int tensor_count = ccv_nnc_tensor_count(b->info);
413
0
      for (x = 0; 
x < tensor_count0
;
x++0
)
414
0
        c->data.f32[x] = 1 / b->data.f32[x];
415
0
      return CCV_NNC_EXEC_SUCCESS;
416
0
    }
417
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
418
0
    ccv_nnc_tensor_view_get_inc(b, binc);
419
0
    ccv_nnc_tensor_view_get_inc(c, cinc);
420
0
    int i[CCV_NNC_MAX_DIM + 2];
421
0
    float* bp = b->data.f32;
422
0
    float* cp = c->data.f32;
423
0
    const int count = dim[2] * dim[3];
424
0
    if (
binc[3] == dim[3] && 0
cinc[3] == dim[3]0
)
425
0
    {
426
0
      // Special casing if the ainc[3] is the same as dim[3]
427
0
      for (i[0] = 0; 
i[0] < dim[0]0
;
i[0]++0
)
428
0
      {
429
0
        for (i[1] = 0; 
i[1] < dim[1]0
;
i[1]++0
)
430
0
        {
431
0
          for (x = 0; 
x < count0
;
x++0
)
432
0
            cp[x] = 1 / bp[x];
433
0
          bp += binc[2] * binc[3];
434
0
          cp += cinc[2] * cinc[3];
435
0
        }
436
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
437
0
        cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
438
0
      }
439
0
      return CCV_NNC_EXEC_SUCCESS;
440
0
    }
441
0
    // Non-optimal case, need to do skip copy.
442
0
    
for (i[0] = 0; 0
i[0] < dim[0]0
;
i[0]++0
)
443
0
    {
444
0
      for (i[1] = 0; 
i[1] < dim[1]0
;
i[1]++0
)
445
0
      {
446
0
        for (i[2] = 0; 
i[2] < dim[2]0
;
i[2]++0
)
447
0
        {
448
0
          for (x = 0; 
x < dim[3]0
;
x++0
)
449
0
            cp[x] = 1 / bp[x];
450
0
          bp += binc[3];
451
0
          cp += cinc[3];
452
0
        }
453
0
        bp += (binc[2] - dim[2]) * binc[3];
454
0
        cp += (cinc[2] - dim[2]) * cinc[3];
455
0
      }
456
0
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
457
0
      cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
458
0
    }
459
5
  } else {
460
5
    assert(a->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
461
5
    assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
462
5
    assert(c->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
463
5
    ccv_nnc_tensor_view_get_dim(a, dim);
464
5
    ccv_nnc_tensor_view_check_dim(b, dim);
465
5
    ccv_nnc_tensor_view_check_dim(c, dim);
466
5
    int x;
467
5
    if (
!5
CCV_IS_TENSOR_VIEW5
(a) &&
!3
CCV_IS_TENSOR_VIEW3
(b) &&
!3
CCV_IS_TENSOR_VIEW3
(c))
468
3
    {
469
3
      // Super optimal case, just do one for-loop for sum.
470
3
      const int tensor_count = ccv_nnc_tensor_count(a->info);
471
6
      for (x = 0; 
x < tensor_count6
;
x++3
)
472
3
        c->data.f32[x] = a->data.f32[x] / b->data.f32[x];
473
3
      return CCV_NNC_EXEC_SUCCESS;
474
3
    }
475
2
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
476
2
    ccv_nnc_tensor_view_get_inc(a, ainc);
477
2
    ccv_nnc_tensor_view_get_inc(b, binc);
478
2
    ccv_nnc_tensor_view_get_inc(c, cinc);
479
2
    int i[CCV_NNC_MAX_DIM + 2];
480
2
    float* ap = a->data.f32;
481
2
    float* bp = b->data.f32;
482
2
    float* cp = c->data.f32;
483
2
    const int count = dim[2] * dim[3];
484
2
    if (
ainc[3] == dim[3] && 2
binc[3] == dim[3]0
&&
cinc[3] == dim[3]0
)
485
0
    {
486
0
      // Special casing if the ainc[3] is the same as dim[3]
487
0
      for (i[0] = 0; 
i[0] < dim[0]0
;
i[0]++0
)
488
0
      {
489
0
        for (i[1] = 0; 
i[1] < dim[1]0
;
i[1]++0
)
490
0
        {
491
0
          for (x = 0; 
x < count0
;
x++0
)
492
0
            cp[x] = ap[x] / bp[x];
493
0
          ap += ainc[2] * ainc[3];
494
0
          bp += binc[2] * binc[3];
495
0
          cp += cinc[2] * cinc[3];
496
0
        }
497
0
        ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
498
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
499
0
        cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
500
0
      }
501
0
      return CCV_NNC_EXEC_SUCCESS;
502
0
    }
503
2
    // Non-optimal case, need to do skip copy.
504
4
    
for (i[0] = 0; 2
i[0] < dim[0]4
;
i[0]++2
)
505
2
    {
506
4
      for (i[1] = 0; 
i[1] < dim[1]4
;
i[1]++2
)
507
2
      {
508
4
        for (i[2] = 0; 
i[2] < dim[2]4
;
i[2]++2
)
509
2
        {
510
4
          for (x = 0; 
x < dim[3]4
;
x++2
)
511
2
            cp[x] = ap[x] / bp[x];
512
2
          ap += ainc[3];
513
2
          bp += binc[3];
514
2
          cp += cinc[3];
515
2
        }
516
2
        ap += (ainc[2] - dim[2]) * ainc[3];
517
2
        bp += (binc[2] - dim[2]) * binc[3];
518
2
        cp += (cinc[2] - dim[2]) * cinc[3];
519
2
      }
520
2
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
521
2
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
522
2
      cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
523
2
    }
524
2
  }
525
2
  return CCV_NNC_EXEC_SUCCESS;
526
5
}
527
528
static int _ccv_nnc_ewdiv_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const ccv_nnc_stream_context_t* const stream_context)
529
1
{
530
1
  // D[x / y, x] = 1 / y, D[x / y, y] = -x / y^2
531
1
  if (
output_size == 1 || 1
outputs[1] == 01
)
532
0
  {
533
0
    // When we only need D[x / y, y]
534
0
    ccv_nnc_cmd_t forw_cmd = cmd;
535
0
    forw_cmd.cmd = CCV_NNC_EWDIV_FORWARD;
536
0
    return _ccv_nnc_ewdiv_forw(cmd, ccv_nnc_no_hint, flags, TENSOR_LIST(inputs[0], inputs[2]), &outputs[0], 1, stream_context);
537
0
  }
538
1
  int dim[CCV_NNC_MAX_DIM + 2];
539
1
  int ginc[CCV_NNC_MAX_DIM + 2];
540
1
  int binc[CCV_NNC_MAX_DIM + 2];
541
1
  int cinc[CCV_NNC_MAX_DIM + 2];
542
1
  int hainc[CCV_NNC_MAX_DIM + 2];
543
1
  int hbinc[CCV_NNC_MAX_DIM + 2];
544
1
  ccv_nnc_tensor_view_t* g = (ccv_nnc_tensor_view_t*)inputs[0];
545
1
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)inputs[2];
546
1
  ccv_nnc_tensor_view_t* c = (ccv_nnc_tensor_view_t*)inputs[3];
547
1
  ccv_nnc_tensor_view_t* ha = (ccv_nnc_tensor_view_t*)outputs[0];
548
1
  ccv_nnc_tensor_view_t* hb = (ccv_nnc_tensor_view_t*)outputs[1];
549
1
  if (g == 0)
550
0
  {
551
0
    assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
552
0
    assert(c->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
553
0
    assert(ha->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
554
0
    assert(hb->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
555
0
    ccv_nnc_tensor_view_get_dim(b, dim);
556
0
    ccv_nnc_tensor_view_check_dim(c, dim);
557
0
    ccv_nnc_tensor_view_check_dim(ha, dim);
558
0
    ccv_nnc_tensor_view_check_dim(hb, dim);
559
0
    int x;
560
0
    if (
!0
CCV_IS_TENSOR_VIEW0
(b) &&
!0
CCV_IS_TENSOR_VIEW0
(c) &&
!0
CCV_IS_TENSOR_VIEW0
(ha) &&
!0
CCV_IS_TENSOR_VIEW0
(hb))
561
0
    {
562
0
      // Super optimal case, just do one for-loop for sum.
563
0
      const int tensor_count = ccv_nnc_tensor_count(b->info);
564
0
      for (x = 0; 
x < tensor_count0
;
x++0
)
565
0
      {
566
0
        const float v = 1 / b->data.f32[x];
567
0
        ha->data.f32[x] = v;
568
0
        hb->data.f32[x] = -c->data.f32[x] * v;
569
0
      }
570
0
      return CCV_NNC_EXEC_SUCCESS;
571
0
    }
572
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
573
0
    ccv_nnc_tensor_view_get_inc(b, binc);
574
0
    ccv_nnc_tensor_view_get_inc(c, cinc);
575
0
    ccv_nnc_tensor_view_get_inc(ha, hainc);
576
0
    ccv_nnc_tensor_view_get_inc(hb, hbinc);
577
0
    int i[CCV_NNC_MAX_DIM + 2];
578
0
    float* bp = b->data.f32;
579
0
    float* cp = c->data.f32;
580
0
    float* hap = ha->data.f32;
581
0
    float* hbp = hb->data.f32;
582
0
    const int count = dim[2] * dim[3];
583
0
    if (
binc[3] == dim[3] && 0
cinc[3] == dim[3]0
&&
hainc[3] == dim[3]0
&&
hbinc[3] == dim[3]0
)
584
0
    {
585
0
      // Special casing if the ainc[3] is the same as dim[3]
586
0
      for (i[0] = 0; 
i[0] < dim[0]0
;
i[0]++0
)
587
0
      {
588
0
        for (i[1] = 0; 
i[1] < dim[1]0
;
i[1]++0
)
589
0
        {
590
0
          for (x = 0; 
x < count0
;
x++0
)
591
0
          {
592
0
            const float v = 1 / bp[x];
593
0
            hap[x] = v;
594
0
            hbp[x] = -cp[x] * v;
595
0
          }
596
0
          bp += binc[2] * binc[3];
597
0
          cp += cinc[2] * cinc[3];
598
0
          hap += hainc[2] * hainc[3];
599
0
          hbp += hbinc[2] * hbinc[3];
600
0
        }
601
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
602
0
        cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
603
0
        hap += (hainc[1] - dim[1]) * hainc[2] * hainc[3];
604
0
        hbp += (hbinc[1] - dim[1]) * hbinc[2] * hbinc[3];
605
0
      }
606
0
      return CCV_NNC_EXEC_SUCCESS;
607
0
    }
608
0
    // Non-optimal case, need to do skip copy.
609
0
    
for (i[0] = 0; 0
i[0] < dim[0]0
;
i[0]++0
)
610
0
    {
611
0
      for (i[1] = 0; 
i[1] < dim[1]0
;
i[1]++0
)
612
0
      {
613
0
        for (i[2] = 0; 
i[2] < dim[2]0
;
i[2]++0
)
614
0
        {
615
0
          for (x = 0; 
x < dim[3]0
;
x++0
)
616
0
          {
617
0
            const float v = 1 / bp[x];
618
0
            hap[x] = v;
619
0
            hbp[x] = -cp[x] * v;
620
0
          }
621
0
          bp += binc[3];
622
0
          cp += cinc[3];
623
0
          hap += hainc[3];
624
0
          hbp += hbinc[3];
625
0
        }
626
0
        bp += (binc[2] - dim[2]) * binc[3];
627
0
        cp += (cinc[2] - dim[2]) * cinc[3];
628
0
        hap += (hainc[2] - dim[2]) * hainc[3];
629
0
        hbp += (hbinc[2] - dim[2]) * hbinc[3];
630
0
      }
631
0
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
632
0
      cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
633
0
      hap += (hainc[1] - dim[1]) * hainc[2] * hainc[3];
634
0
      hbp += (hbinc[1] - dim[1]) * hbinc[2] * hbinc[3];
635
0
    }
636
1
  } else {
637
1
    assert(g->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
638
1
    assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
639
1
    assert(c->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
640
1
    assert(ha->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
641
1
    assert(hb->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
642
1
    ccv_nnc_tensor_view_get_dim(b, dim);
643
1
    ccv_nnc_tensor_view_check_dim(g, dim);
644
1
    ccv_nnc_tensor_view_check_dim(c, dim);
645
1
    ccv_nnc_tensor_view_check_dim(ha, dim);
646
1
    ccv_nnc_tensor_view_check_dim(hb, dim);
647
1
    int x;
648
1
    if (
!1
CCV_IS_TENSOR_VIEW1
(g) &&
!1
CCV_IS_TENSOR_VIEW1
(b) &&
!1
CCV_IS_TENSOR_VIEW1
(c) &&
!1
CCV_IS_TENSOR_VIEW1
(ha) &&
!1
CCV_IS_TENSOR_VIEW1
(hb))
649
1
    {
650
1
      // Super optimal case, just do one for-loop for sum.
651
1
      const int tensor_count = ccv_nnc_tensor_count(g->info);
652
2
      for (x = 0; 
x < tensor_count2
;
x++1
)
653
1
      {
654
1
        const float v = g->data.f32[x] / b->data.f32[x];
655
1
        ha->data.f32[x] = v;
656
1
        hb->data.f32[x] = -c->data.f32[x] * v;
657
1
      }
658
1
      return CCV_NNC_EXEC_SUCCESS;
659
1
    }
660
0
    assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
661
0
    ccv_nnc_tensor_view_get_inc(g, ginc);
662
0
    ccv_nnc_tensor_view_get_inc(b, binc);
663
0
    ccv_nnc_tensor_view_get_inc(c, cinc);
664
0
    ccv_nnc_tensor_view_get_inc(ha, hainc);
665
0
    ccv_nnc_tensor_view_get_inc(hb, hbinc);
666
0
    int i[CCV_NNC_MAX_DIM + 2];
667
0
    float* gp = g->data.f32;
668
0
    float* bp = b->data.f32;
669
0
    float* cp = c->data.f32;
670
0
    float* hap = ha->data.f32;
671
0
    float* hbp = hb->data.f32;
672
0
    const int count = dim[2] * dim[3];
673
0
    if (
ginc[3] == dim[3] && 0
binc[3] == dim[3]0
&&
cinc[3] == dim[3]0
&&
hainc[3] == dim[3]0
&&
hbinc[3] == dim[3]0
)
674
0
    {
675
0
      // Special casing if the ainc[3] is the same as dim[3]
676
0
      for (i[0] = 0; 
i[0] < dim[0]0
;
i[0]++0
)
677
0
      {
678
0
        for (i[1] = 0; 
i[1] < dim[1]0
;
i[1]++0
)
679
0
        {
680
0
          for (x = 0; 
x < count0
;
x++0
)
681
0
          {
682
0
            const float v = gp[x] / bp[x];
683
0
            hap[x] = v;
684
0
            hbp[x] = -cp[x] * v;
685
0
          }
686
0
          gp += ginc[2] * ginc[3];
687
0
          bp += binc[2] * binc[3];
688
0
          cp += cinc[2] * cinc[3];
689
0
          hap += hainc[2] * hainc[3];
690
0
          hbp += hbinc[2] * hbinc[3];
691
0
        }
692
0
        gp += (ginc[1] - dim[1]) * ginc[2] * ginc[3];
693
0
        bp += (binc[1] - dim[1]) * binc[2] * binc[3];
694
0
        cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
695
0
        hap += (hainc[1] - dim[1]) * hainc[2] * hainc[3];
696
0
        hbp += (hbinc[1] - dim[1]) * hbinc[2] * hbinc[3];
697
0
      }
698
0
      return CCV_NNC_EXEC_SUCCESS;
699
0
    }
700
0
    // Non-optimal case, need to do skip copy.
701
0
    
for (i[0] = 0; 0
i[0] < dim[0]0
;
i[0]++0
)
702
0
    {
703
0
      for (i[1] = 0; 
i[1] < dim[1]0
;
i[1]++0
)
704
0
      {
705
0
        for (i[2] = 0; 
i[2] < dim[2]0
;
i[2]++0
)
706
0
        {
707
0
          for (x = 0; 
x < dim[3]0
;
x++0
)
708
0
          {
709
0
            const float v = gp[x] / bp[x];
710
0
            hap[x] = v;
711
0
            hbp[x] = -cp[x] * v;
712
0
          }
713
0
          gp += ginc[3];
714
0
          bp += binc[3];
715
0
          cp += cinc[3];
716
0
          hap += hainc[3];
717
0
          hbp += hbinc[3];
718
0
        }
719
0
        gp += (ginc[2] - dim[2]) * ginc[3];
720
0
        bp += (binc[2] - dim[2]) * binc[3];
721
0
        cp += (cinc[2] - dim[2]) * cinc[3];
722
0
        hap += (hainc[2] - dim[2]) * hainc[3];
723
0
        hbp += (hbinc[2] - dim[2]) * hbinc[3];
724
0
      }
725
0
      gp += (ginc[1] - dim[1]) * ginc[2] * ginc[3];
726
0
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
727
0
      cp += (cinc[1] - dim[1]) * cinc[2] * cinc[3];
728
0
      hap += (hainc[1] - dim[1]) * hainc[2] * hainc[3];
729
0
      hbp += (hbinc[1] - dim[1]) * hbinc[2] * hbinc[3];
730
0
    }
731
0
  }
732
0
  return CCV_NNC_EXEC_SUCCESS;
733
1
}
734
735
static int _ccv_nnc_ewexp_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const ccv_nnc_stream_context_t* const stream_context)
736
1
{
737
1
  // Assuming this is float 32.
738
1
  int dim[CCV_NNC_MAX_DIM + 2];
739
1
  int ainc[CCV_NNC_MAX_DIM + 2];
740
1
  int binc[CCV_NNC_MAX_DIM + 2];
741
1
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
742
1
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
743
1
  assert(a->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
744
1
  assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
745
1
  ccv_nnc_tensor_view_get_dim(a, dim);
746
1
  ccv_nnc_tensor_view_check_dim(b, dim);
747
1
  int x;
748
1
  if (
!1
CCV_IS_TENSOR_VIEW1
(a) &&
!0
CCV_IS_TENSOR_VIEW0
(b))
749
0
  {
750
0
    // Super optimal case, just do one for-loop for sum.
751
0
    const int tensor_count = ccv_nnc_tensor_count(a->info);
752
0
    for (x = 0; 
x < tensor_count0
;
x++0
)
753
0
      b->data.f32[x] = exp(a->data.f32[x]);
754
0
    return CCV_NNC_EXEC_SUCCESS;
755
0
  }
756
1
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
757
1
  ccv_nnc_tensor_view_get_inc(a, ainc);
758
1
  ccv_nnc_tensor_view_get_inc(b, binc);
759
1
  int i[CCV_NNC_MAX_DIM + 2];
760
1
  float* ap = a->data.f32;
761
1
  float* bp = b->data.f32;
762
1
  const int count = dim[2] * dim[3];
763
1
  if (
ainc[3] == dim[3] && 1
binc[3] == dim[3]0
)
764
0
  {
765
0
    // Special casing if the ainc[3] is the same as dim[3]
766
0
    for (i[0] = 0; 
i[0] < dim[0]0
;
i[0]++0
)
767
0
    {
768
0
      for (i[1] = 0; 
i[1] < dim[1]0
;
i[1]++0
)
769
0
      {
770
0
        for (x = 0; 
x < count0
;
x++0
)
771
0
          bp[x] = exp(ap[x]);
772
0
        ap += ainc[2] * ainc[3];
773
0
        bp += binc[2] * binc[3];
774
0
      }
775
0
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
776
0
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
777
0
    }
778
0
    return CCV_NNC_EXEC_SUCCESS;
779
0
  }
780
1
  // Non-optimal case, need to do skip copy.
781
2
  
for (i[0] = 0; 1
i[0] < dim[0]2
;
i[0]++1
)
782
1
  {
783
2
    for (i[1] = 0; 
i[1] < dim[1]2
;
i[1]++1
)
784
1
    {
785
2
      for (i[2] = 0; 
i[2] < dim[2]2
;
i[2]++1
)
786
1
      {
787
2
        for (x = 0; 
x < dim[3]2
;
x++1
)
788
1
          bp[x] = exp(ap[x]);
789
1
        ap += ainc[3];
790
1
        bp += binc[3];
791
1
      }
792
1
      ap += (ainc[2] - dim[2]) * ainc[3];
793
1
      bp += (binc[2] - dim[2]) * binc[3];
794
1
    }
795
1
    ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
796
1
    bp += (binc[1] - dim[1]) * binc[2] * binc[3];
797
1
  }
798
1
  return CCV_NNC_EXEC_SUCCESS;
799
1
}
800
801
static int _ccv_nnc_ewexp_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const ccv_nnc_stream_context_t* const stream_context)
802
1
{
803
1
  // D[Exp[x], x] = Exp[x]
804
1
  if (inputs[0] == 0)
805
0
  {
806
0
    _ccv_nnc_tensor_transfer_cpu_ref((ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
807
0
    return CCV_NNC_EXEC_SUCCESS;
808
1
  } else {
809
1
    ccv_nnc_cmd_t forw_cmd = cmd;
810
1
    forw_cmd.cmd = CCV_NNC_EWPROD_FORWARD;
811
1
    return _ccv_nnc_ewprod_forw(cmd, ccv_nnc_no_hint, flags, TENSOR_LIST(inputs[0], inputs[2]), outputs, output_size, stream_context);
812
1
  }
813
1
}
814
815
static int _ccv_nnc_ewlog_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const ccv_nnc_stream_context_t* const stream_context)
816
9
{
817
9
  // Assuming this is float 32.
818
9
  int dim[CCV_NNC_MAX_DIM + 2];
819
9
  int ainc[CCV_NNC_MAX_DIM + 2];
820
9
  int binc[CCV_NNC_MAX_DIM + 2];
821
9
  ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[0];
822
9
  ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[0];
823
9
  assert(a->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
824
9
  assert(b->info.dim[CCV_NNC_MAX_DIM + 2] == 0);
825
9
  ccv_nnc_tensor_view_get_dim(a, dim);
826
9
  ccv_nnc_tensor_view_check_dim(b, dim);
827
9
  int x;
828
9
  if (
!9
CCV_IS_TENSOR_VIEW9
(a) &&
!7
CCV_IS_TENSOR_VIEW7
(b))
829
7
  {
830
7
    // Super optimal case, just do one for-loop for sum.
831
7
    const int tensor_count = ccv_nnc_tensor_count(a->info);
832
14
    for (x = 0; 
x < tensor_count14
;
x++7
)
833
7
      b->data.f32[x] = log(a->data.f32[x]);
834
7
    return CCV_NNC_EXEC_SUCCESS;
835
7
  }
836
2
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
837
2
  ccv_nnc_tensor_view_get_inc(a, ainc);
838
2
  ccv_nnc_tensor_view_get_inc(b, binc);
839
2
  int i[CCV_NNC_MAX_DIM + 2];
840
2
  float* ap = a->data.f32;
841
2
  float* bp = b->data.f32;
842
2
  const int count = dim[2] * dim[3];
843
2
  if (
ainc[3] == dim[3] && 2
binc[3] == dim[3]0
)
844
0
  {
845
0
    // Special casing if the ainc[3] is the same as dim[3]
846
0
    for (i[0] = 0; 
i[0] < dim[0]0
;
i[0]++0
)
847
0
    {
848
0
      for (i[1] = 0; 
i[1] < dim[1]0
;
i[1]++0
)
849
0
      {
850
0
        for (x = 0; 
x < count0
;
x++0
)
851
0
          bp[x] = log(ap[x]);
852
0
        ap += ainc[2] * ainc[3];
853
0
        bp += binc[2] * binc[3];
854
0
      }
855
0
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
856
0
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
857
0
    }
858
0
    return CCV_NNC_EXEC_SUCCESS;
859
0
  }
860
2
  // Non-optimal case, need to do skip copy.
861
4
  
for (i[0] = 0; 2
i[0] < dim[0]4
;
i[0]++2
)
862
2
  {
863
4
    for (i[1] = 0; 
i[1] < dim[1]4
;
i[1]++2
)
864
2
    {
865
4
      for (i[2] = 0; 
i[2] < dim[2]4
;
i[2]++2
)
866
2
      {
867
4
        for (x = 0; 
x < dim[0]4
;
x++2
)
868
2
          bp[x] = log(ap[x]);
869
2
        ap += ainc[3];
870
2
        bp += binc[3];
871
2
      }
872
2
      ap += (ainc[2] - dim[2]) * ainc[3];
873
2
      bp += (binc[2] - dim[2]) * binc[3];
874
2
    }
875
2
    ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
876
2
    bp += (binc[1] - dim[1]) * binc[2] * binc[3];
877
2
  }
878
2
  return CCV_NNC_EXEC_SUCCESS;
879
2
}
880
881
static int _ccv_nnc_ewlog_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const ccv_nnc_stream_context_t* const stream_context)
882
3
{
883
3
  ccv_nnc_cmd_t forw_cmd = cmd;
884
3
  forw_cmd.cmd = CCV_NNC_EWDIV_FORWARD;
885
3
  // D[Log[x], x] = 1 / x
886
3
  return _ccv_nnc_ewdiv_forw(forw_cmd, ccv_nnc_no_hint, flags, TENSOR_LIST(inputs[0], inputs[1]), outputs, output_size, stream_context);
887
3
  // Otherwise, need to add them together.
888
0
  return CCV_NNC_EXEC_SUCCESS;
889
3
}
890
891
REGISTER_COMMAND_BACKEND(CCV_NNC_EWSUM_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
892
1
{
893
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
894
1
  registry->tensor_datatypes = CCV_32F;
895
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
896
1
  registry->algorithms = 1;
897
1
  registry->exec = _ccv_nnc_ewsum_forw_cpu_ref;
898
1
}
899
900
REGISTER_COMMAND_BACKEND(CCV_NNC_EWSUM_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
901
1
{
902
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
903
1
  registry->tensor_datatypes = CCV_32F;
904
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
905
1
  registry->algorithms = 1;
906
1
  registry->exec = _ccv_nnc_ewsum_back;
907
1
}
908
909
REGISTER_COMMAND_BACKEND(CCV_NNC_EWPROD_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
910
1
{
911
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
912
1
  registry->tensor_datatypes = CCV_32F;
913
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
914
1
  registry->algorithms = 1;
915
1
  registry->exec = _ccv_nnc_ewprod_forw;
916
1
}
917
918
REGISTER_COMMAND_BACKEND(CCV_NNC_EWPROD_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
919
1
{
920
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
921
1
  registry->tensor_datatypes = CCV_32F;
922
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
923
1
  registry->algorithms = 1;
924
1
  registry->exec = _ccv_nnc_ewprod_back;
925
1
}
926
927
REGISTER_COMMAND_BACKEND(CCV_NNC_EWDIV_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
928
1
{
929
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
930
1
  registry->tensor_datatypes = CCV_32F;
931
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
932
1
  registry->algorithms = 1;
933
1
  registry->exec = _ccv_nnc_ewdiv_forw;
934
1
}
935
936
REGISTER_COMMAND_BACKEND(CCV_NNC_EWDIV_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
937
1
{
938
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
939
1
  registry->tensor_datatypes = CCV_32F;
940
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
941
1
  registry->algorithms = 1;
942
1
  registry->exec = _ccv_nnc_ewdiv_back;
943
1
}
944
945
REGISTER_COMMAND_BACKEND(CCV_NNC_EWEXP_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
946
1
{
947
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
948
1
  registry->tensor_datatypes = CCV_32F;
949
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
950
1
  registry->algorithms = 1;
951
1
  registry->exec = _ccv_nnc_ewexp_forw;
952
1
}
953
954
REGISTER_COMMAND_BACKEND(CCV_NNC_EWEXP_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
955
1
{
956
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
957
1
  registry->tensor_datatypes = CCV_32F;
958
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
959
1
  registry->algorithms = 1;
960
1
  registry->exec = _ccv_nnc_ewexp_back;
961
1
}
962
963
REGISTER_COMMAND_BACKEND(CCV_NNC_EWLOG_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
964
1
{
965
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
966
1
  registry->tensor_datatypes = CCV_32F;
967
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
968
1
  registry->algorithms = 1;
969
1
  registry->exec = _ccv_nnc_ewlog_forw;
970
1
}
971
972
REGISTER_COMMAND_BACKEND(CCV_NNC_EWLOG_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
973
1
{
974
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_CHWN;
975
1
  registry->tensor_datatypes = CCV_32F;
976
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
977
1
  registry->algorithms = 1;
978
1
  registry->exec = _ccv_nnc_ewlog_back;
979
1
}