Coverage Report

Created: 2021-04-05 03:19

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/cmd/util/ccv_nnc_util_cpu_ref.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
#include "../_ccv_nnc_cpu_ref.h"
13
14
void _ccv_nnc_tensor_transfer_cpu_ref_f16(const ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b)
15
0
{
16
0
  // Assuming this is float 32.
17
0
  assert(a->info.datatype == b->info.datatype);
18
0
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b))
19
0
  {
20
0
    // Super optimal case, just do memcpy.
21
0
    memcpy(b->data.u8, a->data.u8, ccv_nnc_tensor_count(a->info) * CCV_GET_DATA_TYPE_SIZE(a->info.datatype));
22
0
    return;
23
0
  }
24
0
  int dim[CCV_NNC_MAX_DIM_ALLOC];
25
0
  int ainc[CCV_NNC_MAX_DIM_ALLOC];
26
0
  int binc[CCV_NNC_MAX_DIM_ALLOC];
27
0
  ccv_nnc_tensor_view_get_dim(a, dim);
28
0
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
29
0
  ccv_nnc_tensor_view_get_inc(a, ainc);
30
0
  ccv_nnc_tensor_view_get_inc(b, binc);
31
0
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
32
0
  int i[CCV_NNC_MAX_DIM + 2];
33
0
  ccv_float16_t* ap = a->data.f16;
34
0
  ccv_float16_t* bp = b->data.f16;
35
0
  if (ainc[3] == dim[3] && binc[3] == dim[3])
36
0
  {
37
0
    // Special casing if the ainc[3] is the same as dim[3] (do memcpy for the last two dim)
38
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
39
0
    {
40
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
41
0
      {
42
0
        memcpy(bp, ap, dim[2] * dim[3] * sizeof(ccv_float16_t));
43
0
        ap += ainc[2] * ainc[3];
44
0
        bp += binc[2] * binc[3];
45
0
      }
46
0
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
47
0
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
48
0
    }
49
0
    return;
50
0
  }
51
0
  // Non-optimal case, need to do skip copy.
52
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
53
0
  {
54
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
55
0
    {
56
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
57
0
      {
58
0
        memcpy(bp, ap, dim[3] * sizeof(ccv_float16_t));
59
0
        ap += ainc[3];
60
0
        bp += binc[3];
61
0
      }
62
0
      ap += (ainc[2] - dim[2]) * ainc[3];
63
0
      bp += (binc[2] - dim[2]) * binc[3];
64
0
    }
65
0
    ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
66
0
    bp += (binc[1] - dim[1]) * binc[2] * binc[3];
67
0
  }
68
0
}
69
70
void _ccv_nnc_tensor_transfer_cpu_ref_f32(const ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b)
71
23.6k
{
72
23.6k
  // Assuming this is float 32.
73
23.6k
  assert(a->info.datatype == b->info.datatype);
74
23.6k
  if (!CCV_IS_TENSOR_VIEW(a) && 
!23.6k
CCV_IS_TENSOR_VIEW23.6k
(b))
75
23.6k
  {
76
23.5k
    // Super optimal case, just do memcpy.
77
23.5k
    memcpy(b->data.u8, a->data.u8, ccv_nnc_tensor_count(a->info) * CCV_GET_DATA_TYPE_SIZE(a->info.datatype));
78
23.5k
    return;
79
23.5k
  }
80
11
  int dim[CCV_NNC_MAX_DIM_ALLOC];
81
11
  int ainc[CCV_NNC_MAX_DIM_ALLOC];
82
11
  int binc[CCV_NNC_MAX_DIM_ALLOC];
83
11
  ccv_nnc_tensor_view_get_dim(a, dim);
84
11
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
85
11
  ccv_nnc_tensor_view_get_inc(a, ainc);
86
11
  ccv_nnc_tensor_view_get_inc(b, binc);
87
11
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
88
11
  int i[CCV_NNC_MAX_DIM + 2];
89
11
  float* ap = a->data.f32;
90
11
  float* bp = b->data.f32;
91
11
  if (ainc[3] == dim[3] && 
binc[3] == dim[3]10
)
92
2
  {
93
2
    // Special casing if the ainc[3] is the same as dim[3] (do memcpy for the last two dim)
94
4
    for (i[0] = 0; i[0] < dim[0]; 
i[0]++2
)
95
2
    {
96
6
      for (i[1] = 0; i[1] < dim[1]; 
i[1]++4
)
97
4
      {
98
4
        memcpy(bp, ap, dim[2] * dim[3] * sizeof(float));
99
4
        ap += ainc[2] * ainc[3];
100
4
        bp += binc[2] * binc[3];
101
4
      }
102
2
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
103
2
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
104
2
    }
105
2
    return;
106
2
  }
107
9
  // Non-optimal case, need to do skip copy.
108
18
  
for (i[0] = 0; 9
i[0] < dim[0];
i[0]++9
)
109
9
  {
110
19
    for (i[1] = 0; i[1] < dim[1]; 
i[1]++10
)
111
10
    {
112
24
      for (i[2] = 0; i[2] < dim[2]; 
i[2]++14
)
113
14
      {
114
14
        memcpy(bp, ap, dim[3] * sizeof(float));
115
14
        ap += ainc[3];
116
14
        bp += binc[3];
117
14
      }
118
10
      ap += (ainc[2] - dim[2]) * ainc[3];
119
10
      bp += (binc[2] - dim[2]) * binc[3];
120
10
    }
121
9
    ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
122
9
    bp += (binc[1] - dim[1]) * binc[2] * binc[3];
123
9
  }
124
9
}
125
126
void _ccv_nnc_tensor_transfer_cpu_ref_f64(const ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b)
127
0
{
128
0
  // Assuming this is float 32.
129
0
  assert(a->info.datatype == b->info.datatype);
130
0
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b))
131
0
  {
132
0
    // Super optimal case, just do memcpy.
133
0
    memcpy(b->data.u8, a->data.u8, ccv_nnc_tensor_count(a->info) * CCV_GET_DATA_TYPE_SIZE(a->info.datatype));
134
0
    return;
135
0
  }
136
0
  int dim[CCV_NNC_MAX_DIM_ALLOC];
137
0
  int ainc[CCV_NNC_MAX_DIM_ALLOC];
138
0
  int binc[CCV_NNC_MAX_DIM_ALLOC];
139
0
  ccv_nnc_tensor_view_get_dim(a, dim);
140
0
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
141
0
  ccv_nnc_tensor_view_get_inc(a, ainc);
142
0
  ccv_nnc_tensor_view_get_inc(b, binc);
143
0
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
144
0
  int i[CCV_NNC_MAX_DIM + 2];
145
0
  double* ap = a->data.f64;
146
0
  double* bp = b->data.f64;
147
0
  if (ainc[3] == dim[3] && binc[3] == dim[3])
148
0
  {
149
0
    // Special casing if the ainc[3] is the same as dim[3] (do memcpy for the last two dim)
150
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
151
0
    {
152
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
153
0
      {
154
0
        memcpy(bp, ap, dim[2] * dim[3] * sizeof(double));
155
0
        ap += ainc[2] * ainc[3];
156
0
        bp += binc[2] * binc[3];
157
0
      }
158
0
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
159
0
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
160
0
    }
161
0
    return;
162
0
  }
163
0
  // Non-optimal case, need to do skip copy.
164
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
165
0
  {
166
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
167
0
    {
168
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
169
0
      {
170
0
        memcpy(bp, ap, dim[3] * sizeof(double));
171
0
        ap += ainc[3];
172
0
        bp += binc[3];
173
0
      }
174
0
      ap += (ainc[2] - dim[2]) * ainc[3];
175
0
      bp += (binc[2] - dim[2]) * binc[3];
176
0
    }
177
0
    ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
178
0
    bp += (binc[1] - dim[1]) * binc[2] * binc[3];
179
0
  }
180
0
}
181
182
void _ccv_nnc_tensor_set_cpu_ref(ccv_nnc_tensor_view_t* const a, const float b)
183
10.7k
{
184
10.7k
  // Assuming this is float 32.
185
10.7k
  int dim[CCV_NNC_MAX_DIM_ALLOC];
186
10.7k
  int ainc[CCV_NNC_MAX_DIM_ALLOC];
187
10.7k
  int x;
188
10.7k
  if (!CCV_IS_TENSOR_VIEW(a))
189
10.7k
  {
190
10.7k
    // Super optimal case, just do one for-loop for sum.
191
10.7k
    const int tensor_count = ccv_nnc_tensor_count(a->info);
192
397k
    for (x = 0; x < tensor_count; 
x++386k
)
193
386k
      a->data.f32[x] = b;
194
10.7k
    return;
195
10.7k
  }
196
0
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
197
0
  ccv_nnc_tensor_view_get_dim(a, dim);
198
0
  ccv_nnc_tensor_view_get_inc(a, ainc);
199
0
  int i[CCV_NNC_MAX_DIM + 2];
200
0
  float* ap = a->data.f32;
201
0
  const int count = dim[2] * dim[3];
202
0
  if (ainc[3] == dim[3])
203
0
  {
204
0
    // Special casing if the ainc[3] is the same as dim[3]
205
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
206
0
    {
207
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
208
0
      {
209
0
        for (x = 0; x < count; x++)
210
0
          ap[x] = b;
211
0
        ap += ainc[2] * ainc[3];
212
0
      }
213
0
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
214
0
    }
215
0
    return;
216
0
  }
217
0
  // Non-optimal case, need to do skip copy.
218
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
219
0
  {
220
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
221
0
    {
222
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
223
0
      {
224
0
        for (x = 0; x < dim[3]; x++)
225
0
          ap[x] = b;
226
0
        ap += ainc[3];
227
0
      }
228
0
      ap += (ainc[2] - dim[2]) * ainc[3];
229
0
    }
230
0
    ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
231
0
  }
232
0
}
233
234
static int _ccv_nnc_data_transfer(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
235
1.54k
{
236
1.54k
  int i;
237
6.45k
  for (i = 0; i < ccv_min(input_size, output_size); 
i++4.91k
)
238
4.91k
  {
239
4.91k
    const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[i];
240
4.91k
    ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[i];
241
4.91k
    if (a != b) // Only do transfer if these are two different tensors.
242
53
    {
243
53
      assert(a->info.datatype == b->info.datatype);
244
53
      if (a->info.datatype == CCV_16F)
245
0
        _ccv_nnc_tensor_transfer_cpu_ref_f16(a, b);
246
53
      else if (a->info.datatype == CCV_32F || 
a->info.datatype == CCV_32S0
)
247
53
        _ccv_nnc_tensor_transfer_cpu_ref_f32(a, b);
248
0
      else if (a->info.datatype == CCV_64F)
249
0
        _ccv_nnc_tensor_transfer_cpu_ref_f64(a, b);
250
53
    }
251
4.91k
  }
252
1.54k
  return CCV_NNC_EXEC_SUCCESS;
253
1.54k
}
254
255
REGISTER_COMMAND_BACKEND(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
256
1
{
257
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
258
1
  registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_32S;
259
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
260
1
  registry->algorithms = 1;
261
1
  registry->exec = _ccv_nnc_data_transfer;
262
1
}
263
264
REGISTER_COMMAND_BACKEND(CCV_NNC_DATA_TRANSFER_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
265
1
{
266
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
267
1
  registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_32S;
268
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
269
1
  registry->algorithms = 1;
270
1
  registry->exec = _ccv_nnc_data_transfer;
271
1
}
272
273
static int _ccv_nnc_set_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
274
10.8k
{
275
10.8k
  int i;
276
10.8k
  if (cmd.info.blas.a[0] == 0)
277
262
    
for (i = 0; 131
i < output_size;
i++131
)
278
131
      ccv_nnc_tensor_zero(outputs[i]);
279
10.7k
  else
280
21.4k
    
for (i = 0; 10.7k
i < output_size;
i++10.7k
)
281
10.7k
      _ccv_nnc_tensor_set_cpu_ref((ccv_nnc_tensor_view_t*)outputs[i], cmd.info.blas.a[0]);
282
10.8k
  return CCV_NNC_EXEC_SUCCESS;
283
10.8k
}
284
285
static int _ccv_nnc_set_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
286
0
{
287
0
  int i;
288
0
  for (i = 0; i < output_size; i++)
289
0
    ccv_nnc_tensor_zero(outputs[i]);
290
0
  return CCV_NNC_EXEC_SUCCESS;
291
0
}
292
293
REGISTER_COMMAND_BACKEND(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
294
1
{
295
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
296
1
  registry->tensor_datatypes = CCV_32F;
297
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
298
1
  registry->algorithms = 1;
299
1
  registry->exec = _ccv_nnc_set_forw;
300
1
}
301
302
REGISTER_COMMAND_BACKEND(CCV_NNC_SET_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
303
1
{
304
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
305
1
  registry->tensor_datatypes = CCV_32F;
306
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
307
1
  registry->algorithms = 1;
308
1
  registry->exec = _ccv_nnc_set_back;
309
1
}
310
311
static void _ccv_nnc_tensor_nhwc_nchw(const ccv_nnc_tensor_view_t* a, ccv_nnc_tensor_view_t* b)
312
9
{
313
9
  // Assuming this is float 32.
314
9
  int ainc[CCV_NNC_MAX_DIM + 2];
315
9
  int binc[CCV_NNC_MAX_DIM + 2];
316
9
  int k;
317
9
  // In case it is Toll-free bridged matrix object (NHWC format is possible).
318
9
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
319
9
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
320
9
  const int a_offset = CCV_NNC_MAX_DIM + 2 - a_nd;
321
9
  assert(a_offset == 0 || a_offset == 1);
322
9
  const int b_offset = CCV_NNC_MAX_DIM + 2 - b_nd;
323
9
  assert(b_offset == 0 || b_offset == 1);
324
9
  ccv_nnc_tensor_view_get_inc(a, ainc);
325
9
  ccv_nnc_tensor_view_get_inc(b, binc);
326
9
  // Comparing N
327
9
  assert((a_offset == 0 ? a->info.dim[0] : 1) == (b_offset == 0 ? b->info.dim[0] : 1));
328
9
  const int n = (a_offset == 0 ? 
a->info.dim[0]2
:
17
);
329
9
  // Comparing C
330
9
  assert(a->info.dim[a_nd - 1] == b->info.dim[1 - b_offset]);
331
9
  const int c = a->info.dim[a_nd - 1];
332
9
  // Comparing HW
333
9
  int hw[CCV_NNC_MAX_DIM];
334
27
  for (k = 0; k < CCV_NNC_MAX_DIM; 
k++18
)
335
18
  {
336
18
    assert(a->info.dim[k + 1 - a_offset] == b->info.dim[k + 2 - b_offset]);
337
18
    hw[k] = a->info.dim[k + 1 - a_offset];
338
18
  }
339
9
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
340
9
  int i[CCV_NNC_MAX_DIM + 2];
341
9
  float* ap = a->data.f32;
342
9
  float* bp = b->data.f32;
343
9
  // Non-optimal case, need to do skip copy.
344
20
  for (i[0] = 0; i[0] < n; 
i[0]++11
)
345
11
  {
346
792
    for (i[3] = 0; i[3] < c; 
i[3]++781
)
347
781
    {
348
781
      float* apu = ap + i[3];
349
19.4k
      for (i[1] = 0; i[1] < hw[0]; 
i[1]++18.6k
)
350
18.6k
      {
351
3.35M
        for (i[2] = 0; i[2] < hw[1]; 
i[2]++3.33M
)
352
3.33M
          bp[i[2]] = apu[i[2] * ainc[3]];
353
18.6k
        apu += ainc[2] * ainc[3];
354
18.6k
        bp += binc[3];
355
18.6k
      }
356
781
      bp += (binc[2] - hw[0]) * binc[3];
357
781
    }
358
11
    ap += ainc[1] * ainc[2] * ainc[3];
359
11
    bp += (binc[1] - c) * binc[2] * binc[3];
360
11
  }
361
9
}
362
363
static void _ccv_nnc_tensor_nchw_nhwc(const ccv_nnc_tensor_view_t* a, ccv_nnc_tensor_view_t* b)
364
9
{
365
9
  // Assuming this is float 32.
366
9
  int ainc[CCV_NNC_MAX_DIM + 2];
367
9
  int binc[CCV_NNC_MAX_DIM + 2];
368
9
  int k;
369
9
  // In case it is Toll-free bridged matrix object (NHWC format is possible).
370
9
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
371
9
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
372
9
  const int a_offset = CCV_NNC_MAX_DIM + 2 - a_nd;
373
9
  assert(a_offset == 0 || a_offset == 1);
374
9
  const int b_offset = CCV_NNC_MAX_DIM + 2 - b_nd;
375
9
  assert(b_offset == 0 || b_offset == 1);
376
9
  ccv_nnc_tensor_view_get_inc(a, ainc);
377
9
  ccv_nnc_tensor_view_get_inc(b, binc);
378
9
  // Comparing N
379
9
  assert((a_offset == 0 ? a->info.dim[0] : 1) == (b_offset == 0 ? b->info.dim[0] : 1));
380
9
  const int n = (a_offset == 0 ? 
a->info.dim[0]2
:
17
);
381
9
  // Comparing C
382
9
  assert(a->info.dim[1 - a_offset] == b->info.dim[b_nd - 1]);
383
9
  const int c = a->info.dim[1 - a_offset];
384
9
  // Comparing HW
385
9
  int hw[CCV_NNC_MAX_DIM];
386
27
  for (k = 0; k < CCV_NNC_MAX_DIM; 
k++18
)
387
18
  {
388
18
    assert(a->info.dim[k + 2 - a_offset] == b->info.dim[k + 1 - b_offset]);
389
18
    hw[k] = a->info.dim[k + 2 - a_offset];
390
18
  }
391
9
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
392
9
  int i[CCV_NNC_MAX_DIM + 2];
393
9
  float* ap = a->data.f32;
394
9
  float* bp = b->data.f32;
395
9
  // Non-optimal case, need to do skip copy.
396
20
  for (i[0] = 0; i[0] < n; 
i[0]++11
)
397
11
  {
398
792
    for (i[3] = 0; i[3] < c; 
i[3]++781
)
399
781
    {
400
781
      float* bpu = bp + i[3];
401
20.1k
      for (i[1] = 0; i[1] < hw[0]; 
i[1]++19.3k
)
402
19.3k
      {
403
5.04M
        for (i[2] = 0; i[2] < hw[1]; 
i[2]++5.02M
)
404
5.02M
          bpu[i[2] * binc[3]] = ap[i[2]];
405
19.3k
        ap += ainc[3];
406
19.3k
        bpu += binc[2] * binc[3];
407
19.3k
      }
408
781
      ap += (ainc[2] - hw[0]) * ainc[3];
409
781
    }
410
11
    ap += (ainc[1] - c) * ainc[2] * ainc[3];
411
11
    bp += binc[1] * binc[2] * binc[3];
412
11
  }
413
9
}
414
415
static int _ccv_nnc_format_transform(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
416
20
{
417
20
  assert(output_size <= input_size);
418
20
  int i;
419
42
  for (i = 0; i < output_size; 
i++22
)
420
22
  {
421
22
    const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[i];
422
22
    ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[i];
423
22
    assert(a != b); // Cannot do inplace transform.
424
22
    if (a->info.format == b->info.format) {
425
4
      // If it is the same, just do a normal data transfer.
426
4
      _ccv_nnc_tensor_transfer_cpu_ref_f32(a, b);
427
18
    } else if (a->info.format == CCV_TENSOR_FORMAT_NHWC && 
b->info.format == CCV_TENSOR_FORMAT_NCHW9
) {
428
9
      _ccv_nnc_tensor_nhwc_nchw(a, b);
429
9
    } else if (a->info.format == CCV_TENSOR_FORMAT_NHWC && 
b->info.format == CCV_TENSOR_FORMAT_CHWN0
) {
430
9
    } else if (a->info.format == CCV_TENSOR_FORMAT_NCHW && b->info.format == CCV_TENSOR_FORMAT_NHWC) {
431
9
      _ccv_nnc_tensor_nchw_nhwc(a, b);
432
9
    } else 
if (0
a->info.format == CCV_TENSOR_FORMAT_NCHW0
&&
b->info.format == CCV_TENSOR_FORMAT_CHWN0
) {
433
0
    } else if (a->info.format == CCV_TENSOR_FORMAT_CHWN && b->info.format == CCV_TENSOR_FORMAT_NHWC) {
434
0
    } else if (a->info.format == CCV_TENSOR_FORMAT_CHWN && b->info.format == CCV_TENSOR_FORMAT_NCHW) {
435
0
    }
436
22
  }
437
20
  return CCV_NNC_EXEC_SUCCESS;
438
20
}
439
440
REGISTER_COMMAND_BACKEND(CCV_NNC_FORMAT_TRANSFORM_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
441
1
{
442
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
443
1
  registry->tensor_datatypes = CCV_32F | CCV_32S;
444
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
445
1
  registry->algorithms = 1;
446
1
  registry->exec = _ccv_nnc_format_transform;
447
1
}
448
449
REGISTER_COMMAND_BACKEND(CCV_NNC_FORMAT_TRANSFORM_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
450
1
{
451
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
452
1
  registry->tensor_datatypes = CCV_32F | CCV_32S;
453
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
454
1
  registry->algorithms = 1;
455
1
  registry->exec = _ccv_nnc_format_transform;
456
1
}
457
458
static int _ccv_nnc_transpose(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
459
6
{
460
6
  assert(output_size <= input_size);
461
6
  int k;
462
12
  for (k = 0; k < output_size; 
k++6
)
463
6
  {
464
6
    const ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[k];
465
6
    ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)outputs[k];
466
6
    const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
467
6
    const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
468
6
    assert(a_nd == b_nd);
469
6
    assert(a_nd <= CCV_NNC_MAX_DIM + 2); // I can only handle maximum 4.
470
6
    assert(a_nd >= 2 && b_nd >= 2); // You cannot transpose if it is less than 2.
471
6
    assert(a->info.dim[cmd.info.transpose.axis[0]] == b->info.dim[cmd.info.transpose.axis[1]]);
472
6
    assert(a->info.dim[cmd.info.transpose.axis[1]] == b->info.dim[cmd.info.transpose.axis[0]]);
473
6
    int x;
474
28
    for (x = 0; x < a_nd; 
x++22
)
475
22
      if (x != cmd.info.transpose.axis[0] && 
x != cmd.info.transpose.axis[1]16
)
476
10
        { assert(a->info.dim[x] == b->info.dim[x]); }
477
6
    size_t astride[CCV_NNC_MAX_DIM + 2];
478
6
    size_t bstride[CCV_NNC_MAX_DIM + 2];
479
6
    int dim[CCV_NNC_MAX_DIM + 2];
480
8
    for (x = b_nd; x < CCV_NNC_MAX_DIM + 2; 
x++2
)
481
2
      dim[x] = 1;
482
28
    for (x = 0; x < b_nd; 
x++22
)
483
22
      dim[x] = b->info.dim[x];
484
6
    // Don't use ccv_nnc_tensor_view_get_inc or get_dim because these will prefill beginning to 1:
485
6
    // for example, if the dimension is [2, 4], it will fill to [1, 1, 2, 4] so the axis index will
486
6
    // be messed up.
487
6
    const int* const ainc = CCV_IS_TENSOR_VIEW(a) ? a->inc : 
a->info.dim0
;
488
6
    const int* const binc = CCV_IS_TENSOR_VIEW(b) ? b->inc : 
b->info.dim0
;
489
14
    for (x = a_nd - 1; x < CCV_NNC_MAX_DIM + 2; 
x++8
)
490
8
      astride[x] = 1;
491
22
    for (x = a_nd - 2; x >= 0; 
x--16
)
492
16
      astride[x] = astride[x + 1] * ainc[x + 1];
493
14
    for (x = b_nd - 1; x < CCV_NNC_MAX_DIM + 2; 
x++8
)
494
8
      bstride[x] = 1;
495
22
    for (x = b_nd - 2; x >= 0; 
x--16
)
496
16
      bstride[x] = bstride[x + 1] * binc[x + 1];
497
6
    const float* const ap = a->data.f32;
498
6
    float* const bp = b->data.f32;
499
6
    int i[CCV_NNC_MAX_DIM + 2];
500
6
    int j[CCV_NNC_MAX_DIM + 2] = {
501
6
      0, 1, 2, 3
502
6
    };
503
6
    CCV_SWAP(j[cmd.info.transpose.axis[0]], j[cmd.info.transpose.axis[1]], x);
504
27
    for (i[0] = 0; i[0] < dim[0]; 
i[0]++21
)
505
21
    {
506
21
      float* const bp0 = bp + i[0] * bstride[0];
507
71
      for (i[1] = 0; i[1] < dim[1]; 
i[1]++50
)
508
50
      {
509
50
        float* const bp1 = bp0 + i[1] * bstride[1];
510
154
        for (i[2] = 0; i[2] < dim[2]; 
i[2]++104
)
511
104
        {
512
104
          float* const bp2 = bp1 + i[2] * bstride[2];
513
320
          for (i[3] = 0; i[3] < dim[3]; 
i[3]++216
)
514
216
            bp2[i[3]] = ap[i[j[0]] * astride[0] + i[j[1]] * astride[1] + i[j[2]] * astride[2] + i[j[3]] * astride[3]];
515
104
        }
516
50
      }
517
21
    }
518
6
  }
519
6
  return CCV_NNC_EXEC_SUCCESS;
520
6
}
521
522
REGISTER_COMMAND_BACKEND(CCV_NNC_TRANSPOSE_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
523
1
{
524
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
525
1
  registry->tensor_datatypes = CCV_32F;
526
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
527
1
  registry->algorithms = 1;
528
1
  registry->exec = _ccv_nnc_transpose;
529
1
}
530
531
REGISTER_COMMAND_BACKEND(CCV_NNC_TRANSPOSE_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
532
1
{
533
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
534
1
  registry->tensor_datatypes = CCV_32F;
535
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
536
1
  registry->algorithms = 1;
537
1
  registry->exec = _ccv_nnc_transpose;
538
1
}
539
540
static int _ccv_nnc_datatype_conversion(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
541
120k
{
542
120k
  assert(output_size <= input_size);
543
120k
  int i;
544
240k
  for (i = 0; i < output_size; 
i++120k
)
545
120k
  {
546
120k
    const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[i];
547
120k
    ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[i];
548
120k
    assert(a != b); // Cannot do inplace transform.
549
120k
    assert(a->info.format == b->info.format);
550
120k
    if (a->info.datatype == b->info.datatype) {
551
2
      // If it is the same, just do a normal data transfer.
552
2
      if (a->info.datatype == CCV_16F)
553
0
        _ccv_nnc_tensor_transfer_cpu_ref_f16(a, b);
554
2
      else if (a->info.datatype == CCV_32F)
555
2
        _ccv_nnc_tensor_transfer_cpu_ref_f32(a, b);
556
0
      else if (a->info.datatype == CCV_64F)
557
0
        _ccv_nnc_tensor_transfer_cpu_ref_f64(a, b);
558
120k
    } else if (a->info.datatype == CCV_32F && 
b->info.datatype == CCV_16F120k
) {
559
120k
      assert(!CCV_IS_TENSOR_VIEW(a));
560
120k
      assert(!CCV_IS_TENSOR_VIEW(b));
561
120k
      const size_t tensor_count = ccv_nnc_tensor_count(a->info);
562
120k
      assert(tensor_count == ccv_nnc_tensor_count(b->info));
563
120k
      ccv_float_to_half_precision(a->data.f32, (uint16_t*)b->data.f16, tensor_count);
564
120k
    } else 
if (154
a->info.datatype == CCV_16F154
&&
b->info.datatype == CCV_32F150
) {
565
150
      assert(!CCV_IS_TENSOR_VIEW(a));
566
150
      assert(!CCV_IS_TENSOR_VIEW(b));
567
150
      const int tensor_count = ccv_nnc_tensor_count(a->info);
568
150
      assert(tensor_count == ccv_nnc_tensor_count(b->info));
569
150
      ccv_half_precision_to_float((uint16_t*)a->data.f16, b->data.f32, tensor_count);
570
150
    } else 
if (4
a->info.datatype == CCV_64F4
&&
b->info.datatype == CCV_32F3
) {
571
1
      assert(!CCV_IS_TENSOR_VIEW(a));
572
1
      assert(!CCV_IS_TENSOR_VIEW(b));
573
1
      const size_t tensor_count = ccv_nnc_tensor_count(a->info);
574
1
      assert(tensor_count == ccv_nnc_tensor_count(b->info));
575
1
      int i;
576
129
      for (i = 0; i < tensor_count; 
i++128
)
577
128
        b->data.f32[i] = (float)a->data.f64[i];
578
3
    } else if (a->info.datatype == CCV_32F && 
b->info.datatype == CCV_64F1
) {
579
1
      assert(!CCV_IS_TENSOR_VIEW(a));
580
1
      assert(!CCV_IS_TENSOR_VIEW(b));
581
1
      const int tensor_count = ccv_nnc_tensor_count(a->info);
582
1
      assert(tensor_count == ccv_nnc_tensor_count(b->info));
583
129
      
for (i = 0; 1
i < tensor_count;
i++128
)
584
128
        b->data.f64[i] = (double)a->data.f32[i];
585
2
    } else if (a->info.datatype == CCV_64F && b->info.datatype == CCV_16F) {
586
2
      assert(!CCV_IS_TENSOR_VIEW(a));
587
2
      assert(!CCV_IS_TENSOR_VIEW(b));
588
2
      const size_t tensor_count = ccv_nnc_tensor_count(a->info);
589
2
      assert(tensor_count == ccv_nnc_tensor_count(b->info));
590
2
      ccv_double_to_half_precision(a->data.f64, (uint16_t*)b->data.f16, tensor_count);
591
2
    } else 
if (0
a->info.datatype == CCV_16F0
&&
b->info.datatype == CCV_64F0
) {
592
0
      assert(!CCV_IS_TENSOR_VIEW(a));
593
0
      assert(!CCV_IS_TENSOR_VIEW(b));
594
0
      const int tensor_count = ccv_nnc_tensor_count(a->info);
595
0
      assert(tensor_count == ccv_nnc_tensor_count(b->info));
596
0
      ccv_half_precision_to_double((uint16_t*)a->data.f16, b->data.f64, tensor_count);
597
0
    }
598
120k
  }
599
120k
  return CCV_NNC_EXEC_SUCCESS;
600
120k
}
601
602
REGISTER_COMMAND_BACKEND(CCV_NNC_DATATYPE_CONVERSION_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
603
1
{
604
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
605
1
  registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F;
606
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
607
1
  registry->algorithms = 1;
608
1
  registry->exec = _ccv_nnc_datatype_conversion;
609
1
}
610
611
REGISTER_COMMAND_BACKEND(CCV_NNC_DATATYPE_CONVERSION_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
612
1
{
613
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
614
1
  registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F;
615
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
616
1
  registry->algorithms = 1;
617
1
  registry->exec = _ccv_nnc_datatype_conversion;
618
1
}
619
620
static void _ccv_nnc_masked_fill_cpu_ref_f(const float p, const float q, ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b, ccv_nnc_tensor_view_t* const c)
621
2
{
622
2
  int cdim[CCV_NNC_MAX_DIM_ALLOC];
623
2
  ccv_nnc_tensor_view_get_dim(a, cdim); // Fill in cdim first.
624
2
  ccv_nnc_tensor_view_get_broadcast_dim(b, cdim);
625
2
  assert(ccv_nnc_tensor_view_check_broadcast_dim(a, cdim));
626
2
  assert(ccv_nnc_tensor_view_check_broadcast_dim(b, cdim));
627
2
  const int a_check_dim = ccv_nnc_tensor_view_check_dim(a, cdim);
628
2
  const int b_check_dim = ccv_nnc_tensor_view_check_dim(b, cdim);
629
2
  // Assuming this is float 32.
630
2
  int adim[CCV_NNC_MAX_DIM_ALLOC];
631
2
  int bdim[CCV_NNC_MAX_DIM_ALLOC];
632
2
  ccv_nnc_tensor_view_get_dim(a, adim);
633
2
  ccv_nnc_tensor_view_get_dim(b, bdim);
634
2
  int ainc[CCV_NNC_MAX_DIM_ALLOC];
635
2
  int binc[CCV_NNC_MAX_DIM_ALLOC];
636
2
  int cinc[CCV_NNC_MAX_DIM_ALLOC];
637
2
  assert(ccv_nnc_tensor_view_check_dim(c, cdim));
638
2
  int x;
639
2
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && a_check_dim && b_check_dim)
640
0
  {
641
0
    const int tensor_count = ccv_nnc_tensor_count(a->info);
642
0
    // Super optimal case, just do one for-loop for sum.
643
0
    for (x = 0; x < tensor_count; x++)
644
0
      c->data.f32[x] = (b->data.f32[x] == p) ? q : a->data.f32[x];
645
0
    return;
646
0
  }
647
2
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
648
2
  ccv_nnc_tensor_view_get_inc(a, ainc);
649
2
  ccv_nnc_tensor_view_get_inc(b, binc);
650
2
  ccv_nnc_tensor_view_get_inc(c, cinc);
651
2
  int i[CCV_NNC_MAX_DIM + 2];
652
2
  float* ap = a->data.f32;
653
2
  float* bp = b->data.f32;
654
2
  float* cp = c->data.f32;
655
2
  const int count = cdim[2] * cdim[3];
656
2
  if (ainc[3] == cdim[3] && binc[3] == cdim[3] && cinc[3] == cdim[3] && adim[2] == cdim[2] && bdim[2] == cdim[2])
657
2
  {
658
2
    // Special casing if the ainc[3] is the same as dim[3]
659
4
    for (i[0] = 0; i[0] < cdim[0]; 
i[0]++2
)
660
2
    {
661
2
      float* const ap0 = adim[0] == 1 ? ap : 
ap + i[0] * ainc[1] * ainc[2] * ainc[3]0
;
662
2
      float* const bp0 = bdim[0] == 1 ? bp : 
bp + i[0] * binc[1] * binc[2] * binc[3]0
;
663
14
      for (i[1] = 0; i[1] < cdim[1]; 
i[1]++12
)
664
12
      {
665
12
        float* const ap1 = adim[1] == 1 ? 
ap00
: ap0 + i[1] * ainc[2] * ainc[3];
666
12
        float* const bp1 = bdim[1] == 1 ? bp0 : 
bp0 + i[1] * binc[2] * binc[3]0
;
667
252
        for (x = 0; x < count; 
x++240
)
668
240
          cp[x] = (bp1[x] == p) ? 
q120
:
ap1[x]120
;
669
12
        cp += cinc[2] * cinc[3];
670
12
      }
671
2
      cp += (cinc[1] - cdim[1]) * cinc[2] * cinc[3];
672
2
    }
673
2
    return;
674
2
  }
675
0
  // Non-optimal case, need to do skip copy and handle broadcasting.
676
0
  for (i[0] = 0; i[0] < cdim[0]; i[0]++)
677
0
  {
678
0
    float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * ainc[1] * ainc[2] * ainc[3];
679
0
    float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * binc[1] * binc[2] * binc[3];
680
0
    for (i[1] = 0; i[1] < cdim[1]; i[1]++)
681
0
    {
682
0
      float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * ainc[2] * ainc[3];
683
0
      float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * binc[2] * binc[3];
684
0
      for (i[2] = 0; i[2] < cdim[2]; i[2]++)
685
0
      {
686
0
        float* const ap2 = adim[2] == 1 ? ap1 : ap1 + i[2] * ainc[3];
687
0
        float* const bp2 = bdim[2] == 1 ? bp1 : bp1 + i[2] * binc[3];
688
0
        if (adim[3] == 1)
689
0
          for (x = 0; x < cdim[3]; x++)
690
0
            cp[x] = (bp2[x] == p) ? q : ap2[0];
691
0
        else if (bdim[3] == 1)
692
0
          if (bp2[0] == p)
693
0
            for (x = 0; x < cdim[3]; x++)
694
0
              cp[x] = q;
695
0
          else
696
0
            for (x = 0; x < cdim[3]; x++)
697
0
              cp[x] = ap2[x];
698
0
        else
699
0
          for (x = 0; x < cdim[3]; x++)
700
0
            cp[x] = (bp2[x] == p) ? q : ap2[x];
701
0
        cp += cinc[3];
702
0
      }
703
0
      cp += (cinc[2] - cdim[2]) * cinc[3];
704
0
    }
705
0
    cp += (cinc[1] - cdim[1]) * cinc[2] * cinc[3];
706
0
  }
707
0
}
708
709
static void _ccv_nnc_masked_fill_cpu_ref_s(const int p, const float q, ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b, ccv_nnc_tensor_view_t* const c)
710
4
{
711
4
  int cdim[CCV_NNC_MAX_DIM_ALLOC];
712
4
  ccv_nnc_tensor_view_get_dim(a, cdim); // Fill in cdim first.
713
4
  ccv_nnc_tensor_view_get_broadcast_dim(b, cdim);
714
4
  assert(ccv_nnc_tensor_view_check_broadcast_dim(a, cdim));
715
4
  assert(ccv_nnc_tensor_view_check_broadcast_dim(b, cdim));
716
4
  const int a_check_dim = ccv_nnc_tensor_view_check_dim(a, cdim);
717
4
  const int b_check_dim = ccv_nnc_tensor_view_check_dim(b, cdim);
718
4
  // Assuming this is float 32.
719
4
  int adim[CCV_NNC_MAX_DIM_ALLOC];
720
4
  int bdim[CCV_NNC_MAX_DIM_ALLOC];
721
4
  ccv_nnc_tensor_view_get_dim(a, adim);
722
4
  ccv_nnc_tensor_view_get_dim(b, bdim);
723
4
  int ainc[CCV_NNC_MAX_DIM_ALLOC];
724
4
  int binc[CCV_NNC_MAX_DIM_ALLOC];
725
4
  int cinc[CCV_NNC_MAX_DIM_ALLOC];
726
4
  assert(ccv_nnc_tensor_view_check_dim(c, cdim));
727
4
  int x;
728
4
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && a_check_dim && b_check_dim)
729
0
  {
730
0
    const int tensor_count = ccv_nnc_tensor_count(a->info);
731
0
    // Super optimal case, just do one for-loop for sum.
732
0
    for (x = 0; x < tensor_count; x++)
733
0
      c->data.f32[x] = (b->data.i32[x] == p) ? q : a->data.f32[x];
734
0
    return;
735
0
  }
736
4
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
737
4
  ccv_nnc_tensor_view_get_inc(a, ainc);
738
4
  ccv_nnc_tensor_view_get_inc(b, binc);
739
4
  ccv_nnc_tensor_view_get_inc(c, cinc);
740
4
  int i[CCV_NNC_MAX_DIM + 2];
741
4
  float* ap = a->data.f32;
742
4
  int* bp = b->data.i32;
743
4
  float* cp = c->data.f32;
744
4
  const int count = cdim[2] * cdim[3];
745
4
  if (ainc[3] == cdim[3] && binc[3] == cdim[3] && cinc[3] == cdim[3] && adim[2] == cdim[2] && bdim[2] == cdim[2])
746
4
  {
747
4
    // Special casing if the ainc[3] is the same as dim[3]
748
8
    for (i[0] = 0; i[0] < cdim[0]; 
i[0]++4
)
749
4
    {
750
4
      float* const ap0 = adim[0] == 1 ? ap : 
ap + i[0] * ainc[1] * ainc[2] * ainc[3]0
;
751
4
      int* const bp0 = bdim[0] == 1 ? bp : 
bp + i[0] * binc[1] * binc[2] * binc[3]0
;
752
28
      for (i[1] = 0; i[1] < cdim[1]; 
i[1]++24
)
753
24
      {
754
24
        float* const ap1 = adim[1] == 1 ? 
ap00
: ap0 + i[1] * ainc[2] * ainc[3];
755
24
        int* const bp1 = bdim[1] == 1 ? bp0 : 
bp0 + i[1] * binc[2] * binc[3]0
;
756
504
        for (x = 0; x < count; 
x++480
)
757
480
          cp[x] = (bp1[x] == p) ? 
q240
:
ap1[x]240
;
758
24
        cp += cinc[2] * cinc[3];
759
24
      }
760
4
      cp += (cinc[1] - cdim[1]) * cinc[2] * cinc[3];
761
4
    }
762
4
    return;
763
4
  }
764
0
  // Non-optimal case, need to do skip copy and handle broadcasting.
765
0
  for (i[0] = 0; i[0] < cdim[0]; i[0]++)
766
0
  {
767
0
    float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * ainc[1] * ainc[2] * ainc[3];
768
0
    int* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * binc[1] * binc[2] * binc[3];
769
0
    for (i[1] = 0; i[1] < cdim[1]; i[1]++)
770
0
    {
771
0
      float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * ainc[2] * ainc[3];
772
0
      int* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * binc[2] * binc[3];
773
0
      for (i[2] = 0; i[2] < cdim[2]; i[2]++)
774
0
      {
775
0
        float* const ap2 = adim[2] == 1 ? ap1 : ap1 + i[2] * ainc[3];
776
0
        int* const bp2 = bdim[2] == 1 ? bp1 : bp1 + i[2] * binc[3];
777
0
        if (adim[3] == 1)
778
0
          for (x = 0; x < cdim[3]; x++)
779
0
            cp[x] = (bp2[x] == p) ? q : ap2[0];
780
0
        else if (bdim[3] == 1)
781
0
          if (bp2[0] == p)
782
0
            for (x = 0; x < cdim[3]; x++)
783
0
              cp[x] = q;
784
0
          else
785
0
            for (x = 0; x < cdim[3]; x++)
786
0
              cp[x] = ap2[x];
787
0
        else
788
0
          for (x = 0; x < cdim[3]; x++)
789
0
            cp[x] = (bp2[x] == p) ? q : ap2[x];
790
0
        cp += cinc[3];
791
0
      }
792
0
      cp += (cinc[2] - cdim[2]) * cinc[3];
793
0
    }
794
0
    cp += (cinc[1] - cdim[1]) * cinc[2] * cinc[3];
795
0
  }
796
0
}
797
798
static int _ccv_nnc_masked_fill_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
799
3
{
800
3
  assert(input_size >= 2);
801
3
  assert(inputs[0]);
802
3
  assert(inputs[1]);
803
3
  assert(outputs[0]);
804
3
  if (inputs[1]->info.datatype == CCV_32F)
805
1
    _ccv_nnc_masked_fill_cpu_ref_f(cmd.info.blas.a[0], cmd.info.blas.a[1], (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]);
806
2
  else if (inputs[1]->info.datatype == CCV_32S)
807
2
    _ccv_nnc_masked_fill_cpu_ref_s((int)(cmd.info.blas.a[0] + 0.5), cmd.info.blas.a[1], (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]);
808
3
  return CCV_NNC_EXEC_SUCCESS;
809
3
}
810
811
static int _ccv_nnc_masked_fill_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
812
3
{
813
3
  assert(input_size >= 3);
814
3
  if (inputs[2]->info.datatype == CCV_32F)
815
1
    _ccv_nnc_masked_fill_cpu_ref_f(cmd.info.blas.a[0], 0, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
816
2
  else if (inputs[2]->info.datatype == CCV_32S)
817
2
    _ccv_nnc_masked_fill_cpu_ref_s((int)(cmd.info.blas.a[0] + 0.5), 0, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
818
3
  // TODO: doesn't really support taking gradient on mask.
819
3
  // if (output_size >= 2 && outputs[1])
820
3
  return CCV_NNC_EXEC_SUCCESS;
821
3
}
822
823
REGISTER_COMMAND_BACKEND(CCV_NNC_MASKED_FILL_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
824
1
{
825
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
826
1
  registry->tensor_datatypes = CCV_32F | CCV_32S;
827
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
828
1
  registry->algorithms = 1;
829
1
  registry->exec = _ccv_nnc_masked_fill_forw;
830
1
}
831
832
REGISTER_COMMAND_BACKEND(CCV_NNC_MASKED_FILL_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
833
1
{
834
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
835
1
  registry->tensor_datatypes = CCV_32F | CCV_32S;
836
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
837
1
  registry->algorithms = 1;
838
1
  registry->exec = _ccv_nnc_masked_fill_back;
839
1
}