Coverage Report

Created: 2022-08-03 23:52

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/cmd/util/ccv_nnc_util_cpu_ref.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
#include "../_ccv_nnc_cpu_ref.h"
13
14
void _ccv_nnc_tensor_transfer_cpu_ref_f16(const ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b)
15
0
{
16
  // Assuming this is float 32.
17
0
  assert(a->info.datatype == b->info.datatype);
18
0
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b))
19
0
  {
20
    // Super optimal case, just do memcpy.
21
0
    memcpy(b->data.u8, a->data.u8, ccv_nnc_tensor_count(a->info) * CCV_GET_DATA_TYPE_SIZE(a->info.datatype));
22
0
    return;
23
0
  }
24
0
  int dim[CCV_NNC_MAX_DIM_ALLOC];
25
0
  int ainc[CCV_NNC_MAX_DIM_ALLOC];
26
0
  int binc[CCV_NNC_MAX_DIM_ALLOC];
27
0
  ccv_nnc_tensor_view_get_dim(a, dim);
28
0
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
29
0
  ccv_nnc_tensor_view_get_inc(a, ainc);
30
0
  ccv_nnc_tensor_view_get_inc(b, binc);
31
0
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
32
0
  int i[CCV_NNC_MAX_DIM + 2];
33
0
  ccv_float16_t* ap = a->data.f16;
34
0
  ccv_float16_t* bp = b->data.f16;
35
0
  if (ainc[3] == dim[3] && binc[3] == dim[3])
36
0
  {
37
    // Special casing if the ainc[3] is the same as dim[3] (do memcpy for the last two dim)
38
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
39
0
    {
40
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
41
0
      {
42
0
        memcpy(bp, ap, dim[2] * dim[3] * sizeof(ccv_float16_t));
43
0
        ap += ainc[2] * ainc[3];
44
0
        bp += binc[2] * binc[3];
45
0
      }
46
0
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
47
0
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
48
0
    }
49
0
    return;
50
0
  }
51
  // Non-optimal case, need to do skip copy.
52
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
53
0
  {
54
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
55
0
    {
56
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
57
0
      {
58
0
        memcpy(bp, ap, dim[3] * sizeof(ccv_float16_t));
59
0
        ap += ainc[3];
60
0
        bp += binc[3];
61
0
      }
62
0
      ap += (ainc[2] - dim[2]) * ainc[3];
63
0
      bp += (binc[2] - dim[2]) * binc[3];
64
0
    }
65
0
    ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
66
0
    bp += (binc[1] - dim[1]) * binc[2] * binc[3];
67
0
  }
68
0
}
69
70
void _ccv_nnc_tensor_transfer_cpu_ref_f32(const ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b)
71
23.6k
{
72
  // Assuming this is float 32.
73
23.6k
  assert(a->info.datatype == b->info.datatype);
74
23.6k
  if (!CCV_IS_TENSOR_VIEW(a) && 
!23.6k
CCV_IS_TENSOR_VIEW23.6k
(b))
75
23.6k
  {
76
    // Super optimal case, just do memcpy.
77
23.6k
    memcpy(b->data.u8, a->data.u8, ccv_nnc_tensor_count(a->info) * CCV_GET_DATA_TYPE_SIZE(a->info.datatype));
78
23.6k
    return;
79
23.6k
  }
80
11
  int dim[CCV_NNC_MAX_DIM_ALLOC];
81
11
  int ainc[CCV_NNC_MAX_DIM_ALLOC];
82
11
  int binc[CCV_NNC_MAX_DIM_ALLOC];
83
11
  ccv_nnc_tensor_view_get_dim(a, dim);
84
11
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
85
11
  ccv_nnc_tensor_view_get_inc(a, ainc);
86
11
  ccv_nnc_tensor_view_get_inc(b, binc);
87
11
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
88
11
  int i[CCV_NNC_MAX_DIM + 2];
89
11
  float* ap = a->data.f32;
90
11
  float* bp = b->data.f32;
91
11
  if (ainc[3] == dim[3] && 
binc[3] == dim[3]10
)
92
2
  {
93
    // Special casing if the ainc[3] is the same as dim[3] (do memcpy for the last two dim)
94
4
    for (i[0] = 0; i[0] < dim[0]; 
i[0]++2
)
95
2
    {
96
6
      for (i[1] = 0; i[1] < dim[1]; 
i[1]++4
)
97
4
      {
98
4
        memcpy(bp, ap, dim[2] * dim[3] * sizeof(float));
99
4
        ap += ainc[2] * ainc[3];
100
4
        bp += binc[2] * binc[3];
101
4
      }
102
2
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
103
2
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
104
2
    }
105
2
    return;
106
2
  }
107
  // Non-optimal case, need to do skip copy.
108
18
  
for (i[0] = 0; 9
i[0] < dim[0];
i[0]++9
)
109
9
  {
110
19
    for (i[1] = 0; i[1] < dim[1]; 
i[1]++10
)
111
10
    {
112
24
      for (i[2] = 0; i[2] < dim[2]; 
i[2]++14
)
113
14
      {
114
14
        memcpy(bp, ap, dim[3] * sizeof(float));
115
14
        ap += ainc[3];
116
14
        bp += binc[3];
117
14
      }
118
10
      ap += (ainc[2] - dim[2]) * ainc[3];
119
10
      bp += (binc[2] - dim[2]) * binc[3];
120
10
    }
121
9
    ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
122
9
    bp += (binc[1] - dim[1]) * binc[2] * binc[3];
123
9
  }
124
9
}
125
126
void _ccv_nnc_tensor_transfer_cpu_ref_f64(const ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b)
127
0
{
128
  // Assuming this is float 32.
129
0
  assert(a->info.datatype == b->info.datatype);
130
0
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b))
131
0
  {
132
    // Super optimal case, just do memcpy.
133
0
    memcpy(b->data.u8, a->data.u8, ccv_nnc_tensor_count(a->info) * CCV_GET_DATA_TYPE_SIZE(a->info.datatype));
134
0
    return;
135
0
  }
136
0
  int dim[CCV_NNC_MAX_DIM_ALLOC];
137
0
  int ainc[CCV_NNC_MAX_DIM_ALLOC];
138
0
  int binc[CCV_NNC_MAX_DIM_ALLOC];
139
0
  ccv_nnc_tensor_view_get_dim(a, dim);
140
0
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
141
0
  ccv_nnc_tensor_view_get_inc(a, ainc);
142
0
  ccv_nnc_tensor_view_get_inc(b, binc);
143
0
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
144
0
  int i[CCV_NNC_MAX_DIM + 2];
145
0
  double* ap = a->data.f64;
146
0
  double* bp = b->data.f64;
147
0
  if (ainc[3] == dim[3] && binc[3] == dim[3])
148
0
  {
149
    // Special casing if the ainc[3] is the same as dim[3] (do memcpy for the last two dim)
150
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
151
0
    {
152
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
153
0
      {
154
0
        memcpy(bp, ap, dim[2] * dim[3] * sizeof(double));
155
0
        ap += ainc[2] * ainc[3];
156
0
        bp += binc[2] * binc[3];
157
0
      }
158
0
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
159
0
      bp += (binc[1] - dim[1]) * binc[2] * binc[3];
160
0
    }
161
0
    return;
162
0
  }
163
  // Non-optimal case, need to do skip copy.
164
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
165
0
  {
166
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
167
0
    {
168
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
169
0
      {
170
0
        memcpy(bp, ap, dim[3] * sizeof(double));
171
0
        ap += ainc[3];
172
0
        bp += binc[3];
173
0
      }
174
0
      ap += (ainc[2] - dim[2]) * ainc[3];
175
0
      bp += (binc[2] - dim[2]) * binc[3];
176
0
    }
177
0
    ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
178
0
    bp += (binc[1] - dim[1]) * binc[2] * binc[3];
179
0
  }
180
0
}
181
182
void _ccv_nnc_tensor_set_cpu_ref_f32(ccv_nnc_tensor_view_t* const a, const float b)
183
10.7k
{
184
  // Assuming this is float 32.
185
10.7k
  int dim[CCV_NNC_MAX_DIM_ALLOC];
186
10.7k
  int ainc[CCV_NNC_MAX_DIM_ALLOC];
187
10.7k
  int x;
188
10.7k
  if (!CCV_IS_TENSOR_VIEW(a))
189
10.7k
  {
190
    // Super optimal case, just do one for-loop for sum.
191
10.7k
    const int tensor_count = ccv_nnc_tensor_count(a->info);
192
397k
    for (x = 0; x < tensor_count; 
x++386k
)
193
386k
      a->data.f32[x] = b;
194
10.7k
    return;
195
10.7k
  }
196
0
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
197
0
  ccv_nnc_tensor_view_get_dim(a, dim);
198
0
  ccv_nnc_tensor_view_get_inc(a, ainc);
199
0
  int i[CCV_NNC_MAX_DIM + 2];
200
0
  float* ap = a->data.f32;
201
0
  const int count = dim[2] * dim[3];
202
0
  if (ainc[3] == dim[3])
203
0
  {
204
    // Special casing if the ainc[3] is the same as dim[3]
205
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
206
0
    {
207
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
208
0
      {
209
0
        for (x = 0; x < count; x++)
210
0
          ap[x] = b;
211
0
        ap += ainc[2] * ainc[3];
212
0
      }
213
0
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
214
0
    }
215
0
    return;
216
0
  }
217
  // Non-optimal case, need to do skip copy.
218
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
219
0
  {
220
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
221
0
    {
222
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
223
0
      {
224
0
        for (x = 0; x < dim[3]; x++)
225
0
          ap[x] = b;
226
0
        ap += ainc[3];
227
0
      }
228
0
      ap += (ainc[2] - dim[2]) * ainc[3];
229
0
    }
230
0
    ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
231
0
  }
232
0
}
233
234
void _ccv_nnc_tensor_set_cpu_ref_f64(ccv_nnc_tensor_view_t* const a, const double b)
235
1
{
236
  // Assuming this is double.
237
1
  int dim[CCV_NNC_MAX_DIM_ALLOC];
238
1
  int ainc[CCV_NNC_MAX_DIM_ALLOC];
239
1
  int x;
240
1
  if (!CCV_IS_TENSOR_VIEW(a))
241
1
  {
242
    // Super optimal case, just do one for-loop for sum.
243
1
    const int tensor_count = ccv_nnc_tensor_count(a->info);
244
7.92k
    for (x = 0; x < tensor_count; 
x++7.92k
)
245
7.92k
      a->data.f64[x] = b;
246
1
    return;
247
1
  }
248
0
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
249
0
  ccv_nnc_tensor_view_get_dim(a, dim);
250
0
  ccv_nnc_tensor_view_get_inc(a, ainc);
251
0
  int i[CCV_NNC_MAX_DIM + 2];
252
0
  double* ap = a->data.f64;
253
0
  const int count = dim[2] * dim[3];
254
0
  if (ainc[3] == dim[3])
255
0
  {
256
    // Special casing if the ainc[3] is the same as dim[3]
257
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
258
0
    {
259
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
260
0
      {
261
0
        for (x = 0; x < count; x++)
262
0
          ap[x] = b;
263
0
        ap += ainc[2] * ainc[3];
264
0
      }
265
0
      ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
266
0
    }
267
0
    return;
268
0
  }
269
  // Non-optimal case, need to do skip copy.
270
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
271
0
  {
272
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
273
0
    {
274
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
275
0
      {
276
0
        for (x = 0; x < dim[3]; x++)
277
0
          ap[x] = b;
278
0
        ap += ainc[3];
279
0
      }
280
0
      ap += (ainc[2] - dim[2]) * ainc[3];
281
0
    }
282
0
    ap += (ainc[1] - dim[1]) * ainc[2] * ainc[3];
283
0
  }
284
0
}
285
286
static int _ccv_nnc_data_transfer(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
287
1.54k
{
288
1.54k
  int i;
289
6.46k
  for (i = 0; i < ccv_min(input_size, output_size); 
i++4.91k
)
290
4.91k
  {
291
4.91k
    const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[i];
292
4.91k
    ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[i];
293
4.91k
    if (a != b) // Only do transfer if these are two different tensors.
294
54
    {
295
54
      assert(a->info.datatype == b->info.datatype);
296
54
      if (a->info.datatype == CCV_16F)
297
0
        _ccv_nnc_tensor_transfer_cpu_ref_f16(a, b);
298
54
      else if (a->info.datatype == CCV_32F || 
a->info.datatype == CCV_32S0
)
299
54
        _ccv_nnc_tensor_transfer_cpu_ref_f32(a, b);
300
0
      else if (a->info.datatype == CCV_64F)
301
0
        _ccv_nnc_tensor_transfer_cpu_ref_f64(a, b);
302
54
    }
303
4.91k
  }
304
1.54k
  return CCV_NNC_EXEC_SUCCESS;
305
1.54k
}
306
307
REGISTER_COMMAND_BACKEND(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
308
1
{
309
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
310
1
  registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_32S;
311
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
312
1
  registry->algorithms = 1;
313
1
  registry->exec = _ccv_nnc_data_transfer;
314
1
}
315
316
REGISTER_COMMAND_BACKEND(CCV_NNC_DATA_TRANSFER_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
317
1
{
318
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
319
1
  registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_32S;
320
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
321
1
  registry->algorithms = 1;
322
1
  registry->exec = _ccv_nnc_data_transfer;
323
1
}
324
325
static int _ccv_nnc_set_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
326
10.8k
{
327
10.8k
  int i;
328
10.8k
  if (cmd.info.blas.a[0] == 0)
329
272
    
for (i = 0; 136
i < output_size;
i++136
)
330
136
      ccv_nnc_tensor_zero(outputs[i]);
331
10.7k
  else
332
21.5k
    
for (i = 0; 10.7k
i < output_size;
i++10.7k
)
333
10.7k
      if (outputs[i]->info.datatype == CCV_32F)
334
10.7k
        _ccv_nnc_tensor_set_cpu_ref_f32((ccv_nnc_tensor_view_t*)outputs[i], cmd.info.blas.a[0]);
335
1
      else if (outputs[i]->info.datatype == CCV_64F)
336
1
        _ccv_nnc_tensor_set_cpu_ref_f64((ccv_nnc_tensor_view_t*)outputs[i], cmd.info.blas.a[0]);
337
0
      else
338
0
        { assert(0); }
339
10.8k
  return CCV_NNC_EXEC_SUCCESS;
340
10.8k
}
341
342
static int _ccv_nnc_set_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
343
0
{
344
0
  int i;
345
0
  for (i = 0; i < output_size; i++)
346
0
    ccv_nnc_tensor_zero(outputs[i]);
347
0
  return CCV_NNC_EXEC_SUCCESS;
348
0
}
349
350
REGISTER_COMMAND_BACKEND(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
351
1
{
352
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
353
1
  registry->tensor_datatypes = CCV_64F | CCV_32F;
354
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
355
1
  registry->algorithms = 1;
356
1
  registry->exec = _ccv_nnc_set_forw;
357
1
}
358
359
REGISTER_COMMAND_BACKEND(CCV_NNC_SET_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
360
1
{
361
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
362
1
  registry->tensor_datatypes = CCV_64F | CCV_32F;
363
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
364
1
  registry->algorithms = 1;
365
1
  registry->exec = _ccv_nnc_set_back;
366
1
}
367
368
static void _ccv_nnc_tensor_nhwc_nchw_f32(const ccv_nnc_tensor_view_t* a, ccv_nnc_tensor_view_t* b)
369
9
{
370
  // Assuming this is float 32.
371
9
  int ainc[CCV_NNC_MAX_DIM + 2];
372
9
  int binc[CCV_NNC_MAX_DIM + 2];
373
9
  int k;
374
  // In case it is Toll-free bridged matrix object (NHWC format is possible).
375
9
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
376
9
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
377
9
  const int a_offset = CCV_NNC_MAX_DIM + 2 - a_nd;
378
9
  assert(a_offset == 0 || a_offset == 1);
379
9
  const int b_offset = CCV_NNC_MAX_DIM + 2 - b_nd;
380
9
  assert(b_offset == 0 || b_offset == 1);
381
9
  ccv_nnc_tensor_view_get_inc(a, ainc);
382
9
  ccv_nnc_tensor_view_get_inc(b, binc);
383
  // Comparing N
384
9
  assert((a_offset == 0 ? a->info.dim[0] : 1) == (b_offset == 0 ? b->info.dim[0] : 1));
385
9
  const int n = (a_offset == 0 ? 
a->info.dim[0]2
:
17
);
386
  // Comparing C
387
9
  assert(a->info.dim[a_nd - 1] == b->info.dim[1 - b_offset]);
388
9
  const int c = a->info.dim[a_nd - 1];
389
  // Comparing HW
390
9
  int hw[CCV_NNC_MAX_DIM];
391
27
  for (k = 0; k < CCV_NNC_MAX_DIM; 
k++18
)
392
18
  {
393
18
    assert(a->info.dim[k + 1 - a_offset] == b->info.dim[k + 2 - b_offset]);
394
18
    hw[k] = a->info.dim[k + 1 - a_offset];
395
18
  }
396
9
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
397
9
  int i[CCV_NNC_MAX_DIM + 2];
398
9
  float* ap = a->data.f32;
399
9
  float* bp = b->data.f32;
400
  // Non-optimal case, need to do skip copy.
401
20
  for (i[0] = 0; i[0] < n; 
i[0]++11
)
402
11
  {
403
792
    for (i[3] = 0; i[3] < c; 
i[3]++781
)
404
781
    {
405
781
      float* apu = ap + i[3];
406
19.4k
      for (i[1] = 0; i[1] < hw[0]; 
i[1]++18.6k
)
407
18.6k
      {
408
3.35M
        for (i[2] = 0; i[2] < hw[1]; 
i[2]++3.33M
)
409
3.33M
          bp[i[2]] = apu[i[2] * ainc[3]];
410
18.6k
        apu += ainc[2] * ainc[3];
411
18.6k
        bp += binc[3];
412
18.6k
      }
413
781
      bp += (binc[2] - hw[0]) * binc[3];
414
781
    }
415
11
    ap += ainc[1] * ainc[2] * ainc[3];
416
11
    bp += (binc[1] - c) * binc[2] * binc[3];
417
11
  }
418
9
}
419
420
static void _ccv_nnc_tensor_nchw_nhwc_f32(const ccv_nnc_tensor_view_t* a, ccv_nnc_tensor_view_t* b)
421
9
{
422
  // Assuming this is float 32.
423
9
  int ainc[CCV_NNC_MAX_DIM + 2];
424
9
  int binc[CCV_NNC_MAX_DIM + 2];
425
9
  int k;
426
  // In case it is Toll-free bridged matrix object (NHWC format is possible).
427
9
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
428
9
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
429
9
  const int a_offset = CCV_NNC_MAX_DIM + 2 - a_nd;
430
9
  assert(a_offset == 0 || a_offset == 1);
431
9
  const int b_offset = CCV_NNC_MAX_DIM + 2 - b_nd;
432
9
  assert(b_offset == 0 || b_offset == 1);
433
9
  ccv_nnc_tensor_view_get_inc(a, ainc);
434
9
  ccv_nnc_tensor_view_get_inc(b, binc);
435
  // Comparing N
436
9
  assert((a_offset == 0 ? a->info.dim[0] : 1) == (b_offset == 0 ? b->info.dim[0] : 1));
437
9
  const int n = (a_offset == 0 ? 
a->info.dim[0]2
:
17
);
438
  // Comparing C
439
9
  assert(a->info.dim[1 - a_offset] == b->info.dim[b_nd - 1]);
440
9
  const int c = a->info.dim[1 - a_offset];
441
  // Comparing HW
442
9
  int hw[CCV_NNC_MAX_DIM];
443
27
  for (k = 0; k < CCV_NNC_MAX_DIM; 
k++18
)
444
18
  {
445
18
    assert(a->info.dim[k + 2 - a_offset] == b->info.dim[k + 1 - b_offset]);
446
18
    hw[k] = a->info.dim[k + 2 - a_offset];
447
18
  }
448
9
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
449
9
  int i[CCV_NNC_MAX_DIM + 2];
450
9
  float* ap = a->data.f32;
451
9
  float* bp = b->data.f32;
452
  // Non-optimal case, need to do skip copy.
453
20
  for (i[0] = 0; i[0] < n; 
i[0]++11
)
454
11
  {
455
792
    for (i[3] = 0; i[3] < c; 
i[3]++781
)
456
781
    {
457
781
      float* bpu = bp + i[3];
458
20.1k
      for (i[1] = 0; i[1] < hw[0]; 
i[1]++19.3k
)
459
19.3k
      {
460
5.04M
        for (i[2] = 0; i[2] < hw[1]; 
i[2]++5.02M
)
461
5.02M
          bpu[i[2] * binc[3]] = ap[i[2]];
462
19.3k
        ap += ainc[3];
463
19.3k
        bpu += binc[2] * binc[3];
464
19.3k
      }
465
781
      ap += (ainc[2] - hw[0]) * ainc[3];
466
781
    }
467
11
    ap += (ainc[1] - c) * ainc[2] * ainc[3];
468
11
    bp += binc[1] * binc[2] * binc[3];
469
11
  }
470
9
}
471
472
static void _ccv_nnc_tensor_nhwc_nchw_f64(const ccv_nnc_tensor_view_t* a, ccv_nnc_tensor_view_t* b)
473
1
{
474
  // Assuming this is double.
475
1
  int ainc[CCV_NNC_MAX_DIM + 2];
476
1
  int binc[CCV_NNC_MAX_DIM + 2];
477
1
  int k;
478
  // In case it is Toll-free bridged matrix object (NHWC format is possible).
479
1
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
480
1
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
481
1
  const int a_offset = CCV_NNC_MAX_DIM + 2 - a_nd;
482
1
  assert(a_offset == 0 || a_offset == 1);
483
1
  const int b_offset = CCV_NNC_MAX_DIM + 2 - b_nd;
484
1
  assert(b_offset == 0 || b_offset == 1);
485
1
  ccv_nnc_tensor_view_get_inc(a, ainc);
486
1
  ccv_nnc_tensor_view_get_inc(b, binc);
487
  // Comparing N
488
1
  assert((a_offset == 0 ? a->info.dim[0] : 1) == (b_offset == 0 ? b->info.dim[0] : 1));
489
1
  const int n = (a_offset == 0 ? a->info.dim[0] : 
10
);
490
  // Comparing C
491
1
  assert(a->info.dim[a_nd - 1] == b->info.dim[1 - b_offset]);
492
1
  const int c = a->info.dim[a_nd - 1];
493
  // Comparing HW
494
1
  int hw[CCV_NNC_MAX_DIM];
495
3
  for (k = 0; k < CCV_NNC_MAX_DIM; 
k++2
)
496
2
  {
497
2
    assert(a->info.dim[k + 1 - a_offset] == b->info.dim[k + 2 - b_offset]);
498
2
    hw[k] = a->info.dim[k + 1 - a_offset];
499
2
  }
500
1
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
501
1
  int i[CCV_NNC_MAX_DIM + 2];
502
1
  double* ap = a->data.f64;
503
1
  double* bp = b->data.f64;
504
  // Non-optimal case, need to do skip copy.
505
12
  for (i[0] = 0; i[0] < n; 
i[0]++11
)
506
11
  {
507
99
    for (i[3] = 0; i[3] < c; 
i[3]++88
)
508
88
    {
509
88
      double* apu = ap + i[3];
510
968
      for (i[1] = 0; i[1] < hw[0]; 
i[1]++880
)
511
880
      {
512
8.80k
        for (i[2] = 0; i[2] < hw[1]; 
i[2]++7.92k
)
513
7.92k
          bp[i[2]] = apu[i[2] * ainc[3]];
514
880
        apu += ainc[2] * ainc[3];
515
880
        bp += binc[3];
516
880
      }
517
88
      bp += (binc[2] - hw[0]) * binc[3];
518
88
    }
519
11
    ap += ainc[1] * ainc[2] * ainc[3];
520
11
    bp += (binc[1] - c) * binc[2] * binc[3];
521
11
  }
522
1
}
523
524
static void _ccv_nnc_tensor_nchw_nhwc_f64(const ccv_nnc_tensor_view_t* a, ccv_nnc_tensor_view_t* b)
525
0
{
526
  // Assuming this is double.
527
0
  int ainc[CCV_NNC_MAX_DIM + 2];
528
0
  int binc[CCV_NNC_MAX_DIM + 2];
529
0
  int k;
530
  // In case it is Toll-free bridged matrix object (NHWC format is possible).
531
0
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
532
0
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
533
0
  const int a_offset = CCV_NNC_MAX_DIM + 2 - a_nd;
534
0
  assert(a_offset == 0 || a_offset == 1);
535
0
  const int b_offset = CCV_NNC_MAX_DIM + 2 - b_nd;
536
0
  assert(b_offset == 0 || b_offset == 1);
537
0
  ccv_nnc_tensor_view_get_inc(a, ainc);
538
0
  ccv_nnc_tensor_view_get_inc(b, binc);
539
  // Comparing N
540
0
  assert((a_offset == 0 ? a->info.dim[0] : 1) == (b_offset == 0 ? b->info.dim[0] : 1));
541
0
  const int n = (a_offset == 0 ? a->info.dim[0] : 1);
542
  // Comparing C
543
0
  assert(a->info.dim[1 - a_offset] == b->info.dim[b_nd - 1]);
544
0
  const int c = a->info.dim[1 - a_offset];
545
  // Comparing HW
546
0
  int hw[CCV_NNC_MAX_DIM];
547
0
  for (k = 0; k < CCV_NNC_MAX_DIM; k++)
548
0
  {
549
0
    assert(a->info.dim[k + 2 - a_offset] == b->info.dim[k + 1 - b_offset]);
550
0
    hw[k] = a->info.dim[k + 2 - a_offset];
551
0
  }
552
0
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
553
0
  int i[CCV_NNC_MAX_DIM + 2];
554
0
  double* ap = a->data.f64;
555
0
  double* bp = b->data.f64;
556
  // Non-optimal case, need to do skip copy.
557
0
  for (i[0] = 0; i[0] < n; i[0]++)
558
0
  {
559
0
    for (i[3] = 0; i[3] < c; i[3]++)
560
0
    {
561
0
      double* bpu = bp + i[3];
562
0
      for (i[1] = 0; i[1] < hw[0]; i[1]++)
563
0
      {
564
0
        for (i[2] = 0; i[2] < hw[1]; i[2]++)
565
0
          bpu[i[2] * binc[3]] = ap[i[2]];
566
0
        ap += ainc[3];
567
0
        bpu += binc[2] * binc[3];
568
0
      }
569
0
      ap += (ainc[2] - hw[0]) * ainc[3];
570
0
    }
571
0
    ap += (ainc[1] - c) * ainc[2] * ainc[3];
572
0
    bp += binc[1] * binc[2] * binc[3];
573
0
  }
574
0
}
575
576
static int _ccv_nnc_format_transform(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
577
21
{
578
21
  assert(output_size <= input_size);
579
21
  int i;
580
44
  for (i = 0; i < output_size; 
i++23
)
581
23
  {
582
23
    const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[i];
583
23
    ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[i];
584
23
    assert(a != b); // Cannot do inplace transform.
585
23
    assert(a->info.datatype == b->info.datatype);
586
23
    if (a->info.datatype == CCV_32F || 
a->info.datatype == CCV_32S1
)
587
22
    {
588
22
      if (a->info.format == b->info.format) {
589
        // If it is the same, just do a normal data transfer.
590
4
        _ccv_nnc_tensor_transfer_cpu_ref_f32(a, b);
591
18
      } else if (a->info.format == CCV_TENSOR_FORMAT_NHWC && 
b->info.format == CCV_TENSOR_FORMAT_NCHW9
) {
592
9
        _ccv_nnc_tensor_nhwc_nchw_f32(a, b);
593
9
      } else if (a->info.format == CCV_TENSOR_FORMAT_NHWC && 
b->info.format == CCV_TENSOR_FORMAT_CHWN0
) {
594
9
      } else if (a->info.format == CCV_TENSOR_FORMAT_NCHW && b->info.format == CCV_TENSOR_FORMAT_NHWC) {
595
9
        _ccv_nnc_tensor_nchw_nhwc_f32(a, b);
596
9
      } else 
if (0
a->info.format == CCV_TENSOR_FORMAT_NCHW0
&&
b->info.format == CCV_TENSOR_FORMAT_CHWN0
) {
597
0
        assert(0);
598
0
      } else if (a->info.format == CCV_TENSOR_FORMAT_CHWN && b->info.format == CCV_TENSOR_FORMAT_NHWC) {
599
0
        assert(0);
600
0
      } else if (a->info.format == CCV_TENSOR_FORMAT_CHWN && b->info.format == CCV_TENSOR_FORMAT_NCHW) {
601
0
        assert(0);
602
0
      }
603
22
    } else 
if (1
a->info.datatype == CCV_64F1
) {
604
1
      if (a->info.format == b->info.format) {
605
        // If it is the same, just do a normal data transfer.
606
0
        _ccv_nnc_tensor_transfer_cpu_ref_f64(a, b);
607
1
      } else if (a->info.format == CCV_TENSOR_FORMAT_NHWC && b->info.format == CCV_TENSOR_FORMAT_NCHW) {
608
1
        _ccv_nnc_tensor_nhwc_nchw_f64(a, b);
609
1
      } else 
if (0
a->info.format == CCV_TENSOR_FORMAT_NHWC0
&&
b->info.format == CCV_TENSOR_FORMAT_CHWN0
) {
610
0
      } else if (a->info.format == CCV_TENSOR_FORMAT_NCHW && b->info.format == CCV_TENSOR_FORMAT_NHWC) {
611
0
        _ccv_nnc_tensor_nchw_nhwc_f64(a, b);
612
0
      } else if (a->info.format == CCV_TENSOR_FORMAT_NCHW && b->info.format == CCV_TENSOR_FORMAT_CHWN) {
613
0
        assert(0);
614
0
      } else if (a->info.format == CCV_TENSOR_FORMAT_CHWN && b->info.format == CCV_TENSOR_FORMAT_NHWC) {
615
0
        assert(0);
616
0
      } else if (a->info.format == CCV_TENSOR_FORMAT_CHWN && b->info.format == CCV_TENSOR_FORMAT_NCHW) {
617
0
        assert(0);
618
0
      }
619
1
    } else {
620
0
      assert(0);
621
0
    }
622
23
  }
623
21
  return CCV_NNC_EXEC_SUCCESS;
624
21
}
625
626
REGISTER_COMMAND_BACKEND(CCV_NNC_FORMAT_TRANSFORM_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
627
1
{
628
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
629
1
  registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_32S;
630
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
631
1
  registry->algorithms = 1;
632
1
  registry->exec = _ccv_nnc_format_transform;
633
1
}
634
635
REGISTER_COMMAND_BACKEND(CCV_NNC_FORMAT_TRANSFORM_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
636
1
{
637
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
638
1
  registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_32S;
639
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
640
1
  registry->algorithms = 1;
641
1
  registry->exec = _ccv_nnc_format_transform;
642
1
}
643
644
static int _ccv_nnc_transpose(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
645
6
{
646
6
  assert(output_size <= input_size);
647
6
  int k;
648
12
  for (k = 0; k < output_size; 
k++6
)
649
6
  {
650
6
    const ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[k];
651
6
    ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)outputs[k];
652
6
    const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
653
6
    const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
654
6
    assert(a_nd == b_nd);
655
6
    assert(a_nd <= CCV_NNC_MAX_DIM + 2); // I can only handle maximum 4.
656
6
    assert(a_nd >= 2 && b_nd >= 2); // You cannot transpose if it is less than 2.
657
6
    assert(a->info.dim[cmd.info.transpose.axis[0]] == b->info.dim[cmd.info.transpose.axis[1]]);
658
6
    assert(a->info.dim[cmd.info.transpose.axis[1]] == b->info.dim[cmd.info.transpose.axis[0]]);
659
6
    int x;
660
28
    for (x = 0; x < a_nd; 
x++22
)
661
22
      if (x != cmd.info.transpose.axis[0] && 
x != cmd.info.transpose.axis[1]16
)
662
10
        { assert(a->info.dim[x] == b->info.dim[x]); }
663
6
    size_t astride[CCV_NNC_MAX_DIM + 2];
664
6
    size_t bstride[CCV_NNC_MAX_DIM + 2];
665
6
    int dim[CCV_NNC_MAX_DIM + 2];
666
8
    for (x = b_nd; x < CCV_NNC_MAX_DIM + 2; 
x++2
)
667
2
      dim[x] = 1;
668
28
    for (x = 0; x < b_nd; 
x++22
)
669
22
      dim[x] = b->info.dim[x];
670
    // Don't use ccv_nnc_tensor_view_get_inc or get_dim because these will prefill beginning to 1:
671
    // for example, if the dimension is [2, 4], it will fill to [1, 1, 2, 4] so the axis index will
672
    // be messed up.
673
6
    const int* const ainc = CCV_IS_TENSOR_VIEW(a) ? a->inc : 
a->info.dim0
;
674
6
    const int* const binc = CCV_IS_TENSOR_VIEW(b) ? b->inc : 
b->info.dim0
;
675
14
    for (x = a_nd - 1; x < CCV_NNC_MAX_DIM + 2; 
x++8
)
676
8
      astride[x] = 1;
677
22
    for (x = a_nd - 2; x >= 0; 
x--16
)
678
16
      astride[x] = astride[x + 1] * ainc[x + 1];
679
14
    for (x = b_nd - 1; x < CCV_NNC_MAX_DIM + 2; 
x++8
)
680
8
      bstride[x] = 1;
681
22
    for (x = b_nd - 2; x >= 0; 
x--16
)
682
16
      bstride[x] = bstride[x + 1] * binc[x + 1];
683
6
    const float* const ap = a->data.f32;
684
6
    float* const bp = b->data.f32;
685
6
    int i[CCV_NNC_MAX_DIM + 2];
686
6
    int j[CCV_NNC_MAX_DIM + 2] = {
687
6
      0, 1, 2, 3
688
6
    };
689
6
    CCV_SWAP(j[cmd.info.transpose.axis[0]], j[cmd.info.transpose.axis[1]], x);
690
27
    for (i[0] = 0; i[0] < dim[0]; 
i[0]++21
)
691
21
    {
692
21
      float* const bp0 = bp + i[0] * bstride[0];
693
71
      for (i[1] = 0; i[1] < dim[1]; 
i[1]++50
)
694
50
      {
695
50
        float* const bp1 = bp0 + i[1] * bstride[1];
696
154
        for (i[2] = 0; i[2] < dim[2]; 
i[2]++104
)
697
104
        {
698
104
          float* const bp2 = bp1 + i[2] * bstride[2];
699
320
          for (i[3] = 0; i[3] < dim[3]; 
i[3]++216
)
700
216
            bp2[i[3]] = ap[i[j[0]] * astride[0] + i[j[1]] * astride[1] + i[j[2]] * astride[2] + i[j[3]] * astride[3]];
701
104
        }
702
50
      }
703
21
    }
704
6
  }
705
6
  return CCV_NNC_EXEC_SUCCESS;
706
6
}
707
708
REGISTER_COMMAND_BACKEND(CCV_NNC_TRANSPOSE_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
709
1
{
710
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
711
1
  registry->tensor_datatypes = CCV_32F;
712
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
713
1
  registry->algorithms = 1;
714
1
  registry->exec = _ccv_nnc_transpose;
715
1
}
716
717
REGISTER_COMMAND_BACKEND(CCV_NNC_TRANSPOSE_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
718
1
{
719
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
720
1
  registry->tensor_datatypes = CCV_32F;
721
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
722
1
  registry->algorithms = 1;
723
1
  registry->exec = _ccv_nnc_transpose;
724
1
}
725
726
static int _ccv_nnc_datatype_conversion(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
727
120k
{
728
120k
  assert(output_size <= input_size);
729
120k
  int i;
730
240k
  for (i = 0; i < output_size; 
i++120k
)
731
120k
  {
732
120k
    const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[i];
733
120k
    ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[i];
734
120k
    assert(a != b); // Cannot do inplace transform.
735
120k
    assert(a->info.format == b->info.format);
736
120k
    if (a->info.datatype == b->info.datatype) {
737
      // If it is the same, just do a normal data transfer.
738
2
      if (a->info.datatype == CCV_16F)
739
0
        _ccv_nnc_tensor_transfer_cpu_ref_f16(a, b);
740
2
      else if (a->info.datatype == CCV_32F)
741
2
        _ccv_nnc_tensor_transfer_cpu_ref_f32(a, b);
742
0
      else if (a->info.datatype == CCV_64F)
743
0
        _ccv_nnc_tensor_transfer_cpu_ref_f64(a, b);
744
120k
    } else if (a->info.datatype == CCV_32F && 
b->info.datatype == CCV_16F120k
) {
745
120k
      assert(CCV_IS_TENSOR_CONTIGUOUS(a));
746
120k
      assert(CCV_IS_TENSOR_CONTIGUOUS(b));
747
120k
      const size_t tensor_count = ccv_nnc_tensor_count(a->info);
748
120k
      assert(tensor_count == ccv_nnc_tensor_count(b->info));
749
120k
      ccv_float_to_half_precision(a->data.f32, (uint16_t*)b->data.f16, tensor_count);
750
120k
    } else 
if (163
a->info.datatype == CCV_16F163
&&
b->info.datatype == CCV_32F159
) {
751
159
      assert(CCV_IS_TENSOR_CONTIGUOUS(a));
752
159
      assert(CCV_IS_TENSOR_CONTIGUOUS(b));
753
159
      const int tensor_count = ccv_nnc_tensor_count(a->info);
754
159
      assert(tensor_count == ccv_nnc_tensor_count(b->info));
755
159
      ccv_half_precision_to_float((uint16_t*)a->data.f16, b->data.f32, tensor_count);
756
159
    } else 
if (4
a->info.datatype == CCV_64F4
&&
b->info.datatype == CCV_32F3
) {
757
1
      assert(CCV_IS_TENSOR_CONTIGUOUS(a));
758
1
      assert(CCV_IS_TENSOR_CONTIGUOUS(b));
759
1
      const size_t tensor_count = ccv_nnc_tensor_count(a->info);
760
1
      assert(tensor_count == ccv_nnc_tensor_count(b->info));
761
1
      int i;
762
129
      for (i = 0; i < tensor_count; 
i++128
)
763
128
        b->data.f32[i] = (float)a->data.f64[i];
764
3
    } else if (a->info.datatype == CCV_32F && 
b->info.datatype == CCV_64F1
) {
765
1
      assert(CCV_IS_TENSOR_CONTIGUOUS(a));
766
1
      assert(CCV_IS_TENSOR_CONTIGUOUS(b));
767
1
      const int tensor_count = ccv_nnc_tensor_count(a->info);
768
1
      assert(tensor_count == ccv_nnc_tensor_count(b->info));
769
129
      
for (i = 0; 1
i < tensor_count;
i++128
)
770
128
        b->data.f64[i] = (double)a->data.f32[i];
771
2
    } else if (a->info.datatype == CCV_64F && b->info.datatype == CCV_16F) {
772
2
      assert(CCV_IS_TENSOR_CONTIGUOUS(a));
773
2
      assert(CCV_IS_TENSOR_CONTIGUOUS(b));
774
2
      const size_t tensor_count = ccv_nnc_tensor_count(a->info);
775
2
      assert(tensor_count == ccv_nnc_tensor_count(b->info));
776
2
      ccv_double_to_half_precision(a->data.f64, (uint16_t*)b->data.f16, tensor_count);
777
2
    } else 
if (0
a->info.datatype == CCV_16F0
&&
b->info.datatype == CCV_64F0
) {
778
0
      assert(CCV_IS_TENSOR_CONTIGUOUS(a));
779
0
      assert(CCV_IS_TENSOR_CONTIGUOUS(b));
780
0
      const int tensor_count = ccv_nnc_tensor_count(a->info);
781
0
      assert(tensor_count == ccv_nnc_tensor_count(b->info));
782
0
      ccv_half_precision_to_double((uint16_t*)a->data.f16, b->data.f64, tensor_count);
783
0
    }
784
120k
  }
785
120k
  return CCV_NNC_EXEC_SUCCESS;
786
120k
}
787
788
REGISTER_COMMAND_BACKEND(CCV_NNC_DATATYPE_CONVERSION_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
789
1
{
790
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
791
1
  registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F;
792
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
793
1
  registry->algorithms = 1;
794
1
  registry->exec = _ccv_nnc_datatype_conversion;
795
1
}
796
797
REGISTER_COMMAND_BACKEND(CCV_NNC_DATATYPE_CONVERSION_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
798
1
{
799
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
800
1
  registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F;
801
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
802
1
  registry->algorithms = 1;
803
1
  registry->exec = _ccv_nnc_datatype_conversion;
804
1
}
805
806
static void _ccv_nnc_masked_fill_cpu_ref_f(const float p, const float q, ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b, ccv_nnc_tensor_view_t* const c)
807
2
{
808
2
  int cdim[CCV_NNC_MAX_DIM_ALLOC];
809
2
  ccv_nnc_tensor_view_get_dim(a, cdim); // Fill in cdim first.
810
2
  ccv_nnc_tensor_view_get_broadcast_dim(b, cdim);
811
2
  assert(ccv_nnc_tensor_view_check_broadcast_dim(a, cdim));
812
2
  assert(ccv_nnc_tensor_view_check_broadcast_dim(b, cdim));
813
2
  const int a_check_dim = ccv_nnc_tensor_view_check_dim(a, cdim);
814
2
  const int b_check_dim = ccv_nnc_tensor_view_check_dim(b, cdim);
815
  // Assuming this is float 32.
816
2
  int adim[CCV_NNC_MAX_DIM_ALLOC];
817
2
  int bdim[CCV_NNC_MAX_DIM_ALLOC];
818
2
  ccv_nnc_tensor_view_get_dim(a, adim);
819
2
  ccv_nnc_tensor_view_get_dim(b, bdim);
820
2
  int ainc[CCV_NNC_MAX_DIM_ALLOC];
821
2
  int binc[CCV_NNC_MAX_DIM_ALLOC];
822
2
  int cinc[CCV_NNC_MAX_DIM_ALLOC];
823
2
  assert(ccv_nnc_tensor_view_check_dim(c, cdim));
824
2
  int x;
825
2
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && a_check_dim && b_check_dim)
826
0
  {
827
0
    const int tensor_count = ccv_nnc_tensor_count(a->info);
828
    // Super optimal case, just do one for-loop for sum.
829
0
    for (x = 0; x < tensor_count; x++)
830
0
      c->data.f32[x] = (b->data.f32[x] == p) ? q : a->data.f32[x];
831
0
    return;
832
0
  }
833
2
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
834
2
  ccv_nnc_tensor_view_get_inc(a, ainc);
835
2
  ccv_nnc_tensor_view_get_inc(b, binc);
836
2
  ccv_nnc_tensor_view_get_inc(c, cinc);
837
2
  int i[CCV_NNC_MAX_DIM + 2];
838
2
  float* ap = a->data.f32;
839
2
  float* bp = b->data.f32;
840
2
  float* cp = c->data.f32;
841
2
  const int count = cdim[2] * cdim[3];
842
2
  if (ainc[3] == cdim[3] && binc[3] == cdim[3] && cinc[3] == cdim[3] && adim[2] == cdim[2] && bdim[2] == cdim[2])
843
2
  {
844
    // Special casing if the ainc[3] is the same as dim[3]
845
4
    for (i[0] = 0; i[0] < cdim[0]; 
i[0]++2
)
846
2
    {
847
2
      float* const ap0 = adim[0] == 1 ? ap : 
ap + i[0] * ainc[1] * ainc[2] * ainc[3]0
;
848
2
      float* const bp0 = bdim[0] == 1 ? bp : 
bp + i[0] * binc[1] * binc[2] * binc[3]0
;
849
14
      for (i[1] = 0; i[1] < cdim[1]; 
i[1]++12
)
850
12
      {
851
12
        float* const ap1 = adim[1] == 1 ? 
ap00
: ap0 + i[1] * ainc[2] * ainc[3];
852
12
        float* const bp1 = bdim[1] == 1 ? bp0 : 
bp0 + i[1] * binc[2] * binc[3]0
;
853
252
        for (x = 0; x < count; 
x++240
)
854
240
          cp[x] = (bp1[x] == p) ? 
q120
:
ap1[x]120
;
855
12
        cp += cinc[2] * cinc[3];
856
12
      }
857
2
      cp += (cinc[1] - cdim[1]) * cinc[2] * cinc[3];
858
2
    }
859
2
    return;
860
2
  }
861
  // Non-optimal case, need to do skip copy and handle broadcasting.
862
0
  for (i[0] = 0; i[0] < cdim[0]; i[0]++)
863
0
  {
864
0
    float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * ainc[1] * ainc[2] * ainc[3];
865
0
    float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * binc[1] * binc[2] * binc[3];
866
0
    for (i[1] = 0; i[1] < cdim[1]; i[1]++)
867
0
    {
868
0
      float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * ainc[2] * ainc[3];
869
0
      float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * binc[2] * binc[3];
870
0
      for (i[2] = 0; i[2] < cdim[2]; i[2]++)
871
0
      {
872
0
        float* const ap2 = adim[2] == 1 ? ap1 : ap1 + i[2] * ainc[3];
873
0
        float* const bp2 = bdim[2] == 1 ? bp1 : bp1 + i[2] * binc[3];
874
0
        if (adim[3] == 1)
875
0
          for (x = 0; x < cdim[3]; x++)
876
0
            cp[x] = (bp2[x] == p) ? q : ap2[0];
877
0
        else if (bdim[3] == 1)
878
0
          if (bp2[0] == p)
879
0
            for (x = 0; x < cdim[3]; x++)
880
0
              cp[x] = q;
881
0
          else
882
0
            for (x = 0; x < cdim[3]; x++)
883
0
              cp[x] = ap2[x];
884
0
        else
885
0
          for (x = 0; x < cdim[3]; x++)
886
0
            cp[x] = (bp2[x] == p) ? q : ap2[x];
887
0
        cp += cinc[3];
888
0
      }
889
0
      cp += (cinc[2] - cdim[2]) * cinc[3];
890
0
    }
891
0
    cp += (cinc[1] - cdim[1]) * cinc[2] * cinc[3];
892
0
  }
893
0
}
894
895
static void _ccv_nnc_masked_fill_cpu_ref_s(const int p, const float q, ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b, ccv_nnc_tensor_view_t* const c)
896
4
{
897
4
  int cdim[CCV_NNC_MAX_DIM_ALLOC];
898
4
  ccv_nnc_tensor_view_get_dim(a, cdim); // Fill in cdim first.
899
4
  ccv_nnc_tensor_view_get_broadcast_dim(b, cdim);
900
4
  assert(ccv_nnc_tensor_view_check_broadcast_dim(a, cdim));
901
4
  assert(ccv_nnc_tensor_view_check_broadcast_dim(b, cdim));
902
4
  const int a_check_dim = ccv_nnc_tensor_view_check_dim(a, cdim);
903
4
  const int b_check_dim = ccv_nnc_tensor_view_check_dim(b, cdim);
904
  // Assuming this is float 32.
905
4
  int adim[CCV_NNC_MAX_DIM_ALLOC];
906
4
  int bdim[CCV_NNC_MAX_DIM_ALLOC];
907
4
  ccv_nnc_tensor_view_get_dim(a, adim);
908
4
  ccv_nnc_tensor_view_get_dim(b, bdim);
909
4
  int ainc[CCV_NNC_MAX_DIM_ALLOC];
910
4
  int binc[CCV_NNC_MAX_DIM_ALLOC];
911
4
  int cinc[CCV_NNC_MAX_DIM_ALLOC];
912
4
  assert(ccv_nnc_tensor_view_check_dim(c, cdim));
913
4
  int x;
914
4
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && a_check_dim && b_check_dim)
915
0
  {
916
0
    const int tensor_count = ccv_nnc_tensor_count(a->info);
917
    // Super optimal case, just do one for-loop for sum.
918
0
    for (x = 0; x < tensor_count; x++)
919
0
      c->data.f32[x] = (b->data.i32[x] == p) ? q : a->data.f32[x];
920
0
    return;
921
0
  }
922
4
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
923
4
  ccv_nnc_tensor_view_get_inc(a, ainc);
924
4
  ccv_nnc_tensor_view_get_inc(b, binc);
925
4
  ccv_nnc_tensor_view_get_inc(c, cinc);
926
4
  int i[CCV_NNC_MAX_DIM + 2];
927
4
  float* ap = a->data.f32;
928
4
  int* bp = b->data.i32;
929
4
  float* cp = c->data.f32;
930
4
  const int count = cdim[2] * cdim[3];
931
4
  if (ainc[3] == cdim[3] && binc[3] == cdim[3] && cinc[3] == cdim[3] && adim[2] == cdim[2] && bdim[2] == cdim[2])
932
4
  {
933
    // Special casing if the ainc[3] is the same as dim[3]
934
8
    for (i[0] = 0; i[0] < cdim[0]; 
i[0]++4
)
935
4
    {
936
4
      float* const ap0 = adim[0] == 1 ? ap : 
ap + i[0] * ainc[1] * ainc[2] * ainc[3]0
;
937
4
      int* const bp0 = bdim[0] == 1 ? bp : 
bp + i[0] * binc[1] * binc[2] * binc[3]0
;
938
28
      for (i[1] = 0; i[1] < cdim[1]; 
i[1]++24
)
939
24
      {
940
24
        float* const ap1 = adim[1] == 1 ? 
ap00
: ap0 + i[1] * ainc[2] * ainc[3];
941
24
        int* const bp1 = bdim[1] == 1 ? bp0 : 
bp0 + i[1] * binc[2] * binc[3]0
;
942
504
        for (x = 0; x < count; 
x++480
)
943
480
          cp[x] = (bp1[x] == p) ? 
q240
:
ap1[x]240
;
944
24
        cp += cinc[2] * cinc[3];
945
24
      }
946
4
      cp += (cinc[1] - cdim[1]) * cinc[2] * cinc[3];
947
4
    }
948
4
    return;
949
4
  }
950
  // Non-optimal case, need to do skip copy and handle broadcasting.
951
0
  for (i[0] = 0; i[0] < cdim[0]; i[0]++)
952
0
  {
953
0
    float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * ainc[1] * ainc[2] * ainc[3];
954
0
    int* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * binc[1] * binc[2] * binc[3];
955
0
    for (i[1] = 0; i[1] < cdim[1]; i[1]++)
956
0
    {
957
0
      float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * ainc[2] * ainc[3];
958
0
      int* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * binc[2] * binc[3];
959
0
      for (i[2] = 0; i[2] < cdim[2]; i[2]++)
960
0
      {
961
0
        float* const ap2 = adim[2] == 1 ? ap1 : ap1 + i[2] * ainc[3];
962
0
        int* const bp2 = bdim[2] == 1 ? bp1 : bp1 + i[2] * binc[3];
963
0
        if (adim[3] == 1)
964
0
          for (x = 0; x < cdim[3]; x++)
965
0
            cp[x] = (bp2[x] == p) ? q : ap2[0];
966
0
        else if (bdim[3] == 1)
967
0
          if (bp2[0] == p)
968
0
            for (x = 0; x < cdim[3]; x++)
969
0
              cp[x] = q;
970
0
          else
971
0
            for (x = 0; x < cdim[3]; x++)
972
0
              cp[x] = ap2[x];
973
0
        else
974
0
          for (x = 0; x < cdim[3]; x++)
975
0
            cp[x] = (bp2[x] == p) ? q : ap2[x];
976
0
        cp += cinc[3];
977
0
      }
978
0
      cp += (cinc[2] - cdim[2]) * cinc[3];
979
0
    }
980
0
    cp += (cinc[1] - cdim[1]) * cinc[2] * cinc[3];
981
0
  }
982
0
}
983
984
static int _ccv_nnc_masked_fill_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
985
3
{
986
3
  assert(input_size >= 2);
987
3
  assert(inputs[0]);
988
3
  assert(inputs[1]);
989
3
  assert(outputs[0]);
990
3
  if (inputs[1]->info.datatype == CCV_32F)
991
1
    _ccv_nnc_masked_fill_cpu_ref_f(cmd.info.blas.a[0], cmd.info.blas.a[1], (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]);
992
2
  else if (inputs[1]->info.datatype == CCV_32S)
993
2
    _ccv_nnc_masked_fill_cpu_ref_s((int)(cmd.info.blas.a[0] + 0.5), cmd.info.blas.a[1], (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]);
994
3
  return CCV_NNC_EXEC_SUCCESS;
995
3
}
996
997
static int _ccv_nnc_masked_fill_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
998
3
{
999
3
  assert(input_size >= 3);
1000
3
  if (inputs[2]->info.datatype == CCV_32F)
1001
1
    _ccv_nnc_masked_fill_cpu_ref_f(cmd.info.blas.a[0], 0, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
1002
2
  else if (inputs[2]->info.datatype == CCV_32S)
1003
2
    _ccv_nnc_masked_fill_cpu_ref_s((int)(cmd.info.blas.a[0] + 0.5), 0, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
1004
  // TODO: doesn't really support taking gradient on mask.
1005
  // if (output_size >= 2 && outputs[1])
1006
3
  return CCV_NNC_EXEC_SUCCESS;
1007
3
}
1008
1009
REGISTER_COMMAND_BACKEND(CCV_NNC_MASKED_FILL_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1010
1
{
1011
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
1012
1
  registry->tensor_datatypes = CCV_32F | CCV_32S;
1013
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1014
1
  registry->algorithms = 1;
1015
1
  registry->exec = _ccv_nnc_masked_fill_forw;
1016
1
}
1017
1018
REGISTER_COMMAND_BACKEND(CCV_NNC_MASKED_FILL_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1019
1
{
1020
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
1021
1
  registry->tensor_datatypes = CCV_32F | CCV_32S;
1022
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1023
1
  registry->algorithms = 1;
1024
1
  registry->exec = _ccv_nnc_masked_fill_back;
1025
1
}