Coverage Report

Created: 2025-02-24 17:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/cmd/util/ccv_nnc_util_cpu_ref.c
Line
Count
Source
1
#include "ccv.h"
2
#include "ccv_internal.h"
3
#include "nnc/ccv_nnc.h"
4
#include "nnc/ccv_nnc_easy.h"
5
#include "nnc/ccv_nnc_internal.h"
6
#ifdef USE_OPENMP
7
#include <omp.h>
8
#endif
9
#ifdef USE_DISPATCH
10
#include <dispatch/dispatch.h>
11
#endif
12
#include "../_ccv_nnc_cpu_ref.h"
13
14
void _ccv_nnc_tensor_transfer_cpu_ref_u8(const ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b)
15
0
{
16
  // Assuming this is float 32.
17
0
  assert(a->info.datatype == b->info.datatype);
18
0
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b))
19
0
  {
20
    // Super optimal case, just do memcpy.
21
0
    memcpy(b->data.u8, a->data.u8, ccv_nnc_tensor_count(a->info) * CCV_GET_DATA_TYPE_SIZE(a->info.datatype));
22
0
    return;
23
0
  }
24
0
  int dim[CCV_NNC_MAX_DIM_ALLOC];
25
0
  int astride[CCV_NNC_MAX_DIM_ALLOC];
26
0
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
27
0
  ccv_nnc_tensor_view_get_dim(a, dim);
28
0
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
29
0
  ccv_nnc_tensor_view_get_stride(a, astride);
30
0
  ccv_nnc_tensor_view_get_stride(b, bstride);
31
0
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
32
0
  int i[CCV_NNC_MAX_DIM + 2];
33
0
  unsigned char* const ap = a->data.u8;
34
0
  unsigned char* const bp = b->data.u8;
35
0
  if (astride[2] == dim[3] && bstride[3] == dim[3])
36
0
  {
37
    // Special casing if the ainc[3] is the same as dim[3] (do memcpy for the last two dim)
38
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
39
0
    {
40
0
      unsigned char* ap0 = ap + i[0] * astride[0];
41
0
      unsigned char* bp0 = bp + i[0] * bstride[0];
42
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
43
0
      {
44
0
        memcpy(bp0, ap0, dim[2] * dim[3] * sizeof(unsigned char));
45
0
        ap0 += astride[1];
46
0
        bp0 += bstride[1];
47
0
      }
48
0
    }
49
0
    return;
50
0
  } else if (astride[3] == 1 && bstride[3] == 1) {
51
    // The case the last dimension is packed.
52
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
53
0
    {
54
0
      unsigned char* const ap0 = ap + i[0] * astride[0];
55
0
      unsigned char* const bp0 = bp + i[0] * bstride[0];
56
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
57
0
      {
58
0
        unsigned char* ap1 = ap0 + i[1] * astride[1];
59
0
        unsigned char* bp1 = bp0 + i[1] * bstride[1];
60
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
61
0
        {
62
0
          memcpy(bp1, ap1, dim[3] * sizeof(unsigned char));
63
0
          ap1 += astride[2];
64
0
          bp1 += bstride[2];
65
0
        }
66
0
      }
67
0
    }
68
0
    return;
69
0
  }
70
  // Non-optimal case, need to do skip copy.
71
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
72
0
  {
73
0
    unsigned char* const ap0 = ap + i[0] * astride[0];
74
0
    unsigned char* const bp0 = bp + i[0] * bstride[0];
75
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
76
0
    {
77
0
      unsigned char* ap1 = ap0 + i[1] * astride[1];
78
0
      unsigned char* bp1 = bp0 + i[1] * bstride[1];
79
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
80
0
      {
81
0
        for (i[3] = 0; i[3] < dim[3]; i[3]++)
82
0
          bp1[i[3] * bstride[3]] = ap1[i[3] * astride[3]];
83
0
        ap1 += astride[2];
84
0
        bp1 += bstride[2];
85
0
      }
86
0
    }
87
0
  }
88
0
}
89
90
void _ccv_nnc_tensor_transfer_cpu_ref_f16(const ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b)
91
0
{
92
  // Assuming this is float 32.
93
0
  assert(a->info.datatype == b->info.datatype);
94
0
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b))
95
0
  {
96
    // Super optimal case, just do memcpy.
97
0
    memcpy(b->data.u8, a->data.u8, ccv_nnc_tensor_count(a->info) * CCV_GET_DATA_TYPE_SIZE(a->info.datatype));
98
0
    return;
99
0
  }
100
0
  int dim[CCV_NNC_MAX_DIM_ALLOC];
101
0
  int astride[CCV_NNC_MAX_DIM_ALLOC];
102
0
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
103
0
  ccv_nnc_tensor_view_get_dim(a, dim);
104
0
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
105
0
  ccv_nnc_tensor_view_get_stride(a, astride);
106
0
  ccv_nnc_tensor_view_get_stride(b, bstride);
107
0
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
108
0
  int i[CCV_NNC_MAX_DIM + 2];
109
0
  ccv_float16_t* const ap = a->data.f16;
110
0
  ccv_float16_t* const bp = b->data.f16;
111
0
  if (astride[2] == dim[3] && bstride[3] == dim[3])
112
0
  {
113
    // Special casing if the ainc[3] is the same as dim[3] (do memcpy for the last two dim)
114
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
115
0
    {
116
0
      ccv_float16_t* ap0 = ap + i[0] * astride[0];
117
0
      ccv_float16_t* bp0 = bp + i[0] * bstride[0];
118
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
119
0
      {
120
0
        memcpy(bp0, ap0, dim[2] * dim[3] * sizeof(ccv_float16_t));
121
0
        ap0 += astride[1];
122
0
        bp0 += bstride[1];
123
0
      }
124
0
    }
125
0
    return;
126
0
  } else if (astride[3] == 1 && bstride[3] == 1) {
127
    // The case the last dimension is packed.
128
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
129
0
    {
130
0
      ccv_float16_t* const ap0 = ap + i[0] * astride[0];
131
0
      ccv_float16_t* const bp0 = bp + i[0] * bstride[0];
132
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
133
0
      {
134
0
        ccv_float16_t* ap1 = ap0 + i[1] * astride[1];
135
0
        ccv_float16_t* bp1 = bp0 + i[1] * bstride[1];
136
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
137
0
        {
138
0
          memcpy(bp1, ap1, dim[3] * sizeof(ccv_float16_t));
139
0
          ap1 += astride[2];
140
0
          bp1 += bstride[2];
141
0
        }
142
0
      }
143
0
    }
144
0
    return;
145
0
  }
146
  // Non-optimal case, need to do skip copy.
147
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
148
0
  {
149
0
    ccv_float16_t* const ap0 = ap + i[0] * astride[0];
150
0
    ccv_float16_t* const bp0 = bp + i[0] * bstride[0];
151
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
152
0
    {
153
0
      ccv_float16_t* ap1 = ap0 + i[1] * astride[1];
154
0
      ccv_float16_t* bp1 = bp0 + i[1] * bstride[1];
155
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
156
0
      {
157
0
        for (i[3] = 0; i[3] < dim[3]; i[3]++)
158
0
          bp1[i[3] * bstride[3]] = ap1[i[3] * astride[3]];
159
0
        ap1 += astride[2];
160
0
        bp1 += bstride[2];
161
0
      }
162
0
    }
163
0
  }
164
0
}
165
166
void _ccv_nnc_tensor_transfer_cpu_ref_f32(const ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b)
167
21.5k
{
168
  // Assuming this is float 32.
169
21.5k
  assert(a->info.datatype == b->info.datatype);
170
21.5k
  if (!CCV_IS_TENSOR_VIEW(a) && 
!21.5k
CCV_IS_TENSOR_VIEW21.5k
(b))
171
21.5k
  {
172
    // Super optimal case, just do memcpy.
173
21.5k
    memcpy(b->data.u8, a->data.u8, ccv_nnc_tensor_count(a->info) * CCV_GET_DATA_TYPE_SIZE(a->info.datatype));
174
21.5k
    return;
175
21.5k
  }
176
12
  int dim[CCV_NNC_MAX_DIM_ALLOC];
177
12
  int astride[CCV_NNC_MAX_DIM_ALLOC];
178
12
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
179
12
  ccv_nnc_tensor_view_get_dim(a, dim);
180
12
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
181
12
  ccv_nnc_tensor_view_get_stride(a, astride);
182
12
  ccv_nnc_tensor_view_get_stride(b, bstride);
183
12
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
184
12
  int i[CCV_NNC_MAX_DIM + 2];
185
12
  float* const ap = a->data.f32;
186
12
  float* const bp = b->data.f32;
187
12
  if (astride[2] == dim[3] && 
bstride[2] == dim[3]6
)
188
3
  {
189
    // Special casing if the ainc[3] is the same as dim[3] (do memcpy for the last two dim)
190
6
    for (i[0] = 0; i[0] < dim[0]; 
i[0]++3
)
191
3
    {
192
3
      float* ap0 = ap + i[0] * astride[0];
193
3
      float* bp0 = bp + i[0] * bstride[0];
194
8
      for (i[1] = 0; i[1] < dim[1]; 
i[1]++5
)
195
5
      {
196
5
        memcpy(bp0, ap0, dim[2] * dim[3] * sizeof(float));
197
5
        ap0 += astride[1];
198
5
        bp0 += bstride[1];
199
5
      }
200
3
    }
201
3
    return;
202
9
  } else if (astride[3] == 1 && 
bstride[3] == 18
) {
203
    // The case the last dimension is packed.
204
21
    for (i[0] = 0; i[0] < dim[0]; 
i[0]++13
)
205
13
    {
206
13
      float* const ap0 = ap + i[0] * astride[0];
207
13
      float* const bp0 = bp + i[0] * bstride[0];
208
45
      for (i[1] = 0; i[1] < dim[1]; 
i[1]++32
)
209
32
      {
210
32
        float* ap1 = ap0 + i[1] * astride[1];
211
32
        float* bp1 = bp0 + i[1] * bstride[1];
212
167
        for (i[2] = 0; i[2] < dim[2]; 
i[2]++135
)
213
135
        {
214
135
          memcpy(bp1, ap1, dim[3] * sizeof(float));
215
135
          ap1 += astride[2];
216
135
          bp1 += bstride[2];
217
135
        }
218
32
      }
219
13
    }
220
8
    return;
221
8
  }
222
  // Non-optimal case, need to do skip copy.
223
2
  
for (i[0] = 0; 1
i[0] < dim[0];
i[0]++1
)
224
1
  {
225
1
    float* const ap0 = ap + i[0] * astride[0];
226
1
    float* const bp0 = bp + i[0] * bstride[0];
227
3
    for (i[1] = 0; i[1] < dim[1]; 
i[1]++2
)
228
2
    {
229
2
      float* ap1 = ap0 + i[1] * astride[1];
230
2
      float* bp1 = bp0 + i[1] * bstride[1];
231
10
      for (i[2] = 0; i[2] < dim[2]; 
i[2]++8
)
232
8
      {
233
32
        for (i[3] = 0; i[3] < dim[3]; 
i[3]++24
)
234
24
          bp1[i[3] * bstride[3]] = ap1[i[3] * astride[3]];
235
8
        ap1 += astride[2];
236
8
        bp1 += bstride[2];
237
8
      }
238
2
    }
239
1
  }
240
1
}
241
242
void _ccv_nnc_tensor_transfer_cpu_ref_f64(const ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b)
243
0
{
244
  // Assuming this is float 32.
245
0
  assert(a->info.datatype == b->info.datatype);
246
0
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b))
247
0
  {
248
    // Super optimal case, just do memcpy.
249
0
    memcpy(b->data.u8, a->data.u8, ccv_nnc_tensor_count(a->info) * CCV_GET_DATA_TYPE_SIZE(a->info.datatype));
250
0
    return;
251
0
  }
252
0
  int dim[CCV_NNC_MAX_DIM_ALLOC];
253
0
  int astride[CCV_NNC_MAX_DIM_ALLOC];
254
0
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
255
0
  ccv_nnc_tensor_view_get_dim(a, dim);
256
0
  assert(ccv_nnc_tensor_view_check_dim(b, dim));
257
0
  ccv_nnc_tensor_view_get_stride(a, astride);
258
0
  ccv_nnc_tensor_view_get_stride(b, bstride);
259
0
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
260
0
  int i[CCV_NNC_MAX_DIM + 2];
261
0
  double* ap = a->data.f64;
262
0
  double* bp = b->data.f64;
263
0
  if (astride[2] == dim[3] && bstride[2] == dim[3])
264
0
  {
265
    // Special casing if the ainc[3] is the same as dim[3] (do memcpy for the last two dim)
266
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
267
0
    {
268
0
      double* ap0 = ap + i[0] * astride[0];
269
0
      double* bp0 = bp + i[0] * bstride[0];
270
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
271
0
      {
272
0
        memcpy(bp0, ap0, dim[2] * dim[3] * sizeof(double));
273
0
        ap0 += astride[1];
274
0
        bp0 += bstride[1];
275
0
      }
276
0
    }
277
0
    return;
278
0
  } else if (astride[3] == 1 && bstride[3] == 1) {
279
    // The case the last dimension is packed.
280
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
281
0
    {
282
0
      double* const ap0 = ap + i[0] * astride[0];
283
0
      double* const bp0 = bp + i[0] * bstride[0];
284
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
285
0
      {
286
0
        double* ap1 = ap0 + i[1] * astride[1];
287
0
        double* bp1 = bp0 + i[1] * bstride[1];
288
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
289
0
        {
290
0
          memcpy(bp1, ap1, dim[3] * sizeof(double));
291
0
          ap1 += astride[2];
292
0
          bp1 += bstride[2];
293
0
        }
294
0
      }
295
0
    }
296
0
    return;
297
0
  }
298
  // Non-optimal case, need to do skip copy.
299
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
300
0
  {
301
0
    double* const ap0 = ap + i[0] * astride[0];
302
0
    double* const bp0 = bp + i[0] * bstride[0];
303
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
304
0
    {
305
0
      double* ap1 = ap0 + i[1] * astride[1];
306
0
      double* bp1 = bp0 + i[1] * bstride[1];
307
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
308
0
      {
309
0
        for (i[3] = 0; i[3] < dim[3]; i[3]++)
310
0
          bp1[i[3] * bstride[3]] = ap1[i[3] * astride[3]];
311
0
        ap1 += astride[2];
312
0
        bp1 += bstride[2];
313
0
      }
314
0
    }
315
0
  }
316
0
}
317
318
void _ccv_nnc_tensor_set_cpu_ref_f16(ccv_nnc_tensor_view_t* const a, const float b)
319
0
{
320
  // Assuming this is short.
321
0
  int dim[CCV_NNC_MAX_DIM_ALLOC];
322
0
  int astride[CCV_NNC_MAX_DIM_ALLOC];
323
0
  short h;
324
0
  ccv_float_to_half_precision((float*)&b, (uint16_t*)&h, 1);
325
0
  int x;
326
0
  if (!CCV_IS_TENSOR_VIEW(a))
327
0
  {
328
    // Super optimal case, just do one for-loop for sum.
329
0
    const int tensor_count = ccv_nnc_tensor_count(a->info);
330
0
    for (x = 0; x < tensor_count; x++)
331
0
      a->data.f16[x].v = h;
332
0
    return;
333
0
  }
334
0
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
335
0
  ccv_nnc_tensor_view_get_dim(a, dim);
336
0
  ccv_nnc_tensor_view_get_stride(a, astride);
337
0
  int i[CCV_NNC_MAX_DIM + 2];
338
0
  short* const ap = (short*)a->data.f16;
339
0
  const int count = dim[2] * dim[3];
340
0
  if (astride[2] == dim[3])
341
0
  {
342
    // Special casing if the ainc[3] is the same as dim[3]
343
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
344
0
    {
345
0
      short* ap0 = ap + i[0] * astride[0];
346
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
347
0
      {
348
0
        for (x = 0; x < count; x++)
349
0
          ap0[x] = h;
350
0
        ap0 += astride[1];
351
0
      }
352
0
    }
353
0
    return;
354
0
  } else if (astride[3] == 1) {
355
    // The case the last dimension is packed.
356
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
357
0
    {
358
0
      short* const ap0 = ap + i[0] * astride[0];
359
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
360
0
      {
361
0
        short* ap1 = ap0 + i[1] * astride[1];
362
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
363
0
        {
364
0
          for (x = 0; x < dim[3]; x++)
365
0
            ap1[x] = h;
366
0
          ap1 += astride[2];
367
0
        }
368
0
      }
369
0
    }
370
0
    return;
371
0
  }
372
  // Non-optimal case, need to do skip copy.
373
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
374
0
  {
375
0
    short* const ap0 = ap + i[0] * astride[0];
376
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
377
0
    {
378
0
      short* ap1 = ap0 + i[1] * astride[1];
379
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
380
0
      {
381
0
        for (x = 0; x < dim[3]; x++)
382
0
          ap1[x * astride[3]] = h;
383
0
        ap1 += astride[2];
384
0
      }
385
0
    }
386
0
  }
387
0
}
388
389
void _ccv_nnc_tensor_set_cpu_ref_bf16(ccv_nnc_tensor_view_t* const a, const float b)
390
0
{
391
  // Assuming this is short.
392
0
  int dim[CCV_NNC_MAX_DIM_ALLOC];
393
0
  int astride[CCV_NNC_MAX_DIM_ALLOC];
394
0
  short h;
395
0
  ccv_float_to_bfloat((float*)&b, (uint16_t*)&h, 1);
396
0
  int x;
397
0
  if (!CCV_IS_TENSOR_VIEW(a))
398
0
  {
399
    // Super optimal case, just do one for-loop for sum.
400
0
    const int tensor_count = ccv_nnc_tensor_count(a->info);
401
0
    for (x = 0; x < tensor_count; x++)
402
0
      a->data.f16[x].v = h;
403
0
    return;
404
0
  }
405
0
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
406
0
  ccv_nnc_tensor_view_get_dim(a, dim);
407
0
  ccv_nnc_tensor_view_get_stride(a, astride);
408
0
  int i[CCV_NNC_MAX_DIM + 2];
409
0
  short* const ap = (short*)a->data.f16;
410
0
  const int count = dim[2] * dim[3];
411
0
  if (astride[2] == dim[3])
412
0
  {
413
    // Special casing if the ainc[3] is the same as dim[3]
414
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
415
0
    {
416
0
      short* ap0 = ap + i[0] * astride[0];
417
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
418
0
      {
419
0
        for (x = 0; x < count; x++)
420
0
          ap0[x] = h;
421
0
        ap0 += astride[1];
422
0
      }
423
0
    }
424
0
    return;
425
0
  } else if (astride[3] == 1) {
426
    // The case the last dimension is packed.
427
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
428
0
    {
429
0
      short* const ap0 = ap + i[0] * astride[0];
430
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
431
0
      {
432
0
        short* ap1 = ap0 + i[1] * astride[1];
433
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
434
0
        {
435
0
          for (x = 0; x < dim[3]; x++)
436
0
            ap1[x] = h;
437
0
          ap1 += astride[2];
438
0
        }
439
0
      }
440
0
    }
441
0
    return;
442
0
  }
443
  // Non-optimal case, need to do skip copy.
444
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
445
0
  {
446
0
    short* const ap0 = ap + i[0] * astride[0];
447
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
448
0
    {
449
0
      short* ap1 = ap0 + i[1] * astride[1];
450
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
451
0
      {
452
0
        for (x = 0; x < dim[3]; x++)
453
0
          ap1[x * astride[3]] = h;
454
0
        ap1 += astride[2];
455
0
      }
456
0
    }
457
0
  }
458
0
}
459
460
void _ccv_nnc_tensor_set_cpu_ref_f32(ccv_nnc_tensor_view_t* const a, const float b)
461
10.9k
{
462
  // Assuming this is float 32.
463
10.9k
  int dim[CCV_NNC_MAX_DIM_ALLOC];
464
10.9k
  int astride[CCV_NNC_MAX_DIM_ALLOC];
465
10.9k
  int x;
466
10.9k
  if (!CCV_IS_TENSOR_VIEW(a))
467
10.9k
  {
468
    // Super optimal case, just do one for-loop for sum.
469
10.9k
    const int tensor_count = ccv_nnc_tensor_count(a->info);
470
400k
    for (x = 0; x < tensor_count; 
x++389k
)
471
389k
      a->data.f32[x] = b;
472
10.9k
    return;
473
10.9k
  }
474
10.9k
  assert
(CCV_NNC_MAX_DIM == 2)0
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
475
0
  ccv_nnc_tensor_view_get_dim(a, dim);
476
0
  ccv_nnc_tensor_view_get_stride(a, astride);
477
0
  int i[CCV_NNC_MAX_DIM + 2];
478
0
  float* const ap = a->data.f32;
479
0
  const int count = dim[2] * dim[3];
480
0
  if (astride[2] == dim[3])
481
0
  {
482
    // Special casing if the ainc[3] is the same as dim[3]
483
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
484
0
    {
485
0
      float* ap0 = ap + i[0] * astride[0];
486
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
487
0
      {
488
0
        for (x = 0; x < count; x++)
489
0
          ap0[x] = b;
490
0
        ap0 += astride[1];
491
0
      }
492
0
    }
493
0
    return;
494
0
  } else if (astride[3] == 1) {
495
    // The case the last dimension is packed.
496
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
497
0
    {
498
0
      float* const ap0 = ap + i[0] * astride[0];
499
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
500
0
      {
501
0
        float* ap1 = ap0 + i[1] * astride[1];
502
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
503
0
        {
504
0
          for (x = 0; x < dim[3]; x++)
505
0
            ap1[x] = b;
506
0
          ap1 += astride[2];
507
0
        }
508
0
      }
509
0
    }
510
0
    return;
511
0
  }
512
  // Non-optimal case, need to do skip copy.
513
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
514
0
  {
515
0
    float* const ap0 = ap + i[0] * astride[0];
516
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
517
0
    {
518
0
      float* ap1 = ap0 + i[1] * astride[1];
519
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
520
0
      {
521
0
        for (x = 0; x < dim[3]; x++)
522
0
          ap1[x * astride[3]] = b;
523
0
        ap1 += astride[2];
524
0
      }
525
0
    }
526
0
  }
527
0
}
528
529
void _ccv_nnc_tensor_set_cpu_ref_f64(ccv_nnc_tensor_view_t* const a, const double b)
530
1
{
531
  // Assuming this is double.
532
1
  int dim[CCV_NNC_MAX_DIM_ALLOC];
533
1
  int astride[CCV_NNC_MAX_DIM_ALLOC];
534
1
  int x;
535
1
  if (!CCV_IS_TENSOR_VIEW(a))
536
1
  {
537
    // Super optimal case, just do one for-loop for sum.
538
1
    const int tensor_count = ccv_nnc_tensor_count(a->info);
539
7.92k
    for (x = 0; x < tensor_count; 
x++7.92k
)
540
7.92k
      a->data.f64[x] = b;
541
1
    return;
542
1
  }
543
1
  assert
(CCV_NNC_MAX_DIM == 2)0
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
544
0
  ccv_nnc_tensor_view_get_dim(a, dim);
545
0
  ccv_nnc_tensor_view_get_stride(a, astride);
546
0
  int i[CCV_NNC_MAX_DIM + 2];
547
0
  double* const ap = a->data.f64;
548
0
  const int count = dim[2] * dim[3];
549
0
  if (astride[2] == dim[3])
550
0
  {
551
    // Special casing if the ainc[3] is the same as dim[3]
552
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
553
0
    {
554
0
      double* ap0 = ap + i[0] * astride[0];
555
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
556
0
      {
557
0
        for (x = 0; x < count; x++)
558
0
          ap0[x] = b;
559
0
        ap0 += astride[1];
560
0
      }
561
0
    }
562
0
    return;
563
0
  } else if (astride[3] == 1) {
564
    // The case the last dimension is packed.
565
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
566
0
    {
567
0
      double* const ap0 = ap + i[0] * astride[0];
568
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
569
0
      {
570
0
        double* ap1 = ap0 + i[1] * astride[1];
571
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
572
0
        {
573
0
          for (x = 0; x < dim[3]; x++)
574
0
            ap1[x] = b;
575
0
          ap1 += astride[2];
576
0
        }
577
0
      }
578
0
    }
579
0
    return;
580
0
  }
581
  // Non-optimal case, need to do skip copy.
582
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
583
0
  {
584
0
    double* const ap0 = ap + i[0] * astride[0];
585
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
586
0
    {
587
0
      double* ap1 = ap0 + i[1] * astride[1];
588
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
589
0
      {
590
0
        for (x = 0; x < dim[3]; x++)
591
0
          ap1[x * astride[3]] = b;
592
0
        ap1 += astride[2];
593
0
      }
594
0
    }
595
0
  }
596
0
}
597
598
void _ccv_nnc_tensor_set_cpu_ref_i32(ccv_nnc_tensor_view_t* const a, const int b)
599
4
{
600
  // Assuming this is float 32.
601
4
  int dim[CCV_NNC_MAX_DIM_ALLOC];
602
4
  int astride[CCV_NNC_MAX_DIM_ALLOC];
603
4
  int x;
604
4
  if (!CCV_IS_TENSOR_VIEW(a))
605
4
  {
606
    // Super optimal case, just do one for-loop for sum.
607
4
    const int tensor_count = ccv_nnc_tensor_count(a->info);
608
11
    for (x = 0; x < tensor_count; 
x++7
)
609
7
      a->data.i32[x] = b;
610
4
    return;
611
4
  }
612
4
  assert
(CCV_NNC_MAX_DIM == 2)0
; // Need to change this logic for CCV_NNC_MAX_DIM == other number.
613
0
  ccv_nnc_tensor_view_get_dim(a, dim);
614
0
  ccv_nnc_tensor_view_get_stride(a, astride);
615
0
  int i[CCV_NNC_MAX_DIM + 2];
616
0
  int* const ap = a->data.i32;
617
0
  const int count = dim[2] * dim[3];
618
0
  if (astride[2] == dim[3])
619
0
  {
620
    // Special casing if the ainc[3] is the same as dim[3]
621
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
622
0
    {
623
0
      int* ap0 = ap + i[0] * astride[0];
624
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
625
0
      {
626
0
        for (x = 0; x < count; x++)
627
0
          ap0[x] = b;
628
0
        ap0 += astride[1];
629
0
      }
630
0
    }
631
0
    return;
632
0
  } else if (astride[3] == 1) {
633
    // The case the last dimension is packed.
634
0
    for (i[0] = 0; i[0] < dim[0]; i[0]++)
635
0
    {
636
0
      int* const ap0 = ap + i[0] * astride[0];
637
0
      for (i[1] = 0; i[1] < dim[1]; i[1]++)
638
0
      {
639
0
        int* ap1 = ap0 + i[1] * astride[1];
640
0
        for (i[2] = 0; i[2] < dim[2]; i[2]++)
641
0
        {
642
0
          for (x = 0; x < dim[3]; x++)
643
0
            ap1[x] = b;
644
0
          ap1 += astride[2];
645
0
        }
646
0
      }
647
0
    }
648
0
    return;
649
0
  }
650
  // Non-optimal case, need to do skip copy.
651
0
  for (i[0] = 0; i[0] < dim[0]; i[0]++)
652
0
  {
653
0
    int* const ap0 = ap + i[0] * astride[0];
654
0
    for (i[1] = 0; i[1] < dim[1]; i[1]++)
655
0
    {
656
0
      int* ap1 = ap0 + i[1] * astride[1];
657
0
      for (i[2] = 0; i[2] < dim[2]; i[2]++)
658
0
      {
659
0
        for (x = 0; x < dim[3]; x++)
660
0
          ap1[x * astride[3]] = b;
661
0
        ap1 += astride[2];
662
0
      }
663
0
    }
664
0
  }
665
0
}
666
667
static int _ccv_nnc_data_transfer(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
668
2.05k
{
669
2.05k
  int i;
670
8.10k
  for (i = 0; i < ccv_min(input_size, output_size); 
i++6.04k
)
671
6.04k
  {
672
6.04k
    const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[i];
673
6.04k
    ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[i];
674
6.04k
    if (a != b) // Only do transfer if these are two different tensors.
675
78
    {
676
78
      assert(a->info.datatype == b->info.datatype);
677
78
      if (a->info.datatype == CCV_16F || a->info.datatype == CCV_16BF)
678
0
        _ccv_nnc_tensor_transfer_cpu_ref_f16(a, b);
679
78
      else if (a->info.datatype == CCV_32F || 
a->info.datatype == CCV_32S0
)
680
78
        _ccv_nnc_tensor_transfer_cpu_ref_f32(a, b);
681
0
      else if (a->info.datatype == CCV_64F)
682
0
        _ccv_nnc_tensor_transfer_cpu_ref_f64(a, b);
683
0
      else if (a->info.datatype == CCV_8U)
684
0
        _ccv_nnc_tensor_transfer_cpu_ref_u8(a, b);
685
78
    }
686
6.04k
  }
687
2.05k
  return CCV_NNC_EXEC_SUCCESS;
688
2.05k
}
689
690
REGISTER_COMMAND_BACKEND(CCV_NNC_DATA_TRANSFER_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
691
1
{
692
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
693
1
  registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_32S | CCV_16BF;
694
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
695
1
  registry->algorithms = 1;
696
1
  registry->exec = _ccv_nnc_data_transfer;
697
1
}
698
699
REGISTER_COMMAND_BACKEND(CCV_NNC_DATA_TRANSFER_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
700
1
{
701
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
702
1
  registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_32S | CCV_16BF;
703
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
704
1
  registry->algorithms = 1;
705
1
  registry->exec = _ccv_nnc_data_transfer;
706
1
}
707
708
static int _ccv_nnc_set_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
709
11.0k
{
710
11.0k
  int i;
711
11.0k
  if (cmd.info.blas.a[0] == 0)
712
310
    
for (i = 0; 155
i < output_size;
i++155
)
713
155
      ccv_nnc_tensor_zero(outputs[i]);
714
10.8k
  else
715
21.7k
    
for (i = 0; 10.8k
i < output_size;
i++10.8k
)
716
10.8k
      if (outputs[i]->info.datatype == CCV_16F)
717
0
        _ccv_nnc_tensor_set_cpu_ref_f16((ccv_nnc_tensor_view_t*)outputs[i], cmd.info.blas.a[0]);
718
10.8k
      else if (outputs[i]->info.datatype == CCV_16BF)
719
0
        _ccv_nnc_tensor_set_cpu_ref_bf16((ccv_nnc_tensor_view_t*)outputs[i], cmd.info.blas.a[0]);
720
10.8k
      else if (outputs[i]->info.datatype == CCV_32F)
721
10.8k
        _ccv_nnc_tensor_set_cpu_ref_f32((ccv_nnc_tensor_view_t*)outputs[i], cmd.info.blas.a[0]);
722
2
      else if (outputs[i]->info.datatype == CCV_64F)
723
1
        _ccv_nnc_tensor_set_cpu_ref_f64((ccv_nnc_tensor_view_t*)outputs[i], cmd.info.blas.a[0]);
724
1
      else if (outputs[i]->info.datatype == CCV_32S)
725
1
        _ccv_nnc_tensor_set_cpu_ref_i32((ccv_nnc_tensor_view_t*)outputs[i], (int)cmd.info.blas.a[0]);
726
0
      else
727
0
        { assert(0); }
728
11.0k
  return CCV_NNC_EXEC_SUCCESS;
729
11.0k
}
730
731
static int _ccv_nnc_set_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
732
0
{
733
0
  int i;
734
0
  for (i = 0; i < output_size; i++)
735
0
    ccv_nnc_tensor_zero(outputs[i]);
736
0
  return CCV_NNC_EXEC_SUCCESS;
737
0
}
738
739
REGISTER_COMMAND_BACKEND(CCV_NNC_SET_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
740
1
{
741
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
742
1
  registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_32S | CCV_16BF;
743
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
744
1
  registry->algorithms = 1;
745
1
  registry->exec = _ccv_nnc_set_forw;
746
1
}
747
748
REGISTER_COMMAND_BACKEND(CCV_NNC_SET_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
749
1
{
750
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
751
1
  registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_32S | CCV_16BF;
752
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
753
1
  registry->algorithms = 1;
754
1
  registry->exec = _ccv_nnc_set_back;
755
1
}
756
757
static void _ccv_nnc_tensor_nhwc_nchw_f32(const ccv_nnc_tensor_view_t* a, ccv_nnc_tensor_view_t* b)
758
10
{
759
  // Assuming this is float 32.
760
10
  int astride[CCV_NNC_MAX_DIM_ALLOC];
761
10
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
762
10
  int k;
763
  // In case it is Toll-free bridged matrix object (NHWC format is possible).
764
10
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
765
10
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
766
10
  const int a_offset = CCV_NNC_MAX_DIM + 2 - a_nd;
767
10
  assert(a_offset == 0 || a_offset == 1);
768
10
  const int b_offset = CCV_NNC_MAX_DIM + 2 - b_nd;
769
10
  assert(b_offset == 0 || b_offset == 1);
770
10
  ccv_nnc_tensor_view_get_stride(a, astride);
771
10
  ccv_nnc_tensor_view_get_stride(b, bstride);
772
  // Comparing N
773
10
  assert((a_offset == 0 ? a->info.dim[0] : 1) == (b_offset == 0 ? b->info.dim[0] : 1));
774
10
  const int n = (a_offset == 0 ? 
a->info.dim[0]3
:
17
);
775
  // Comparing C
776
10
  assert(a->info.dim[a_nd - 1] == b->info.dim[1 - b_offset]);
777
10
  const int c = a->info.dim[a_nd - 1];
778
  // Comparing HW
779
10
  int hw[CCV_NNC_MAX_DIM];
780
30
  for (k = 0; k < CCV_NNC_MAX_DIM; 
k++20
)
781
20
  {
782
20
    assert(a->info.dim[k + 1 - a_offset] == b->info.dim[k + 2 - b_offset]);
783
20
    hw[k] = a->info.dim[k + 1 - a_offset];
784
20
  }
785
10
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
786
10
  int i[CCV_NNC_MAX_DIM + 2];
787
10
  float* const ap = a->data.f32;
788
10
  float* const bp = b->data.f32;
789
  // Non-optimal case, need to do skip copy.
790
117
  for (i[0] = 0; i[0] < n; 
i[0]++107
)
791
107
  {
792
107
    float* ap0 = ap + i[0] * astride[0];
793
107
    float* const bp0 = bp + i[0] * bstride[0];
794
1.17k
    for (i[3] = 0; i[3] < c; 
i[3]++1.06k
)
795
1.06k
    {
796
1.06k
      float* apu = ap0 + i[3];
797
1.06k
      float* bp1 = bp0 + i[3] * bstride[1];
798
21.7k
      for (i[1] = 0; i[1] < hw[0]; 
i[1]++20.6k
)
799
20.6k
      {
800
3.37M
        for (i[2] = 0; i[2] < hw[1]; 
i[2]++3.35M
)
801
3.35M
          bp1[i[2]] = apu[i[2] * astride[2]];
802
20.6k
        apu += astride[1];
803
20.6k
        bp1 += bstride[2];
804
20.6k
      }
805
1.06k
    }
806
107
  }
807
10
}
808
809
static void _ccv_nnc_tensor_nchw_nhwc_f32(const ccv_nnc_tensor_view_t* a, ccv_nnc_tensor_view_t* b)
810
9
{
811
  // Assuming this is float 32.
812
9
  int astride[CCV_NNC_MAX_DIM_ALLOC];
813
9
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
814
9
  int k;
815
  // In case it is Toll-free bridged matrix object (NHWC format is possible).
816
9
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
817
9
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
818
9
  const int a_offset = CCV_NNC_MAX_DIM + 2 - a_nd;
819
9
  assert(a_offset == 0 || a_offset == 1);
820
9
  const int b_offset = CCV_NNC_MAX_DIM + 2 - b_nd;
821
9
  assert(b_offset == 0 || b_offset == 1);
822
9
  ccv_nnc_tensor_view_get_stride(a, astride);
823
9
  ccv_nnc_tensor_view_get_stride(b, bstride);
824
  // Comparing N
825
9
  assert((a_offset == 0 ? a->info.dim[0] : 1) == (b_offset == 0 ? b->info.dim[0] : 1));
826
9
  const int n = (a_offset == 0 ? 
a->info.dim[0]2
:
17
);
827
  // Comparing C
828
9
  assert(a->info.dim[1 - a_offset] == b->info.dim[b_nd - 1]);
829
9
  const int c = a->info.dim[1 - a_offset];
830
  // Comparing HW
831
9
  int hw[CCV_NNC_MAX_DIM];
832
27
  for (k = 0; k < CCV_NNC_MAX_DIM; 
k++18
)
833
18
  {
834
18
    assert(a->info.dim[k + 2 - a_offset] == b->info.dim[k + 1 - b_offset]);
835
18
    hw[k] = a->info.dim[k + 2 - a_offset];
836
18
  }
837
9
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
838
9
  int i[CCV_NNC_MAX_DIM + 2];
839
9
  float* const ap = a->data.f32;
840
9
  float* const bp = b->data.f32;
841
  // Non-optimal case, need to do skip copy.
842
20
  for (i[0] = 0; i[0] < n; 
i[0]++11
)
843
11
  {
844
11
    float* const ap0 = ap + i[0] * astride[0];
845
11
    float* const bp0 = bp + i[0] * bstride[0];
846
792
    for (i[3] = 0; i[3] < c; 
i[3]++781
)
847
781
    {
848
781
      float* bpu = bp0 + i[3];
849
781
      float* ap1 = ap0 + i[3] * astride[1];
850
20.1k
      for (i[1] = 0; i[1] < hw[0]; 
i[1]++19.3k
)
851
19.3k
      {
852
5.04M
        for (i[2] = 0; i[2] < hw[1]; 
i[2]++5.02M
)
853
5.02M
          bpu[i[2] * bstride[2]] = ap1[i[2]];
854
19.3k
        ap1 += astride[2];
855
19.3k
        bpu += bstride[1];
856
19.3k
      }
857
781
    }
858
11
  }
859
9
}
860
861
static void _ccv_nnc_tensor_nhwc_nchw_f64(const ccv_nnc_tensor_view_t* a, ccv_nnc_tensor_view_t* b)
862
1
{
863
  // Assuming this is float 32.
864
1
  int astride[CCV_NNC_MAX_DIM_ALLOC];
865
1
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
866
1
  int k;
867
  // In case it is Toll-free bridged matrix object (NHWC format is possible).
868
1
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
869
1
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
870
1
  const int a_offset = CCV_NNC_MAX_DIM + 2 - a_nd;
871
1
  assert(a_offset == 0 || a_offset == 1);
872
1
  const int b_offset = CCV_NNC_MAX_DIM + 2 - b_nd;
873
1
  assert(b_offset == 0 || b_offset == 1);
874
1
  ccv_nnc_tensor_view_get_stride(a, astride);
875
1
  ccv_nnc_tensor_view_get_stride(b, bstride);
876
  // Comparing N
877
1
  assert((a_offset == 0 ? a->info.dim[0] : 1) == (b_offset == 0 ? b->info.dim[0] : 1));
878
1
  const int n = (a_offset == 0 ? a->info.dim[0] : 
10
);
879
  // Comparing C
880
1
  assert(a->info.dim[a_nd - 1] == b->info.dim[1 - b_offset]);
881
1
  const int c = a->info.dim[a_nd - 1];
882
  // Comparing HW
883
1
  int hw[CCV_NNC_MAX_DIM];
884
3
  for (k = 0; k < CCV_NNC_MAX_DIM; 
k++2
)
885
2
  {
886
2
    assert(a->info.dim[k + 1 - a_offset] == b->info.dim[k + 2 - b_offset]);
887
2
    hw[k] = a->info.dim[k + 1 - a_offset];
888
2
  }
889
1
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
890
1
  int i[CCV_NNC_MAX_DIM + 2];
891
1
  double* const ap = a->data.f64;
892
1
  double* const bp = b->data.f64;
893
  // Non-optimal case, need to do skip copy.
894
12
  for (i[0] = 0; i[0] < n; 
i[0]++11
)
895
11
  {
896
11
    double* ap0 = ap + i[0] * astride[0];
897
11
    double* const bp0 = bp + i[0] * bstride[0];
898
99
    for (i[3] = 0; i[3] < c; 
i[3]++88
)
899
88
    {
900
88
      double* apu = ap0 + i[3];
901
88
      double* bp1 = bp0 + i[3] * bstride[1];
902
968
      for (i[1] = 0; i[1] < hw[0]; 
i[1]++880
)
903
880
      {
904
8.80k
        for (i[2] = 0; i[2] < hw[1]; 
i[2]++7.92k
)
905
7.92k
          bp1[i[2]] = apu[i[2] * astride[2]];
906
880
        apu += astride[1];
907
880
        bp1 += bstride[2];
908
880
      }
909
88
    }
910
11
  }
911
1
}
912
913
static void _ccv_nnc_tensor_nchw_nhwc_f64(const ccv_nnc_tensor_view_t* a, ccv_nnc_tensor_view_t* b)
914
0
{
915
  // Assuming this is float 32.
916
0
  int astride[CCV_NNC_MAX_DIM_ALLOC];
917
0
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
918
0
  int k;
919
  // In case it is Toll-free bridged matrix object (NHWC format is possible).
920
0
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
921
0
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
922
0
  const int a_offset = CCV_NNC_MAX_DIM + 2 - a_nd;
923
0
  assert(a_offset == 0 || a_offset == 1);
924
0
  const int b_offset = CCV_NNC_MAX_DIM + 2 - b_nd;
925
0
  assert(b_offset == 0 || b_offset == 1);
926
0
  ccv_nnc_tensor_view_get_stride(a, astride);
927
0
  ccv_nnc_tensor_view_get_stride(b, bstride);
928
  // Comparing N
929
0
  assert((a_offset == 0 ? a->info.dim[0] : 1) == (b_offset == 0 ? b->info.dim[0] : 1));
930
0
  const int n = (a_offset == 0 ? a->info.dim[0] : 1);
931
  // Comparing C
932
0
  assert(a->info.dim[1 - a_offset] == b->info.dim[b_nd - 1]);
933
0
  const int c = a->info.dim[1 - a_offset];
934
  // Comparing HW
935
0
  int hw[CCV_NNC_MAX_DIM];
936
0
  for (k = 0; k < CCV_NNC_MAX_DIM; k++)
937
0
  {
938
0
    assert(a->info.dim[k + 2 - a_offset] == b->info.dim[k + 1 - b_offset]);
939
0
    hw[k] = a->info.dim[k + 2 - a_offset];
940
0
  }
941
0
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
942
0
  int i[CCV_NNC_MAX_DIM + 2];
943
0
  double* const ap = a->data.f64;
944
0
  double* const bp = b->data.f64;
945
  // Non-optimal case, need to do skip copy.
946
0
  for (i[0] = 0; i[0] < n; i[0]++)
947
0
  {
948
0
    double* const ap0 = ap + i[0] * astride[0];
949
0
    double* const bp0 = bp + i[0] * bstride[0];
950
0
    for (i[3] = 0; i[3] < c; i[3]++)
951
0
    {
952
0
      double* bpu = bp0 + i[3];
953
0
      double* ap1 = ap0 + i[3] * astride[1];
954
0
      for (i[1] = 0; i[1] < hw[0]; i[1]++)
955
0
      {
956
0
        for (i[2] = 0; i[2] < hw[1]; i[2]++)
957
0
          bpu[i[2] * bstride[2]] = ap1[i[2]];
958
0
        ap1 += astride[2];
959
0
        bpu += bstride[1];
960
0
      }
961
0
    }
962
0
  }
963
0
}
964
965
static void _ccv_nnc_tensor_nhwc_nchw_f16(const ccv_nnc_tensor_view_t* a, ccv_nnc_tensor_view_t* b)
966
0
{
967
  // Assuming this is float 32.
968
0
  int astride[CCV_NNC_MAX_DIM_ALLOC];
969
0
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
970
0
  int k;
971
  // In case it is Toll-free bridged matrix object (NHWC format is possible).
972
0
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
973
0
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
974
0
  const int a_offset = CCV_NNC_MAX_DIM + 2 - a_nd;
975
0
  assert(a_offset == 0 || a_offset == 1);
976
0
  const int b_offset = CCV_NNC_MAX_DIM + 2 - b_nd;
977
0
  assert(b_offset == 0 || b_offset == 1);
978
0
  ccv_nnc_tensor_view_get_stride(a, astride);
979
0
  ccv_nnc_tensor_view_get_stride(b, bstride);
980
  // Comparing N
981
0
  assert((a_offset == 0 ? a->info.dim[0] : 1) == (b_offset == 0 ? b->info.dim[0] : 1));
982
0
  const int n = (a_offset == 0 ? a->info.dim[0] : 1);
983
  // Comparing C
984
0
  assert(a->info.dim[a_nd - 1] == b->info.dim[1 - b_offset]);
985
0
  const int c = a->info.dim[a_nd - 1];
986
  // Comparing HW
987
0
  int hw[CCV_NNC_MAX_DIM];
988
0
  for (k = 0; k < CCV_NNC_MAX_DIM; k++)
989
0
  {
990
0
    assert(a->info.dim[k + 1 - a_offset] == b->info.dim[k + 2 - b_offset]);
991
0
    hw[k] = a->info.dim[k + 1 - a_offset];
992
0
  }
993
0
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
994
0
  int i[CCV_NNC_MAX_DIM + 2];
995
0
  ccv_float16_t* const ap = a->data.f16;
996
0
  ccv_float16_t* const bp = b->data.f16;
997
  // Non-optimal case, need to do skip copy.
998
0
  for (i[0] = 0; i[0] < n; i[0]++)
999
0
  {
1000
0
    ccv_float16_t* ap0 = ap + i[0] * astride[0];
1001
0
    ccv_float16_t* const bp0 = bp + i[0] * bstride[0];
1002
0
    for (i[3] = 0; i[3] < c; i[3]++)
1003
0
    {
1004
0
      ccv_float16_t* apu = ap0 + i[3];
1005
0
      ccv_float16_t* bp1 = bp0 + i[3] * bstride[1];
1006
0
      for (i[1] = 0; i[1] < hw[0]; i[1]++)
1007
0
      {
1008
0
        for (i[2] = 0; i[2] < hw[1]; i[2]++)
1009
0
          bp1[i[2]] = apu[i[2] * astride[2]];
1010
0
        apu += astride[1];
1011
0
        bp1 += bstride[2];
1012
0
      }
1013
0
    }
1014
0
  }
1015
0
}
1016
1017
static void _ccv_nnc_tensor_nchw_nhwc_f16(const ccv_nnc_tensor_view_t* a, ccv_nnc_tensor_view_t* b)
1018
0
{
1019
  // Assuming this is float 32.
1020
0
  int astride[CCV_NNC_MAX_DIM_ALLOC];
1021
0
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
1022
0
  int k;
1023
  // In case it is Toll-free bridged matrix object (NHWC format is possible).
1024
0
  const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
1025
0
  const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
1026
0
  const int a_offset = CCV_NNC_MAX_DIM + 2 - a_nd;
1027
0
  assert(a_offset == 0 || a_offset == 1);
1028
0
  const int b_offset = CCV_NNC_MAX_DIM + 2 - b_nd;
1029
0
  assert(b_offset == 0 || b_offset == 1);
1030
0
  ccv_nnc_tensor_view_get_stride(a, astride);
1031
0
  ccv_nnc_tensor_view_get_stride(b, bstride);
1032
  // Comparing N
1033
0
  assert((a_offset == 0 ? a->info.dim[0] : 1) == (b_offset == 0 ? b->info.dim[0] : 1));
1034
0
  const int n = (a_offset == 0 ? a->info.dim[0] : 1);
1035
  // Comparing C
1036
0
  assert(a->info.dim[1 - a_offset] == b->info.dim[b_nd - 1]);
1037
0
  const int c = a->info.dim[1 - a_offset];
1038
  // Comparing HW
1039
0
  int hw[CCV_NNC_MAX_DIM];
1040
0
  for (k = 0; k < CCV_NNC_MAX_DIM; k++)
1041
0
  {
1042
0
    assert(a->info.dim[k + 2 - a_offset] == b->info.dim[k + 1 - b_offset]);
1043
0
    hw[k] = a->info.dim[k + 2 - a_offset];
1044
0
  }
1045
0
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1046
0
  int i[CCV_NNC_MAX_DIM + 2];
1047
0
  ccv_float16_t* const ap = a->data.f16;
1048
0
  ccv_float16_t* const bp = b->data.f16;
1049
  // Non-optimal case, need to do skip copy.
1050
0
  for (i[0] = 0; i[0] < n; i[0]++)
1051
0
  {
1052
0
    ccv_float16_t* const ap0 = ap + i[0] * astride[0];
1053
0
    ccv_float16_t* const bp0 = bp + i[0] * bstride[0];
1054
0
    for (i[3] = 0; i[3] < c; i[3]++)
1055
0
    {
1056
0
      ccv_float16_t* bpu = bp0 + i[3];
1057
0
      ccv_float16_t* ap1 = ap0 + i[3] * astride[1];
1058
0
      for (i[1] = 0; i[1] < hw[0]; i[1]++)
1059
0
      {
1060
0
        for (i[2] = 0; i[2] < hw[1]; i[2]++)
1061
0
          bpu[i[2] * bstride[2]] = ap1[i[2]];
1062
0
        ap1 += astride[2];
1063
0
        bpu += bstride[1];
1064
0
      }
1065
0
    }
1066
0
  }
1067
0
}
1068
1069
static int _ccv_nnc_format_transform(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1070
2.03k
{
1071
2.03k
  assert(output_size <= input_size);
1072
2.03k
  int i;
1073
4.06k
  for (i = 0; i < output_size; 
i++2.03k
)
1074
2.03k
  {
1075
2.03k
    const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[i];
1076
2.03k
    ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[i];
1077
2.03k
    assert(a != b); // Cannot do inplace transform.
1078
2.03k
    assert(a->info.datatype == b->info.datatype);
1079
2.03k
    if (a->info.dim[0] == 0 || b->info.dim[0] == 0)
1080
0
      continue;
1081
2.03k
    if (a->info.datatype == CCV_32F || 
a->info.datatype == CCV_32S1
)
1082
2.03k
    {
1083
2.03k
      if (a->info.format == b->info.format) {
1084
        // If it is the same, just do a normal data transfer.
1085
2.01k
        _ccv_nnc_tensor_transfer_cpu_ref_f32(a, b);
1086
2.01k
      } else 
if (19
a->info.format == CCV_TENSOR_FORMAT_NHWC19
&&
b->info.format == CCV_TENSOR_FORMAT_NCHW10
) {
1087
10
        _ccv_nnc_tensor_nhwc_nchw_f32(a, b);
1088
10
      } else 
if (9
a->info.format == CCV_TENSOR_FORMAT_NHWC9
&&
b->info.format == CCV_TENSOR_FORMAT_CHWN0
) {
1089
0
        assert(0);
1090
9
      } else if (a->info.format == CCV_TENSOR_FORMAT_NCHW && b->info.format == CCV_TENSOR_FORMAT_NHWC) {
1091
9
        _ccv_nnc_tensor_nchw_nhwc_f32(a, b);
1092
9
      } else 
if (0
a->info.format == CCV_TENSOR_FORMAT_NCHW0
&&
b->info.format == CCV_TENSOR_FORMAT_CHWN0
) {
1093
0
        assert(0);
1094
0
      } else if (a->info.format == CCV_TENSOR_FORMAT_CHWN && b->info.format == CCV_TENSOR_FORMAT_NHWC) {
1095
0
        assert(0);
1096
0
      } else if (a->info.format == CCV_TENSOR_FORMAT_CHWN && b->info.format == CCV_TENSOR_FORMAT_NCHW) {
1097
0
        assert(0);
1098
0
      }
1099
2.03k
    } else 
if (1
a->info.datatype == CCV_64F1
) {
1100
1
      if (a->info.format == b->info.format) {
1101
        // If it is the same, just do a normal data transfer.
1102
0
        _ccv_nnc_tensor_transfer_cpu_ref_f64(a, b);
1103
1
      } else if (a->info.format == CCV_TENSOR_FORMAT_NHWC && b->info.format == CCV_TENSOR_FORMAT_NCHW) {
1104
1
        _ccv_nnc_tensor_nhwc_nchw_f64(a, b);
1105
1
      } else 
if (0
a->info.format == CCV_TENSOR_FORMAT_NHWC0
&&
b->info.format == CCV_TENSOR_FORMAT_CHWN0
) {
1106
0
        assert(0);
1107
0
      } else if (a->info.format == CCV_TENSOR_FORMAT_NCHW && b->info.format == CCV_TENSOR_FORMAT_NHWC) {
1108
0
        _ccv_nnc_tensor_nchw_nhwc_f64(a, b);
1109
0
      } else if (a->info.format == CCV_TENSOR_FORMAT_NCHW && b->info.format == CCV_TENSOR_FORMAT_CHWN) {
1110
0
        assert(0);
1111
0
      } else if (a->info.format == CCV_TENSOR_FORMAT_CHWN && b->info.format == CCV_TENSOR_FORMAT_NHWC) {
1112
0
        assert(0);
1113
0
      } else if (a->info.format == CCV_TENSOR_FORMAT_CHWN && b->info.format == CCV_TENSOR_FORMAT_NCHW) {
1114
0
        assert(0);
1115
0
      }
1116
1
    } else 
if (0
a->info.datatype == CCV_16F0
||
a->info.datatype == CCV_16BF0
) {
1117
0
      if (a->info.format == b->info.format) {
1118
        // If it is the same, just do a normal data transfer.
1119
0
        _ccv_nnc_tensor_transfer_cpu_ref_f16(a, b);
1120
0
      } else if (a->info.format == CCV_TENSOR_FORMAT_NHWC && b->info.format == CCV_TENSOR_FORMAT_NCHW) {
1121
0
        _ccv_nnc_tensor_nhwc_nchw_f16(a, b);
1122
0
      } else if (a->info.format == CCV_TENSOR_FORMAT_NHWC && b->info.format == CCV_TENSOR_FORMAT_CHWN) {
1123
0
        assert(0);
1124
0
      } else if (a->info.format == CCV_TENSOR_FORMAT_NCHW && b->info.format == CCV_TENSOR_FORMAT_NHWC) {
1125
0
        _ccv_nnc_tensor_nchw_nhwc_f16(a, b);
1126
0
      } else if (a->info.format == CCV_TENSOR_FORMAT_NCHW && b->info.format == CCV_TENSOR_FORMAT_CHWN) {
1127
0
        assert(0);
1128
0
      } else if (a->info.format == CCV_TENSOR_FORMAT_CHWN && b->info.format == CCV_TENSOR_FORMAT_NHWC) {
1129
0
        assert(0);
1130
0
      } else if (a->info.format == CCV_TENSOR_FORMAT_CHWN && b->info.format == CCV_TENSOR_FORMAT_NCHW) {
1131
0
        assert(0);
1132
0
      }
1133
0
    } else if (a->info.datatype == CCV_8U) {
1134
0
      if (a->info.format == b->info.format) {
1135
        // If it is the same, just do a normal data transfer.
1136
0
        _ccv_nnc_tensor_transfer_cpu_ref_u8(a, b);
1137
0
      } else if (a->info.format == CCV_TENSOR_FORMAT_NHWC && b->info.format == CCV_TENSOR_FORMAT_NCHW) {
1138
0
        assert(0);
1139
0
      } else if (a->info.format == CCV_TENSOR_FORMAT_NHWC && b->info.format == CCV_TENSOR_FORMAT_CHWN) {
1140
0
        assert(0);
1141
0
      } else if (a->info.format == CCV_TENSOR_FORMAT_NCHW && b->info.format == CCV_TENSOR_FORMAT_NHWC) {
1142
0
        assert(0);
1143
0
      } else if (a->info.format == CCV_TENSOR_FORMAT_NCHW && b->info.format == CCV_TENSOR_FORMAT_CHWN) {
1144
0
        assert(0);
1145
0
      } else if (a->info.format == CCV_TENSOR_FORMAT_CHWN && b->info.format == CCV_TENSOR_FORMAT_NHWC) {
1146
0
        assert(0);
1147
0
      } else if (a->info.format == CCV_TENSOR_FORMAT_CHWN && b->info.format == CCV_TENSOR_FORMAT_NCHW) {
1148
0
        assert(0);
1149
0
      }
1150
0
    } else {
1151
0
      assert(0);
1152
0
    }
1153
2.03k
  }
1154
2.03k
  return CCV_NNC_EXEC_SUCCESS;
1155
2.03k
}
1156
1157
REGISTER_COMMAND_BACKEND(CCV_NNC_FORMAT_TRANSFORM_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1158
1
{
1159
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
1160
1
  registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_32S | CCV_16F | CCV_8U | CCV_16BF;
1161
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1162
1
  registry->algorithms = 1;
1163
1
  registry->exec = _ccv_nnc_format_transform;
1164
1
}
1165
1166
REGISTER_COMMAND_BACKEND(CCV_NNC_FORMAT_TRANSFORM_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1167
1
{
1168
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
1169
1
  registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_32S | CCV_16F | CCV_8U | CCV_16BF;
1170
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1171
1
  registry->algorithms = 1;
1172
1
  registry->exec = _ccv_nnc_format_transform;
1173
1
}
1174
1175
static int _ccv_nnc_transpose(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1176
55
{
1177
55
  assert(output_size <= input_size);
1178
55
  int k;
1179
110
  for (k = 0; k < output_size; 
k++55
)
1180
55
  {
1181
55
    const ccv_nnc_tensor_view_t* const a = (ccv_nnc_tensor_view_t*)inputs[k];
1182
55
    ccv_nnc_tensor_view_t* const b = (ccv_nnc_tensor_view_t*)outputs[k];
1183
55
    const int a_nd = ccv_nnc_tensor_nd(a->info.dim);
1184
55
    const int b_nd = ccv_nnc_tensor_nd(b->info.dim);
1185
55
    assert(a_nd == b_nd);
1186
55
    assert(a_nd <= CCV_NNC_MAX_DIM + 2); // I can only handle maximum 4.
1187
55
    assert(a_nd >= 2 && b_nd >= 2); // You cannot transpose if it is less than 2.
1188
55
    assert(a->info.dim[cmd.info.transpose.axis[0]] == b->info.dim[cmd.info.transpose.axis[1]]);
1189
55
    assert(a->info.dim[cmd.info.transpose.axis[1]] == b->info.dim[cmd.info.transpose.axis[0]]);
1190
55
    int x;
1191
271
    for (x = 0; x < a_nd; 
x++216
)
1192
216
      if (x != cmd.info.transpose.axis[0] && 
x != cmd.info.transpose.axis[1]161
)
1193
106
        { assert(a->info.dim[x] == b->info.dim[x]); }
1194
55
    size_t astride[CCV_NNC_MAX_DIM + 2];
1195
55
    size_t bstride[CCV_NNC_MAX_DIM + 2];
1196
55
    int dim[CCV_NNC_MAX_DIM + 2];
1197
59
    for (x = b_nd; x < CCV_NNC_MAX_DIM + 2; 
x++4
)
1198
4
      dim[x] = 1;
1199
271
    for (x = 0; x < b_nd; 
x++216
)
1200
216
      dim[x] = b->info.dim[x];
1201
    // Don't use ccv_nnc_tensor_view_get_inc or get_dim because these will prefill beginning to 1:
1202
    // for example, if the dimension is [2, 4], it will fill to [1, 1, 2, 4] so the axis index will
1203
    // be messed up.
1204
55
    if (CCV_IS_TENSOR_VIEW(a))
1205
6
    {
1206
8
      for (x = a_nd; x < CCV_NNC_MAX_DIM + 2; 
x++2
)
1207
2
        astride[x] = 1;
1208
28
      for (x = 0; x < a_nd; 
x++22
)
1209
22
        astride[x] = a->stride[x];
1210
49
    } else {
1211
49
      const int* const adim = a->info.dim;
1212
100
      for (x = a_nd - 1; x < CCV_NNC_MAX_DIM + 2; 
x++51
)
1213
51
        astride[x] = 1;
1214
194
      for (x = a_nd - 2; x >= 0; 
x--145
)
1215
145
        astride[x] = astride[x + 1] * adim[x + 1];
1216
49
    }
1217
55
    if (CCV_IS_TENSOR_VIEW(b))
1218
6
    {
1219
8
      for (x = b_nd; x < CCV_NNC_MAX_DIM + 2; 
x++2
)
1220
2
        bstride[x] = 1;
1221
28
      for (x = 0; x < b_nd; 
x++22
)
1222
22
        bstride[x] = b->stride[x];
1223
49
    } else {
1224
49
      const int* const bdim = b->info.dim;
1225
100
      for (x = b_nd - 1; x < CCV_NNC_MAX_DIM + 2; 
x++51
)
1226
51
        bstride[x] = 1;
1227
194
      for (x = b_nd - 2; x >= 0; 
x--145
)
1228
145
        bstride[x] = bstride[x + 1] * bdim[x + 1];
1229
49
    }
1230
55
    const float* const ap = a->data.f32;
1231
55
    float* const bp = b->data.f32;
1232
55
    int i[CCV_NNC_MAX_DIM + 2];
1233
55
    int j[CCV_NNC_MAX_DIM + 2] = {
1234
55
      0, 1, 2, 3
1235
55
    };
1236
55
    CCV_SWAP(j[cmd.info.transpose.axis[0]], j[cmd.info.transpose.axis[1]], x);
1237
658
    for (i[0] = 0; i[0] < dim[0]; 
i[0]++603
)
1238
603
    {
1239
603
      float* const bp0 = bp + i[0] * bstride[0];
1240
28.4k
      for (i[1] = 0; i[1] < dim[1]; 
i[1]++27.8k
)
1241
27.8k
      {
1242
27.8k
        float* const bp1 = bp0 + i[1] * bstride[1];
1243
578k
        for (i[2] = 0; i[2] < dim[2]; 
i[2]++550k
)
1244
550k
        {
1245
550k
          float* const bp2 = bp1 + i[2] * bstride[2];
1246
43.3M
          for (i[3] = 0; i[3] < dim[3]; 
i[3]++42.8M
)
1247
42.8M
            bp2[i[3]] = ap[i[j[0]] * astride[0] + i[j[1]] * astride[1] + i[j[2]] * astride[2] + i[j[3]] * astride[3]];
1248
550k
        }
1249
27.8k
      }
1250
603
    }
1251
55
  }
1252
55
  return CCV_NNC_EXEC_SUCCESS;
1253
55
}
1254
1255
REGISTER_COMMAND_BACKEND(CCV_NNC_TRANSPOSE_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1256
1
{
1257
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
1258
1
  registry->tensor_datatypes = CCV_32F;
1259
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1260
1
  registry->algorithms = 1;
1261
1
  registry->exec = _ccv_nnc_transpose;
1262
1
}
1263
1264
REGISTER_COMMAND_BACKEND(CCV_NNC_TRANSPOSE_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1265
1
{
1266
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
1267
1
  registry->tensor_datatypes = CCV_32F;
1268
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1269
1
  registry->algorithms = 1;
1270
1
  registry->exec = _ccv_nnc_transpose;
1271
1
}
1272
1273
static int _ccv_nnc_datatype_conversion(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1274
120k
{
1275
120k
  assert(output_size <= input_size);
1276
120k
  int i;
1277
240k
  for (i = 0; i < output_size; 
i++120k
)
1278
120k
  {
1279
120k
    const ccv_nnc_tensor_view_t* a = (ccv_nnc_tensor_view_t*)inputs[i];
1280
120k
    ccv_nnc_tensor_view_t* b = (ccv_nnc_tensor_view_t*)outputs[i];
1281
120k
    assert(a != b); // Cannot do inplace transform.
1282
120k
    assert(a->info.format == b->info.format);
1283
120k
    if (a->info.datatype == b->info.datatype) {
1284
      // If it is the same, just do a normal data transfer.
1285
2
      if (a->info.datatype == CCV_16F || a->info.datatype == CCV_16BF)
1286
0
        _ccv_nnc_tensor_transfer_cpu_ref_f16(a, b);
1287
2
      else if (a->info.datatype == CCV_32F)
1288
2
        _ccv_nnc_tensor_transfer_cpu_ref_f32(a, b);
1289
0
      else if (a->info.datatype == CCV_64F)
1290
0
        _ccv_nnc_tensor_transfer_cpu_ref_f64(a, b);
1291
120k
    } else if (a->info.datatype == CCV_32F && 
b->info.datatype == CCV_16F120k
) {
1292
120k
      assert(CCV_IS_TENSOR_CONTIGUOUS(a));
1293
120k
      assert(CCV_IS_TENSOR_CONTIGUOUS(b));
1294
120k
      const size_t tensor_count = ccv_nnc_tensor_count(a->info);
1295
120k
      assert(tensor_count == ccv_nnc_tensor_count(b->info));
1296
120k
      ccv_float_to_half_precision(a->data.f32, (uint16_t*)b->data.f16, tensor_count);
1297
120k
    } else 
if (205
a->info.datatype == CCV_16F205
&&
b->info.datatype == CCV_32F201
) {
1298
201
      assert(CCV_IS_TENSOR_CONTIGUOUS(a));
1299
201
      assert(CCV_IS_TENSOR_CONTIGUOUS(b));
1300
201
      const int tensor_count = ccv_nnc_tensor_count(a->info);
1301
201
      assert(tensor_count == ccv_nnc_tensor_count(b->info));
1302
201
      ccv_half_precision_to_float((uint16_t*)a->data.f16, b->data.f32, tensor_count);
1303
201
    } else 
if (4
a->info.datatype == CCV_64F4
&&
b->info.datatype == CCV_32F3
) {
1304
1
      assert(CCV_IS_TENSOR_CONTIGUOUS(a));
1305
1
      assert(CCV_IS_TENSOR_CONTIGUOUS(b));
1306
1
      const size_t tensor_count = ccv_nnc_tensor_count(a->info);
1307
1
      assert(tensor_count == ccv_nnc_tensor_count(b->info));
1308
1
      int i;
1309
129
      for (i = 0; i < tensor_count; 
i++128
)
1310
128
        b->data.f32[i] = (float)a->data.f64[i];
1311
3
    } else if (a->info.datatype == CCV_32F && 
b->info.datatype == CCV_64F1
) {
1312
1
      assert(CCV_IS_TENSOR_CONTIGUOUS(a));
1313
1
      assert(CCV_IS_TENSOR_CONTIGUOUS(b));
1314
1
      const int tensor_count = ccv_nnc_tensor_count(a->info);
1315
1
      assert(tensor_count == ccv_nnc_tensor_count(b->info));
1316
129
      
for (i = 0; 1
i < tensor_count;
i++128
)
1317
128
        b->data.f64[i] = (double)a->data.f32[i];
1318
2
    } else if (a->info.datatype == CCV_64F && b->info.datatype == CCV_16F) {
1319
2
      assert(CCV_IS_TENSOR_CONTIGUOUS(a));
1320
2
      assert(CCV_IS_TENSOR_CONTIGUOUS(b));
1321
2
      const size_t tensor_count = ccv_nnc_tensor_count(a->info);
1322
2
      assert(tensor_count == ccv_nnc_tensor_count(b->info));
1323
2
      ccv_double_to_half_precision(a->data.f64, (uint16_t*)b->data.f16, tensor_count);
1324
2
    } else 
if (0
a->info.datatype == CCV_16F0
&&
b->info.datatype == CCV_64F0
) {
1325
0
      assert(CCV_IS_TENSOR_CONTIGUOUS(a));
1326
0
      assert(CCV_IS_TENSOR_CONTIGUOUS(b));
1327
0
      const int tensor_count = ccv_nnc_tensor_count(a->info);
1328
0
      assert(tensor_count == ccv_nnc_tensor_count(b->info));
1329
0
      ccv_half_precision_to_double((uint16_t*)a->data.f16, b->data.f64, tensor_count);
1330
0
    } else if (a->info.datatype == CCV_16F && b->info.datatype == CCV_16BF) {
1331
0
      assert(CCV_IS_TENSOR_CONTIGUOUS(a));
1332
0
      assert(CCV_IS_TENSOR_CONTIGUOUS(b));
1333
0
      const size_t tensor_count = ccv_nnc_tensor_count(a->info);
1334
0
      assert(tensor_count == ccv_nnc_tensor_count(b->info));
1335
0
      ccv_half_precision_to_bfloat((uint16_t*)a->data.f16, (uint16_t*)b->data.f16, tensor_count);
1336
0
    } else if (a->info.datatype == CCV_16BF && b->info.datatype == CCV_16F) {
1337
0
      assert(CCV_IS_TENSOR_CONTIGUOUS(a));
1338
0
      assert(CCV_IS_TENSOR_CONTIGUOUS(b));
1339
0
      const int tensor_count = ccv_nnc_tensor_count(a->info);
1340
0
      assert(tensor_count == ccv_nnc_tensor_count(b->info));
1341
0
      ccv_bfloat_to_half_precision((uint16_t*)a->data.f16, (uint16_t*)b->data.f16, tensor_count);
1342
0
    } else if (a->info.datatype == CCV_32F && b->info.datatype == CCV_16BF) {
1343
0
      assert(CCV_IS_TENSOR_CONTIGUOUS(a));
1344
0
      assert(CCV_IS_TENSOR_CONTIGUOUS(b));
1345
0
      const size_t tensor_count = ccv_nnc_tensor_count(a->info);
1346
0
      assert(tensor_count == ccv_nnc_tensor_count(b->info));
1347
0
      ccv_float_to_bfloat(a->data.f32, (uint16_t*)b->data.f16, tensor_count);
1348
0
    } else if (a->info.datatype == CCV_16BF && b->info.datatype == CCV_32F) {
1349
0
      assert(CCV_IS_TENSOR_CONTIGUOUS(a));
1350
0
      assert(CCV_IS_TENSOR_CONTIGUOUS(b));
1351
0
      const int tensor_count = ccv_nnc_tensor_count(a->info);
1352
0
      assert(tensor_count == ccv_nnc_tensor_count(b->info));
1353
0
      ccv_bfloat_to_float((uint16_t*)a->data.f16, b->data.f32, tensor_count);
1354
0
    } else if (a->info.datatype == CCV_64F && b->info.datatype == CCV_16BF) {
1355
0
      assert(CCV_IS_TENSOR_CONTIGUOUS(a));
1356
0
      assert(CCV_IS_TENSOR_CONTIGUOUS(b));
1357
0
      const size_t tensor_count = ccv_nnc_tensor_count(a->info);
1358
0
      assert(tensor_count == ccv_nnc_tensor_count(b->info));
1359
0
      ccv_double_to_bfloat(a->data.f64, (uint16_t*)b->data.f16, tensor_count);
1360
0
    } else if (a->info.datatype == CCV_16BF && b->info.datatype == CCV_64F) {
1361
0
      assert(CCV_IS_TENSOR_CONTIGUOUS(a));
1362
0
      assert(CCV_IS_TENSOR_CONTIGUOUS(b));
1363
0
      const int tensor_count = ccv_nnc_tensor_count(a->info);
1364
0
      assert(tensor_count == ccv_nnc_tensor_count(b->info));
1365
0
      ccv_bfloat_to_double((uint16_t*)a->data.f16, b->data.f64, tensor_count);
1366
0
    }
1367
120k
  }
1368
120k
  return CCV_NNC_EXEC_SUCCESS;
1369
120k
}
1370
1371
REGISTER_COMMAND_BACKEND(CCV_NNC_DATATYPE_CONVERSION_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1372
1
{
1373
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
1374
1
  registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_16BF;
1375
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1376
1
  registry->algorithms = 1;
1377
1
  registry->exec = _ccv_nnc_datatype_conversion;
1378
1
}
1379
1380
REGISTER_COMMAND_BACKEND(CCV_NNC_DATATYPE_CONVERSION_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1381
1
{
1382
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
1383
1
  registry->tensor_datatypes = CCV_64F | CCV_32F | CCV_16F | CCV_16BF;
1384
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1385
1
  registry->algorithms = 1;
1386
1
  registry->exec = _ccv_nnc_datatype_conversion;
1387
1
}
1388
1389
static void _ccv_nnc_masked_fill_cpu_ref_f(const float p, const float q, ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b, ccv_nnc_tensor_view_t* const c)
1390
2
{
1391
2
  int cdim[CCV_NNC_MAX_DIM_ALLOC];
1392
2
  ccv_nnc_tensor_view_get_dim(a, cdim); // Fill in cdim first.
1393
2
  ccv_nnc_tensor_view_get_broadcast_dim(b, cdim);
1394
2
  assert(ccv_nnc_tensor_view_check_broadcast_dim(a, cdim));
1395
2
  assert(ccv_nnc_tensor_view_check_broadcast_dim(b, cdim));
1396
2
  const int a_check_dim = ccv_nnc_tensor_view_check_dim(a, cdim);
1397
2
  const int b_check_dim = ccv_nnc_tensor_view_check_dim(b, cdim);
1398
  // Assuming this is float 32.
1399
2
  int adim[CCV_NNC_MAX_DIM_ALLOC];
1400
2
  int bdim[CCV_NNC_MAX_DIM_ALLOC];
1401
2
  ccv_nnc_tensor_view_get_dim(a, adim);
1402
2
  ccv_nnc_tensor_view_get_dim(b, bdim);
1403
2
  int astride[CCV_NNC_MAX_DIM_ALLOC];
1404
2
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
1405
2
  int cstride[CCV_NNC_MAX_DIM_ALLOC];
1406
2
  assert(ccv_nnc_tensor_view_check_dim(c, cdim));
1407
2
  int x;
1408
2
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && a_check_dim && b_check_dim)
1409
0
  {
1410
0
    const int tensor_count = ccv_nnc_tensor_count(a->info);
1411
    // Super optimal case, just do one for-loop for sum.
1412
0
    for (x = 0; x < tensor_count; x++)
1413
0
      c->data.f32[x] = (b->data.f32[x] == p) ? q : a->data.f32[x];
1414
0
    return;
1415
0
  }
1416
2
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1417
2
  ccv_nnc_tensor_view_get_stride(a, astride);
1418
2
  ccv_nnc_tensor_view_get_stride(b, bstride);
1419
2
  ccv_nnc_tensor_view_get_stride(c, cstride);
1420
2
  int i[CCV_NNC_MAX_DIM + 2];
1421
2
  float* const ap = a->data.f32;
1422
2
  float* const bp = b->data.f32;
1423
2
  float* const cp = c->data.f32;
1424
2
  const int count = cdim[2] * cdim[3];
1425
2
  if (astride[2] == cdim[3] && bstride[2] == cdim[3] && cstride[2] == cdim[3] && adim[2] == cdim[2] && bdim[2] == cdim[2])
1426
2
  {
1427
    // Special casing if the ainc[3] is the same as dim[3]
1428
4
    for (i[0] = 0; i[0] < cdim[0]; 
i[0]++2
)
1429
2
    {
1430
2
      float* const ap0 = adim[0] == 1 ? ap : 
ap + i[0] * astride[0]0
;
1431
2
      float* const bp0 = bdim[0] == 1 ? bp : 
bp + i[0] * bstride[0]0
;
1432
2
      float* cp0 = cp + i[0] * cstride[0];
1433
14
      for (i[1] = 0; i[1] < cdim[1]; 
i[1]++12
)
1434
12
      {
1435
12
        float* const ap1 = adim[1] == 1 ? 
ap00
: ap0 + i[1] * astride[1];
1436
12
        float* const bp1 = bdim[1] == 1 ? bp0 : 
bp0 + i[1] * bstride[1]0
;
1437
252
        for (x = 0; x < count; 
x++240
)
1438
240
          cp0[x] = (bp1[x] == p) ? 
q120
:
ap1[x]120
;
1439
12
        cp0 += cstride[1];
1440
12
      }
1441
2
    }
1442
2
    return;
1443
2
  }
1444
  // Non-optimal case, need to do skip copy and handle broadcasting.
1445
0
  for (i[0] = 0; i[0] < cdim[0]; i[0]++)
1446
0
  {
1447
0
    float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0];
1448
0
    float* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0];
1449
0
    float* const cp0 = cp + i[0] * cstride[0];
1450
0
    for (i[1] = 0; i[1] < cdim[1]; i[1]++)
1451
0
    {
1452
0
      float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1];
1453
0
      float* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1];
1454
0
      float* cp1 = cp0 + i[1] * cstride[1];
1455
0
      for (i[2] = 0; i[2] < cdim[2]; i[2]++)
1456
0
      {
1457
0
        float* const ap2 = adim[2] == 1 ? ap1 : ap1 + i[2] * astride[2];
1458
0
        float* const bp2 = bdim[2] == 1 ? bp1 : bp1 + i[2] * bstride[2];
1459
0
        if (adim[3] == 1)
1460
0
          for (x = 0; x < cdim[3]; x++)
1461
0
            cp1[x] = (bp2[x] == p) ? q : ap2[0];
1462
0
        else if (bdim[3] == 1)
1463
0
          if (bp2[0] == p)
1464
0
            for (x = 0; x < cdim[3]; x++)
1465
0
              cp1[x] = q;
1466
0
          else
1467
0
            for (x = 0; x < cdim[3]; x++)
1468
0
              cp1[x] = ap2[x];
1469
0
        else
1470
0
          for (x = 0; x < cdim[3]; x++)
1471
0
            cp1[x] = (bp2[x] == p) ? q : ap2[x];
1472
0
        cp1 += cstride[2];
1473
0
      }
1474
0
    }
1475
0
  }
1476
0
}
1477
1478
static void _ccv_nnc_masked_fill_cpu_ref_s(const int p, const float q, ccv_nnc_tensor_view_t* const a, ccv_nnc_tensor_view_t* const b, ccv_nnc_tensor_view_t* const c)
1479
4
{
1480
4
  int cdim[CCV_NNC_MAX_DIM_ALLOC];
1481
4
  ccv_nnc_tensor_view_get_dim(a, cdim); // Fill in cdim first.
1482
4
  ccv_nnc_tensor_view_get_broadcast_dim(b, cdim);
1483
4
  assert(ccv_nnc_tensor_view_check_broadcast_dim(a, cdim));
1484
4
  assert(ccv_nnc_tensor_view_check_broadcast_dim(b, cdim));
1485
4
  const int a_check_dim = ccv_nnc_tensor_view_check_dim(a, cdim);
1486
4
  const int b_check_dim = ccv_nnc_tensor_view_check_dim(b, cdim);
1487
  // Assuming this is float 32.
1488
4
  int adim[CCV_NNC_MAX_DIM_ALLOC];
1489
4
  int bdim[CCV_NNC_MAX_DIM_ALLOC];
1490
4
  ccv_nnc_tensor_view_get_dim(a, adim);
1491
4
  ccv_nnc_tensor_view_get_dim(b, bdim);
1492
4
  int astride[CCV_NNC_MAX_DIM_ALLOC];
1493
4
  int bstride[CCV_NNC_MAX_DIM_ALLOC];
1494
4
  int cstride[CCV_NNC_MAX_DIM_ALLOC];
1495
4
  assert(ccv_nnc_tensor_view_check_dim(c, cdim));
1496
4
  int x;
1497
4
  if (!CCV_IS_TENSOR_VIEW(a) && !CCV_IS_TENSOR_VIEW(b) && !CCV_IS_TENSOR_VIEW(c) && a_check_dim && b_check_dim)
1498
0
  {
1499
0
    const int tensor_count = ccv_nnc_tensor_count(a->info);
1500
    // Super optimal case, just do one for-loop for sum.
1501
0
    for (x = 0; x < tensor_count; x++)
1502
0
      c->data.f32[x] = (b->data.i32[x] == p) ? q : a->data.f32[x];
1503
0
    return;
1504
0
  }
1505
4
  assert(CCV_NNC_MAX_DIM == 2); // Need to change this logic for CCV_NNC_MAX_DIM == other number.
1506
4
  ccv_nnc_tensor_view_get_stride(a, astride);
1507
4
  ccv_nnc_tensor_view_get_stride(b, bstride);
1508
4
  ccv_nnc_tensor_view_get_stride(c, cstride);
1509
4
  int i[CCV_NNC_MAX_DIM + 2];
1510
4
  float* const ap = a->data.f32;
1511
4
  int* const bp = b->data.i32;
1512
4
  float* const cp = c->data.f32;
1513
4
  const int count = cdim[2] * cdim[3];
1514
4
  if (astride[2] == cdim[3] && bstride[2] == cdim[3] && cstride[2] == cdim[3] && adim[2] == cdim[2] && bdim[2] == cdim[2])
1515
4
  {
1516
    // Special casing if the ainc[3] is the same as dim[3]
1517
8
    for (i[0] = 0; i[0] < cdim[0]; 
i[0]++4
)
1518
4
    {
1519
4
      float* const ap0 = adim[0] == 1 ? ap : 
ap + i[0] * astride[0]0
;
1520
4
      int* const bp0 = bdim[0] == 1 ? bp : 
bp + i[0] * bstride[0]0
;
1521
4
      float* cp0 = cp + i[0] * cstride[0];
1522
28
      for (i[1] = 0; i[1] < cdim[1]; 
i[1]++24
)
1523
24
      {
1524
24
        float* const ap1 = adim[1] == 1 ? 
ap00
: ap0 + i[1] * astride[1];
1525
24
        int* const bp1 = bdim[1] == 1 ? bp0 : 
bp0 + i[1] * bstride[1]0
;
1526
504
        for (x = 0; x < count; 
x++480
)
1527
480
          cp0[x] = (bp1[x] == p) ? 
q240
:
ap1[x]240
;
1528
24
        cp0 += cstride[1];
1529
24
      }
1530
4
    }
1531
4
    return;
1532
4
  }
1533
  // Non-optimal case, need to do skip copy and handle broadcasting.
1534
0
  for (i[0] = 0; i[0] < cdim[0]; i[0]++)
1535
0
  {
1536
0
    float* const ap0 = adim[0] == 1 ? ap : ap + i[0] * astride[0];
1537
0
    int* const bp0 = bdim[0] == 1 ? bp : bp + i[0] * bstride[0];
1538
0
    float* const cp0 = cp + i[0] * cstride[0];
1539
0
    for (i[1] = 0; i[1] < cdim[1]; i[1]++)
1540
0
    {
1541
0
      float* const ap1 = adim[1] == 1 ? ap0 : ap0 + i[1] * astride[1];
1542
0
      int* const bp1 = bdim[1] == 1 ? bp0 : bp0 + i[1] * bstride[1];
1543
0
      float* cp1 = cp0 + i[1] * cstride[1];
1544
0
      for (i[2] = 0; i[2] < cdim[2]; i[2]++)
1545
0
      {
1546
0
        float* const ap2 = adim[2] == 1 ? ap1 : ap1 + i[2] * astride[2];
1547
0
        int* const bp2 = bdim[2] == 1 ? bp1 : bp1 + i[2] * bstride[2];
1548
0
        if (adim[3] == 1)
1549
0
          for (x = 0; x < cdim[3]; x++)
1550
0
            cp1[x] = (bp2[x] == p) ? q : ap2[0];
1551
0
        else if (bdim[3] == 1)
1552
0
          if (bp2[0] == p)
1553
0
            for (x = 0; x < cdim[3]; x++)
1554
0
              cp1[x] = q;
1555
0
          else
1556
0
            for (x = 0; x < cdim[3]; x++)
1557
0
              cp1[x] = ap2[x];
1558
0
        else
1559
0
          for (x = 0; x < cdim[3]; x++)
1560
0
            cp1[x] = (bp2[x] == p) ? q : ap2[x];
1561
0
        cp1 += cstride[2];
1562
0
      }
1563
0
    }
1564
0
  }
1565
0
}
1566
1567
static int _ccv_nnc_masked_fill_forw(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1568
3
{
1569
3
  assert(input_size >= 2);
1570
3
  assert(inputs[0]);
1571
3
  assert(inputs[1]);
1572
3
  assert(outputs[0]);
1573
3
  if (inputs[1]->info.datatype == CCV_32F)
1574
1
    _ccv_nnc_masked_fill_cpu_ref_f(cmd.info.blas.a[0], cmd.info.blas.a[1], (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]);
1575
2
  else if (inputs[1]->info.datatype == CCV_32S)
1576
2
    _ccv_nnc_masked_fill_cpu_ref_s((int)(cmd.info.blas.a[0] + 0.5), cmd.info.blas.a[1], (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[1], (ccv_nnc_tensor_view_t*)outputs[0]);
1577
3
  return CCV_NNC_EXEC_SUCCESS;
1578
3
}
1579
1580
static int _ccv_nnc_masked_fill_back(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
1581
3
{
1582
3
  assert(input_size >= 3);
1583
3
  if (inputs[2]->info.datatype == CCV_32F)
1584
1
    _ccv_nnc_masked_fill_cpu_ref_f(cmd.info.blas.a[0], 0, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
1585
2
  else if (inputs[2]->info.datatype == CCV_32S)
1586
2
    _ccv_nnc_masked_fill_cpu_ref_s((int)(cmd.info.blas.a[0] + 0.5), 0, (ccv_nnc_tensor_view_t*)inputs[0], (ccv_nnc_tensor_view_t*)inputs[2], (ccv_nnc_tensor_view_t*)outputs[0]);
1587
  // TODO: doesn't really support taking gradient on mask.
1588
  // if (output_size >= 2 && outputs[1])
1589
3
  return CCV_NNC_EXEC_SUCCESS;
1590
3
}
1591
1592
REGISTER_COMMAND_BACKEND(CCV_NNC_MASKED_FILL_FORWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1593
1
{
1594
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
1595
1
  registry->tensor_datatypes = CCV_32F | CCV_32S;
1596
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1597
1
  registry->algorithms = 1;
1598
1
  registry->exec = _ccv_nnc_masked_fill_forw;
1599
1
}
1600
1601
REGISTER_COMMAND_BACKEND(CCV_NNC_MASKED_FILL_BACKWARD, CCV_NNC_BACKEND_CPU_REF)(ccv_nnc_cmd_backend_registry_t* const registry)
1602
1
{
1603
1
  registry->tensor_formats = CCV_TENSOR_FORMAT_NCHW | CCV_TENSOR_FORMAT_NHWC | CCV_TENSOR_FORMAT_CHWN;
1604
1
  registry->tensor_datatypes = CCV_32F | CCV_32S;
1605
1
  registry->tensor_memory = CCV_TENSOR_CPU_MEMORY;
1606
1
  registry->algorithms = 1;
1607
1
  registry->exec = _ccv_nnc_masked_fill_back;
1608
1
}