Coverage Report

Created: 2025-02-24 17:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_cmd.c
Line
Count
Source
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_internal.h"
3
#include "ccv_nnc_easy.h"
4
#ifdef HAVE_CUDA
5
#include "gpu/ccv_nnc_compat.h"
6
#elif defined(HAVE_MPS)
7
#include "mps/ccv_nnc_mps.h"
8
#endif
9
#include <time.h>
10
#include <sys/time.h>
11
12
typedef struct {
13
  const uint32_t cmd;
14
  const char* name;
15
  ccv_nnc_cmd_registry_t registry;
16
  ccv_nnc_cmd_backend_registry_t backends[CCV_NNC_BACKEND_COUNT];
17
} ccv_nnc_cmd_init_t;
18
19
typedef struct {
20
  const uint32_t backend;
21
  const char* name;
22
} ccv_nnc_cmd_backend_init_t;
23
24
// The generated code configures command and its mapping.
25
#include "cmd/ccv_nnc_cmd.inc"
26
27
void ccv_nnc_init(void)
28
1
{
29
1
  _ccv_nnc_cmd_init();
30
1
}
31
32
static uint64_t _ccv_nnc_flags = 0;
33
34
uint64_t ccv_nnc_flags(void)
35
0
{
36
0
  return _ccv_nnc_flags;
37
0
}
38
39
void ccv_nnc_enable_flag(uint64_t flag)
40
0
{
41
0
  _ccv_nnc_flags |= flag;
42
0
}
43
44
void ccv_nnc_disable_flag(uint64_t flag)
45
0
{
46
0
  _ccv_nnc_flags &= ~flag;
47
0
}
48
49
const char* ccv_nnc_cmd_name(const uint32_t cmd)
50
2.28k
{
51
2.28k
  switch (cmd)
52
2.28k
  {
53
86
    case CCV_NNC_NOOP:
54
86
      return "CCV_NNC_NOOP";
55
3
    case CCV_NNC_CUSTOM_FORWARD:
56
3
      return "CCV_NNC_CUSTOM_FORWARD";
57
0
    case CCV_NNC_CUSTOM_BACKWARD:
58
0
      return "CCV_NNC_CUSTOM_BACKWARD";
59
64
    case CCV_NNC_GRAPH_FORWARD:
60
64
      return "CCV_NNC_GRAPH_FORWARD";
61
5
    case CCV_NNC_GRAPH_BACKWARD:
62
5
      return "CCV_NNC_GRAPH_BACKWARD";
63
2.28k
  }
64
2.12k
  const int idx = _ccv_nnc_cmd_ph(cmd);
65
2.12k
  assert(idx >= 0);
66
2.12k
  assert(idx < sizeof(init_map) / sizeof(init_map[0]));
67
2.12k
  return init_map[idx].name;
68
2.12k
}
69
70
const char* ccv_nnc_cmd_backend_name(const uint32_t backend)
71
0
{
72
0
  if (backend == CCV_NNC_NO_BACKEND)
73
0
    return "CCV_NNC_NO_BACKEND";
74
0
  const int idx = _ccv_nnc_cmd_backend_ph(backend);
75
0
  assert(idx >= 0);
76
0
  assert(idx < CCV_NNC_BACKEND_COUNT);
77
0
  return backend_init_map[idx].name;
78
0
}
79
80
const ccv_nnc_cmd_param_t ccv_nnc_cmd_auto = {};
81
82
int ccv_nnc_is_cmd_auto(const ccv_nnc_cmd_param_t params)
83
0
{
84
0
  return (memcmp(&params, &ccv_nnc_cmd_auto, sizeof(ccv_nnc_cmd_param_t)) == 0);
85
0
}
86
87
int ccv_nnc_cmd_is_forward(const ccv_nnc_cmd_t cmd)
88
26.8k
{
89
26.8k
  switch (cmd.cmd)
90
26.8k
  {
91
2
    case CCV_NNC_NOOP:
92
2
      return 0;
93
2.40k
    case CCV_NNC_CUSTOM_FORWARD:
94
2.40k
    case CCV_NNC_CUSTOM_BACKWARD:
95
2.40k
    case CCV_NNC_GRAPH_FORWARD:
96
2.40k
    case CCV_NNC_GRAPH_BACKWARD:
97
26.8k
    default:
98
26.8k
      return !(cmd.cmd & 0x1); // If it is even, it is forward
99
26.8k
  }
100
26.8k
}
101
102
int ccv_nnc_cmd_is_backward(const ccv_nnc_cmd_t cmd)
103
38.4k
{
104
38.4k
  switch (cmd.cmd)
105
38.4k
  {
106
2
    case CCV_NNC_NOOP:
107
2
      return 0;
108
0
    case CCV_NNC_CUSTOM_FORWARD:
109
4.80k
    case CCV_NNC_CUSTOM_BACKWARD:
110
4.80k
    case CCV_NNC_GRAPH_FORWARD:
111
4.81k
    case CCV_NNC_GRAPH_BACKWARD:
112
38.4k
    default:
113
38.4k
      return !!(cmd.cmd & 0x1); // If it is odd, it is backward
114
38.4k
  }
115
38.4k
}
116
117
int ccv_nnc_cmd_ok(const uint32_t cmd, const uint32_t backend)
118
789
{
119
  // If it is a custom command, a no op, or a graph op, there is no backend to check.
120
789
  if (cmd == CCV_NNC_NOOP ||
121
789
    cmd == CCV_NNC_GRAPH_FORWARD || cmd == CCV_NNC_GRAPH_BACKWARD ||
122
789
    cmd == CCV_NNC_CUSTOM_FORWARD || cmd == CCV_NNC_CUSTOM_BACKWARD)
123
0
    return 1;
124
789
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd);
125
789
  const int backend_idx = _ccv_nnc_cmd_backend_ph(backend);
126
789
  assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]));
127
789
  assert(backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT);
128
789
  const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[backend_idx];
129
  // Check if the execution function exists or not.
130
789
  return !!api_registry.exec;
131
789
}
132
133
ccv_nnc_cmd_t ccv_nnc_cmd(const uint32_t _cmd, ccv_nnc_cmd_vtab_t* const isa, const ccv_nnc_cmd_param_t params, const int flags)
134
50.9k
{
135
50.9k
  ccv_nnc_cmd_t cmd;
136
50.9k
  cmd.info = params;
137
50.9k
  cmd.backend = CCV_NNC_NO_BACKEND;
138
50.9k
  assert((_cmd == CCV_NNC_CUSTOM_FORWARD && isa) || (_cmd != CCV_NNC_CUSTOM_FORWARD && !isa));
139
50.9k
  cmd.cmd = _cmd;
140
50.9k
  cmd.algorithm = -1; // This is default.
141
50.9k
  cmd.isa = isa;
142
50.9k
  cmd.data = 0;
143
50.9k
  return cmd;
144
50.9k
}
145
146
const ccv_nnc_hint_t ccv_nnc_no_hint = {};
147
148
int ccv_nnc_is_no_hint(const ccv_nnc_hint_t hint)
149
143k
{
150
143k
  return (memcmp(&hint, &ccv_nnc_no_hint, sizeof(ccv_nnc_hint_t)) == 0);
151
143k
}
152
153
int ccv_nnc_hint_verify(const ccv_nnc_hint_t hint, const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t a, const ccv_nnc_tensor_param_t b)
154
11
{
155
11
  int i;
156
11
  assert(a.format == b.format);
157
11
  const int nd = ccv_nnc_tensor_nd(a.dim);
158
11
  const int size_nd = ccv_max(2, ccv_nnc_tensor_nd(cmd.size.dim) - 1);
159
11
  assert(size_nd == 2 || size_nd == 3); // Support 3D convolution.
160
11
  assert(nd == size_nd + 1 || nd == size_nd + 2);
161
11
  int hw;
162
11
  if ((a.format == CCV_TENSOR_FORMAT_CHWN) ||
163
11
    (a.format == CCV_TENSOR_FORMAT_NHWC && 
nd == size_nd + 19
))
164
0
    hw = 0;
165
11
  else if ((a.format == CCV_TENSOR_FORMAT_NHWC && 
nd == size_nd + 29
) ||
166
11
       
(2
a.format == CCV_TENSOR_FORMAT_NCHW2
&&
nd == size_nd + 12
))
167
9
    hw = 1;
168
2
  else if (a.format == CCV_TENSOR_FORMAT_NCHW && nd == size_nd + 2)
169
2
    hw = 2;
170
0
  else
171
2
    assert(0 && "unknown format");
172
35
  
for (i = 0; 11
i < size_nd;
i++24
)
173
24
  {
174
24
    if ((hint.border.begin[i] + hint.border.end[i] + a.dim[i + hw] - cmd.size.dim[i]) % hint.stride.dim[i] != 0)
175
0
      return -1;
176
24
    int expected = (hint.border.begin[i] + hint.border.end[i] + a.dim[i + hw] - cmd.size.dim[i]) / hint.stride.dim[i] + 1;
177
24
    if (expected != b.dim[i + hw])
178
0
      return -1;
179
24
  }
180
11
  return 0;
181
11
}
182
183
ccv_nnc_hint_t ccv_nnc_hint_auto(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t a, const ccv_nnc_tensor_param_t b)
184
112k
{
185
112k
  int i;
186
112k
  if (a.format != b.format)
187
0
    return ccv_nnc_no_hint;
188
112k
  assert(a.format == b.format);
189
112k
  const int a_nd = ccv_nnc_tensor_nd(a.dim);
190
112k
  const int b_nd = ccv_nnc_tensor_nd(b.dim);
191
112k
  const int size_nd = ccv_max(2, ccv_nnc_tensor_nd(cmd.size.dim) - 1);
192
112k
  assert(size_nd == 2 || size_nd == 3); // Support 3D convolution.
193
  // Is not auto hint deducible dimensions.
194
112k
  if (a_nd != b_nd || 
(111k
a_nd != size_nd + 1111k
&&
a_nd != size_nd + 2111k
))
195
110k
    return ccv_nnc_no_hint;
196
1.59k
  int hw;
197
1.59k
  if ((a.format == CCV_TENSOR_FORMAT_CHWN) ||
198
1.59k
    (a.format == CCV_TENSOR_FORMAT_NHWC && 
a_nd == size_nd + 1609
))
199
140
    hw = 0;
200
1.45k
  else if ((a.format == CCV_TENSOR_FORMAT_NHWC && 
a_nd == size_nd + 2469
) ||
201
1.45k
       
(983
a.format == CCV_TENSOR_FORMAT_NCHW983
&&
a_nd == size_nd + 1983
))
202
669
    hw = 1;
203
783
  else if (a.format == CCV_TENSOR_FORMAT_NCHW && a_nd == size_nd + 2)
204
783
    hw = 2;
205
0
  else
206
783
    assert(0 && "unknown format");
207
1.59k
  ccv_nnc_hint_t hint_auto = {};
208
  // 0-dim is reserved for channels
209
4.77k
  for (i = 0; i < size_nd; 
i++3.18k
)
210
3.18k
  {
211
    // Cannot have one of the dim is zero, we cannot auto the hint, return no hint.
212
3.18k
    assert(a.dim[i + hw] && b.dim[i + hw]);
213
    // This is guessed by having a stride that will approximately match the scale.
214
3.18k
    int stride = (a.dim[i + hw] + b.dim[i + hw] / 2) / b.dim[i + hw];
215
3.18k
    hint_auto.stride.dim[i] = stride;
216
3.18k
    int border = (b.dim[i + hw] - 1) * stride - a.dim[i + hw] + cmd.size.dim[i];
217
3.18k
    hint_auto.border.begin[i] = (border + 1) / 2; // Always prefer to have more padding in the beginning, this matches CUDNN behavior.
218
3.18k
    hint_auto.border.end[i] = border - hint_auto.border.begin[i];
219
3.18k
  }
220
1.59k
  return hint_auto;
221
1.59k
}
222
223
void ccv_nnc_hint_tensor_auto_forward_from_inputs(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
224
27.9k
{
225
27.9k
  int i;
226
27.9k
  assert(output_size <= input_size);
227
57.7k
  
for (i = 0; 27.9k
i < output_size;
i++29.7k
)
228
29.7k
    outputs[i] = inputs[i];
229
27.9k
}
230
231
void ccv_nnc_hint_tensor_auto_backward_from_gradient(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
232
7.73k
{
233
7.73k
  int i;
234
18.5k
  for (i = 0; i < output_size; 
i++10.7k
)
235
10.7k
    outputs[i] = inputs[0];
236
7.73k
}
237
238
void ccv_nnc_hint_tensor_auto_backward_from_inputs(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
239
20.3k
{
240
20.3k
  int i;
241
20.3k
  assert(output_size < input_size);
242
65.8k
  
for (i = 0; 20.3k
i < output_size;
i++45.5k
)
243
45.5k
    outputs[i] = inputs[i + 1];
244
20.3k
}
245
246
void ccv_nnc_hint_tensor_auto_backward_from_gradient_and_inputs(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
247
148
{
248
148
  int i;
249
148
  outputs[0] = inputs[0];
250
148
  assert(output_size < input_size);
251
296
  
for (i = 1; 148
i < output_size;
i++148
)
252
148
    outputs[i] = inputs[i + 1];
253
148
}
254
255
void ccv_nnc_hint_tensor_auto(const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
256
136k
{
257
  // zero out the parameters
258
136k
  const ccv_nnc_tensor_param_t z = {};
259
136k
  int i;
260
340k
  for (i = 0; i < output_size; 
i++204k
)
261
204k
    outputs[i] = z; // Reset the outputs.
262
  // Cannot handle these situations.
263
136k
  if (cmd.cmd == CCV_NNC_NOOP || 
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD135k
||
cmd.cmd == CCV_NNC_GRAPH_FORWARD132k
||
cmd.cmd == CCV_NNC_GRAPH_BACKWARD132k
)
264
3.43k
    return;
265
132k
  if (cmd.cmd == CCV_NNC_CUSTOM_FORWARD)
266
4.42k
  {
267
4.42k
    if (cmd.isa->tensor_auto)
268
4.41k
      cmd.isa->tensor_auto(cmd, inputs, input_size, hint, outputs, output_size);
269
4.42k
    return;
270
4.42k
  }
271
128k
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
272
128k
  const ccv_nnc_cmd_registry_t registry = init_map[cmd_idx].registry;
273
128k
  if (registry.tensor_auto)
274
128k
    registry.tensor_auto(cmd.info, inputs, input_size, hint, outputs, output_size);
275
0
  else if (ccv_nnc_cmd_is_forward(cmd)) // For forward, the default auto is forward_from_inputs
276
0
    ccv_nnc_hint_tensor_auto_forward_from_inputs(cmd.info, inputs, input_size, hint, outputs, output_size);
277
0
  else // For backward, the default auto is backward_from_inputs
278
0
    ccv_nnc_hint_tensor_auto_backward_from_inputs(cmd.info, inputs, input_size, hint, outputs, output_size);
279
128k
}
280
281
int ccv_nnc_cmd_allow_inplace(const ccv_nnc_cmd_t cmd, const int input_idx, const int input_size, const int output_idx, const int output_size)
282
53.6k
{
283
53.6k
  if (cmd.cmd == CCV_NNC_NOOP || 
cmd.cmd == CCV_NNC_CUSTOM_FORWARD53.6k
||
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD53.6k
||
cmd.cmd == CCV_NNC_GRAPH_FORWARD51.1k
||
cmd.cmd == CCV_NNC_GRAPH_BACKWARD51.1k
)
284
2.54k
    return 0;
285
51.0k
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
286
51.0k
  const ccv_nnc_cmd_registry_t registry = init_map[cmd_idx].registry;
287
51.0k
  if (registry.allow_inplace)
288
19.4k
    return registry.allow_inplace(cmd.info, input_idx, input_size, output_idx, output_size);
289
31.6k
  return 0;
290
51.0k
}
291
292
int ccv_nnc_cmd_enforce_inplace(const ccv_nnc_cmd_t cmd, const int input_idx, const int input_size, const int output_idx, const int output_size)
293
198k
{
294
198k
  if (cmd.cmd == CCV_NNC_NOOP || 
cmd.cmd == CCV_NNC_CUSTOM_FORWARD198k
||
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD196k
||
cmd.cmd == CCV_NNC_GRAPH_FORWARD188k
||
cmd.cmd == CCV_NNC_GRAPH_BACKWARD188k
)
295
10.1k
    return 0;
296
188k
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
297
188k
  const ccv_nnc_cmd_registry_t registry = init_map[cmd_idx].registry;
298
188k
  if (registry.enforce_inplace)
299
2.27k
    return registry.enforce_inplace(cmd.info, input_idx, input_size, output_idx, output_size);
300
186k
  return 0;
301
188k
}
302
303
// This returns absolute time.
304
uint64_t ccv_nnc_cmd_mono_time(void)
305
3.98k
{
306
3.98k
  struct timespec ts;
307
3.98k
  clock_gettime(CLOCK_MONOTONIC, &ts);
308
3.98k
  return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
309
3.98k
}
310
311
uint32_t ccv_nnc_cmd_find_backend(const ccv_nnc_cmd_t cmd, const int tensor_memory, const int tensor_formats, const int tensor_datatypes)
312
269k
{
313
269k
  if (cmd.cmd == CCV_NNC_NOOP ||
314
269k
    
cmd.cmd == CCV_NNC_GRAPH_FORWARD269k
||
cmd.cmd == CCV_NNC_GRAPH_BACKWARD269k
||
315
269k
    
cmd.cmd == CCV_NNC_CUSTOM_FORWARD269k
||
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD267k
)
316
7.77k
    return cmd.backend;
317
262k
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
318
262k
  assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]));
319
262k
  assert(tensor_memory != 0 && tensor_formats != 0 && tensor_datatypes != 0);
320
262k
  int i;
321
863k
  for (i = 0; i < CCV_NNC_BACKEND_COUNT; 
i++600k
)
322
863k
  {
323
863k
    const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i];
324
    // We have the exec kernel, and support all the tensor memory types.
325
863k
    if (api_registry.exec &&
326
863k
      
(api_registry.tensor_memory & tensor_memory) == tensor_memory305k
&&
327
863k
      
(api_registry.tensor_formats & tensor_formats) == tensor_formats262k
&&
328
863k
      
(api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes262k
)
329
262k
      return backend_init_map[i].backend;
330
863k
  }
331
0
  return cmd.backend;
332
262k
}
333
334
736
#define AUTO_TUNE_TRIAL_SIZE (3)
335
336
static void _ccv_nnc_cmd_set_device_id(ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
337
414k
{
338
414k
#ifdef HAVE_CUDA
339
414k
  if (!stream_context)
340
114k
  {
341
114k
    int device_id;
342
114k
    if (ccv_nnc_device_ids_for_io(inputs, input_size, outputs, output_size, CCV_TENSOR_GPU_MEMORY, &device_id, 1) > 0)
343
5.92k
      cudevice(device_id);
344
114k
  }
345
414k
#endif
346
414k
}
347
348
ccv_nnc_cmd_t ccv_nnc_cmd_autotune(const ccv_nnc_cmd_t cmd, const size_t max_workspace_size, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
349
2.60k
{
350
  // This is a custom cmd kernel, no need to autotune.
351
2.60k
  if (cmd.cmd == CCV_NNC_NOOP ||
352
2.60k
    cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD ||
353
2.60k
    cmd.cmd == CCV_NNC_CUSTOM_FORWARD || 
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD2.60k
)
354
1
    return cmd;
355
2.60k
  int i, j, k;
356
  // Go through all the backends that supports the same type of memory input / output tensors support.
357
2.60k
  int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0;
358
10.9k
  for (i = 0; i < input_size; 
i++8.33k
)
359
8.33k
    if (inputs[i])
360
6.12k
      tensor_memory |= CCV_TENSOR_GET_MEMORY(inputs[i]->info.type), tensor_formats |= inputs[i]->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(inputs[i]->info.datatype);
361
7.11k
  for (i = 0; i < output_size; 
i++4.50k
)
362
4.50k
    if (outputs[i])
363
4.30k
      tensor_memory |= CCV_TENSOR_GET_MEMORY(outputs[i]->info.type), tensor_formats |= outputs[i]->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(outputs[i]->info.datatype);
364
  // In this case, we cannot determine the type of the tensor, skip auto-tune.
365
2.60k
  if (!tensor_memory)
366
0
    return cmd;
367
  // Otherwise, we are good to go.
368
2.60k
  ccv_nnc_cmd_t tuned_cmd = cmd;
369
2.60k
  int64_t best_measured = -1;
370
2.60k
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
371
2.60k
  assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]));
372
2.60k
  int flag = 0, autotune_available_1 = 0; // This is only applicable if we have only one backend.
373
20.2k
  for (i = 0; i < CCV_NNC_BACKEND_COUNT; 
i++17.6k
)
374
17.8k
  {
375
17.8k
    const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i];
376
    // We have the exec kernel, and support all the tensor memory types.
377
17.8k
    if (api_registry.exec &&
378
17.8k
      
(api_registry.tensor_memory & tensor_memory) == tensor_memory5.69k
&&
379
17.8k
      
(api_registry.tensor_formats & tensor_formats) == tensor_formats2.79k
&&
380
17.8k
      
(api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes2.79k
)
381
2.79k
    {
382
2.79k
      if (api_registry.autotune)
383
159
        autotune_available_1 = 1;
384
2.79k
      if ((++flag) >= 2) // If we have more than 2 suitable backend, we can do this now.
385
184
        break;
386
2.79k
    }
387
17.8k
  }
388
2.60k
  if (flag == 0)
389
0
    return cmd;
390
2.60k
  _ccv_nnc_cmd_set_device_id(inputs, input_size, outputs, output_size, stream_context);
391
  // Allocate inputs / outputs and fill them in.
392
2.60k
  ccv_nnc_tensor_t** copy_inputs;
393
2.60k
  ccv_nnc_tensor_t** copy_outputs;
394
2.60k
  ccv_nnc_tensor_t** allocated_inputs;
395
2.60k
  ccv_nnc_tensor_t** allocated_outputs;
396
2.60k
  ccv_nnc_tensor_view_t** allocated_input_views;
397
2.60k
  ccv_nnc_tensor_view_t** allocated_output_views;
398
2.60k
  if (flag > 1 || 
autotune_available_12.42k
)
399
343
  {
400
343
    copy_inputs = (ccv_nnc_tensor_t**)cccalloc((input_size + output_size) * 3, sizeof(ccv_nnc_tensor_t*));
401
343
    copy_outputs = copy_inputs + input_size;
402
343
    allocated_inputs = copy_outputs + output_size;
403
343
    allocated_outputs = allocated_inputs + input_size;
404
343
    allocated_input_views = (ccv_nnc_tensor_view_t**)(allocated_outputs + output_size);
405
343
    allocated_output_views = allocated_input_views + input_size;
406
343
    int stride[CCV_NNC_MAX_DIM_ALLOC];
407
890
    for (i = 0; i < output_size; 
i++547
)
408
547
      if (outputs[i])
409
529
      {
410
2.60k
        for (j = 0; j < input_size; 
j++2.07k
)
411
2.07k
          if (inputs[j])
412
1.50k
          {
413
1.50k
            if (outputs[i] == inputs[j])
414
0
            {
415
0
              if (!copy_inputs[j])
416
0
              {
417
0
                allocated_inputs[j] = ccv_nnc_tensor_new(0, inputs[j]->info, 0);
418
0
                if (CCV_IS_TENSOR_VIEW(inputs[j]))
419
0
                {
420
0
                  ccv_nnc_tensor_get_stride(inputs[j]->info.dim, stride);
421
0
                  copy_inputs[j] = (ccv_nnc_tensor_t*)(allocated_input_views[j] = ccv_nnc_tensor_view_new(allocated_inputs[j], inputs[j]->info, DIM_ALLOC(), stride));
422
0
                } else
423
0
                  copy_inputs[j] = allocated_inputs[j];
424
0
              }
425
0
              copy_outputs[i] = copy_inputs[j];
426
0
              break;
427
1.50k
            } else if (outputs[i]->data.u8 == inputs[j]->data.u8 &&
428
1.50k
              
ccv_nnc_tensor_count(outputs[i]->info) == ccv_nnc_tensor_count(inputs[j]->info)0
) {
429
0
              if (!copy_inputs[j])
430
0
              {
431
0
                allocated_inputs[j] = ccv_nnc_tensor_new(0, inputs[j]->info, 0);
432
0
                if (CCV_IS_TENSOR_VIEW(inputs[j]))
433
0
                {
434
0
                  ccv_nnc_tensor_get_stride(inputs[j]->info.dim, stride);
435
0
                  copy_inputs[j] = (ccv_nnc_tensor_t*)(allocated_input_views[j] = ccv_nnc_tensor_view_new(allocated_inputs[j], inputs[j]->info, DIM_ALLOC(), stride));
436
0
                } else
437
0
                  copy_inputs[j] = allocated_inputs[j];
438
0
              }
439
0
              allocated_outputs[i] = ccv_nnc_tensor_new(copy_inputs[j]->data.u8, outputs[i]->info, 0);
440
0
              if (CCV_IS_TENSOR_VIEW(outputs[i]))
441
0
              {
442
0
                  ccv_nnc_tensor_get_stride(outputs[i]->info.dim, stride);
443
0
                copy_outputs[i] = (ccv_nnc_tensor_t*)(allocated_output_views[i] = ccv_nnc_tensor_view_new(allocated_outputs[i], outputs[i]->info, DIM_ALLOC(), stride));
444
0
              } else
445
0
                copy_outputs[i] = allocated_outputs[i];
446
0
              break;
447
0
            }
448
1.50k
          }
449
529
        if (!copy_outputs[i])
450
529
        {
451
529
          allocated_outputs[i] = ccv_nnc_tensor_new(0, outputs[i]->info, 0);
452
529
          if (CCV_IS_TENSOR_VIEW(outputs[i]))
453
3
          {
454
3
            ccv_nnc_tensor_get_stride(outputs[i]->info.dim, stride);
455
3
            copy_outputs[i] = (ccv_nnc_tensor_t*)(allocated_output_views[i] = ccv_nnc_tensor_view_new(allocated_outputs[i], outputs[i]->info, DIM_ALLOC(), stride));
456
3
          } else
457
526
            copy_outputs[i] = allocated_outputs[i];
458
529
        }
459
529
      }
460
1.51k
    for (i = 0; i < input_size; 
i++1.17k
)
461
1.17k
      if (inputs[i] && 
!copy_inputs[i]958
)
462
958
        copy_inputs[i] = inputs[i];
463
343
  }
464
2.60k
  if (flag == 1)
465
2.42k
  {
466
8.71k
    for (i = 0; i < CCV_NNC_BACKEND_COUNT; 
i++6.29k
)
467
8.71k
    {
468
8.71k
      const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i];
469
      // We have the exec kernel, and support all the tensor memory types.
470
8.71k
      if (api_registry.exec &&
471
8.71k
        
(api_registry.tensor_memory & tensor_memory) == tensor_memory4.02k
&&
472
8.71k
        
(api_registry.tensor_formats & tensor_formats) == tensor_formats2.42k
&&
473
8.71k
        
(api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes2.42k
)
474
2.42k
      {
475
2.42k
        tuned_cmd.backend = backend_init_map[i].backend;
476
        // If a given API exist an autotune function, use that to pick the top algorithm.
477
2.42k
        if (api_registry.autotune)
478
159
        {
479
159
          ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, inputs, input_size, copy_inputs, input_size, stream_context);
480
159
          _ccv_nnc_cmd_set_device_id(copy_inputs, input_size, copy_outputs, output_size, stream_context);
481
159
          tuned_cmd.algorithm = api_registry.autotune(tuned_cmd, max_workspace_size, hint, flags, copy_inputs, input_size, copy_outputs, output_size, stream_context);
482
          // Drain the context, autotune can use excessive amount of memory. Need to drain it now.
483
159
          ccv_nnc_stream_context_drain(stream_context);
484
159
        }
485
2.42k
        break;
486
2.42k
      }
487
8.71k
    }
488
2.42k
    if (autotune_available_1)
489
159
    {
490
780
      for (i = 0; i < input_size; 
i++621
)
491
621
      {
492
621
        if (allocated_inputs[i])
493
0
          ccv_nnc_tensor_free(allocated_inputs[i]);
494
621
        if (allocated_input_views[i])
495
0
          ccv_nnc_tensor_view_free(allocated_input_views[i]);
496
621
      }
497
470
      for (i = 0; i < output_size; 
i++311
)
498
311
      {
499
311
        if (allocated_outputs[i])
500
303
          ccv_nnc_tensor_free(allocated_outputs[i]);
501
311
        if (allocated_output_views[i])
502
0
          ccv_nnc_tensor_view_free(allocated_output_views[i]);
503
311
      }
504
159
      ccfree(copy_inputs);
505
159
    }
506
2.42k
    return tuned_cmd;
507
2.42k
  }
508
  // We need to have trial loop through all the data.
509
736
  
for (k = 0; 184
k < AUTO_TUNE_TRIAL_SIZE;
k++552
)
510
552
  {
511
4.41k
    for (i = 0; i < CCV_NNC_BACKEND_COUNT; 
i++3.86k
)
512
3.86k
    {
513
3.86k
      const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i];
514
      // We have the exec kernel, and support all the tensor memory types.
515
3.86k
      if (api_registry.exec &&
516
3.86k
        
(api_registry.tensor_memory & tensor_memory) == tensor_memory1.65k
&&
517
3.86k
        
(api_registry.tensor_formats & tensor_formats) == tensor_formats1.10k
&&
518
3.86k
        
(api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes1.10k
)
519
1.10k
      {
520
1.10k
        ccv_nnc_cmd_t candid_cmd = cmd;
521
1.10k
        candid_cmd.backend = backend_init_map[i].backend;
522
        // If a given API exist an autotune function, use that to pick the top algorithm.
523
1.10k
        if (api_registry.autotune)
524
0
        {
525
          // Assuming k == 0 is sufficient, and we can skip.
526
0
          if (k > 0)
527
0
            continue;
528
0
          ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, inputs, input_size, copy_inputs, input_size, stream_context);
529
0
          _ccv_nnc_cmd_set_device_id(copy_inputs, input_size, copy_outputs, output_size, stream_context);
530
0
          candid_cmd.algorithm = api_registry.autotune(candid_cmd, max_workspace_size, hint, flags, copy_inputs, input_size, copy_outputs, output_size, stream_context);
531
          // Drain the context, autotune can use excessive amount of memory. Need to drain it now.
532
0
          ccv_nnc_stream_context_drain(stream_context);
533
0
          uint64_t elapsed = ccv_nnc_cmd_mono_time();
534
          // Ready to run.
535
0
          int status = ccv_nnc_cmd_exec(candid_cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context);
536
0
          ccv_nnc_stream_context_wait(stream_context);
537
0
          elapsed = ccv_nnc_cmd_mono_time() - elapsed;
538
0
          if (status == CCV_NNC_EXEC_SUCCESS &&
539
0
            (best_measured == -1 || elapsed < best_measured))
540
0
          {
541
0
            best_measured = elapsed;
542
0
            tuned_cmd = candid_cmd;
543
0
          }
544
1.10k
        } else {
545
          // Otherwise loop over the existing algorithms and pick the top one.
546
3.09k
          for (j = 0; j < api_registry.algorithms; 
j++1.99k
)
547
1.99k
          {
548
1.99k
            candid_cmd.algorithm = j;
549
1.99k
            ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, inputs, input_size, copy_inputs, input_size, stream_context);
550
1.99k
            _ccv_nnc_cmd_set_device_id(copy_inputs, input_size, copy_outputs, output_size, stream_context);
551
1.99k
            uint64_t elapsed = ccv_nnc_cmd_mono_time();
552
            // Ready to run.
553
1.99k
            int status = ccv_nnc_cmd_exec(candid_cmd, hint, flags, copy_inputs, input_size, copy_outputs, output_size, stream_context);
554
1.99k
            elapsed = ccv_nnc_cmd_mono_time() - elapsed;
555
1.99k
            if (status == CCV_NNC_EXEC_SUCCESS &&
556
1.99k
              
(1.29k
best_measured == -11.29k
||
elapsed < best_measured1.10k
))
557
647
            {
558
647
              best_measured = elapsed;
559
647
              tuned_cmd = candid_cmd;
560
647
            }
561
1.99k
          }
562
1.10k
        }
563
1.10k
      }
564
3.86k
    }
565
552
  }
566
735
  for (i = 0; i < input_size; 
i++551
)
567
551
  {
568
551
    if (allocated_inputs[i])
569
0
      ccv_nnc_tensor_free(allocated_inputs[i]);
570
551
    if (allocated_input_views[i])
571
0
      ccv_nnc_tensor_view_free(allocated_input_views[i]);
572
551
  }
573
420
  for (i = 0; i < output_size; 
i++236
)
574
236
  {
575
236
    if (allocated_outputs[i])
576
226
      ccv_nnc_tensor_free(allocated_outputs[i]);
577
236
    if (allocated_output_views[i])
578
3
      ccv_nnc_tensor_view_free(allocated_output_views[i]);
579
236
  }
580
184
  ccfree(copy_inputs);
581
184
  return tuned_cmd;
582
2.60k
}
583
584
int ccv_nnc_cmd_bitmask(const ccv_nnc_cmd_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size)
585
146k
{
586
  // If it is no-op, return true, it can deal with any number of parameters.
587
146k
  if (cmd.cmd == CCV_NNC_NOOP)
588
112
    return 1;
589
  // If it is a custom command, I cannot check it at all, return false.
590
146k
  if (cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD)
591
2.40k
    return 0;
592
144k
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
593
144k
  const ccv_nnc_cmd_registry_t cmd_registry = init_map[cmd_idx].registry;
594
144k
  if (cmd_registry.bitmask)
595
144k
    return cmd_registry.bitmask(cmd.info, input_size, output_size, input_bitmasks, input_bitmask_size, output_bitmasks, output_bitmask_size);
596
  // If there is not checking, none can pass.
597
0
  return 0;
598
144k
}
599
600
int ccv_nnc_device_ids_for_io(ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const int tensor_type, int* const device_ids, const int max_device_id_size)
601
127k
{
602
127k
  int i, j;
603
127k
  int device_id_size = 0;
604
127k
  if (max_device_id_size <= device_id_size)
605
0
    return device_id_size;
606
  // The device id of the exec is determined by its outputs.
607
299k
  
for (i = 0; 127k
i < output_size;
i++171k
)
608
177k
    if (outputs[i] &&
609
177k
      
CCV_TENSOR_GET_MEMORY158k
(outputs[i]->info.type) == tensor_type158k
&&
610
177k
      
CCV_TENSOR_GET_DEVICE24.5k
(outputs[i]->info.type) != CCV_COMPUTE_DEVICE_ANY24.5k
)
611
24.5k
    {
612
24.5k
      const int device_id = CCV_TENSOR_GET_DEVICE_ID(outputs[i]->info.type);
613
24.5k
      int flag = 0;
614
35.3k
      for (j = 0; !flag && 
j < device_id_size27.5k
;
j++10.7k
)
615
10.7k
        flag = (device_ids[j] == device_id);
616
24.5k
      if (flag)
617
7.76k
        continue;
618
16.8k
      device_ids[device_id_size++] = device_id;
619
16.8k
      if (device_id_size >= max_device_id_size)
620
5.33k
        return device_id_size;
621
16.8k
    }
622
122k
  if (device_id_size == 0)
623
112k
  {
624
112k
    int device_id = -1;
625
363k
    for (i = 0; i < input_size; 
i++251k
)
626
251k
      if (inputs[i] &&
627
251k
        
CCV_TENSOR_GET_MEMORY205k
(inputs[i]->info.type) == tensor_type205k
&&
628
251k
        
CCV_TENSOR_GET_DEVICE2.10k
(inputs[i]->info.type) != CCV_COMPUTE_DEVICE_ANY2.10k
&&
629
251k
        
(2.09k
device_id < 02.09k
||
CCV_TENSOR_GET_DEVICE_ID1.10k
(inputs[i]->info.type) < device_id1.10k
))
630
996
        device_id = CCV_TENSOR_GET_DEVICE_ID(inputs[i]->info.type);
631
112k
    if (device_id >= 0)
632
996
    {
633
996
      device_ids[0] = device_id;
634
996
      return 1;
635
996
    }
636
112k
  }
637
121k
  return device_id_size;
638
122k
}
639
640
void* ccv_nnc_cmd_aux(const ccv_nnc_cmd_t cmd)
641
11
{
642
11
  if (cmd.cmd == CCV_NNC_NOOP ||
643
11
    cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD ||
644
11
    cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD)
645
0
    return 0;
646
11
  assert(cmd.backend != CCV_NNC_NO_BACKEND);
647
11
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
648
11
  assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]));
649
11
  const int backend_idx = _ccv_nnc_cmd_backend_ph(cmd.backend);
650
11
  assert(backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT);
651
11
  const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[backend_idx];
652
11
  return api_registry.aux;
653
11
}
654
655
int ccv_nnc_cmd_exec(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
656
440k
{
657
  // If it is no-op, return as if succeed already.
658
440k
  if (cmd.cmd == CCV_NNC_NOOP)
659
31.2k
    return 0;
660
409k
  _ccv_nnc_cmd_set_device_id(inputs, input_size, outputs, output_size, stream_context);
661
  // If it is a custom command, just apply it directly.
662
409k
  if (cmd.cmd == CCV_NNC_CUSTOM_FORWARD || 
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD406k
)
663
4.82k
  {
664
4.82k
    int ret = cmd.isa->exec(cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context);
665
4.82k
    if (!stream_context)
666
4.43k
      ccv_nnc_stream_context_drain(stream_context);
667
4.82k
    return ret;
668
4.82k
  }
669
409k
  assert
(cmd.cmd != CCV_NNC_GRAPH_FORWARD && cmd.cmd != CCV_NNC_GRAPH_BACKWARD)404k
;
670
404k
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
671
404k
  assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]));
672
404k
  int i;
673
404k
  uint32_t backend = cmd.backend;
674
404k
  if (backend == CCV_NNC_NO_BACKEND)
675
180k
  {
676
    // Find a suitable backend.
677
180k
    int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0;
678
451k
    for (i = 0; i < input_size; 
i++271k
)
679
271k
      if (inputs[i])
680
269k
        tensor_memory |= CCV_TENSOR_GET_MEMORY(inputs[i]->info.type), tensor_formats |= inputs[i]->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(inputs[i]->info.datatype);
681
408k
    for (i = 0; i < output_size; 
i++227k
)
682
227k
      if (outputs[i])
683
226k
        tensor_memory |= CCV_TENSOR_GET_MEMORY(outputs[i]->info.type), tensor_formats |= outputs[i]->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(outputs[i]->info.datatype);
684
180k
    backend = ccv_nnc_cmd_find_backend(cmd, tensor_memory, tensor_formats, tensor_datatypes);
685
180k
  }
686
404k
  assert(backend != CCV_NNC_NO_BACKEND);
687
404k
  const int backend_idx = _ccv_nnc_cmd_backend_ph(backend);
688
404k
  assert(backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT);
689
404k
  const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[backend_idx];
690
404k
  if (!api_registry.exec)
691
0
    return CCV_NNC_EXEC_NO_KERNEL;
692
  // Everything is out, call the underlying implementation.
693
404k
  int ret = api_registry.exec(cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context);
694
404k
  if (!stream_context)
695
105k
    ccv_nnc_stream_context_drain(stream_context);
696
404k
  return ret;
697
404k
}
698
699
int ccv_nnc_cmd_attr(const ccv_nnc_cmd_t cmd, const int flags)
700
0
{
701
  // No additional attr for noop.
702
0
  if (cmd.cmd == CCV_NNC_NOOP ||
703
    // If it is a custom command, just apply it directly.
704
0
    cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD ||
705
    // If it is sub-graph, there is no additional attr as well.
706
0
    cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD)
707
0
    return 0;
708
0
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
709
0
  assert(cmd_idx >= 0 && cmd_idx <sizeof(init_map) / sizeof(init_map[0]));
710
0
  const ccv_nnc_cmd_registry_t cmd_registry = init_map[cmd_idx].registry;
711
0
  return !!(cmd_registry.flags & flags);
712
0
}
713
714
void ccv_nnc_set_profiler(int state)
715
0
{
716
0
#ifdef HAVE_CUDA
717
0
  cusetprofiler(state);
718
0
#endif
719
0
}
720
721
int ccv_nnc_queue_watermark(void)
722
0
{
723
#ifdef HAVE_MPS
724
  return ccv_nnc_mps_queue_watermark();
725
#else
726
0
  return 0;
727
0
#endif
728
0
}
729
730
void ccv_nnc_set_queue_watermark(int watermark)
731
0
{
732
#ifdef HAVE_MPS
733
  // If we need to be memory efficient, we need to bound how many in-flight command buffers there are.
734
  ccv_nnc_mps_set_queue_watermark(watermark);
735
#endif
736
0
}
737
738
void ccv_nnc_set_device_permutation(const int type, const int* const device_map, const int size)
739
2
{
740
2
  if (type != CCV_STREAM_CONTEXT_GPU)
741
0
    return;
742
2
#ifdef HAVE_CUDA
743
2
  cusetdevicemap(device_map, size);
744
2
#endif
745
2
}