Coverage Report

Created: 2025-05-07 17:36

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_cmd.c
Line
Count
Source
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_internal.h"
3
#include "3rdparty/khash/khash.h"
4
#include "ccv_nnc_easy.h"
5
#ifdef HAVE_CUDA
6
#include "gpu/ccv_nnc_compat.h"
7
#elif defined(HAVE_MPS)
8
#include "mps/ccv_nnc_mps.h"
9
#endif
10
#include <time.h>
11
#include <sys/time.h>
12
13
typedef struct {
14
  const uint32_t cmd;
15
  const char* name;
16
  ccv_nnc_cmd_registry_t registry;
17
  ccv_nnc_cmd_backend_registry_t backends[CCV_NNC_BACKEND_COUNT];
18
} ccv_nnc_cmd_init_t;
19
20
typedef struct {
21
  const uint32_t backend;
22
  const char* name;
23
} ccv_nnc_cmd_backend_init_t;
24
25
// The generated code configures command and its mapping.
26
#include "cmd/ccv_nnc_cmd.inc"
27
28
void ccv_nnc_init(void)
29
1
{
30
1
  _ccv_nnc_cmd_init();
31
1
}
32
33
static uint64_t _ccv_nnc_flags = 0;
34
35
uint64_t ccv_nnc_flags(void)
36
0
{
37
0
  return _ccv_nnc_flags;
38
0
}
39
40
void ccv_nnc_enable_flag(uint64_t flag)
41
0
{
42
0
  _ccv_nnc_flags |= flag;
43
0
}
44
45
void ccv_nnc_disable_flag(uint64_t flag)
46
0
{
47
0
  _ccv_nnc_flags &= ~flag;
48
0
}
49
50
const char* ccv_nnc_cmd_name(const uint32_t cmd)
51
2.29k
{
52
2.29k
  switch (cmd)
53
2.29k
  {
54
86
    case CCV_NNC_NOOP:
55
86
      return "CCV_NNC_NOOP";
56
3
    case CCV_NNC_CUSTOM_FORWARD:
57
3
      return "CCV_NNC_CUSTOM_FORWARD";
58
0
    case CCV_NNC_CUSTOM_BACKWARD:
59
0
      return "CCV_NNC_CUSTOM_BACKWARD";
60
64
    case CCV_NNC_GRAPH_FORWARD:
61
64
      return "CCV_NNC_GRAPH_FORWARD";
62
5
    case CCV_NNC_GRAPH_BACKWARD:
63
5
      return "CCV_NNC_GRAPH_BACKWARD";
64
2.29k
  }
65
2.14k
  const int idx = _ccv_nnc_cmd_ph(cmd);
66
2.14k
  assert(idx >= 0);
67
2.14k
  assert(idx < sizeof(init_map) / sizeof(init_map[0]));
68
2.14k
  return init_map[idx].name;
69
2.14k
}
70
71
const char* ccv_nnc_cmd_backend_name(const uint32_t backend)
72
0
{
73
0
  if (backend == CCV_NNC_NO_BACKEND)
74
0
    return "CCV_NNC_NO_BACKEND";
75
0
  const int idx = _ccv_nnc_cmd_backend_ph(backend);
76
0
  assert(idx >= 0);
77
0
  assert(idx < CCV_NNC_BACKEND_COUNT);
78
0
  return backend_init_map[idx].name;
79
0
}
80
81
const ccv_nnc_cmd_param_t ccv_nnc_cmd_auto = {};
82
83
int ccv_nnc_is_cmd_auto(const ccv_nnc_cmd_param_t params)
84
0
{
85
0
  return (memcmp(&params, &ccv_nnc_cmd_auto, sizeof(ccv_nnc_cmd_param_t)) == 0);
86
0
}
87
88
int ccv_nnc_cmd_is_forward(const ccv_nnc_cmd_t cmd)
89
26.9k
{
90
26.9k
  switch (cmd.cmd)
91
26.9k
  {
92
2
    case CCV_NNC_NOOP:
93
2
      return 0;
94
2.40k
    case CCV_NNC_CUSTOM_FORWARD:
95
2.40k
    case CCV_NNC_CUSTOM_BACKWARD:
96
2.40k
    case CCV_NNC_GRAPH_FORWARD:
97
2.40k
    case CCV_NNC_GRAPH_BACKWARD:
98
26.9k
    default:
99
26.9k
      return !(cmd.cmd & 0x1); // If it is even, it is forward
100
26.9k
  }
101
26.9k
}
102
103
int ccv_nnc_cmd_is_backward(const ccv_nnc_cmd_t cmd)
104
38.4k
{
105
38.4k
  switch (cmd.cmd)
106
38.4k
  {
107
2
    case CCV_NNC_NOOP:
108
2
      return 0;
109
0
    case CCV_NNC_CUSTOM_FORWARD:
110
4.80k
    case CCV_NNC_CUSTOM_BACKWARD:
111
4.80k
    case CCV_NNC_GRAPH_FORWARD:
112
4.81k
    case CCV_NNC_GRAPH_BACKWARD:
113
38.4k
    default:
114
38.4k
      return !!(cmd.cmd & 0x1); // If it is odd, it is backward
115
38.4k
  }
116
38.4k
}
117
118
int ccv_nnc_cmd_ok(const uint32_t cmd, const uint32_t backend)
119
846
{
120
  // If it is a custom command, a no op, or a graph op, there is no backend to check.
121
846
  if (cmd == CCV_NNC_NOOP ||
122
846
    cmd == CCV_NNC_GRAPH_FORWARD || cmd == CCV_NNC_GRAPH_BACKWARD ||
123
846
    cmd == CCV_NNC_CUSTOM_FORWARD || cmd == CCV_NNC_CUSTOM_BACKWARD)
124
0
    return 1;
125
846
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd);
126
846
  const int backend_idx = _ccv_nnc_cmd_backend_ph(backend);
127
846
  assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]));
128
846
  assert(backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT);
129
846
  const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[backend_idx];
130
  // Check if the execution function exists or not.
131
846
  return !!api_registry.exec;
132
846
}
133
134
ccv_nnc_cmd_t ccv_nnc_cmd(const uint32_t _cmd, ccv_nnc_cmd_vtab_t* const isa, const ccv_nnc_cmd_param_t params, const int flags)
135
50.8k
{
136
50.8k
  ccv_nnc_cmd_t cmd;
137
50.8k
  cmd.info = params;
138
50.8k
  cmd.backend = CCV_NNC_NO_BACKEND;
139
50.8k
  assert((_cmd == CCV_NNC_CUSTOM_FORWARD && isa) || (_cmd != CCV_NNC_CUSTOM_FORWARD && !isa));
140
50.8k
  cmd.cmd = _cmd;
141
50.8k
  cmd.algorithm = -1; // This is default.
142
50.8k
  cmd.isa = isa;
143
50.8k
  cmd.data = 0;
144
50.8k
  return cmd;
145
50.8k
}
146
147
const ccv_nnc_hint_t ccv_nnc_no_hint = {};
148
149
int ccv_nnc_is_no_hint(const ccv_nnc_hint_t hint)
150
143k
{
151
143k
  return (memcmp(&hint, &ccv_nnc_no_hint, sizeof(ccv_nnc_hint_t)) == 0);
152
143k
}
153
154
int ccv_nnc_hint_verify(const ccv_nnc_hint_t hint, const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t a, const ccv_nnc_tensor_param_t b)
155
11
{
156
11
  int i;
157
11
  assert(a.format == b.format);
158
11
  const int nd = ccv_nnc_tensor_nd(a.dim);
159
11
  const int size_nd = ccv_max(2, ccv_nnc_tensor_nd(cmd.size.dim) - 1);
160
11
  assert(size_nd == 2 || size_nd == 3); // Support 3D convolution.
161
11
  assert(nd == size_nd + 1 || nd == size_nd + 2);
162
11
  int hw;
163
11
  if ((a.format == CCV_TENSOR_FORMAT_CHWN) ||
164
11
    (a.format == CCV_TENSOR_FORMAT_NHWC && 
nd == size_nd + 19
))
165
0
    hw = 0;
166
11
  else if ((a.format == CCV_TENSOR_FORMAT_NHWC && 
nd == size_nd + 29
) ||
167
11
       
(2
a.format == CCV_TENSOR_FORMAT_NCHW2
&&
nd == size_nd + 12
))
168
9
    hw = 1;
169
2
  else if (a.format == CCV_TENSOR_FORMAT_NCHW && nd == size_nd + 2)
170
2
    hw = 2;
171
0
  else
172
2
    assert(0 && "unknown format");
173
35
  
for (i = 0; 11
i < size_nd;
i++24
)
174
24
  {
175
24
    if ((hint.border.begin[i] + hint.border.end[i] + a.dim[i + hw] - cmd.size.dim[i]) % hint.stride.dim[i] != 0)
176
0
      return -1;
177
24
    int expected = (hint.border.begin[i] + hint.border.end[i] + a.dim[i + hw] - cmd.size.dim[i]) / hint.stride.dim[i] + 1;
178
24
    if (expected != b.dim[i + hw])
179
0
      return -1;
180
24
  }
181
11
  return 0;
182
11
}
183
184
ccv_nnc_hint_t ccv_nnc_hint_auto(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t a, const ccv_nnc_tensor_param_t b)
185
112k
{
186
112k
  int i;
187
112k
  if (a.format != b.format)
188
0
    return ccv_nnc_no_hint;
189
112k
  assert(a.format == b.format);
190
112k
  const int a_nd = ccv_nnc_tensor_nd(a.dim);
191
112k
  const int b_nd = ccv_nnc_tensor_nd(b.dim);
192
112k
  const int size_nd = ccv_max(2, ccv_nnc_tensor_nd(cmd.size.dim) - 1);
193
112k
  assert(size_nd == 2 || size_nd == 3); // Support 3D convolution.
194
  // Is not auto hint deducible dimensions.
195
112k
  if (a_nd != b_nd || 
(111k
a_nd != size_nd + 1111k
&&
a_nd != size_nd + 2111k
))
196
110k
    return ccv_nnc_no_hint;
197
1.59k
  int hw;
198
1.59k
  if ((a.format == CCV_TENSOR_FORMAT_CHWN) ||
199
1.59k
    (a.format == CCV_TENSOR_FORMAT_NHWC && 
a_nd == size_nd + 1609
))
200
140
    hw = 0;
201
1.45k
  else if ((a.format == CCV_TENSOR_FORMAT_NHWC && 
a_nd == size_nd + 2469
) ||
202
1.45k
       
(983
a.format == CCV_TENSOR_FORMAT_NCHW983
&&
a_nd == size_nd + 1983
))
203
669
    hw = 1;
204
783
  else if (a.format == CCV_TENSOR_FORMAT_NCHW && a_nd == size_nd + 2)
205
783
    hw = 2;
206
0
  else
207
783
    assert(0 && "unknown format");
208
1.59k
  ccv_nnc_hint_t hint_auto = {};
209
  // 0-dim is reserved for channels
210
4.77k
  for (i = 0; i < size_nd; 
i++3.18k
)
211
3.18k
  {
212
    // Cannot have one of the dim is zero, we cannot auto the hint, return no hint.
213
3.18k
    assert(a.dim[i + hw] && b.dim[i + hw]);
214
    // This is guessed by having a stride that will approximately match the scale.
215
3.18k
    int stride = (a.dim[i + hw] + b.dim[i + hw] / 2) / b.dim[i + hw];
216
3.18k
    hint_auto.stride.dim[i] = stride;
217
3.18k
    int border = (b.dim[i + hw] - 1) * stride - a.dim[i + hw] + cmd.size.dim[i];
218
3.18k
    hint_auto.border.begin[i] = (border + 1) / 2; // Always prefer to have more padding in the beginning, this matches CUDNN behavior.
219
3.18k
    hint_auto.border.end[i] = border - hint_auto.border.begin[i];
220
3.18k
  }
221
1.59k
  return hint_auto;
222
1.59k
}
223
224
void ccv_nnc_hint_tensor_auto_forward_from_inputs(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
225
28.0k
{
226
28.0k
  int i;
227
28.0k
  assert(output_size <= input_size);
228
57.8k
  
for (i = 0; 28.0k
i < output_size;
i++29.8k
)
229
29.8k
    outputs[i] = inputs[i];
230
28.0k
}
231
232
void ccv_nnc_hint_tensor_auto_backward_from_gradient(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
233
7.74k
{
234
7.74k
  int i;
235
18.5k
  for (i = 0; i < output_size; 
i++10.8k
)
236
10.8k
    outputs[i] = inputs[0];
237
7.74k
}
238
239
void ccv_nnc_hint_tensor_auto_backward_from_inputs(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
240
20.3k
{
241
20.3k
  int i;
242
20.3k
  assert(output_size < input_size);
243
65.8k
  
for (i = 0; 20.3k
i < output_size;
i++45.5k
)
244
45.5k
    outputs[i] = inputs[i + 1];
245
20.3k
}
246
247
void ccv_nnc_hint_tensor_auto_backward_from_gradient_and_inputs(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
248
148
{
249
148
  int i;
250
148
  outputs[0] = inputs[0];
251
148
  assert(output_size < input_size);
252
296
  
for (i = 1; 148
i < output_size;
i++148
)
253
148
    outputs[i] = inputs[i + 1];
254
148
}
255
256
void ccv_nnc_hint_tensor_auto(const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
257
136k
{
258
  // zero out the parameters
259
136k
  const ccv_nnc_tensor_param_t z = {};
260
136k
  int i;
261
341k
  for (i = 0; i < output_size; 
i++204k
)
262
204k
    outputs[i] = z; // Reset the outputs.
263
  // Cannot handle these situations.
264
136k
  if (cmd.cmd == CCV_NNC_NOOP || 
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD135k
||
cmd.cmd == CCV_NNC_GRAPH_FORWARD132k
||
cmd.cmd == CCV_NNC_GRAPH_BACKWARD132k
)
265
3.43k
    return;
266
132k
  if (cmd.cmd == CCV_NNC_CUSTOM_FORWARD)
267
4.42k
  {
268
4.42k
    if (cmd.isa->tensor_auto)
269
4.41k
      cmd.isa->tensor_auto(cmd, inputs, input_size, hint, outputs, output_size);
270
4.42k
    return;
271
4.42k
  }
272
128k
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
273
128k
  const ccv_nnc_cmd_registry_t registry = init_map[cmd_idx].registry;
274
128k
  if (registry.tensor_auto)
275
128k
    registry.tensor_auto(cmd.info, inputs, input_size, hint, outputs, output_size);
276
0
  else if (ccv_nnc_cmd_is_forward(cmd)) // For forward, the default auto is forward_from_inputs
277
0
    ccv_nnc_hint_tensor_auto_forward_from_inputs(cmd.info, inputs, input_size, hint, outputs, output_size);
278
0
  else // For backward, the default auto is backward_from_inputs
279
0
    ccv_nnc_hint_tensor_auto_backward_from_inputs(cmd.info, inputs, input_size, hint, outputs, output_size);
280
128k
}
281
282
int ccv_nnc_cmd_allow_inplace(const ccv_nnc_cmd_t cmd, const int input_idx, const int input_size, const int output_idx, const int output_size)
283
53.7k
{
284
53.7k
  if (cmd.cmd == CCV_NNC_NOOP || 
cmd.cmd == CCV_NNC_CUSTOM_FORWARD53.7k
||
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD53.7k
||
cmd.cmd == CCV_NNC_GRAPH_FORWARD51.2k
||
cmd.cmd == CCV_NNC_GRAPH_BACKWARD51.1k
)
285
2.54k
    return 0;
286
51.1k
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
287
51.1k
  const ccv_nnc_cmd_registry_t registry = init_map[cmd_idx].registry;
288
51.1k
  if (registry.allow_inplace)
289
19.4k
    return registry.allow_inplace(cmd.info, input_idx, input_size, output_idx, output_size);
290
31.7k
  return 0;
291
51.1k
}
292
293
int ccv_nnc_cmd_enforce_inplace(const ccv_nnc_cmd_t cmd, const int input_idx, const int input_size, const int output_idx, const int output_size)
294
198k
{
295
198k
  if (cmd.cmd == CCV_NNC_NOOP || 
cmd.cmd == CCV_NNC_CUSTOM_FORWARD198k
||
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD196k
||
cmd.cmd == CCV_NNC_GRAPH_FORWARD188k
||
cmd.cmd == CCV_NNC_GRAPH_BACKWARD188k
)
296
10.1k
    return 0;
297
188k
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
298
188k
  const ccv_nnc_cmd_registry_t registry = init_map[cmd_idx].registry;
299
188k
  if (registry.enforce_inplace)
300
2.27k
    return registry.enforce_inplace(cmd.info, input_idx, input_size, output_idx, output_size);
301
186k
  return 0;
302
188k
}
303
304
// This returns absolute time.
305
uint64_t ccv_nnc_cmd_mono_time(void)
306
3.29k
{
307
3.29k
  struct timespec ts;
308
3.29k
  clock_gettime(CLOCK_MONOTONIC, &ts);
309
3.29k
  return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
310
3.29k
}
311
312
uint32_t ccv_nnc_cmd_find_backend(const ccv_nnc_cmd_t cmd, const int tensor_memory, const int tensor_formats, const int tensor_datatypes)
313
269k
{
314
269k
  if (cmd.cmd == CCV_NNC_NOOP ||
315
269k
    
cmd.cmd == CCV_NNC_GRAPH_FORWARD269k
||
cmd.cmd == CCV_NNC_GRAPH_BACKWARD269k
||
316
269k
    
cmd.cmd == CCV_NNC_CUSTOM_FORWARD269k
||
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD266k
)
317
7.77k
    return cmd.backend;
318
261k
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
319
261k
  assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]));
320
261k
  assert(tensor_memory != 0 && tensor_formats != 0 && tensor_datatypes != 0);
321
261k
  int i;
322
862k
  for (i = 0; i < CCV_NNC_BACKEND_COUNT; 
i++600k
)
323
862k
  {
324
862k
    const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i];
325
    // We have the exec kernel, and support all the tensor memory types.
326
862k
    if (api_registry.exec &&
327
862k
      
(api_registry.tensor_memory & tensor_memory) == tensor_memory305k
&&
328
862k
      
(api_registry.tensor_formats & tensor_formats) == tensor_formats261k
&&
329
862k
      
(api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes261k
)
330
261k
      return backend_init_map[i].backend;
331
862k
  }
332
0
  return cmd.backend;
333
261k
}
334
335
604
#define AUTO_TUNE_TRIAL_SIZE (3)
336
337
static void _ccv_nnc_cmd_set_device_id(ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
338
411k
{
339
411k
#ifdef HAVE_CUDA
340
411k
  if (!stream_context)
341
112k
  {
342
112k
    int device_id;
343
112k
    if (ccv_nnc_device_ids_for_io(inputs, input_size, outputs, output_size, CCV_TENSOR_GPU_MEMORY, &device_id, 1) > 0)
344
4.21k
      cudevice(device_id);
345
112k
  }
346
411k
#endif
347
411k
}
348
349
typedef struct {
350
  int format;
351
  int datatype;
352
  int nd;
353
  off_t dataof;
354
  int dim[CCV_NNC_MAX_DIM_ALLOC];
355
  int stride[CCV_NNC_MAX_DIM_ALLOC];
356
} ccv_nnc_cmd_autotune_tensor_shape_t;
357
358
typedef struct {
359
  uint32_t cmd;
360
  ccv_nnc_cmd_param_t params;
361
  ccv_nnc_hint_t hint;
362
  int flags;
363
  int input_size;
364
  int output_size;
365
  size_t workspace_size;
366
  ccv_nnc_cmd_autotune_tensor_shape_t* inputs;
367
  ccv_nnc_cmd_autotune_tensor_shape_t* outputs;
368
} ccv_nnc_cmd_autotune_key_t;
369
370
static CCV_WARN_UNUSED(ccv_nnc_cmd_autotune_key_t) ccv_nnc_cmd_autotune_key_new(const ccv_nnc_cmd_t cmd, const size_t workspace_size, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
371
2.65k
{
372
2.65k
  ccv_nnc_cmd_autotune_key_t key = {
373
2.65k
    .cmd = cmd.cmd,
374
2.65k
    .params = cmd.info,
375
2.65k
    .hint = hint,
376
2.65k
    .workspace_size = workspace_size,
377
2.65k
    .inputs = 0,
378
2.65k
    .input_size = 0,
379
2.65k
    .outputs = 0,
380
2.65k
    .output_size = 0
381
2.65k
  };
382
2.65k
  if (input_size == 0 && 
output_size == 070
)
383
0
    return key;
384
2.65k
  assert(input_size >= 0 && output_size >= 0);
385
2.65k
  key.input_size = input_size;
386
2.65k
  key.output_size = output_size;
387
2.65k
  key.inputs = (ccv_nnc_cmd_autotune_tensor_shape_t*)ccmalloc(sizeof(ccv_nnc_cmd_autotune_tensor_shape_t) * (input_size + output_size));
388
2.65k
  key.outputs = key.inputs + input_size;
389
2.65k
  int i, j;
390
11.1k
  for (i = 0; i < input_size; 
i++8.45k
)
391
8.45k
  {
392
8.45k
    memset(key.inputs[i].dim, 0, sizeof(key.inputs[i].dim));
393
8.45k
    memset(key.inputs[i].stride, 0, sizeof(key.inputs[i].stride));
394
8.45k
    if (!inputs[i])
395
2.24k
    {
396
2.24k
      key.inputs[i].format = 0;
397
2.24k
      key.inputs[i].datatype = 0;
398
2.24k
      key.inputs[i].dataof = 0;
399
2.24k
      key.inputs[i].nd = 0;
400
2.24k
      continue;
401
2.24k
    }
402
6.21k
    key.inputs[i].format = inputs[i]->info.format;
403
6.21k
    key.inputs[i].datatype = inputs[i]->info.datatype;
404
6.21k
    key.inputs[i].dataof = inputs[i]->dataof;
405
6.21k
    const int nd = key.inputs[i].nd = ccv_nnc_tensor_nd(inputs[i]->info.dim);
406
20.4k
    for (j = 0; j < nd; 
j++14.1k
)
407
14.1k
      key.inputs[i].dim[j] = inputs[i]->info.dim[j];
408
6.21k
    if (CCV_IS_TENSOR_VIEW(inputs[i]))
409
18
      
for (j = 0; 6
j < nd;
j++12
)
410
12
        key.inputs[i].stride[j] = ((ccv_nnc_tensor_view_t*)inputs[i])->stride[j];
411
6.21k
  }
412
7.22k
  for (i = 0; i < output_size; 
i++4.57k
)
413
4.57k
  {
414
4.57k
    memset(key.outputs[i].dim, 0, sizeof(key.outputs[i].dim));
415
4.57k
    memset(key.outputs[i].stride, 0, sizeof(key.outputs[i].stride));
416
4.57k
    if (!outputs[i])
417
205
    {
418
205
      key.outputs[i].format = 0;
419
205
      key.outputs[i].datatype = 0;
420
205
      key.outputs[i].dataof = 0;
421
205
      key.outputs[i].nd = 0;
422
205
      continue;
423
205
    }
424
4.36k
    key.outputs[i].format = outputs[i]->info.format;
425
4.36k
    key.outputs[i].datatype = outputs[i]->info.datatype;
426
4.36k
    key.outputs[i].dataof = outputs[i]->dataof;
427
4.36k
    const int nd = key.outputs[i].nd = ccv_nnc_tensor_nd(outputs[i]->info.dim);
428
14.1k
    for (j = 0; j < nd; 
j++9.78k
)
429
9.78k
      key.outputs[i].dim[j] = outputs[i]->info.dim[j];
430
4.36k
    if (CCV_IS_TENSOR_VIEW(outputs[i]))
431
16
      
for (j = 0; 7
j < nd;
j++9
)
432
9
        key.outputs[i].stride[j] = ((ccv_nnc_tensor_view_t*)outputs[i])->stride[j];
433
4.36k
  }
434
2.65k
  return key;
435
2.65k
}
436
437
// autotune cache.
438
static inline uint32_t twang_32from64(uint64_t key)
439
301k
{
440
301k
  key = (~key) + (key << 18);
441
301k
  key = key ^ (key >> 31);
442
301k
  key = key * 21;
443
301k
  key = key ^ (key >> 11);
444
301k
  key = key + (key << 6);
445
301k
  key = key ^ (key >> 22);
446
301k
  return (uint32_t)(key);
447
301k
}
448
449
static inline khint32_t _kh_autotune_key_executable_hash_func(const ccv_nnc_cmd_autotune_key_t key)
450
2.81k
{
451
2.81k
  uint32_t h = key.cmd;
452
2.81k
  int i, j;
453
2.81k
  uint32_t* data = (uint32_t*)&key.params;
454
87.2k
  for (i = 0; i < sizeof(key.params) / sizeof(uint32_t); 
i++84.4k
)
455
84.4k
    h = twang_32from64(((uint64_t)h << 32) | data[i]);
456
2.81k
  data = (uint32_t*)&key.hint;
457
104k
  for (i = 0; i < sizeof(key.hint) / sizeof(uint32_t); 
i++101k
)
458
101k
    h = twang_32from64(((uint64_t)h << 32) | data[i]);
459
2.81k
  h = twang_32from64(((uint64_t)h << 32) | key.workspace_size);
460
2.81k
  h = twang_32from64(((uint64_t)h << 32) | key.input_size);
461
2.81k
  h = twang_32from64(((uint64_t)h << 32) | key.output_size);
462
11.7k
  for (i = 0; i < key.input_size; 
i++8.94k
)
463
8.94k
  {
464
8.94k
    h = twang_32from64(((uint64_t)h << 32) | key.inputs[i].format);
465
8.94k
    h = twang_32from64(((uint64_t)h << 32) | key.inputs[i].datatype);
466
8.94k
    h = twang_32from64(((uint64_t)h << 32) | key.inputs[i].dataof);
467
8.94k
    h = twang_32from64(((uint64_t)h << 32) | key.inputs[i].nd);
468
24.1k
    for (j = 0; j < key.inputs[i].nd; 
j++15.2k
)
469
15.2k
    {
470
15.2k
      h = twang_32from64(((uint64_t)h << 32) | key.inputs[i].dim[j]);
471
15.2k
      h = twang_32from64(((uint64_t)h << 32) | key.inputs[i].stride[j]);
472
15.2k
    }
473
8.94k
  }
474
7.71k
  for (i = 0; i < key.output_size; 
i++4.89k
)
475
4.89k
  {
476
4.89k
    h = twang_32from64(((uint64_t)h << 32) | key.outputs[i].format);
477
4.89k
    h = twang_32from64(((uint64_t)h << 32) | key.outputs[i].datatype);
478
4.89k
    h = twang_32from64(((uint64_t)h << 32) | key.outputs[i].dataof);
479
4.89k
    h = twang_32from64(((uint64_t)h << 32) | key.outputs[i].nd);
480
15.4k
    for (j = 0; j < key.outputs[i].nd; 
j++10.5k
)
481
10.5k
    {
482
10.5k
      h = twang_32from64(((uint64_t)h << 32) | key.outputs[i].dim[j]);
483
10.5k
      h = twang_32from64(((uint64_t)h << 32) | key.outputs[i].stride[j]);
484
10.5k
    }
485
4.89k
  }
486
2.81k
  return (khint32_t)h;
487
2.81k
}
488
489
static inline int _kh_autotune_key_executable_hash_equal(const ccv_nnc_cmd_autotune_key_t a, const ccv_nnc_cmd_autotune_key_t b)
490
3.17k
{
491
3.17k
  if (a.cmd != b.cmd || 
a.flags != b.flags1.84k
||
a.workspace_size != b.workspace_size1.84k
||
a.input_size != b.input_size1.84k
||
a.output_size != b.output_size1.84k
)
492
1.32k
    return 0;
493
1.84k
  if (memcmp(&a.params, &b.params, sizeof(a.params)) != 0)
494
57
    return 0;
495
1.79k
  if (memcmp(&a.hint, &b.hint, sizeof(a.hint)) != 0)
496
11
    return 0;
497
1.78k
  int i, j;
498
7.61k
  for (i = 0; i < a.input_size; 
i++5.83k
)
499
5.87k
  {
500
5.87k
    if (a.inputs[i].format != b.inputs[i].format || a.inputs[i].datatype != b.inputs[i].datatype || a.inputs[i].nd != b.inputs[i].nd || a.inputs[i].dataof != b.inputs[i].dataof)
501
0
      return 0;
502
15.5k
    
for (j = 0; 5.87k
j < a.inputs[i].nd;
j++9.72k
)
503
9.76k
      if (a.inputs[i].dim[j] != b.inputs[i].dim[j] || 
a.inputs[i].stride[j] != b.inputs[i].stride[j]9.72k
)
504
35
        return 0;
505
5.87k
  }
506
4.87k
  
for (i = 0; 1.74k
i < a.output_size;
i++3.12k
)
507
3.12k
  {
508
3.12k
    if (a.outputs[i].format != b.outputs[i].format || a.outputs[i].datatype != b.outputs[i].datatype || a.outputs[i].nd != b.outputs[i].nd || a.outputs[i].dataof != b.outputs[i].dataof)
509
0
      return 0;
510
9.90k
    
for (j = 0; 3.12k
j < a.outputs[i].nd;
j++6.77k
)
511
6.77k
      if (a.outputs[i].dim[j] != b.outputs[i].dim[j] || a.outputs[i].stride[j] != b.outputs[i].stride[j])
512
0
        return 0;
513
3.12k
  }
514
1.74k
  return 1;
515
1.74k
}
516
517
typedef struct {
518
  int backend;
519
  int algorithm;
520
} ccv_nnc_cmd_autotune_val_t;
521
522
KHASH_INIT(autotune_executable_cache, ccv_nnc_cmd_autotune_key_t, ccv_nnc_cmd_autotune_val_t, 1, _kh_autotune_key_executable_hash_func, _kh_autotune_key_executable_hash_equal)
523
524
static khash_t(autotune_executable_cache)* g_autotune_executable_cache = 0;
525
526
static inline void ccv_nnc_cmd_autotune_key_free(ccv_nnc_cmd_autotune_key_t key)
527
2.65k
{
528
2.65k
  if (key.inputs)
529
2.65k
    ccfree(key.inputs);
530
2.65k
}
531
532
void ccv_nnc_drain_autotune_cache(void)
533
262
{
534
262
  if (!g_autotune_executable_cache)
535
1
    return;
536
261
  khiter_t k;
537
33.6k
  for (k = 
kh_begin261
(g_autotune_executable_cache); k < kh_end(g_autotune_executable_cache);
k++33.4k
)
538
33.4k
  {
539
33.4k
    if (!kh_exist(g_autotune_executable_cache, k))
540
32.5k
      continue;
541
907
    ccv_nnc_cmd_autotune_key_free(kh_key(g_autotune_executable_cache, k));
542
907
    kh_del(autotune_executable_cache, g_autotune_executable_cache, k);
543
907
  }
544
261
}
545
546
ccv_nnc_cmd_t ccv_nnc_cmd_autotune(const ccv_nnc_cmd_t cmd, const size_t max_workspace_size, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
547
2.65k
{
548
  // This is a custom cmd kernel, no need to autotune.
549
2.65k
  if (cmd.cmd == CCV_NNC_NOOP ||
550
2.65k
    cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD ||
551
2.65k
    cmd.cmd == CCV_NNC_CUSTOM_FORWARD || 
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD2.65k
)
552
1
    return cmd;
553
2.65k
  int i, j, k;
554
  // Go through all the backends that supports the same type of memory input / output tensors support.
555
2.65k
  int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0;
556
11.1k
  for (i = 0; i < input_size; 
i++8.45k
)
557
8.45k
    if (inputs[i])
558
6.21k
      tensor_memory |= CCV_TENSOR_GET_MEMORY(inputs[i]->info.type), tensor_formats |= inputs[i]->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(inputs[i]->info.datatype);
559
7.22k
  for (i = 0; i < output_size; 
i++4.57k
)
560
4.57k
    if (outputs[i])
561
4.36k
      tensor_memory |= CCV_TENSOR_GET_MEMORY(outputs[i]->info.type), tensor_formats |= outputs[i]->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(outputs[i]->info.datatype);
562
  // In this case, we cannot determine the type of the tensor, skip auto-tune.
563
2.65k
  if (!tensor_memory)
564
0
    return cmd;
565
  // Otherwise, we are good to go.
566
2.65k
  ccv_nnc_cmd_t tuned_cmd = cmd;
567
2.65k
  if (!g_autotune_executable_cache)
568
1
    g_autotune_executable_cache = kh_init(autotune_executable_cache);
569
2.65k
  int ret = 0;
570
2.65k
  ccv_nnc_cmd_autotune_key_t key = ccv_nnc_cmd_autotune_key_new(cmd, max_workspace_size, hint, flags, inputs, input_size, outputs, output_size);
571
2.65k
  khiter_t kiter = kh_put(autotune_executable_cache, g_autotune_executable_cache, key, &ret);
572
2.65k
  if (ret == 0)
573
1.74k
  {
574
1.74k
    ccv_nnc_cmd_autotune_key_free(key);
575
1.74k
    const ccv_nnc_cmd_autotune_val_t val = kh_val(g_autotune_executable_cache, kiter);
576
1.74k
    tuned_cmd.backend = val.backend;
577
1.74k
    tuned_cmd.algorithm = val.algorithm;
578
1.74k
    return tuned_cmd;
579
1.74k
  }
580
907
  int64_t best_measured = -1;
581
907
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
582
907
  assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]));
583
907
  int flag = 0, autotune_available_1 = 0; // This is only applicable if we have only one backend.
584
6.80k
  for (i = 0; i < CCV_NNC_BACKEND_COUNT; 
i++5.89k
)
585
6.04k
  {
586
6.04k
    const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i];
587
    // We have the exec kernel, and support all the tensor memory types.
588
6.04k
    if (api_registry.exec &&
589
6.04k
      
(api_registry.tensor_memory & tensor_memory) == tensor_memory1.97k
&&
590
6.04k
      
(api_registry.tensor_formats & tensor_formats) == tensor_formats1.05k
&&
591
6.04k
      
(api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes1.05k
)
592
1.05k
    {
593
1.05k
      if (api_registry.autotune)
594
51
        autotune_available_1 = 1;
595
1.05k
      if ((++flag) >= 2) // If we have more than 2 suitable backend, we can do this now.
596
151
        break;
597
1.05k
    }
598
6.04k
  }
599
907
  if (flag == 0)
600
0
    return cmd;
601
907
  _ccv_nnc_cmd_set_device_id(inputs, input_size, outputs, output_size, stream_context);
602
  // Allocate inputs / outputs and fill them in.
603
907
  ccv_nnc_tensor_t** copy_inputs;
604
907
  ccv_nnc_tensor_t** copy_outputs;
605
907
  ccv_nnc_tensor_t** allocated_inputs;
606
907
  ccv_nnc_tensor_t** allocated_outputs;
607
907
  ccv_nnc_tensor_view_t** allocated_input_views;
608
907
  ccv_nnc_tensor_view_t** allocated_output_views;
609
907
  if (flag > 1 || 
autotune_available_1756
)
610
202
  {
611
202
    copy_inputs = (ccv_nnc_tensor_t**)cccalloc((input_size + output_size) * 3, sizeof(ccv_nnc_tensor_t*));
612
202
    copy_outputs = copy_inputs + input_size;
613
202
    allocated_inputs = copy_outputs + output_size;
614
202
    allocated_outputs = allocated_inputs + input_size;
615
202
    allocated_input_views = (ccv_nnc_tensor_view_t**)(allocated_outputs + output_size);
616
202
    allocated_output_views = allocated_input_views + input_size;
617
202
    int stride[CCV_NNC_MAX_DIM_ALLOC];
618
490
    for (i = 0; i < output_size; 
i++288
)
619
288
      if (outputs[i])
620
279
      {
621
1.28k
        for (j = 0; j < input_size; 
j++1.00k
)
622
1.00k
          if (inputs[j])
623
785
          {
624
785
            if (outputs[i] == inputs[j])
625
0
            {
626
0
              if (!copy_inputs[j])
627
0
              {
628
0
                allocated_inputs[j] = ccv_nnc_tensor_new(0, inputs[j]->info, 0);
629
0
                if (CCV_IS_TENSOR_VIEW(inputs[j]))
630
0
                {
631
0
                  ccv_nnc_tensor_get_stride(inputs[j]->info.dim, stride);
632
0
                  copy_inputs[j] = (ccv_nnc_tensor_t*)(allocated_input_views[j] = ccv_nnc_tensor_view_new(allocated_inputs[j], inputs[j]->info, DIM_ALLOC(), stride));
633
0
                } else
634
0
                  copy_inputs[j] = allocated_inputs[j];
635
0
              }
636
0
              copy_outputs[i] = copy_inputs[j];
637
0
              break;
638
785
            } else if (outputs[i]->data.u8 == inputs[j]->data.u8 &&
639
785
              
ccv_nnc_tensor_count(outputs[i]->info) == ccv_nnc_tensor_count(inputs[j]->info)0
) {
640
0
              if (!copy_inputs[j])
641
0
              {
642
0
                allocated_inputs[j] = ccv_nnc_tensor_new(0, inputs[j]->info, 0);
643
0
                if (CCV_IS_TENSOR_VIEW(inputs[j]))
644
0
                {
645
0
                  ccv_nnc_tensor_get_stride(inputs[j]->info.dim, stride);
646
0
                  copy_inputs[j] = (ccv_nnc_tensor_t*)(allocated_input_views[j] = ccv_nnc_tensor_view_new(allocated_inputs[j], inputs[j]->info, DIM_ALLOC(), stride));
647
0
                } else
648
0
                  copy_inputs[j] = allocated_inputs[j];
649
0
              }
650
0
              allocated_outputs[i] = ccv_nnc_tensor_new(copy_inputs[j]->data.u8, outputs[i]->info, 0);
651
0
              if (CCV_IS_TENSOR_VIEW(outputs[i]))
652
0
              {
653
0
                  ccv_nnc_tensor_get_stride(outputs[i]->info.dim, stride);
654
0
                copy_outputs[i] = (ccv_nnc_tensor_t*)(allocated_output_views[i] = ccv_nnc_tensor_view_new(allocated_outputs[i], outputs[i]->info, DIM_ALLOC(), stride));
655
0
              } else
656
0
                copy_outputs[i] = allocated_outputs[i];
657
0
              break;
658
0
            }
659
785
          }
660
279
        if (!copy_outputs[i])
661
279
        {
662
279
          allocated_outputs[i] = ccv_nnc_tensor_new(0, outputs[i]->info, 0);
663
279
          if (CCV_IS_TENSOR_VIEW(outputs[i]))
664
3
          {
665
3
            ccv_nnc_tensor_get_stride(outputs[i]->info.dim, stride);
666
3
            copy_outputs[i] = (ccv_nnc_tensor_t*)(allocated_output_views[i] = ccv_nnc_tensor_view_new(allocated_outputs[i], outputs[i]->info, DIM_ALLOC(), stride));
667
3
          } else
668
276
            copy_outputs[i] = allocated_outputs[i];
669
279
        }
670
279
      }
671
848
    for (i = 0; i < input_size; 
i++646
)
672
646
      if (inputs[i] && 
!copy_inputs[i]559
)
673
559
        copy_inputs[i] = inputs[i];
674
202
  }
675
907
  if (flag == 1)
676
756
  {
677
2.53k
    for (i = 0; i < CCV_NNC_BACKEND_COUNT; 
i++1.78k
)
678
2.53k
    {
679
2.53k
      const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i];
680
      // We have the exec kernel, and support all the tensor memory types.
681
2.53k
      if (api_registry.exec &&
682
2.53k
        
(api_registry.tensor_memory & tensor_memory) == tensor_memory1.07k
&&
683
2.53k
        
(api_registry.tensor_formats & tensor_formats) == tensor_formats756
&&
684
2.53k
        
(api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes756
)
685
756
      {
686
756
        tuned_cmd.backend = backend_init_map[i].backend;
687
        // If a given API exist an autotune function, use that to pick the top algorithm.
688
756
        if (api_registry.autotune)
689
51
        {
690
51
          ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, inputs, input_size, copy_inputs, input_size, stream_context);
691
51
          _ccv_nnc_cmd_set_device_id(copy_inputs, input_size, copy_outputs, output_size, stream_context);
692
51
          tuned_cmd.algorithm = api_registry.autotune(tuned_cmd, max_workspace_size, hint, flags, copy_inputs, input_size, copy_outputs, output_size, stream_context);
693
          // Drain the context, autotune can use excessive amount of memory. Need to drain it now.
694
51
          ccv_nnc_stream_context_drain(stream_context);
695
51
        }
696
756
        break;
697
756
      }
698
2.53k
    }
699
756
    if (autotune_available_1)
700
51
    {
701
240
      for (i = 0; i < input_size; 
i++189
)
702
189
      {
703
189
        if (allocated_inputs[i])
704
0
          ccv_nnc_tensor_free(allocated_inputs[i]);
705
189
        if (allocated_input_views[i])
706
0
          ccv_nnc_tensor_view_free(allocated_input_views[i]);
707
189
      }
708
146
      for (i = 0; i < output_size; 
i++95
)
709
95
      {
710
95
        if (allocated_outputs[i])
711
93
          ccv_nnc_tensor_free(allocated_outputs[i]);
712
95
        if (allocated_output_views[i])
713
0
          ccv_nnc_tensor_view_free(allocated_output_views[i]);
714
95
      }
715
51
      ccfree(copy_inputs);
716
51
    }
717
756
    const ccv_nnc_cmd_autotune_val_t val = {
718
756
      .backend = tuned_cmd.backend,
719
756
      .algorithm = tuned_cmd.algorithm
720
756
    };
721
756
    kh_val(g_autotune_executable_cache, kiter) = val;
722
756
    return tuned_cmd;
723
756
  }
724
  // We need to have trial loop through all the data.
725
604
  
for (k = 0; 151
k < AUTO_TUNE_TRIAL_SIZE;
k++453
)
726
453
  {
727
3.62k
    for (i = 0; i < CCV_NNC_BACKEND_COUNT; 
i++3.17k
)
728
3.17k
    {
729
3.17k
      const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i];
730
      // We have the exec kernel, and support all the tensor memory types.
731
3.17k
      if (api_registry.exec &&
732
3.17k
        
(api_registry.tensor_memory & tensor_memory) == tensor_memory1.35k
&&
733
3.17k
        
(api_registry.tensor_formats & tensor_formats) == tensor_formats906
&&
734
3.17k
        
(api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes906
)
735
906
      {
736
906
        ccv_nnc_cmd_t candid_cmd = cmd;
737
906
        candid_cmd.backend = backend_init_map[i].backend;
738
        // If a given API exist an autotune function, use that to pick the top algorithm.
739
906
        if (api_registry.autotune)
740
0
        {
741
          // Assuming k == 0 is sufficient, and we can skip.
742
0
          if (k > 0)
743
0
            continue;
744
0
          ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, inputs, input_size, copy_inputs, input_size, stream_context);
745
0
          _ccv_nnc_cmd_set_device_id(copy_inputs, input_size, copy_outputs, output_size, stream_context);
746
0
          candid_cmd.algorithm = api_registry.autotune(candid_cmd, max_workspace_size, hint, flags, copy_inputs, input_size, copy_outputs, output_size, stream_context);
747
          // Drain the context, autotune can use excessive amount of memory. Need to drain it now.
748
0
          ccv_nnc_stream_context_drain(stream_context);
749
0
          uint64_t elapsed = ccv_nnc_cmd_mono_time();
750
          // Ready to run.
751
0
          int status = ccv_nnc_cmd_exec(candid_cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context);
752
0
          ccv_nnc_stream_context_wait(stream_context);
753
0
          elapsed = ccv_nnc_cmd_mono_time() - elapsed;
754
0
          if (status == CCV_NNC_EXEC_SUCCESS &&
755
0
            (best_measured == -1 || elapsed < best_measured))
756
0
          {
757
0
            best_measured = elapsed;
758
0
            tuned_cmd = candid_cmd;
759
0
          }
760
906
        } else {
761
          // Otherwise loop over the existing algorithms and pick the top one.
762
2.55k
          for (j = 0; j < api_registry.algorithms; 
j++1.64k
)
763
1.64k
          {
764
1.64k
            candid_cmd.algorithm = j;
765
1.64k
            ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, inputs, input_size, copy_inputs, input_size, stream_context);
766
1.64k
            _ccv_nnc_cmd_set_device_id(copy_inputs, input_size, copy_outputs, output_size, stream_context);
767
1.64k
            uint64_t elapsed = ccv_nnc_cmd_mono_time();
768
            // Ready to run.
769
1.64k
            int status = ccv_nnc_cmd_exec(candid_cmd, hint, flags, copy_inputs, input_size, copy_outputs, output_size, stream_context);
770
1.64k
            elapsed = ccv_nnc_cmd_mono_time() - elapsed;
771
1.64k
            if (status == CCV_NNC_EXEC_SUCCESS &&
772
1.64k
              
(1.07k
best_measured == -11.07k
||
elapsed < best_measured920
))
773
529
            {
774
529
              best_measured = elapsed;
775
529
              tuned_cmd = candid_cmd;
776
529
            }
777
1.64k
          }
778
906
        }
779
906
      }
780
3.17k
    }
781
453
  }
782
608
  for (i = 0; i < input_size; 
i++457
)
783
457
  {
784
457
    if (allocated_inputs[i])
785
0
      ccv_nnc_tensor_free(allocated_inputs[i]);
786
457
    if (allocated_input_views[i])
787
0
      ccv_nnc_tensor_view_free(allocated_input_views[i]);
788
457
  }
789
344
  for (i = 0; i < output_size; 
i++193
)
790
193
  {
791
193
    if (allocated_outputs[i])
792
186
      ccv_nnc_tensor_free(allocated_outputs[i]);
793
193
    if (allocated_output_views[i])
794
3
      ccv_nnc_tensor_view_free(allocated_output_views[i]);
795
193
  }
796
151
  ccfree(copy_inputs);
797
151
  const ccv_nnc_cmd_autotune_val_t val = {
798
151
    .backend = tuned_cmd.backend,
799
151
    .algorithm = tuned_cmd.algorithm
800
151
  };
801
151
  kh_val(g_autotune_executable_cache, kiter) = val;
802
151
  return tuned_cmd;
803
907
}
804
805
int ccv_nnc_cmd_bitmask(const ccv_nnc_cmd_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size)
806
147k
{
807
  // If it is no-op, return true, it can deal with any number of parameters.
808
147k
  if (cmd.cmd == CCV_NNC_NOOP)
809
118
    return 1;
810
  // If it is a custom command, I cannot check it at all, return false.
811
147k
  if (cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD)
812
2.40k
    return 0;
813
144k
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
814
144k
  const ccv_nnc_cmd_registry_t cmd_registry = init_map[cmd_idx].registry;
815
144k
  if (cmd_registry.bitmask)
816
144k
    return cmd_registry.bitmask(cmd.info, input_size, output_size, input_bitmasks, input_bitmask_size, output_bitmasks, output_bitmask_size);
817
  // If there is not checking, none can pass.
818
0
  return 0;
819
144k
}
820
821
int ccv_nnc_device_ids_for_io(ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const int tensor_type, int* const device_ids, const int max_device_id_size)
822
125k
{
823
125k
  int i, j;
824
125k
  int device_id_size = 0;
825
125k
  if (max_device_id_size <= device_id_size)
826
0
    return device_id_size;
827
  // The device id of the exec is determined by its outputs.
828
295k
  
for (i = 0; 125k
i < output_size;
i++170k
)
829
173k
    if (outputs[i] &&
830
173k
      
CCV_TENSOR_GET_MEMORY155k
(outputs[i]->info.type) == tensor_type155k
&&
831
173k
      
CCV_TENSOR_GET_DEVICE22.8k
(outputs[i]->info.type) != CCV_COMPUTE_DEVICE_ANY22.8k
)
832
22.8k
    {
833
22.8k
      const int device_id = CCV_TENSOR_GET_DEVICE_ID(outputs[i]->info.type);
834
22.8k
      int flag = 0;
835
33.5k
      for (j = 0; !flag && 
j < device_id_size25.7k
;
j++10.7k
)
836
10.7k
        flag = (device_ids[j] == device_id);
837
22.8k
      if (flag)
838
7.76k
        continue;
839
15.0k
      device_ids[device_id_size++] = device_id;
840
15.0k
      if (device_id_size >= max_device_id_size)
841
3.57k
        return device_id_size;
842
15.0k
    }
843
121k
  if (device_id_size == 0)
844
111k
  {
845
111k
    int device_id = -1;
846
360k
    for (i = 0; i < input_size; 
i++249k
)
847
249k
      if (inputs[i] &&
848
249k
        
CCV_TENSOR_GET_MEMORY203k
(inputs[i]->info.type) == tensor_type203k
&&
849
249k
        
CCV_TENSOR_GET_DEVICE2.17k
(inputs[i]->info.type) != CCV_COMPUTE_DEVICE_ANY2.17k
&&
850
249k
        
(2.17k
device_id < 02.17k
||
CCV_TENSOR_GET_DEVICE_ID1.13k
(inputs[i]->info.type) < device_id1.13k
))
851
1.04k
        device_id = CCV_TENSOR_GET_DEVICE_ID(inputs[i]->info.type);
852
111k
    if (device_id >= 0)
853
1.04k
    {
854
1.04k
      device_ids[0] = device_id;
855
1.04k
      return 1;
856
1.04k
    }
857
111k
  }
858
120k
  return device_id_size;
859
121k
}
860
861
void* ccv_nnc_cmd_aux(const ccv_nnc_cmd_t cmd)
862
11
{
863
11
  if (cmd.cmd == CCV_NNC_NOOP ||
864
11
    cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD ||
865
11
    cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD)
866
0
    return 0;
867
11
  assert(cmd.backend != CCV_NNC_NO_BACKEND);
868
11
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
869
11
  assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]));
870
11
  const int backend_idx = _ccv_nnc_cmd_backend_ph(cmd.backend);
871
11
  assert(backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT);
872
11
  const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[backend_idx];
873
11
  return api_registry.aux;
874
11
}
875
876
int ccv_nnc_cmd_exec(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
877
440k
{
878
  // If it is no-op, return as if succeed already.
879
440k
  if (cmd.cmd == CCV_NNC_NOOP)
880
31.3k
    return 0;
881
409k
  _ccv_nnc_cmd_set_device_id(inputs, input_size, outputs, output_size, stream_context);
882
  // If it is a custom command, just apply it directly.
883
409k
  if (cmd.cmd == CCV_NNC_CUSTOM_FORWARD || 
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD406k
)
884
4.82k
  {
885
4.82k
    int ret = cmd.isa->exec(cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context);
886
4.82k
    if (!stream_context)
887
4.43k
      ccv_nnc_stream_context_drain(stream_context);
888
4.82k
    return ret;
889
4.82k
  }
890
409k
  assert
(cmd.cmd != CCV_NNC_GRAPH_FORWARD && cmd.cmd != CCV_NNC_GRAPH_BACKWARD)404k
;
891
404k
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
892
404k
  assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]));
893
404k
  int i;
894
404k
  uint32_t backend = cmd.backend;
895
404k
  if (backend == CCV_NNC_NO_BACKEND)
896
180k
  {
897
    // Find a suitable backend.
898
180k
    int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0;
899
450k
    for (i = 0; i < input_size; 
i++270k
)
900
270k
      if (inputs[i])
901
268k
        tensor_memory |= CCV_TENSOR_GET_MEMORY(inputs[i]->info.type), tensor_formats |= inputs[i]->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(inputs[i]->info.datatype);
902
406k
    for (i = 0; i < output_size; 
i++226k
)
903
226k
      if (outputs[i])
904
225k
        tensor_memory |= CCV_TENSOR_GET_MEMORY(outputs[i]->info.type), tensor_formats |= outputs[i]->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(outputs[i]->info.datatype);
905
180k
    backend = ccv_nnc_cmd_find_backend(cmd, tensor_memory, tensor_formats, tensor_datatypes);
906
180k
  }
907
404k
  assert(backend != CCV_NNC_NO_BACKEND);
908
404k
  const int backend_idx = _ccv_nnc_cmd_backend_ph(backend);
909
404k
  assert(backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT);
910
404k
  const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[backend_idx];
911
404k
  if (!api_registry.exec)
912
0
    return CCV_NNC_EXEC_NO_KERNEL;
913
  // Everything is out, call the underlying implementation.
914
404k
  int ret = api_registry.exec(cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context);
915
404k
  if (!stream_context)
916
104k
    ccv_nnc_stream_context_drain(stream_context);
917
404k
  return ret;
918
404k
}
919
920
int ccv_nnc_cmd_attr(const ccv_nnc_cmd_t cmd, const int flags)
921
0
{
922
  // No additional attr for noop.
923
0
  if (cmd.cmd == CCV_NNC_NOOP ||
924
    // If it is a custom command, just apply it directly.
925
0
    cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD ||
926
    // If it is sub-graph, there is no additional attr as well.
927
0
    cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD)
928
0
    return 0;
929
0
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
930
0
  assert(cmd_idx >= 0 && cmd_idx <sizeof(init_map) / sizeof(init_map[0]));
931
0
  const ccv_nnc_cmd_registry_t cmd_registry = init_map[cmd_idx].registry;
932
0
  return !!(cmd_registry.flags & flags);
933
0
}
934
935
void ccv_nnc_set_profiler(int state)
936
0
{
937
0
#ifdef HAVE_CUDA
938
0
  cusetprofiler(state);
939
0
#endif
940
0
}
941
942
int ccv_nnc_queue_watermark(void)
943
0
{
944
#ifdef HAVE_MPS
945
  return ccv_nnc_mps_queue_watermark();
946
#else
947
0
  return 0;
948
0
#endif
949
0
}
950
951
void ccv_nnc_set_queue_watermark(int watermark)
952
0
{
953
#ifdef HAVE_MPS
954
  // If we need to be memory efficient, we need to bound how many in-flight command buffers there are.
955
  ccv_nnc_mps_set_queue_watermark(watermark);
956
#endif
957
0
}
958
959
void ccv_nnc_set_device_permutation(const int type, const int* const device_map, const int size)
960
2
{
961
2
  if (type != CCV_STREAM_CONTEXT_GPU)
962
0
    return;
963
2
#ifdef HAVE_CUDA
964
2
  cusetdevicemap(device_map, size);
965
2
#endif
966
2
}