Coverage Report

Created: 2021-04-07 21:56

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/ccv_nnc_cmd.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_internal.h"
3
#include "ccv_nnc_easy.h"
4
#ifdef HAVE_CUDA
5
#include "gpu/ccv_nnc_compat.h"
6
#endif
7
#include <time.h>
8
#include <sys/time.h>
9
10
#ifdef __MACH__
11
#include <mach/mach.h>
12
#include <mach/mach_time.h>
13
#endif
14
15
typedef struct {
16
  const uint32_t cmd;
17
  const char* name;
18
  ccv_nnc_cmd_registry_t registry;
19
  ccv_nnc_cmd_backend_registry_t backends[CCV_NNC_BACKEND_COUNT];
20
} ccv_nnc_cmd_init_t;
21
22
typedef struct {
23
  const uint32_t backend;
24
  const char* name;
25
} ccv_nnc_cmd_backend_init_t;
26
27
// The generated code configures command and its mapping.
28
#include "cmd/ccv_nnc_cmd.inc"
29
30
void ccv_nnc_init(void)
31
1
{
32
1
  _ccv_nnc_cmd_init();
33
1
}
34
35
const char* ccv_nnc_cmd_name(const uint32_t cmd)
36
1.54k
{
37
1.54k
  switch (cmd)
38
1.54k
  {
39
77
    case CCV_NNC_NOOP:
40
77
      return "CCV_NNC_NOOP";
41
2
    case CCV_NNC_CUSTOM_FORWARD:
42
2
      return "CCV_NNC_CUSTOM_FORWARD";
43
0
    case CCV_NNC_CUSTOM_BACKWARD:
44
0
      return "CCV_NNC_CUSTOM_BACKWARD";
45
64
    case CCV_NNC_GRAPH_FORWARD:
46
64
      return "CCV_NNC_GRAPH_FORWARD";
47
5
    case CCV_NNC_GRAPH_BACKWARD:
48
5
      return "CCV_NNC_GRAPH_BACKWARD";
49
1.39k
  }
50
1.39k
  const int idx = _ccv_nnc_cmd_ph(cmd);
51
1.39k
  assert(idx >= 0);
52
1.39k
  assert(idx < sizeof(init_map) / sizeof(init_map[0]));
53
1.39k
  return init_map[idx].name;
54
1.39k
}
55
56
const char* ccv_nnc_cmd_backend_name(const uint32_t backend)
57
0
{
58
0
  if (backend == CCV_NNC_NO_BACKEND)
59
0
    return "CCV_NNC_NO_BACKEND";
60
0
  const int idx = _ccv_nnc_cmd_backend_ph(backend);
61
0
  assert(idx >= 0);
62
0
  assert(idx < CCV_NNC_BACKEND_COUNT);
63
0
  return backend_init_map[idx].name;
64
0
}
65
66
const ccv_nnc_cmd_param_t ccv_nnc_cmd_auto = {};
67
68
int ccv_nnc_is_cmd_auto(const ccv_nnc_cmd_param_t params)
69
0
{
70
0
  return (memcmp(&params, &ccv_nnc_cmd_auto, sizeof(ccv_nnc_cmd_param_t)) == 0);
71
0
}
72
73
int ccv_nnc_cmd_is_forward(const ccv_nnc_cmd_t cmd)
74
18.0k
{
75
18.0k
  switch (cmd.cmd)
76
18.0k
  {
77
2
    case CCV_NNC_NOOP:
78
2
      return 0;
79
2.40k
    case CCV_NNC_CUSTOM_FORWARD:
80
2.40k
    case CCV_NNC_CUSTOM_BACKWARD:
81
2.40k
    case CCV_NNC_GRAPH_FORWARD:
82
2.40k
    case CCV_NNC_GRAPH_BACKWARD:
83
18.0k
    default:
84
18.0k
      return !(cmd.cmd & 0x1); // If it is even, it is forward
85
18.0k
  }
86
18.0k
}
87
88
int ccv_nnc_cmd_is_backward(const ccv_nnc_cmd_t cmd)
89
35.9k
{
90
35.9k
  switch (cmd.cmd)
91
35.9k
  {
92
2
    case CCV_NNC_NOOP:
93
2
      return 0;
94
4.80k
    case CCV_NNC_CUSTOM_FORWARD:
95
4.80k
    case CCV_NNC_CUSTOM_BACKWARD:
96
4.80k
    case CCV_NNC_GRAPH_FORWARD:
97
4.80k
    case CCV_NNC_GRAPH_BACKWARD:
98
35.9k
    default:
99
35.9k
      return !!(cmd.cmd & 0x1); // If it is odd, it is backward
100
35.9k
  }
101
35.9k
}
102
103
int ccv_nnc_cmd_ok(const uint32_t cmd, const uint32_t backend)
104
367
{
105
367
  // If it is a custom command, a no op, or a graph op, there is no backend to check.
106
367
  if (cmd == CCV_NNC_NOOP ||
107
367
    cmd == CCV_NNC_GRAPH_FORWARD || cmd == CCV_NNC_GRAPH_BACKWARD ||
108
367
    cmd == CCV_NNC_CUSTOM_FORWARD || cmd == CCV_NNC_CUSTOM_BACKWARD)
109
0
    return 1;
110
367
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd);
111
367
  const int backend_idx = _ccv_nnc_cmd_backend_ph(backend);
112
367
  assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]));
113
367
  assert(backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT);
114
367
  const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[backend_idx];
115
367
  // Check if the execution function exists or not.
116
367
  return !!api_registry.exec;
117
367
}
118
119
ccv_nnc_cmd_t ccv_nnc_cmd(const uint32_t _cmd, ccv_nnc_cmd_vtab_t* const isa, const ccv_nnc_cmd_param_t params, const int flags)
120
49.5k
{
121
49.5k
  ccv_nnc_cmd_t cmd;
122
49.5k
  cmd.info = params;
123
49.5k
  cmd.backend = CCV_NNC_NO_BACKEND;
124
49.5k
  assert((_cmd == CCV_NNC_CUSTOM_FORWARD && isa) || (_cmd != CCV_NNC_CUSTOM_FORWARD && !isa));
125
49.5k
  cmd.cmd = _cmd;
126
49.5k
  cmd.algorithm = -1; // This is default.
127
49.5k
  cmd.isa = isa;
128
49.5k
  cmd.data = 0;
129
49.5k
  return cmd;
130
49.5k
}
131
132
const ccv_nnc_hint_t ccv_nnc_no_hint = {};
133
134
int ccv_nnc_is_no_hint(const ccv_nnc_hint_t hint)
135
133k
{
136
133k
  return (memcmp(&hint, &ccv_nnc_no_hint, sizeof(ccv_nnc_hint_t)) == 0);
137
133k
}
138
139
int ccv_nnc_hint_verify(const ccv_nnc_hint_t hint, const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t a, const ccv_nnc_tensor_param_t b)
140
3
{
141
3
  int i;
142
3
  assert(a.format == b.format);
143
3
  const int nd = ccv_nnc_tensor_nd(a.dim);
144
3
  assert(nd == CCV_NNC_MAX_DIM + 1 || nd == CCV_NNC_MAX_DIM + 2);
145
3
  int hw;
146
3
  if ((a.format == CCV_TENSOR_FORMAT_CHWN) ||
147
3
    (a.format == CCV_TENSOR_FORMAT_NHWC && nd == CCV_NNC_MAX_DIM + 1))
148
0
    hw = 0;
149
3
  else if ((a.format == CCV_TENSOR_FORMAT_NHWC && nd == CCV_NNC_MAX_DIM + 2) ||
150
3
       
(0
a.format == CCV_TENSOR_FORMAT_NCHW0
&&
nd == 0
CCV_NNC_MAX_DIM0
+ 1))
151
3
    hw = 1;
152
0
  else if (a.format == CCV_TENSOR_FORMAT_NCHW && nd == CCV_NNC_MAX_DIM + 2)
153
0
    hw = 2;
154
0
  else
155
0
    assert(0 && "unknown format");
156
9
  
for (i = 0; 3
i < CCV_NNC_MAX_DIM;
i++6
)
157
6
  {
158
6
    if ((hint.border.begin[i] + hint.border.end[i] + a.dim[i] - cmd.size.dim[i]) % hint.stride.dim[i] != 0)
159
0
      return -1;
160
6
    int expected = (hint.border.begin[i] + hint.border.end[i] + a.dim[i + hw] - cmd.size.dim[i]) / hint.stride.dim[i] + 1;
161
6
    if (expected != b.dim[i + hw])
162
0
      return -1;
163
6
  }
164
3
  return 0;
165
3
}
166
167
ccv_nnc_hint_t ccv_nnc_hint_auto(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t a, const ccv_nnc_tensor_param_t b)
168
103k
{
169
103k
  int i;
170
103k
  if (a.format != b.format)
171
0
    return ccv_nnc_no_hint;
172
103k
  assert(a.format == b.format);
173
103k
  const int a_nd = ccv_nnc_tensor_nd(a.dim);
174
103k
  const int b_nd = ccv_nnc_tensor_nd(b.dim);
175
103k
  // Is not auto hint deducible dimensions.
176
103k
  if (a_nd != b_nd || 
(102k
a_nd != 102k
CCV_NNC_MAX_DIM102k
+ 1 &&
a_nd != 102k
CCV_NNC_MAX_DIM102k
+ 2))
177
101k
    return ccv_nnc_no_hint;
178
1.42k
  int hw;
179
1.42k
  if ((a.format == CCV_TENSOR_FORMAT_CHWN) ||
180
1.42k
    (a.format == CCV_TENSOR_FORMAT_NHWC && 
a_nd == 306
CCV_NNC_MAX_DIM306
+ 1))
181
120
    hw = 0;
182
1.30k
  else if ((a.format == CCV_TENSOR_FORMAT_NHWC && 
a_nd == 186
CCV_NNC_MAX_DIM186
+ 2) ||
183
1.30k
       
(1.11k
a.format == CCV_TENSOR_FORMAT_NCHW1.11k
&&
a_nd == 1.11k
CCV_NNC_MAX_DIM1.11k
+ 1))
184
492
    hw = 1;
185
813
  else if (a.format == CCV_TENSOR_FORMAT_NCHW && a_nd == CCV_NNC_MAX_DIM + 2)
186
813
    hw = 2;
187
813
  else
188
813
    assert(0 && "unknown format");
189
1.42k
  ccv_nnc_hint_t hint_auto = {};
190
1.42k
  // 0-dim is reserved for channels
191
4.27k
  for (i = 0; i < CCV_NNC_MAX_DIM; 
i++2.85k
)
192
2.85k
  {
193
2.85k
    // Cannot have one of the dim is zero, we cannot auto the hint, return no hint.
194
2.85k
    assert(a.dim[i + hw] && b.dim[i + hw]);
195
2.85k
    // This is guessed by having a stride that will approximately match the scale.
196
2.85k
    int stride = (a.dim[i + hw] + b.dim[i + hw] / 2) / b.dim[i + hw];
197
2.85k
    hint_auto.stride.dim[i] = stride;
198
2.85k
    int border = (b.dim[i + hw] - 1) * stride - a.dim[i + hw] + cmd.size.dim[i];
199
2.85k
    hint_auto.border.begin[i] = (border + 1) / 2; // Always prefer to have more padding in the beginning, this matches CUDNN behavior.
200
2.85k
    hint_auto.border.end[i] = border - hint_auto.border.begin[i];
201
2.85k
  }
202
1.42k
  return hint_auto;
203
1.42k
}
204
205
void ccv_nnc_hint_tensor_auto_forward_from_inputs(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
206
16.9k
{
207
16.9k
  int i;
208
16.9k
  assert(output_size <= input_size);
209
35.8k
  
for (i = 0; 16.9k
i < output_size;
i++18.9k
)
210
18.9k
    outputs[i] = inputs[i];
211
16.9k
}
212
213
void ccv_nnc_hint_tensor_auto_backward_from_gradient(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
214
7.79k
{
215
7.79k
  int i;
216
18.6k
  for (i = 0; i < output_size; 
i++10.8k
)
217
10.8k
    outputs[i] = inputs[0];
218
7.79k
}
219
220
void ccv_nnc_hint_tensor_auto_backward_from_inputs(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
221
20.4k
{
222
20.4k
  int i;
223
20.4k
  assert(output_size < input_size);
224
66.2k
  
for (i = 0; 20.4k
i < output_size;
i++45.8k
)
225
45.8k
    outputs[i] = inputs[i + 1];
226
20.4k
}
227
228
void ccv_nnc_hint_tensor_auto_backward_from_gradient_and_inputs(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
229
166
{
230
166
  int i;
231
166
  outputs[0] = inputs[0];
232
166
  assert(output_size < input_size);
233
332
  
for (i = 1; 166
i < output_size;
i++166
)
234
166
    outputs[i] = inputs[i + 1];
235
166
}
236
237
void ccv_nnc_hint_tensor_auto(const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
238
125k
{
239
125k
  // zero out the parameters
240
125k
  const ccv_nnc_tensor_param_t z = {};
241
125k
  int i;
242
320k
  for (i = 0; i < output_size; 
i++195k
)
243
195k
    outputs[i] = z; // Reset the outputs.
244
125k
  // Cannot handle these situations.
245
125k
  if (cmd.cmd == CCV_NNC_NOOP || 
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD124k
||
cmd.cmd == CCV_NNC_GRAPH_FORWARD122k
||
cmd.cmd == CCV_NNC_GRAPH_BACKWARD122k
)
246
3.43k
    return;
247
122k
  if (cmd.cmd == CCV_NNC_CUSTOM_FORWARD)
248
4.40k
  {
249
4.40k
    if (cmd.isa->tensor_auto)
250
4.40k
      cmd.isa->tensor_auto(cmd, inputs, input_size, hint, outputs, output_size);
251
4.40k
    return;
252
4.40k
  }
253
117k
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
254
117k
  const ccv_nnc_cmd_registry_t registry = init_map[cmd_idx].registry;
255
117k
  if (registry.tensor_auto)
256
117k
    registry.tensor_auto(cmd.info, inputs, input_size, hint, outputs, output_size);
257
0
  else if (ccv_nnc_cmd_is_forward(cmd)) // For forward, the default auto is forward_from_inputs
258
0
    ccv_nnc_hint_tensor_auto_forward_from_inputs(cmd.info, inputs, input_size, hint, outputs, output_size);
259
0
  else // For backward, the default auto is backward_from_inputs
260
0
    ccv_nnc_hint_tensor_auto_backward_from_inputs(cmd.info, inputs, input_size, hint, outputs, output_size);
261
117k
}
262
263
int ccv_nnc_cmd_allow_inplace(const ccv_nnc_cmd_t cmd, const int input_idx, const int input_size, const int output_idx, const int output_size)
264
52.1k
{
265
52.1k
  if (cmd.cmd == CCV_NNC_NOOP || 
cmd.cmd == CCV_NNC_CUSTOM_FORWARD52.1k
||
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD52.1k
||
cmd.cmd == CCV_NNC_GRAPH_FORWARD49.7k
||
cmd.cmd == CCV_NNC_GRAPH_BACKWARD49.6k
)
266
2.53k
    return 0;
267
49.6k
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
268
49.6k
  const ccv_nnc_cmd_registry_t registry = init_map[cmd_idx].registry;
269
49.6k
  if (registry.allow_inplace)
270
18.8k
    return registry.allow_inplace(input_idx, input_size, output_idx, output_size);
271
30.7k
  return 0;
272
30.7k
}
273
274
int ccv_nnc_cmd_enforce_inplace(const ccv_nnc_cmd_t cmd, const int input_idx, const int input_size, const int output_idx, const int output_size)
275
196k
{
276
196k
  if (cmd.cmd == CCV_NNC_NOOP || 
cmd.cmd == CCV_NNC_CUSTOM_FORWARD196k
||
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD194k
||
cmd.cmd == CCV_NNC_GRAPH_FORWARD186k
||
cmd.cmd == CCV_NNC_GRAPH_BACKWARD186k
)
277
10.0k
    return 0;
278
186k
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
279
186k
  const ccv_nnc_cmd_registry_t registry = init_map[cmd_idx].registry;
280
186k
  if (registry.enforce_inplace)
281
2.27k
    return registry.enforce_inplace(input_idx, input_size, output_idx, output_size);
282
184k
  return 0;
283
184k
}
284
285
// This returns absolute time.
286
uint64_t ccv_nnc_cmd_mono_time(void)
287
3.01k
{
288
#ifdef __MACH__
289
  return mach_absolute_time();
290
#else
291
3.01k
  struct timespec ts;
292
3.01k
  clock_gettime(CLOCK_MONOTONIC, &ts);
293
3.01k
  return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
294
3.01k
#endif
295
3.01k
}
296
297
uint32_t ccv_nnc_cmd_find_backend(const ccv_nnc_cmd_t cmd, const int tensor_memory, const int tensor_formats, const int tensor_datatypes)
298
265k
{
299
265k
  if (cmd.cmd == CCV_NNC_NOOP ||
300
265k
    
cmd.cmd == CCV_NNC_GRAPH_FORWARD264k
||
cmd.cmd == CCV_NNC_GRAPH_BACKWARD264k
||
301
265k
    
cmd.cmd == CCV_NNC_CUSTOM_FORWARD264k
||
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD262k
)
302
7.75k
    return cmd.backend;
303
257k
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
304
257k
  assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]));
305
257k
  assert(tensor_memory != 0 && tensor_formats != 0 && tensor_datatypes != 0);
306
257k
  int i;
307
1.06M
  for (i = 0; i < CCV_NNC_BACKEND_COUNT; 
i++808k
)
308
1.06M
  {
309
1.06M
    const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i];
310
1.06M
    // We have the exec kernel, and support all the tensor memory types.
311
1.06M
    if (api_registry.exec &&
312
1.06M
      
(api_registry.tensor_memory & tensor_memory) == tensor_memory354k
&&
313
1.06M
      
(api_registry.tensor_formats & tensor_formats) == tensor_formats257k
&&
314
1.06M
      
(api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes257k
)
315
257k
      return backend_init_map[i].backend;
316
1.06M
  }
317
257k
  
return cmd.backend0
;
318
257k
}
319
320
528
#define AUTO_TUNE_TRIAL_SIZE (3)
321
322
static void _ccv_nnc_cmd_set_device_id(ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
323
396k
{
324
396k
#ifdef HAVE_CUDA
325
396k
  if (!stream_context)
326
103k
  {
327
103k
    int device_id;
328
103k
    if (ccv_nnc_device_ids_for_io(inputs, input_size, outputs, output_size, CCV_TENSOR_GPU_MEMORY, &device_id, 1) > 0)
329
5.52k
      cudevice(device_id);
330
103k
  }
331
396k
#endif
332
396k
}
333
334
ccv_nnc_cmd_t ccv_nnc_cmd_autotune(const ccv_nnc_cmd_t cmd, const size_t max_workspace_size, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
335
2.62k
{
336
2.62k
  // This is a custom cmd kernel, no need to autotune.
337
2.62k
  if (cmd.cmd == CCV_NNC_NOOP ||
338
2.62k
    cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD ||
339
2.62k
    cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD)
340
0
    return cmd;
341
2.62k
  int i, j, k;
342
2.62k
  // Go through all the backends that supports the same type of memory input / output tensors support.
343
2.62k
  int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0;
344
10.8k
  for (i = 0; i < input_size; 
i++8.25k
)
345
8.25k
    if (inputs[i])
346
6.12k
      tensor_memory |= CCV_TENSOR_GET_MEMORY(inputs[i]->info.type), tensor_formats |= inputs[i]->info.format, tensor_datatypes |= inputs[i]->info.datatype;
347
7.15k
  for (i = 0; i < output_size; 
i++4.52k
)
348
4.52k
    if (outputs[i])
349
4.34k
      tensor_memory |= CCV_TENSOR_GET_MEMORY(outputs[i]->info.type), tensor_formats |= outputs[i]->info.format, tensor_datatypes |= outputs[i]->info.datatype;
350
2.62k
  // In this case, we cannot determine the type of the tensor, skip auto-tune.
351
2.62k
  if (!tensor_memory)
352
0
    return cmd;
353
2.62k
  // Otherwise, we are good to go.
354
2.62k
  ccv_nnc_cmd_t tuned_cmd = cmd;
355
2.62k
  int64_t best_measured = -1;
356
2.62k
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
357
2.62k
  assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]));
358
2.62k
  int flag = 0;
359
18.1k
  for (i = 0; i < CCV_NNC_BACKEND_COUNT; 
i++15.4k
)
360
15.6k
  {
361
15.6k
    const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i];
362
15.6k
    // We have the exec kernel, and support all the tensor memory types.
363
15.6k
    if (api_registry.exec &&
364
15.6k
      
(api_registry.tensor_memory & tensor_memory) == tensor_memory5.75k
&&
365
15.6k
      
(api_registry.tensor_formats & tensor_formats) == tensor_formats2.75k
&&
366
15.6k
      
(api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes2.75k
)
367
2.75k
      if ((++flag) >= 2) // If we have more than 2 suitable backend, we can do this now.
368
132
        break;
369
15.6k
  }
370
2.62k
  if (flag == 0)
371
0
    return cmd;
372
2.62k
  _ccv_nnc_cmd_set_device_id(inputs, input_size, outputs, output_size, stream_context);
373
2.62k
  // Allocate inputs / outputs and fill them in.
374
2.62k
  ccv_nnc_tensor_t** const copy_inputs = (ccv_nnc_tensor_t**)cccalloc((input_size + output_size) * 3, sizeof(ccv_nnc_tensor_t*));
375
2.62k
  ccv_nnc_tensor_t** const copy_outputs = copy_inputs + input_size;
376
2.62k
  ccv_nnc_tensor_t** const allocated_inputs = copy_outputs + output_size;
377
2.62k
  ccv_nnc_tensor_t** const allocated_outputs = allocated_inputs + input_size;
378
2.62k
  ccv_nnc_tensor_view_t** const allocated_input_views = (ccv_nnc_tensor_view_t**)(allocated_outputs + output_size);
379
2.62k
  ccv_nnc_tensor_view_t** const allocated_output_views = allocated_input_views + input_size;
380
7.15k
  for (i = 0; i < output_size; 
i++4.52k
)
381
4.52k
    if (outputs[i])
382
4.34k
    {
383
18.6k
      for (j = 0; j < input_size; 
j++14.3k
)
384
15.7k
        if (inputs[j])
385
11.0k
        {
386
11.0k
          if (outputs[i] == inputs[j])
387
1.18k
          {
388
1.18k
            if (!copy_inputs[j])
389
1.18k
            {
390
1.18k
              allocated_inputs[j] = ccv_nnc_tensor_new(0, inputs[j]->info, 0);
391
1.18k
              if (CCV_IS_TENSOR_VIEW(inputs[j]))
392
1.18k
                
copy_inputs[j] = (ccv_nnc_tensor_t*)(allocated_input_views[j] = ccv_nnc_tensor_view_new(allocated_inputs[j], inputs[j]->info, 0
DIM_ALLOC0
(), inputs[j]->info.dim));
393
1.18k
              else
394
1.18k
                copy_inputs[j] = allocated_inputs[j];
395
1.18k
            }
396
1.18k
            copy_outputs[i] = copy_inputs[j];
397
1.18k
            break;
398
9.89k
          } else if (outputs[i]->data.u8 == inputs[j]->data.u8 &&
399
9.89k
            
ccv_nnc_tensor_count(outputs[i]->info) == ccv_nnc_tensor_count(inputs[j]->info)287
) {
400
287
            if (!copy_inputs[j])
401
287
            {
402
287
              allocated_inputs[j] = ccv_nnc_tensor_new(0, inputs[j]->info, 0);
403
287
              if (CCV_IS_TENSOR_VIEW(inputs[j]))
404
287
                
copy_inputs[j] = (ccv_nnc_tensor_t*)(allocated_input_views[j] = ccv_nnc_tensor_view_new(allocated_inputs[j], inputs[j]->info, 0
DIM_ALLOC0
(), inputs[j]->info.dim));
405
287
              else
406
287
                copy_inputs[j] = allocated_inputs[j];
407
287
            }
408
287
            allocated_outputs[i] = ccv_nnc_tensor_new(copy_inputs[j]->data.u8, outputs[i]->info, 0);
409
287
            if (CCV_IS_TENSOR_VIEW(outputs[i]))
410
287
              
copy_outputs[i] = (ccv_nnc_tensor_t*)(allocated_output_views[i] = ccv_nnc_tensor_view_new(allocated_outputs[i], outputs[i]->info, 0
DIM_ALLOC0
(), outputs[i]->info.dim));
411
287
            else
412
287
              copy_outputs[i] = allocated_outputs[i];
413
287
            break;
414
287
          }
415
11.0k
        }
416
4.34k
      if (!copy_outputs[i])
417
2.86k
      {
418
2.86k
        allocated_outputs[i] = ccv_nnc_tensor_new(0, outputs[i]->info, 0);
419
2.86k
        if (CCV_IS_TENSOR_VIEW(outputs[i]))
420
2.86k
          
copy_outputs[i] = (ccv_nnc_tensor_t*)(allocated_output_views[i] = ccv_nnc_tensor_view_new(allocated_outputs[i], outputs[i]->info, 8
DIM_ALLOC8
(), outputs[i]->info.dim));
421
2.86k
        else
422
2.86k
          copy_outputs[i] = allocated_outputs[i];
423
2.86k
      }
424
4.34k
    }
425
10.8k
  for (i = 0; i < input_size; 
i++8.25k
)
426
8.25k
    if (inputs[i] && 
!copy_inputs[i]6.12k
)
427
4.65k
      copy_inputs[i] = inputs[i];
428
2.62k
  if (flag == 1)
429
2.49k
  {
430
7.87k
    for (i = 0; i < CCV_NNC_BACKEND_COUNT; 
i++5.38k
)
431
7.87k
    {
432
7.87k
      const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i];
433
7.87k
      // We have the exec kernel, and support all the tensor memory types.
434
7.87k
      if (api_registry.exec &&
435
7.87k
        
(api_registry.tensor_memory & tensor_memory) == tensor_memory3.04k
&&
436
7.87k
        
(api_registry.tensor_formats & tensor_formats) == tensor_formats2.49k
&&
437
7.87k
        
(api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes2.49k
)
438
2.49k
      {
439
2.49k
        tuned_cmd.backend = backend_init_map[i].backend;
440
2.49k
        // If a given API exist an autotune function, use that to pick the top algorithm.
441
2.49k
        if (api_registry.autotune)
442
147
        {
443
147
          ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, inputs, input_size, copy_inputs, input_size, stream_context);
444
147
          _ccv_nnc_cmd_set_device_id(copy_inputs, input_size, copy_outputs, output_size, stream_context);
445
147
          tuned_cmd.algorithm = api_registry.autotune(tuned_cmd, max_workspace_size, hint, flags, copy_inputs, input_size, copy_outputs, output_size, stream_context);
446
147
          // Drain the context, autotune can use excessive amount of memory. Need to drain it now.
447
147
          ccv_nnc_stream_context_drain(stream_context);
448
147
        }
449
2.49k
        break;
450
2.49k
      }
451
7.87k
    }
452
10.3k
    for (i = 0; i < input_size; 
i++7.82k
)
453
7.82k
    {
454
7.82k
      if (allocated_inputs[i])
455
1.47k
        ccv_nnc_tensor_free(allocated_inputs[i]);
456
7.82k
      if (allocated_input_views[i])
457
0
        ccv_nnc_tensor_view_free(allocated_input_views[i]);
458
7.82k
    }
459
6.83k
    for (i = 0; i < output_size; 
i++4.34k
)
460
4.34k
    {
461
4.34k
      if (allocated_outputs[i])
462
2.97k
        ccv_nnc_tensor_free(allocated_outputs[i]);
463
4.34k
      if (allocated_output_views[i])
464
6
        ccv_nnc_tensor_view_free(allocated_output_views[i]);
465
4.34k
    }
466
2.49k
    ccfree(copy_inputs);
467
2.49k
    return tuned_cmd;
468
2.49k
  }
469
132
  // We need to have trial loop through all the data.
470
528
  
for (k = 0; 132
k < AUTO_TUNE_TRIAL_SIZE;
k++396
)
471
396
  {
472
2.77k
    for (i = 0; i < CCV_NNC_BACKEND_COUNT; 
i++2.37k
)
473
2.37k
    {
474
2.37k
      const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i];
475
2.37k
      // We have the exec kernel, and support all the tensor memory types.
476
2.37k
      if (api_registry.exec &&
477
2.37k
        
(api_registry.tensor_memory & tensor_memory) == tensor_memory1.18k
&&
478
2.37k
        
(api_registry.tensor_formats & tensor_formats) == tensor_formats792
&&
479
2.37k
        
(api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes792
)
480
792
      {
481
792
        ccv_nnc_cmd_t candid_cmd = cmd;
482
792
        candid_cmd.backend = backend_init_map[i].backend;
483
792
        // If a given API exist an autotune function, use that to pick the top algorithm.
484
792
        if (api_registry.autotune)
485
0
        {
486
0
          // Assuming k == 0 is sufficient, and we can skip.
487
0
          if (k > 0)
488
0
            continue;
489
0
          ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, inputs, input_size, copy_inputs, input_size, stream_context);
490
0
          _ccv_nnc_cmd_set_device_id(copy_inputs, input_size, copy_outputs, output_size, stream_context);
491
0
          candid_cmd.algorithm = api_registry.autotune(candid_cmd, max_workspace_size, hint, flags, copy_inputs, input_size, copy_outputs, output_size, stream_context);
492
0
          // Drain the context, autotune can use excessive amount of memory. Need to drain it now.
493
0
          ccv_nnc_stream_context_drain(stream_context);
494
0
          uint64_t elapsed = ccv_nnc_cmd_mono_time();
495
0
          // Ready to run.
496
0
          int status = ccv_nnc_cmd_exec(candid_cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context);
497
0
          ccv_nnc_stream_context_wait(stream_context);
498
0
          elapsed = ccv_nnc_cmd_mono_time() - elapsed;
499
0
          if (status == CCV_NNC_EXEC_SUCCESS &&
500
0
            (best_measured == -1 || elapsed < best_measured))
501
0
          {
502
0
            best_measured = elapsed;
503
0
            tuned_cmd = candid_cmd;
504
0
          }
505
792
        } else {
506
792
          // Otherwise loop over the existing algorithms and pick the top one.
507
2.29k
          for (j = 0; j < api_registry.algorithms; 
j++1.50k
)
508
1.50k
          {
509
1.50k
            candid_cmd.algorithm = j;
510
1.50k
            ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, inputs, input_size, copy_inputs, input_size, stream_context);
511
1.50k
            _ccv_nnc_cmd_set_device_id(copy_inputs, input_size, copy_outputs, output_size, stream_context);
512
1.50k
            uint64_t elapsed = ccv_nnc_cmd_mono_time();
513
1.50k
            // Ready to run.
514
1.50k
            int status = ccv_nnc_cmd_exec(candid_cmd, hint, flags, copy_inputs, input_size, copy_outputs, output_size, stream_context);
515
1.50k
            elapsed = ccv_nnc_cmd_mono_time() - elapsed;
516
1.50k
            if (status == CCV_NNC_EXEC_SUCCESS &&
517
1.50k
              
(969
best_measured == -1969
||
elapsed < best_measured837
))
518
418
            {
519
418
              best_measured = elapsed;
520
418
              tuned_cmd = candid_cmd;
521
418
            }
522
1.50k
          }
523
792
        }
524
792
      }
525
2.37k
    }
526
396
  }
527
566
  for (i = 0; i < input_size; 
i++434
)
528
434
  {
529
434
    if (allocated_inputs[i])
530
0
      ccv_nnc_tensor_free(allocated_inputs[i]);
531
434
    if (allocated_input_views[i])
532
0
      ccv_nnc_tensor_view_free(allocated_input_views[i]);
533
434
  }
534
311
  for (i = 0; i < output_size; 
i++179
)
535
179
  {
536
179
    if (allocated_outputs[i])
537
179
      ccv_nnc_tensor_free(allocated_outputs[i]);
538
179
    if (allocated_output_views[i])
539
2
      ccv_nnc_tensor_view_free(allocated_output_views[i]);
540
179
  }
541
132
  ccfree(copy_inputs);
542
132
  return tuned_cmd;
543
132
}
544
545
int ccv_nnc_cmd_bitmask(const ccv_nnc_cmd_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size)
546
154k
{
547
154k
  // If it is no-op, return true, it can deal with any number of parameters.
548
154k
  if (cmd.cmd == CCV_NNC_NOOP)
549
69
    return 1;
550
154k
  // If it is a custom command, I cannot check it at all, return false.
551
154k
  if (cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD)
552
2.40k
    return 0;
553
152k
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
554
152k
  const ccv_nnc_cmd_registry_t cmd_registry = init_map[cmd_idx].registry;
555
152k
  if (cmd_registry.bitmask)
556
152k
    return cmd_registry.bitmask(input_size, output_size, input_bitmasks, input_bitmask_size, output_bitmasks, output_bitmask_size);
557
0
  // If there is not checking, none can pass.
558
0
  return 0;
559
0
}
560
561
int ccv_nnc_device_ids_for_io(ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const int tensor_type, int* const device_ids, const int max_device_id_size)
562
116k
{
563
116k
  int i, j;
564
116k
  int device_id_size = 0;
565
116k
  if (max_device_id_size <= device_id_size)
566
0
    return device_id_size;
567
116k
  // The device id of the exec is determined by its outputs.
568
272k
  
for (i = 0; 116k
i < output_size;
i++155k
)
569
160k
    if (outputs[i] &&
570
160k
      
CCV_TENSOR_GET_MEMORY149k
(outputs[i]->info.type) == tensor_type149k
&&
571
160k
      
CCV_TENSOR_GET_DEVICE24.8k
(outputs[i]->info.type) != CCV_COMPUTE_DEVICE_ANY24.8k
)
572
24.8k
    {
573
24.8k
      const int device_id = CCV_TENSOR_GET_DEVICE_ID(outputs[i]->info.type);
574
24.8k
      int flag = 0;
575
35.6k
      for (j = 0; !flag && 
j < device_id_size27.8k
;
j++10.7k
)
576
10.7k
        flag = (device_ids[j] == device_id);
577
24.8k
      if (flag)
578
7.85k
        continue;
579
17.0k
      device_ids[device_id_size++] = device_id;
580
17.0k
      if (device_id_size >= max_device_id_size)
581
5.13k
        return device_id_size;
582
17.0k
    }
583
116k
  
if (111k
device_id_size == 0111k
)
584
101k
  {
585
101k
    int device_id = -1;
586
322k
    for (i = 0; i < input_size; 
i++221k
)
587
221k
      if (inputs[i] &&
588
221k
        
CCV_TENSOR_GET_MEMORY186k
(inputs[i]->info.type) == tensor_type186k
&&
589
221k
        
CCV_TENSOR_GET_DEVICE1.75k
(inputs[i]->info.type) != CCV_COMPUTE_DEVICE_ANY1.75k
&&
590
221k
        
(1.75k
device_id < 01.75k
||
CCV_TENSOR_GET_DEVICE_ID968
(inputs[i]->info.type) < device_id968
))
591
789
        device_id = CCV_TENSOR_GET_DEVICE_ID(inputs[i]->info.type);
592
101k
    if (device_id >= 0)
593
789
    {
594
789
      device_ids[0] = device_id;
595
789
      return 1;
596
789
    }
597
110k
  }
598
110k
  return device_id_size;
599
110k
}
600
601
int ccv_nnc_cmd_exec(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
602
423k
{
603
423k
  // If it is no-op, return as if succeed already.
604
423k
  if (cmd.cmd == CCV_NNC_NOOP)
605
31.0k
    return 0;
606
391k
  _ccv_nnc_cmd_set_device_id(inputs, input_size, outputs, output_size, stream_context);
607
391k
  // If it is a custom command, just apply it directly.
608
391k
  if (cmd.cmd == CCV_NNC_CUSTOM_FORWARD || 
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD389k
)
609
4.80k
  {
610
4.80k
    int ret = cmd.isa->exec(cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context);
611
4.80k
    if (!stream_context)
612
4.41k
      ccv_nnc_stream_context_drain(stream_context);
613
4.80k
    return ret;
614
4.80k
  }
615
387k
  assert(cmd.cmd != CCV_NNC_GRAPH_FORWARD && cmd.cmd != CCV_NNC_GRAPH_BACKWARD);
616
387k
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
617
387k
  assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]));
618
387k
  int i;
619
387k
  uint32_t backend = cmd.backend;
620
387k
  if (backend == CCV_NNC_NO_BACKEND)
621
178k
  {
622
178k
    // Find a suitable backend.
623
178k
    int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0;
624
442k
    for (i = 0; i < input_size; 
i++264k
)
625
264k
      if (inputs[i])
626
263k
        tensor_memory |= CCV_TENSOR_GET_MEMORY(inputs[i]->info.type), tensor_formats |= inputs[i]->info.format, tensor_datatypes |= inputs[i]->info.datatype;
627
400k
    for (i = 0; i < output_size; 
i++222k
)
628
222k
      if (outputs[i])
629
221k
        tensor_memory |= CCV_TENSOR_GET_MEMORY(outputs[i]->info.type), tensor_formats |= outputs[i]->info.format, tensor_datatypes |= outputs[i]->info.datatype;
630
178k
    backend = ccv_nnc_cmd_find_backend(cmd, tensor_memory, tensor_formats, tensor_datatypes);
631
178k
  }
632
387k
  assert(backend != CCV_NNC_NO_BACKEND);
633
387k
  const int backend_idx = _ccv_nnc_cmd_backend_ph(backend);
634
387k
  assert(backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT);
635
387k
  const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[backend_idx];
636
387k
  if (!api_registry.exec)
637
0
    return CCV_NNC_EXEC_NO_KERNEL;
638
387k
  // Everything is out, call the underlying implementation.
639
387k
  int ret = api_registry.exec(cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context);
640
387k
  if (!stream_context)
641
95.2k
    ccv_nnc_stream_context_drain(stream_context);
642
387k
  return ret;
643
387k
}
644
645
int ccv_nnc_cmd_attr(const ccv_nnc_cmd_t cmd, const int flags)
646
0
{
647
0
  // No additional attr for noop.
648
0
  if (cmd.cmd == CCV_NNC_NOOP ||
649
0
    // If it is a custom command, just apply it directly.
650
0
    cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD ||
651
0
    // If it is sub-graph, there is no additional attr as well.
652
0
    cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD)
653
0
    return 0;
654
0
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
655
0
  assert(cmd_idx >= 0 && cmd_idx <sizeof(init_map) / sizeof(init_map[0]));
656
0
  const ccv_nnc_cmd_registry_t cmd_registry = init_map[cmd_idx].registry;
657
0
  return !!(cmd_registry.flags & flags);
658
0
}