Coverage Report

Created: 2019-07-03 22:50

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/ccv_nnc_cmd.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_internal.h"
3
#include "ccv_nnc_easy.h"
4
#ifdef HAVE_CUDA
5
#include "gpu/ccv_nnc_compat.h"
6
#endif
7
#include <time.h>
8
#include <sys/time.h>
9
10
#ifdef __MACH__
11
#include <mach/mach.h>
12
#include <mach/mach_time.h>
13
#endif
14
15
typedef struct {
16
  const uint32_t cmd;
17
  const char* name;
18
  ccv_nnc_cmd_registry_t registry;
19
  ccv_nnc_cmd_backend_registry_t backends[CCV_NNC_BACKEND_COUNT];
20
} ccv_nnc_cmd_init_t;
21
22
typedef struct {
23
  const uint32_t backend;
24
  const char* name;
25
} ccv_nnc_cmd_backend_init_t;
26
27
// The generated code configures command and its mapping.
28
#include "cmd/ccv_nnc_cmd.inc"
29
30
void ccv_nnc_init(void)
31
1
{
32
1
  _ccv_nnc_cmd_init();
33
1
}
34
35
const char* ccv_nnc_cmd_name(const uint32_t cmd)
36
1.21k
{
37
1.21k
  switch (cmd)
38
1.21k
  {
39
1.21k
    case CCV_NNC_NOOP:
40
68
      return "CCV_NNC_NOOP";
41
1.21k
    case CCV_NNC_CUSTOM_FORWARD:
42
0
      return "CCV_NNC_CUSTOM_FORWARD";
43
1.21k
    case CCV_NNC_CUSTOM_BACKWARD:
44
0
      return "CCV_NNC_CUSTOM_BACKWARD";
45
1.21k
    case CCV_NNC_GRAPH_FORWARD:
46
64
      return "CCV_NNC_GRAPH_FORWARD";
47
1.21k
    case CCV_NNC_GRAPH_BACKWARD:
48
5
      return "CCV_NNC_GRAPH_BACKWARD";
49
1.07k
  }
50
1.07k
  const int idx = _ccv_nnc_cmd_ph(cmd);
51
1.07k
  assert(idx >= 0);
52
1.07k
  assert(idx < sizeof(init_map) / sizeof(init_map[0]));
53
1.07k
  return init_map[idx].name;
54
1.07k
}
55
56
const char* ccv_nnc_cmd_backend_name(const uint32_t backend)
57
0
{
58
0
  if (backend == CCV_NNC_NO_BACKEND)
59
0
    return "CCV_NNC_NO_BACKEND";
60
0
  const int idx = _ccv_nnc_cmd_backend_ph(backend);
61
0
  assert(idx >= 0);
62
0
  assert(idx < CCV_NNC_BACKEND_COUNT);
63
0
  return backend_init_map[idx].name;
64
0
}
65
66
const ccv_nnc_cmd_param_t ccv_nnc_cmd_auto = {};
67
68
int ccv_nnc_is_cmd_auto(const ccv_nnc_cmd_param_t params)
69
0
{
70
0
  return (memcmp(&params, &ccv_nnc_cmd_auto, sizeof(ccv_nnc_cmd_param_t)) == 0);
71
0
}
72
73
int ccv_nnc_cmd_is_forward(const ccv_nnc_cmd_t cmd)
74
3.28k
{
75
3.28k
  switch (cmd.cmd)
76
3.28k
  {
77
3.28k
    case CCV_NNC_NOOP:
78
2
      return 0;
79
3.28k
    case CCV_NNC_CUSTOM_FORWARD:
80
3
    case CCV_NNC_CUSTOM_BACKWARD:
81
3
    case CCV_NNC_GRAPH_FORWARD:
82
3
    case CCV_NNC_GRAPH_BACKWARD:
83
3.27k
    default:
84
3.27k
      return !(cmd.cmd & 0x1); // If it is even, it is forward
85
3.28k
  }
86
3.28k
}
87
88
int ccv_nnc_cmd_is_backward(const ccv_nnc_cmd_t cmd)
89
6.52k
{
90
6.52k
  switch (cmd.cmd)
91
6.52k
  {
92
6.52k
    case CCV_NNC_NOOP:
93
2
      return 0;
94
6.52k
    case CCV_NNC_CUSTOM_FORWARD:
95
6
    case CCV_NNC_CUSTOM_BACKWARD:
96
6
    case CCV_NNC_GRAPH_FORWARD:
97
6
    case CCV_NNC_GRAPH_BACKWARD:
98
6.52k
    default:
99
6.52k
      return !!(cmd.cmd & 0x1); // If it is odd, it is backward
100
6.52k
  }
101
6.52k
}
102
103
int ccv_nnc_cmd_ok(const uint32_t cmd, const uint32_t backend)
104
102
{
105
102
  // If it is a custom command, a no op, or a graph op, there is no backend to check.
106
102
  if (cmd == CCV_NNC_NOOP ||
107
102
    cmd == CCV_NNC_GRAPH_FORWARD || cmd == CCV_NNC_GRAPH_BACKWARD ||
108
102
    cmd == CCV_NNC_CUSTOM_FORWARD || cmd == CCV_NNC_CUSTOM_BACKWARD)
109
0
    return 1;
110
102
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd);
111
102
  const int backend_idx = _ccv_nnc_cmd_backend_ph(backend);
112
102
  assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]));
113
102
  assert(backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT);
114
102
  const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[backend_idx];
115
102
  // Check if the execution function exists or not.
116
102
  return !!api_registry.exec;
117
102
}
118
119
ccv_nnc_cmd_t ccv_nnc_cmd(const uint32_t _cmd, ccv_nnc_cmd_exec_f exec, const ccv_nnc_cmd_param_t params, const int flags)
120
27.6k
{
121
27.6k
  ccv_nnc_cmd_t cmd;
122
27.6k
  cmd.info = params;
123
27.6k
  cmd.backend = CCV_NNC_NO_BACKEND;
124
27.6k
  assert((_cmd == CCV_NNC_CUSTOM_FORWARD && exec) || (_cmd != CCV_NNC_CUSTOM_FORWARD && !exec));
125
27.6k
  cmd.cmd = _cmd;
126
27.6k
  cmd.algorithm = -1; // This is default.
127
27.6k
  cmd.exec = exec;
128
27.6k
  return cmd;
129
27.6k
}
130
131
const ccv_nnc_hint_t ccv_nnc_no_hint = {};
132
133
int ccv_nnc_is_no_hint(const ccv_nnc_hint_t hint)
134
14.6k
{
135
14.6k
  return (memcmp(&hint, &ccv_nnc_no_hint, sizeof(ccv_nnc_hint_t)) == 0);
136
14.6k
}
137
138
int ccv_nnc_hint_verify(const ccv_nnc_hint_t hint, const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t a, const ccv_nnc_tensor_param_t b)
139
3
{
140
3
  int i;
141
3
  assert(a.format == b.format);
142
3
  const int nd = ccv_nnc_tensor_nd(a.dim);
143
3
  assert(nd == CCV_NNC_MAX_DIM + 1 || nd == CCV_NNC_MAX_DIM + 2);
144
3
  int hw;
145
3
  if ((a.format == CCV_TENSOR_FORMAT_CHWN) ||
146
3
    (a.format == CCV_TENSOR_FORMAT_NHWC && nd == CCV_NNC_MAX_DIM + 1))
147
0
    hw = 0;
148
3
  else if ((a.format == CCV_TENSOR_FORMAT_NHWC && nd == CCV_NNC_MAX_DIM + 2) ||
149
3
       
(0
a.format == CCV_TENSOR_FORMAT_NCHW0
&&
nd == 0
CCV_NNC_MAX_DIM0
+ 1))
150
3
    hw = 1;
151
0
  else if (a.format == CCV_TENSOR_FORMAT_NCHW && nd == CCV_NNC_MAX_DIM + 2)
152
0
    hw = 2;
153
0
  else
154
0
    assert(0 && "unknown format");
155
9
  
for (i = 0; 3
i < CCV_NNC_MAX_DIM;
i++6
)
156
6
  {
157
6
    if ((hint.border.begin[i] + hint.border.end[i] + a.dim[i] - cmd.size.dim[i]) % hint.stride.dim[i] != 0)
158
0
      return -1;
159
6
    int expected = (hint.border.begin[i] + hint.border.end[i] + a.dim[i + hw] - cmd.size.dim[i]) / hint.stride.dim[i] + 1;
160
6
    if (expected != b.dim[i + hw])
161
0
      return -1;
162
6
  }
163
3
  return 0;
164
3
}
165
166
ccv_nnc_hint_t ccv_nnc_hint_auto(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t a, const ccv_nnc_tensor_param_t b)
167
12.4k
{
168
12.4k
  int i;
169
12.4k
  if (a.format != b.format)
170
0
    return ccv_nnc_no_hint;
171
12.4k
  assert(a.format == b.format);
172
12.4k
  const int a_nd = ccv_nnc_tensor_nd(a.dim);
173
12.4k
  const int b_nd = ccv_nnc_tensor_nd(b.dim);
174
12.4k
  // Is not auto hint deducible dimensions.
175
12.4k
  if (a_nd != b_nd || 
(12.4k
a_nd != 12.4k
CCV_NNC_MAX_DIM12.4k
+ 1 &&
a_nd != 12.3k
CCV_NNC_MAX_DIM12.3k
+ 2))
176
12.0k
    return ccv_nnc_no_hint;
177
481
  int hw;
178
481
  if ((a.format == CCV_TENSOR_FORMAT_CHWN) ||
179
481
    (a.format == CCV_TENSOR_FORMAT_NHWC && 
a_nd == 255
CCV_NNC_MAX_DIM255
+ 1))
180
114
    hw = 0;
181
367
  else if ((a.format == CCV_TENSOR_FORMAT_NHWC && 
a_nd == 141
CCV_NNC_MAX_DIM141
+ 2) ||
182
367
       
(226
a.format == CCV_TENSOR_FORMAT_NCHW226
&&
a_nd == 226
CCV_NNC_MAX_DIM226
+ 1))
183
141
    hw = 1;
184
226
  else if (a.format == CCV_TENSOR_FORMAT_NCHW && a_nd == CCV_NNC_MAX_DIM + 2)
185
226
    hw = 2;
186
226
  else
187
226
    assert(0 && "unknown format");
188
481
  ccv_nnc_hint_t hint_auto = {};
189
481
  // 0-dim is reserved for channels
190
1.44k
  for (i = 0; i < CCV_NNC_MAX_DIM; 
i++962
)
191
962
  {
192
962
    // Cannot have one of the dim is zero, we cannot auto the hint, return no hint.
193
962
    assert(a.dim[i + hw] && b.dim[i + hw]);
194
962
    // This is guessed by having a stride that will approximately match the scale.
195
962
    int stride = (a.dim[i + hw] + b.dim[i + hw] / 2) / b.dim[i + hw];
196
962
    hint_auto.stride.dim[i] = stride;
197
962
    int border = (b.dim[i + hw] - 1) * stride - a.dim[i + hw] + cmd.size.dim[i];
198
962
    hint_auto.border.begin[i] = (border + 1) / 2; // Always prefer to have more padding in the beginning, this matches CUDNN behavior.
199
962
    hint_auto.border.end[i] = border - hint_auto.border.begin[i];
200
962
  }
201
481
  return hint_auto;
202
481
}
203
204
void ccv_nnc_hint_tensor_auto_forward_from_inputs(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
205
3.76k
{
206
3.76k
  int i;
207
3.76k
  assert(output_size <= input_size);
208
7.69k
  
for (i = 0; 3.76k
i < output_size;
i++3.93k
)
209
3.93k
    outputs[i] = inputs[i];
210
3.76k
}
211
212
void ccv_nnc_hint_tensor_auto_backward_from_gradient(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
213
2.19k
{
214
2.19k
  int i;
215
5.47k
  for (i = 0; i < output_size; 
i++3.27k
)
216
3.27k
    outputs[i] = inputs[0];
217
2.19k
}
218
219
void ccv_nnc_hint_tensor_auto_backward_from_inputs(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
220
1.15k
{
221
1.15k
  int i;
222
1.15k
  assert(output_size < input_size);
223
4.52k
  
for (i = 0; 1.15k
i < output_size;
i++3.36k
)
224
3.36k
    outputs[i] = inputs[i + 1];
225
1.15k
}
226
227
void ccv_nnc_hint_tensor_auto(const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
228
13.4k
{
229
13.4k
  // zero out the parameters
230
13.4k
  const ccv_nnc_tensor_param_t z = {};
231
13.4k
  int i;
232
32.7k
  for (i = 0; i < output_size; 
i++19.3k
)
233
19.3k
    outputs[i] = z; // Reset the outputs.
234
13.4k
  // Cannot handle these situations.
235
13.4k
  if (cmd.cmd == CCV_NNC_NOOP || 
cmd.cmd == CCV_NNC_CUSTOM_FORWARD13.4k
||
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD13.4k
||
cmd.cmd == CCV_NNC_GRAPH_FORWARD13.4k
||
cmd.cmd == CCV_NNC_GRAPH_BACKWARD13.4k
)
236
38
    return;
237
13.4k
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
238
13.4k
  const ccv_nnc_cmd_registry_t registry = init_map[cmd_idx].registry;
239
13.4k
  if (registry.tensor_auto)
240
13.4k
    registry.tensor_auto(cmd.info, inputs, input_size, hint, outputs, output_size);
241
0
  else if (ccv_nnc_cmd_is_forward(cmd)) // For forward, the default auto is forward_from_inputs
242
0
    ccv_nnc_hint_tensor_auto_forward_from_inputs(cmd.info, inputs, input_size, hint, outputs, output_size);
243
0
  else // For backward, the default auto is backward_from_inputs
244
0
    ccv_nnc_hint_tensor_auto_backward_from_inputs(cmd.info, inputs, input_size, hint, outputs, output_size);
245
13.4k
}
246
247
int ccv_nnc_cmd_allow_inplace(const ccv_nnc_cmd_t cmd, const int input_idx, const int input_size, const int output_idx, const int output_size)
248
16.2k
{
249
16.2k
  if (cmd.cmd == CCV_NNC_NOOP || 
cmd.cmd == CCV_NNC_CUSTOM_FORWARD16.2k
||
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD16.2k
||
cmd.cmd == CCV_NNC_GRAPH_FORWARD16.2k
||
cmd.cmd == CCV_NNC_GRAPH_BACKWARD16.1k
)
250
71
    return 0;
251
16.1k
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
252
16.1k
  const ccv_nnc_cmd_registry_t registry = init_map[cmd_idx].registry;
253
16.1k
  if (registry.allow_inplace)
254
6.75k
    return registry.allow_inplace(input_idx, input_size, output_idx, output_size);
255
9.42k
  return 0;
256
9.42k
}
257
258
int ccv_nnc_cmd_enforce_inplace(const ccv_nnc_cmd_t cmd, const int input_idx, const int input_size, const int output_idx, const int output_size)
259
53.7k
{
260
53.7k
  if (cmd.cmd == CCV_NNC_NOOP || 
cmd.cmd == CCV_NNC_CUSTOM_FORWARD53.7k
||
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD53.7k
||
cmd.cmd == CCV_NNC_GRAPH_FORWARD53.7k
||
cmd.cmd == CCV_NNC_GRAPH_BACKWARD53.7k
)
261
77
    return 0;
262
53.7k
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
263
53.7k
  const ccv_nnc_cmd_registry_t registry = init_map[cmd_idx].registry;
264
53.7k
  if (registry.enforce_inplace)
265
2.08k
    return registry.enforce_inplace(input_idx, input_size, output_idx, output_size);
266
51.6k
  return 0;
267
51.6k
}
268
269
// This returns absolute time.
270
uint64_t ccv_nnc_cmd_mono_time(void)
271
2.43k
{
272
#ifdef __MACH__
273
  return mach_absolute_time();
274
#else
275
  struct timespec ts;
276
2.43k
  clock_gettime(CLOCK_MONOTONIC, &ts);
277
2.43k
  return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
278
2.43k
#endif
279
2.43k
}
280
281
uint32_t ccv_nnc_cmd_find_backend(const ccv_nnc_cmd_t cmd, const int tensor_memory, const int tensor_formats, const int tensor_datatypes)
282
2.16M
{
283
2.16M
  if (cmd.cmd == CCV_NNC_NOOP ||
284
2.16M
    
cmd.cmd == CCV_NNC_GRAPH_FORWARD2.16M
||
cmd.cmd == CCV_NNC_GRAPH_BACKWARD2.16M
||
285
2.16M
    
cmd.cmd == CCV_NNC_CUSTOM_FORWARD2.16M
||
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD2.16M
)
286
114
    return cmd.backend;
287
2.16M
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
288
2.16M
  assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]));
289
2.16M
  assert(tensor_memory != 0 && tensor_formats != 0 && tensor_datatypes != 0);
290
2.16M
  int i;
291
8.31M
  for (i = 0; i < CCV_NNC_BACKEND_COUNT; 
i++6.15M
)
292
8.31M
  {
293
8.31M
    const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i];
294
8.31M
    // We have the exec kernel, and support all the tensor memory types.
295
8.31M
    if (api_registry.exec &&
296
8.31M
      
(api_registry.tensor_memory & tensor_memory) == tensor_memory2.19M
&&
297
8.31M
      
(api_registry.tensor_formats & tensor_formats) == tensor_formats2.16M
&&
298
8.31M
      
(api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes2.16M
)
299
2.16M
      return backend_init_map[i].backend;
300
8.31M
  }
301
2.16M
  
return cmd.backend0
;
302
2.16M
}
303
304
400
#define AUTO_TUNE_TRIAL_SIZE (3)
305
306
static void _ccv_nnc_cmd_set_device_id(ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
307
3.23M
{
308
3.23M
#ifdef HAVE_CUDA
309
3.23M
  if (!stream_context)
310
96.7k
  {
311
96.7k
    int device_id;
312
96.7k
    if (ccv_nnc_device_ids_for_io(inputs, input_size, outputs, output_size, &device_id, 1) > 0)
313
4.62k
      cudevice(device_id);
314
96.7k
  }
315
3.23M
#endif
316
3.23M
}
317
318
ccv_nnc_cmd_t ccv_nnc_cmd_autotune(const ccv_nnc_cmd_t cmd, const size_t max_workspace_size, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
319
1.09k
{
320
1.09k
  // This is a custom cmd kernel, no need to autotune.
321
1.09k
  if (cmd.cmd == CCV_NNC_NOOP ||
322
1.09k
    cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD ||
323
1.09k
    cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD)
324
0
    return cmd;
325
1.09k
  int i, j, k;
326
1.09k
  // Go through all the backends that supports the same type of memory input / output tensors support.
327
1.09k
  int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0;
328
4.94k
  for (i = 0; i < input_size; 
i++3.85k
)
329
3.85k
    if (inputs[i])
330
2.88k
      tensor_memory |= CCV_TENSOR_GET_MEMORY(inputs[i]->info.type), tensor_formats |= inputs[i]->info.format, tensor_datatypes |= inputs[i]->info.datatype;
331
3.25k
  for (i = 0; i < output_size; 
i++2.15k
)
332
2.15k
    if (outputs[i])
333
2.00k
      tensor_memory |= CCV_TENSOR_GET_MEMORY(outputs[i]->info.type), tensor_formats |= outputs[i]->info.format, tensor_datatypes |= outputs[i]->info.datatype;
334
1.09k
  // In this case, we cannot determine the type of the tensor, skip auto-tune.
335
1.09k
  if (!tensor_memory)
336
0
    return cmd;
337
1.09k
  // Otherwise, we are good to go.
338
1.09k
  ccv_nnc_cmd_t tuned_cmd = cmd;
339
1.09k
  int64_t best_measured = -1;
340
1.09k
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
341
1.09k
  assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]));
342
1.09k
  int flag = 0;
343
7.47k
  for (i = 0; i < CCV_NNC_BACKEND_COUNT; 
i++6.37k
)
344
6.47k
  {
345
6.47k
    const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i];
346
6.47k
    // We have the exec kernel, and support all the tensor memory types.
347
6.47k
    if (api_registry.exec &&
348
6.47k
      
(api_registry.tensor_memory & tensor_memory) == tensor_memory2.30k
&&
349
6.47k
      
(api_registry.tensor_formats & tensor_formats) == tensor_formats1.19k
&&
350
6.47k
      
(api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes1.19k
)
351
1.19k
      if ((++flag) >= 2) // If we have more than 2 suitable backend, we can do this now.
352
100
        break;
353
6.47k
  }
354
1.09k
  if (flag == 0)
355
0
    return cmd;
356
1.09k
  _ccv_nnc_cmd_set_device_id(inputs, input_size, outputs, output_size, stream_context);
357
1.09k
  // Allocate inputs / outputs and fill them in.
358
1.09k
  ccv_nnc_tensor_t** const copy_inputs = (ccv_nnc_tensor_t**)malloc(sizeof(ccv_nnc_tensor_t*) * (input_size + output_size * 2));
359
1.09k
  ccv_nnc_tensor_t** const copy_outputs = copy_inputs + input_size;
360
1.09k
  ccv_nnc_tensor_t** const allocated_outputs = copy_outputs + output_size;
361
4.94k
  for (i = 0; i < input_size; 
i++3.85k
)
362
3.85k
    copy_inputs[i] = (inputs[i]) ? 
ccv_nnc_tensor_new(0, inputs[i]->info, 0)2.88k
:
0966
;
363
3.25k
  for (i = 0; i < output_size; 
i++2.15k
)
364
2.15k
  {
365
2.15k
    allocated_outputs[i] = copy_outputs[i] = 0;
366
2.15k
    if (outputs[i])
367
2.00k
    {
368
9.12k
      for (j = 0; j < input_size; 
j++7.11k
)
369
8.13k
        if (inputs[j])
370
5.78k
        {
371
5.78k
          if (outputs[i] == inputs[j])
372
796
          {
373
796
            copy_outputs[i] = copy_inputs[j];
374
796
            break;
375
4.98k
          } else if (outputs[i]->data.u8 == inputs[j]->data.u8 &&
376
4.98k
            
ccv_nnc_tensor_count(outputs[i]->info) == ccv_nnc_tensor_count(inputs[j]->info)223
) {
377
223
            allocated_outputs[i] = copy_outputs[i] = ccv_nnc_tensor_new(copy_inputs[j]->data.u8, outputs[i]->info, 0);
378
223
            break;
379
223
          }
380
5.78k
        }
381
2.00k
      if (!copy_outputs[i])
382
988
        allocated_outputs[i] = copy_outputs[i] = ccv_nnc_tensor_new(0, outputs[i]->info, 0);
383
2.00k
    }
384
2.15k
  }
385
1.09k
  if (flag == 1)
386
996
  {
387
3.09k
    for (i = 0; i < CCV_NNC_BACKEND_COUNT; 
i++2.09k
)
388
3.09k
    {
389
3.09k
      const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i];
390
3.09k
      // We have the exec kernel, and support all the tensor memory types.
391
3.09k
      if (api_registry.exec &&
392
3.09k
        
(api_registry.tensor_memory & tensor_memory) == tensor_memory1.16k
&&
393
3.09k
        
(api_registry.tensor_formats & tensor_formats) == tensor_formats996
&&
394
3.09k
        
(api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes996
)
395
996
      {
396
996
        tuned_cmd.backend = backend_init_map[i].backend;
397
996
        // If a given API exist an autotune function, use that to pick the top algorithm.
398
996
        if (api_registry.autotune)
399
131
        {
400
131
          ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, inputs, input_size, copy_inputs, input_size, stream_context);
401
131
          _ccv_nnc_cmd_set_device_id(copy_inputs, input_size, copy_outputs, output_size, stream_context);
402
131
          tuned_cmd.algorithm = api_registry.autotune(tuned_cmd, max_workspace_size, hint, flags, copy_inputs, input_size, copy_outputs, output_size, stream_context);
403
131
          // Drain the context, autotune can use excessive amount of memory. Need to drain it now.
404
131
          ccv_nnc_stream_context_drain(stream_context);
405
131
        }
406
996
        break;
407
996
      }
408
3.09k
    }
409
4.52k
    for (i = 0; i < input_size; 
i++3.52k
)
410
3.52k
      if (copy_inputs[i])
411
2.58k
        ccv_nnc_tensor_free(copy_inputs[i]);
412
3.03k
    for (i = 0; i < output_size; 
i++2.03k
)
413
2.03k
      if (allocated_outputs[i])
414
1.08k
        ccv_nnc_tensor_free(allocated_outputs[i]);
415
996
    ccfree(copy_inputs);
416
996
    return tuned_cmd;
417
996
  }
418
100
  // We need to have trial loop through all the data.
419
400
  
for (k = 0; 100
k < AUTO_TUNE_TRIAL_SIZE;
k++300
)
420
300
  {
421
2.10k
    for (i = 0; i < CCV_NNC_BACKEND_COUNT; 
i++1.80k
)
422
1.80k
    {
423
1.80k
      const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i];
424
1.80k
      // We have the exec kernel, and support all the tensor memory types.
425
1.80k
      if (api_registry.exec &&
426
1.80k
        
(api_registry.tensor_memory & tensor_memory) == tensor_memory900
&&
427
1.80k
        
(api_registry.tensor_formats & tensor_formats) == tensor_formats600
&&
428
1.80k
        
(api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes600
)
429
600
      {
430
600
        ccv_nnc_cmd_t candid_cmd = cmd;
431
600
        candid_cmd.backend = backend_init_map[i].backend;
432
600
        // If a given API exist an autotune function, use that to pick the top algorithm.
433
600
        if (api_registry.autotune)
434
0
        {
435
0
          // Assuming k == 0 is sufficient, and we can skip.
436
0
          if (k > 0)
437
0
            continue;
438
0
          ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, inputs, input_size, copy_inputs, input_size, stream_context);
439
0
          _ccv_nnc_cmd_set_device_id(copy_inputs, input_size, copy_outputs, output_size, stream_context);
440
0
          candid_cmd.algorithm = api_registry.autotune(candid_cmd, max_workspace_size, hint, flags, copy_inputs, input_size, copy_outputs, output_size, stream_context);
441
0
          // Drain the context, autotune can use excessive amount of memory. Need to drain it now.
442
0
          ccv_nnc_stream_context_drain(stream_context);
443
0
          uint64_t elapsed = ccv_nnc_cmd_mono_time();
444
0
          // Ready to run.
445
0
          int status = ccv_nnc_cmd_exec(candid_cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context);
446
0
          ccv_nnc_stream_context_wait(stream_context);
447
0
          elapsed = ccv_nnc_cmd_mono_time() - elapsed;
448
0
          if (status == CCV_NNC_EXEC_SUCCESS &&
449
0
            (best_measured == -1 || elapsed < best_measured))
450
0
          {
451
0
            best_measured = elapsed;
452
0
            tuned_cmd = candid_cmd;
453
0
          }
454
600
        } else {
455
600
          // Otherwise loop over the existing algorithms and pick the top one.
456
1.81k
          for (j = 0; j < api_registry.algorithms; 
j++1.21k
)
457
1.21k
          {
458
1.21k
            candid_cmd.algorithm = j;
459
1.21k
            ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, inputs, input_size, copy_inputs, input_size, stream_context);
460
1.21k
            _ccv_nnc_cmd_set_device_id(copy_inputs, input_size, copy_outputs, output_size, stream_context);
461
1.21k
            uint64_t elapsed = ccv_nnc_cmd_mono_time();
462
1.21k
            // Ready to run.
463
1.21k
            int status = ccv_nnc_cmd_exec(candid_cmd, hint, flags, copy_inputs, input_size, copy_outputs, output_size, stream_context);
464
1.21k
            elapsed = ccv_nnc_cmd_mono_time() - elapsed;
465
1.21k
            if (status == CCV_NNC_EXEC_SUCCESS &&
466
1.21k
              
(771
best_measured == -1771
||
elapsed < best_measured671
))
467
331
            {
468
331
              best_measured = elapsed;
469
331
              tuned_cmd = candid_cmd;
470
331
            }
471
1.21k
          }
472
600
        }
473
600
      }
474
1.80k
    }
475
300
  }
476
424
  for (i = 0; i < input_size; 
i++324
)
477
324
    if (copy_inputs[i])
478
300
      ccv_nnc_tensor_free(copy_inputs[i]);
479
224
  for (i = 0; i < output_size; 
i++124
)
480
124
    if (allocated_outputs[i])
481
124
      ccv_nnc_tensor_free(allocated_outputs[i]);
482
100
  ccfree(copy_inputs);
483
100
  return tuned_cmd;
484
100
}
485
486
int ccv_nnc_cmd_bitmask(const ccv_nnc_cmd_t cmd, const int input_size, const int output_size, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size)
487
27.3k
{
488
27.3k
  // If it is no-op, return true, it can deal with any number of parameters.
489
27.3k
  if (cmd.cmd == CCV_NNC_NOOP)
490
4
    return 1;
491
27.2k
  // If it is a custom command, I cannot check it at all, return false.
492
27.2k
  if (cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD)
493
1
    return 0;
494
27.2k
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
495
27.2k
  const ccv_nnc_cmd_registry_t cmd_registry = init_map[cmd_idx].registry;
496
27.2k
  if (cmd_registry.bitmask)
497
27.2k
    return cmd_registry.bitmask(input_size, output_size, input_bitmasks, input_bitmask_size, output_bitmasks, output_bitmask_size);
498
0
  // If there is not checking, none can pass.
499
0
  return 0;
500
0
}
501
502
int ccv_nnc_device_ids_for_io(ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, int* const device_ids, const int max_device_id_size)
503
100k
{
504
100k
  int i, j;
505
100k
  int device_id_size = 0;
506
100k
  if (max_device_id_size <= device_id_size)
507
0
    return device_id_size;
508
100k
  // The device id of the exec is determined by its outputs.
509
242k
  
for (i = 0; 100k
i < output_size;
i++141k
)
510
146k
    if (outputs[i] &&
511
146k
      
CCV_TENSOR_GET_MEMORY138k
(outputs[i]->info.type) == CCV_TENSOR_GPU_MEMORY138k
&&
512
146k
      
CCV_TENSOR_GET_DEVICE10.1k
(outputs[i]->info.type) != CCV_COMPUTE_DEVICE_ANY10.1k
)
513
10.1k
    {
514
10.1k
      const int device_id = CCV_TENSOR_GET_DEVICE_ID(outputs[i]->info.type);
515
10.1k
      int flag = 0;
516
13.8k
      for (j = 0; !flag && 
j < device_id_size11.6k
;
j++3.69k
)
517
3.69k
        flag = (device_ids[j] == device_id);
518
10.1k
      if (flag)
519
2.10k
        continue;
520
8.00k
      device_ids[device_id_size++] = device_id;
521
8.00k
      if (device_id_size >= max_device_id_size)
522
4.52k
        return device_id_size;
523
8.00k
    }
524
100k
  
if (95.7k
device_id_size == 095.7k
)
525
93.0k
  {
526
93.0k
    int device_id = -1;
527
336k
    for (i = 0; i < input_size; 
i++243k
)
528
243k
      if (inputs[i] &&
529
243k
        
CCV_TENSOR_GET_MEMORY197k
(inputs[i]->info.type) == CCV_TENSOR_GPU_MEMORY197k
&&
530
243k
        
CCV_TENSOR_GET_DEVICE222
(inputs[i]->info.type) != CCV_COMPUTE_DEVICE_ANY222
&&
531
243k
        
(222
device_id < 0222
||
CCV_TENSOR_GET_DEVICE_ID110
(inputs[i]->info.type) < device_id110
))
532
112
        device_id = CCV_TENSOR_GET_DEVICE_ID(inputs[i]->info.type);
533
93.0k
    if (device_id >= 0)
534
112
    {
535
112
      device_ids[0] = device_id;
536
112
      return 1;
537
112
    }
538
95.6k
  }
539
95.6k
  return device_id_size;
540
95.6k
}
541
542
int ccv_nnc_cmd_exec(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
543
3.24M
{
544
3.24M
  // If it is no-op, return as if succeed already.
545
3.24M
  if (cmd.cmd == CCV_NNC_NOOP)
546
12.6k
    return 0;
547
3.23M
  _ccv_nnc_cmd_set_device_id(inputs, input_size, outputs, output_size, stream_context);
548
3.23M
  // If it is a custom command, just apply it directly.
549
3.23M
  if (cmd.cmd == CCV_NNC_CUSTOM_FORWARD || 
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD3.23M
)
550
4
  {
551
4
    int ret = cmd.exec(cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context);
552
4
    if (!stream_context)
553
4
      ccv_nnc_stream_context_drain(stream_context);
554
4
    return ret;
555
4
  }
556
3.23M
  assert(cmd.cmd != CCV_NNC_GRAPH_FORWARD && cmd.cmd != CCV_NNC_GRAPH_BACKWARD);
557
3.23M
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
558
3.23M
  assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]));
559
3.23M
  int i;
560
3.23M
  uint32_t backend = cmd.backend;
561
3.23M
  if (backend == CCV_NNC_NO_BACKEND)
562
2.14M
  {
563
2.14M
    // Find a suitable backend.
564
2.14M
    int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0;
565
5.03M
    for (i = 0; i < input_size; 
i++2.89M
)
566
2.89M
      if (inputs[i])
567
2.89M
        tensor_memory |= CCV_TENSOR_GET_MEMORY(inputs[i]->info.type), tensor_formats |= inputs[i]->info.format, tensor_datatypes |= inputs[i]->info.datatype;
568
4.66M
    for (i = 0; i < output_size; 
i++2.52M
)
569
2.52M
      if (outputs[i])
570
2.52M
        tensor_memory |= CCV_TENSOR_GET_MEMORY(outputs[i]->info.type), tensor_formats |= outputs[i]->info.format, tensor_datatypes |= outputs[i]->info.datatype;
571
2.14M
    backend = ccv_nnc_cmd_find_backend(cmd, tensor_memory, tensor_formats, tensor_datatypes);
572
2.14M
  }
573
3.23M
  assert(backend != CCV_NNC_NO_BACKEND);
574
3.23M
  const int backend_idx = _ccv_nnc_cmd_backend_ph(backend);
575
3.23M
  assert(backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT);
576
3.23M
  const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[backend_idx];
577
3.23M
  if (!api_registry.exec)
578
0
    return CCV_NNC_EXEC_NO_KERNEL;
579
3.23M
  // Everything is out, call the underlying implementation.
580
3.23M
  int ret = api_registry.exec(cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context);
581
3.23M
  if (!stream_context)
582
94.3k
    ccv_nnc_stream_context_drain(stream_context);
583
3.23M
  return ret;
584
3.23M
}
585
586
int ccv_nnc_cmd_attr(const ccv_nnc_cmd_t cmd, const int flags)
587
0
{
588
0
  // No additional attr for noop.
589
0
  if (cmd.cmd == CCV_NNC_NOOP ||
590
0
    // If it is a custom command, just apply it directly.
591
0
    cmd.cmd == CCV_NNC_CUSTOM_FORWARD || cmd.cmd == CCV_NNC_CUSTOM_BACKWARD ||
592
0
    // If it is sub-graph, there is no additional attr as well.
593
0
    cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD)
594
0
    return 0;
595
0
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
596
0
  assert(cmd_idx >= 0 && cmd_idx <sizeof(init_map) / sizeof(init_map[0]));
597
0
  const ccv_nnc_cmd_registry_t cmd_registry = init_map[cmd_idx].registry;
598
0
  return !!(cmd_registry.flags & flags);
599
0
}