Coverage Report

Created: 2017-11-12 13:27

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/ccv_nnc_cmd.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_internal.h"
3
#include "ccv_nnc_easy.h"
4
#ifdef HAVE_CUDA
5
#include "gpu/ccv_nnc_compat.h"
6
#endif
7
#include <time.h>
8
#include <sys/time.h>
9
10
#ifdef __MACH__
11
#include <mach/mach.h>
12
#include <mach/mach_time.h>
13
#endif
14
15
typedef struct {
16
  const uint32_t cmd;
17
  const char* name;
18
  ccv_nnc_cmd_registry_t registry;
19
  ccv_nnc_cmd_backend_registry_t backends[CCV_NNC_BACKEND_COUNT];
20
} ccv_nnc_cmd_init_t;
21
22
typedef struct {
23
  const uint32_t backend;
24
  const char* name;
25
} ccv_nnc_cmd_backend_init_t;
26
27
// The generated code configures command and its mapping.
28
#include "cmd/ccv_nnc_cmd.inc"
29
30
void ccv_nnc_init(void)
31
1
{
32
1
  _ccv_nnc_cmd_init();
33
1
}
34
35
const char* ccv_nnc_cmd_name(const uint32_t cmd)
36
339
{
37
339
  switch (cmd)
38
339
  {
39
38
    case CCV_NNC_NOOP:
40
38
      return "CCV_NNC_NOOP";
41
0
    case CCV_NNC_CUSTOM_FORWARD:
42
0
      return "CCV_NNC_CUSTOM_FORWARD";
43
0
    case CCV_NNC_CUSTOM_BACKWARD:
44
0
      return "CCV_NNC_CUSTOM_BACKWARD";
45
0
    case CCV_NNC_GRAPH_FORWARD:
46
0
      return "CCV_NNC_GRAPH_FORWARD";
47
0
    case CCV_NNC_GRAPH_BACKWARD:
48
0
      return "CCV_NNC_GRAPH_BACKWARD";
49
339
  }
50
301
  const int idx = _ccv_nnc_cmd_ph(cmd);
51
301
  assert(idx >= 0);
52
301
  assert(idx < sizeof(init_map) / sizeof(init_map[0]));
53
301
  return init_map[idx].name;
54
339
}
55
56
const char* ccv_nnc_cmd_backend_name(const uint32_t backend)
57
0
{
58
0
  const int idx = _ccv_nnc_cmd_backend_ph(backend);
59
0
  assert(idx >= 0);
60
0
  assert(idx < CCV_NNC_BACKEND_COUNT);
61
0
  return backend_init_map[idx].name;
62
0
}
63
64
const ccv_nnc_cmd_param_t ccv_nnc_cmd_auto = {};
65
66
int ccv_nnc_is_cmd_auto(const ccv_nnc_cmd_param_t params)
67
0
{
68
0
  return (memcmp(&params, &ccv_nnc_cmd_auto, sizeof(ccv_nnc_cmd_param_t)) == 0);
69
0
}
70
71
int ccv_nnc_cmd_is_forward(const ccv_nnc_cmd_t cmd)
72
50
{
73
50
  switch (cmd.cmd)
74
50
  {
75
2
    case CCV_NNC_NOOP:
76
2
      return 0;
77
1
    case CCV_NNC_CUSTOM_FORWARD:
78
1
    case CCV_NNC_CUSTOM_BACKWARD:
79
1
    case CCV_NNC_GRAPH_FORWARD:
80
1
    case CCV_NNC_GRAPH_BACKWARD:
81
48
    default:
82
48
      return !(cmd.cmd & 0x1); // If it is even, it is forward
83
50
  }
84
50
}
85
86
int ccv_nnc_cmd_is_backward(const ccv_nnc_cmd_t cmd)
87
94
{
88
94
  switch (cmd.cmd)
89
94
  {
90
2
    case CCV_NNC_NOOP:
91
2
      return 0;
92
2
    case CCV_NNC_CUSTOM_FORWARD:
93
2
    case CCV_NNC_CUSTOM_BACKWARD:
94
2
    case CCV_NNC_GRAPH_FORWARD:
95
2
    case CCV_NNC_GRAPH_BACKWARD:
96
92
    default:
97
92
      return !!(cmd.cmd & 0x1); // If it is odd, it is backward
98
94
  }
99
94
}
100
101
int ccv_nnc_cmd_ok(const uint32_t cmd, const uint32_t backend)
102
1
{
103
1
  // If it is a custom command, a no op, or a graph op, there is no backend to check.
104
1
  if (cmd == CCV_NNC_NOOP ||
105
1
    
cmd == CCV_NNC_GRAPH_FORWARD1
||
cmd == CCV_NNC_GRAPH_BACKWARD1
||
106
1
    
cmd == CCV_NNC_CUSTOM_FORWARD1
||
cmd == CCV_NNC_CUSTOM_BACKWARD1
)
107
0
    return 1;
108
1
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd);
109
1
  const int backend_idx = _ccv_nnc_cmd_backend_ph(backend);
110
1
  assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]));
111
1
  assert(backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT);
112
1
  const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[backend_idx];
113
1
  // Check if the execution function exists or not.
114
1
  return !!api_registry.exec;
115
1
}
116
117
ccv_nnc_cmd_t ccv_nnc_cmd(const uint32_t _cmd, ccv_nnc_cmd_exec_f exec, const ccv_nnc_cmd_param_t params, const int flags)
118
306
{
119
306
  ccv_nnc_cmd_t cmd;
120
306
  cmd.info = params;
121
306
  // Default to CPU ref implementation if the type is CPU memory, otherwise use GPU ref.
122
306
  cmd.backend = CCV_NNC_BACKEND_CPU_REF;
123
306
  assert((_cmd == CCV_NNC_CUSTOM_FORWARD && exec) || (_cmd != CCV_NNC_CUSTOM_FORWARD && !exec));
124
306
  cmd.cmd = _cmd;
125
306
  cmd.algorithm = -1; // This is default.
126
306
  cmd.exec = exec;
127
306
  return cmd;
128
306
}
129
130
const ccv_nnc_hint_t ccv_nnc_no_hint = {};
131
132
int ccv_nnc_is_no_hint(const ccv_nnc_hint_t hint)
133
213
{
134
213
  return (memcmp(&hint, &ccv_nnc_no_hint, sizeof(ccv_nnc_hint_t)) == 0);
135
213
}
136
137
int ccv_nnc_hint_verify(const ccv_nnc_hint_t hint, const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t a, const ccv_nnc_tensor_param_t b)
138
1
{
139
1
  int i;
140
1
  assert(a.format == b.format);
141
1
  const int nd = ccv_nnc_tensor_nd(a.dim);
142
1
  assert(nd == CCV_NNC_MAX_DIM + 1 || nd == CCV_NNC_MAX_DIM + 2);
143
1
  int hw;
144
1
  if ((a.format == CCV_TENSOR_FORMAT_CHWN) ||
145
1
    
(a.format == CCV_TENSOR_FORMAT_NHWC && 1
nd == 1
CCV_NNC_MAX_DIM1
+ 1))
146
0
    hw = 0;
147
1
  else 
if (1
(a.format == CCV_TENSOR_FORMAT_NHWC && 1
nd == 1
CCV_NNC_MAX_DIM1
+ 2) ||
148
0
       
(a.format == CCV_TENSOR_FORMAT_NCHW && 0
nd == 0
CCV_NNC_MAX_DIM0
+ 1))
149
1
    hw = 1;
150
0
  else 
if (0
a.format == CCV_TENSOR_FORMAT_NCHW && 0
nd == 0
CCV_NNC_MAX_DIM0
+ 2)
151
0
    hw = 2;
152
0
  else
153
0
    assert(0 && "unknown format");
154
3
  for (i = 0; 
i < 3
CCV_NNC_MAX_DIM3
;
i++2
)
155
2
  {
156
2
    if ((hint.border.begin[i] + hint.border.end[i] + a.dim[i] - cmd.size.dim[i]) % hint.stride.dim[i] != 0)
157
0
      return -1;
158
2
    int expected = (hint.border.begin[i] + hint.border.end[i] + a.dim[i + hw] - cmd.size.dim[i]) / hint.stride.dim[i] + 1;
159
2
    if (expected != b.dim[i + hw])
160
0
      return -1;
161
2
  }
162
1
  return 0;
163
1
}
164
165
ccv_nnc_hint_t ccv_nnc_hint_auto(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t a, const ccv_nnc_tensor_param_t b)
166
221
{
167
221
  int i;
168
221
  assert(a.format == b.format);
169
221
  const int a_nd = ccv_nnc_tensor_nd(a.dim);
170
221
  const int b_nd = ccv_nnc_tensor_nd(b.dim);
171
221
  // Is not auto hint deducible dimensions.
172
221
  if (
a_nd != b_nd || 221
(a_nd != 220
CCV_NNC_MAX_DIM220
+ 1 &&
a_nd != 118
CCV_NNC_MAX_DIM118
+ 2))
173
118
    return ccv_nnc_no_hint;
174
103
  int hw;
175
103
  if ((a.format == CCV_TENSOR_FORMAT_CHWN) ||
176
103
    
(a.format == CCV_TENSOR_FORMAT_NHWC && 103
a_nd == 103
CCV_NNC_MAX_DIM103
+ 1))
177
102
    hw = 0;
178
1
  else 
if (1
(a.format == CCV_TENSOR_FORMAT_NHWC && 1
a_nd == 1
CCV_NNC_MAX_DIM1
+ 2) ||
179
0
       
(a.format == CCV_TENSOR_FORMAT_NCHW && 0
a_nd == 0
CCV_NNC_MAX_DIM0
+ 1))
180
1
    hw = 1;
181
0
  else 
if (0
a.format == CCV_TENSOR_FORMAT_NCHW && 0
a_nd == 0
CCV_NNC_MAX_DIM0
+ 2)
182
0
    hw = 2;
183
0
  else
184
0
    assert(0 && "unknown format");
185
103
  ccv_nnc_hint_t hint_auto = {};
186
103
  // 0-dim is reserved for channels
187
309
  for (i = 0; 
i < 309
CCV_NNC_MAX_DIM309
;
i++206
)
188
206
  {
189
206
    // Cannot have one of the dim is zero, we cannot auto the hint, return no hint.
190
206
    assert(a.dim[i + hw] && b.dim[i + hw]);
191
206
    // This is guessed by having a stride that will approximately match the scale.
192
206
    int stride = (a.dim[i + hw] + b.dim[i + hw] / 2) / b.dim[i + hw];
193
206
    hint_auto.stride.dim[i] = stride;
194
206
    int border = (b.dim[i + hw] - 1) * stride - a.dim[i + hw] + cmd.size.dim[i];
195
206
    hint_auto.border.begin[i] = (border + 1) / 2; // Always prefer to have more padding in the beginning, this matches CUDNN behavior.
196
206
    hint_auto.border.end[i] = border - hint_auto.border.begin[i];
197
206
  }
198
103
  return hint_auto;
199
221
}
200
201
void ccv_nnc_hint_tensor_auto_forward_from_inputs(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
202
105
{
203
105
  int i;
204
105
  assert(output_size <= input_size);
205
210
  for (i = 0; 
i < output_size210
;
i++105
)
206
105
    outputs[i] = inputs[i];
207
105
}
208
209
void ccv_nnc_hint_tensor_auto_backward_from_gradient(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
210
22
{
211
22
  int i;
212
63
  for (i = 0; 
i < output_size63
;
i++41
)
213
41
    outputs[i] = inputs[0];
214
22
}
215
216
void ccv_nnc_hint_tensor_auto_backward_from_inputs(const ccv_nnc_cmd_param_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
217
0
{
218
0
  int i;
219
0
  assert(output_size < input_size);
220
0
  for (i = 0; 
i < output_size0
;
i++0
)
221
0
    outputs[i] = inputs[i + 1];
222
0
}
223
224
void ccv_nnc_hint_tensor_auto(const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size)
225
187
{
226
187
  // zero out the parameters
227
187
  const ccv_nnc_tensor_param_t z = {};
228
187
  int i;
229
393
  for (i = 0; 
i < output_size393
;
i++206
)
230
206
    outputs[i] = z; // Reset the outputs.
231
187
  // Cannot handle these situations.
232
187
  if (
cmd.cmd == CCV_NNC_NOOP || 187
cmd.cmd == CCV_NNC_CUSTOM_FORWARD185
||
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD185
||
cmd.cmd == CCV_NNC_GRAPH_FORWARD185
||
cmd.cmd == CCV_NNC_GRAPH_BACKWARD177
)
233
10
    return;
234
177
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
235
177
  const ccv_nnc_cmd_registry_t registry = init_map[cmd_idx].registry;
236
177
  if (registry.tensor_auto)
237
177
    registry.tensor_auto(cmd.info, inputs, input_size, hint, outputs, output_size);
238
0
  else 
if (0
ccv_nnc_cmd_is_forward(cmd)0
) // For forward, the default auto is forward_from_inputs
239
0
    ccv_nnc_hint_tensor_auto_forward_from_inputs(cmd.info, inputs, input_size, hint, outputs, output_size);
240
0
  else // For backward, the default auto is backward_from_inputs
241
0
    ccv_nnc_hint_tensor_auto_backward_from_inputs(cmd.info, inputs, input_size, hint, outputs, output_size);
242
177
}
243
244
// This returns absolute time.
245
uint64_t ccv_nnc_cmd_mono_time(void)
246
1.01k
{
247
1.01k
#ifdef __MACH__
248
  return mach_absolute_time();
249
#else
250
1.01k
  struct timespec ts;
251
1.01k
  clock_gettime(CLOCK_MONOTONIC, &ts);
252
1.01k
  return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
253
1.01k
#endif
254
1.01k
}
255
256
212
#define AUTO_TUNE_TRIAL_SIZE (3)
257
258
ccv_nnc_cmd_t ccv_nnc_cmd_autotune(const ccv_nnc_cmd_t cmd, const size_t max_workspace_size, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const ccv_nnc_stream_context_t* const stream_context)
259
53
{
260
53
  // This is a custom cmd kernel, no need to autotune.
261
53
  if (
cmd.cmd == CCV_NNC_CUSTOM_FORWARD || 53
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD53
||
cmd.cmd == CCV_NNC_NOOP53
)
262
0
    return cmd;
263
53
  int i, j, k;
264
53
  // Go through all the backends that supports the same type of memory input / output tensors support.
265
53
  int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0;
266
172
  for (i = 0; 
i < input_size172
;
i++119
)
267
119
    
if (119
inputs[i]119
)
268
119
      tensor_memory |= inputs[i]->info.type, tensor_formats |= inputs[i]->info.format, tensor_datatypes |= inputs[i]->info.datatype;
269
106
  for (i = 0; 
i < output_size106
;
i++53
)
270
53
    
if (53
outputs[i]53
)
271
53
      tensor_memory |= outputs[i]->info.type, tensor_formats |= outputs[i]->info.format, tensor_datatypes |= outputs[i]->info.datatype;
272
53
  // In this case, we cannot determine the type of the tensor, skip auto-tune.
273
53
  if (!tensor_memory)
274
0
    return cmd;
275
53
  // Otherwise, we are good to go.
276
53
  ccv_nnc_cmd_t tuned_cmd = cmd;
277
53
  int64_t best_measured = -1;
278
53
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
279
53
  // We need to have trial loop through all the data.
280
212
  for (k = 0; 
k < 212
AUTO_TUNE_TRIAL_SIZE212
;
k++159
)
281
159
  {
282
795
    for (i = 0; 
i < CCV_NNC_BACKEND_COUNT795
;
i++636
)
283
636
    {
284
636
      const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[i];
285
636
      // We have the exec kernel, and support all the tensor memory types.
286
636
      if (api_registry.exec &&
287
339
        (api_registry.tensor_memory & tensor_memory) == tensor_memory &&
288
255
        (api_registry.tensor_formats & tensor_formats) == tensor_formats &&
289
255
        (api_registry.tensor_datatypes & tensor_datatypes) == tensor_datatypes)
290
255
      {
291
255
        ccv_nnc_cmd_t candid_cmd = cmd;
292
255
        candid_cmd.backend = backend_init_map[i].backend;
293
255
        // If a given API exist an autotune function, use that to pick the top algorithm.
294
255
        if (api_registry.autotune)
295
3
        {
296
3
          // Assuming k == 0 is sufficient, and we can skip.
297
3
          if (k > 0)
298
2
            continue;
299
1
          candid_cmd.algorithm = api_registry.autotune(candid_cmd, max_workspace_size, hint, flags, inputs, input_size, outputs, output_size, stream_context);
300
1
          uint64_t elapsed = ccv_nnc_cmd_mono_time();
301
1
          // Ready to run.
302
1
          int status = ccv_nnc_cmd_exec(candid_cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context);
303
1
          ccv_nnc_stream_context_wait(stream_context);
304
1
          elapsed = ccv_nnc_cmd_mono_time() - elapsed;
305
1
          if (status == CCV_NNC_EXEC_SUCCESS &&
306
1
            
(best_measured == -1 || 1
elapsed < best_measured0
))
307
1
          {
308
1
            best_measured = elapsed;
309
1
            tuned_cmd = candid_cmd;
310
1
          }
311
252
        } else {
312
252
          // Otherwise loop over the existing algorithms and pick the top one.
313
756
          for (j = 0; 
j < api_registry.algorithms756
;
j++504
)
314
504
          {
315
504
            candid_cmd.algorithm = j;
316
504
            uint64_t elapsed = ccv_nnc_cmd_mono_time();
317
504
            // Ready to run.
318
504
            int status = ccv_nnc_cmd_exec(candid_cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context);
319
504
            elapsed = ccv_nnc_cmd_mono_time() - elapsed;
320
504
            if (status == CCV_NNC_EXEC_SUCCESS &&
321
348
              
(best_measured == -1 || 348
elapsed < best_measured296
))
322
142
            {
323
142
              best_measured = elapsed;
324
142
              tuned_cmd = candid_cmd;
325
142
            }
326
504
          }
327
252
        }
328
255
      }
329
636
    }
330
159
  }
331
53
  return tuned_cmd;
332
53
}
333
334
int ccv_nnc_cmd_bitmask(const ccv_nnc_cmd_t cmd, const uint64_t* const input_bitmasks, const int input_bitmask_size, const uint64_t* const output_bitmasks, const int output_bitmask_size)
335
229
{
336
229
  // If it is no-op, return true, it can deal with any number of parameters.
337
229
  if (cmd.cmd == CCV_NNC_NOOP)
338
4
    return 1;
339
229
  // If it is a custom command, I cannot check it at all, return true.
340
225
  
if (225
cmd.cmd == CCV_NNC_CUSTOM_FORWARD || 225
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD225
)
341
0
    return 1;
342
225
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
343
225
  const ccv_nnc_cmd_registry_t cmd_registry = init_map[cmd_idx].registry;
344
225
  if (cmd_registry.bitmask)
345
225
    return cmd_registry.bitmask(input_bitmasks, input_bitmask_size, output_bitmasks, output_bitmask_size);
346
225
  // If there is not checking, none can pass.
347
0
  return 0;
348
225
}
349
350
int ccv_nnc_cmd_exec(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const ccv_nnc_stream_context_t* const stream_context)
351
1.94k
{
352
1.94k
  // If it is no-op, return as if succeed already.
353
1.94k
  if (cmd.cmd == CCV_NNC_NOOP)
354
113
    return 0;
355
1.94k
  // If it is a custom command, just apply it directly.
356
1.83k
  
if (1.83k
cmd.cmd == CCV_NNC_CUSTOM_FORWARD || 1.83k
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD1.83k
)
357
1
    return cmd.exec(cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context);
358
1.83k
  assert(cmd.cmd != CCV_NNC_GRAPH_FORWARD && cmd.cmd != CCV_NNC_GRAPH_BACKWARD);
359
1.83k
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
360
1.83k
  const int backend_idx = _ccv_nnc_cmd_backend_ph(cmd.backend);
361
1.83k
  assert(cmd_idx >= 0 && cmd_idx < sizeof(init_map) / sizeof(init_map[0]));
362
1.83k
  assert(backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT);
363
1.83k
  const ccv_nnc_cmd_registry_t cmd_registry = init_map[cmd_idx].registry;
364
1.83k
  const ccv_nnc_cmd_backend_registry_t api_registry = init_map[cmd_idx].backends[backend_idx];
365
1.83k
  if (!api_registry.exec)
366
0
    return CCV_NNC_EXEC_NO_KERNEL;
367
1.83k
  int i;
368
1.83k
  uint64_t stack_input_bitmasks[CCV_NNC_STACK_BITMASK_ALLOC] = {};
369
1.83k
  uint64_t stack_output_bitmasks[CCV_NNC_STACK_BITMASK_ALLOC] = {};
370
1.83k
  assert(CCV_NNC_STACK_BITMASK_ALLOC > 0);
371
1.83k
  uint64_t* input_bitmasks = (input_size > 64 * 
CCV_NNC_STACK_BITMASK_ALLOC1.83k
) ?
(uint64_t*)0
cccalloc0
((input_size + 63) / 64, sizeof(uint64_t)) :
stack_input_bitmasks1.83k
;
372
1.83k
  uint64_t* output_bitmasks = (output_size > 64 * 
CCV_NNC_STACK_BITMASK_ALLOC1.83k
) ?
(uint64_t*)0
cccalloc0
((input_size + 63) / 64, sizeof(uint64_t)) :
stack_output_bitmasks1.83k
;
373
6.00k
  for (i = 0; 
i < input_size6.00k
;
i++4.17k
)
374
4.17k
    
if (4.17k
inputs[i]4.17k
)
375
4.13k
    {
376
4.13k
      assert(api_registry.tensor_formats & inputs[i]->info.format);
377
4.13k
      assert(api_registry.tensor_datatypes & inputs[i]->info.datatype);
378
4.13k
      input_bitmasks[i >> 6] |= (uint64_t)1 << (i & 63);
379
4.13k
    }
380
3.70k
  for (i = 0; 
i < output_size3.70k
;
i++1.87k
)
381
1.87k
    
if (1.87k
outputs[i]1.87k
)
382
1.86k
    {
383
1.86k
      assert(api_registry.tensor_formats & outputs[i]->info.format);
384
1.86k
      assert(api_registry.tensor_datatypes & outputs[i]->info.datatype);
385
1.86k
      output_bitmasks[i >> 6] |= (uint64_t)1 << (i & 63);
386
1.86k
    }
387
1.83k
  if (cmd_registry.bitmask)
388
1.83k
    // If cannot pass the bitmask check.
389
1.83k
    
if (1.83k
!cmd_registry.bitmask(input_bitmasks, (input_size + 63) / 64, output_bitmasks, (output_size + 63) / 64)1.83k
)
390
0
    {
391
0
      if (
input_size > 64 * 0
CCV_NNC_STACK_BITMASK_ALLOC0
)
392
0
        
ccfree0
(input_bitmasks)0
;
393
0
      if (
output_size > 64 * 0
CCV_NNC_STACK_BITMASK_ALLOC0
)
394
0
        
ccfree0
(output_bitmasks)0
;
395
0
      return CCV_NNC_EXEC_INVALID; // Return invalid input.
396
0
    }
397
1.83k
  // TODO: Print out warning message.
398
1.83k
  
if (1.83k
input_size > 64 * 1.83k
CCV_NNC_STACK_BITMASK_ALLOC1.83k
)
399
0
    
ccfree0
(input_bitmasks)0
;
400
1.83k
  if (
output_size > 64 * 1.83k
CCV_NNC_STACK_BITMASK_ALLOC1.83k
)
401
0
    
ccfree0
(output_bitmasks)0
;
402
1.83k
  // Everything is out, call the underlying implementation.
403
1.83k
  return api_registry.exec(cmd, hint, flags, inputs, input_size, outputs, output_size, stream_context);
404
1.83k
}
405
406
int ccv_nnc_cmd_attr(const ccv_nnc_cmd_t cmd, const int flags)
407
162
{
408
162
  // No additional attr for noop.
409
162
  if (cmd.cmd == CCV_NNC_NOOP ||
410
162
    // If it is a custom command, just apply it directly.
411
140
    
cmd.cmd == CCV_NNC_CUSTOM_FORWARD140
||
cmd.cmd == CCV_NNC_CUSTOM_BACKWARD140
||
412
162
    // If it is sub-graph, there is no additional attr as well.
413
140
    
cmd.cmd == CCV_NNC_GRAPH_FORWARD140
||
cmd.cmd == CCV_NNC_GRAPH_BACKWARD129
)
414
33
    return 0;
415
129
  const int cmd_idx = _ccv_nnc_cmd_ph(cmd.cmd);
416
129
  const int backend_idx = _ccv_nnc_cmd_backend_ph(cmd.backend);
417
129
  assert(cmd_idx >= 0 && cmd_idx <sizeof(init_map) / sizeof(init_map[0]));
418
129
  assert(backend_idx >= 0 && backend_idx < CCV_NNC_BACKEND_COUNT);
419
129
  const ccv_nnc_cmd_registry_t cmd_registry = init_map[cmd_idx].registry;
420
129
  return !!(cmd_registry.flags & flags);
421
162
}
422
423
struct ccv_nnc_stream_context_s {
424
  int type;
425
  // Left for implementation yet, the CPU support for stream context.
426
};
427
428
ccv_nnc_stream_context_t* ccv_nnc_stream_context_new(const int type)
429
1
{
430
1
  ccv_nnc_stream_context_t* stream_context = (ccv_nnc_stream_context_t*)ccmalloc(sizeof(ccv_nnc_stream_context_t));
431
1
  stream_context->type = type;
432
1
#ifdef HAVE_CUDA
433
1
  if (
CCV_STREAM_GET_CONTEXT1
(type) == CCV_STREAM_CONTEXT_GPU1
)
434
1
    stream_context = ccv_nnc_init_stream_context(stream_context);
435
1
#endif
436
1
  return stream_context;
437
1
}
438
439
void ccv_nnc_stream_context_wait(const ccv_nnc_stream_context_t* const stream_context)
440
3
{
441
3
  if (!stream_context)
442
0
    return;
443
3
#ifdef HAVE_CUDA
444
3
  
if (3
CCV_STREAM_GET_CONTEXT3
(stream_context->type) == CCV_STREAM_CONTEXT_GPU3
)
445
3
    ccv_nnc_synchronize_stream_context(stream_context);
446
3
#endif
447
3
}
448
449
void ccv_nnc_stream_context_free(ccv_nnc_stream_context_t* const stream_context)
450
1
{
451
1
#ifdef HAVE_CUDA
452
1
  if (
CCV_STREAM_GET_CONTEXT1
(stream_context->type) == CCV_STREAM_CONTEXT_GPU1
)
453
1
    ccv_nnc_deinit_stream_context(stream_context);
454
1
#endif
455
1
  ccfree(stream_context);
456
1
}