Coverage Report

Created: 2024-08-19 11:27

/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_graph.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#include "_ccv_nnc_graph.h"
6
7
// MARK - Level-2 API
8
9
ccv_nnc_graph_t* ccv_nnc_graph_new(void)
10
6.24k
{
11
6.24k
  ccv_nnc_graph_t* graph = (ccv_nnc_graph_t*)cccalloc(1, sizeof(ccv_nnc_graph_t));
12
6.24k
  graph->exec_info = ccv_array_new(sizeof(ccv_nnc_graph_exec_info_t), 5, 0);
13
6.24k
  return graph;
14
6.24k
}
15
16
void ccv_nnc_graph_set_sources(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t* const sources, const int source_size)
17
6.23k
{
18
6.23k
  if (!graph->sources)
19
6.23k
    graph->sources = ccv_array_new(sizeof(ccv_nnc_graph_exec_t), source_size, 0);
20
0
  else
21
0
    ccv_array_clear(graph->sources);
22
6.23k
  int i;
23
12.4k
  for (i = 0; i < source_size; 
i++6.23k
)
24
6.23k
    ccv_array_push(graph->sources, sources + i);
25
6.23k
  graph->topsorted = 0;
26
6.23k
}
27
28
ccv_nnc_graph_exec_t* ccv_nnc_graph_sources(const ccv_nnc_graph_t* const graph)
29
0
{
30
0
  return graph->sources ? (ccv_nnc_graph_exec_t*)ccv_array_get(graph->sources, 0) : 0;
31
0
}
32
33
int ccv_nnc_graph_source_size(const ccv_nnc_graph_t* const graph)
34
0
{
35
0
  return graph->sources ? graph->sources->rnum : 0;
36
0
}
37
38
void ccv_nnc_graph_set_destinations(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t* const destinations, const int destination_size)
39
6.23k
{
40
6.23k
  if (!graph->destinations)
41
6.23k
    graph->destinations = ccv_array_new(sizeof(ccv_nnc_graph_exec_t), destination_size, 0);
42
0
  else
43
0
    ccv_array_clear(graph->sources);
44
6.23k
  int i;
45
12.4k
  for (i = 0; i < destination_size; 
i++6.23k
)
46
6.23k
    ccv_array_push(graph->destinations, destinations + i);
47
6.23k
  graph->topsorted = 0;
48
6.23k
}
49
50
ccv_nnc_graph_exec_t* ccv_nnc_graph_destinations(const ccv_nnc_graph_t* const graph)
51
0
{
52
0
  return graph->destinations ? (ccv_nnc_graph_exec_t*)ccv_array_get(graph->destinations, 0) : 0;
53
0
}
54
55
int ccv_nnc_graph_destination_size(const ccv_nnc_graph_t* const graph)
56
0
{
57
0
  return graph->destinations ? graph->destinations->rnum : 0;
58
0
}
59
60
void ccv_nnc_graph_exec_set(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, const ccv_nnc_cmd_t cmd)
61
44.2k
{
62
44.2k
  assert(exec.d < graph->exec_info->rnum);
63
44.2k
  assert(exec.graph == graph);
64
44.2k
  ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d);
65
44.2k
  exec_info->cmd = cmd;
66
44.2k
}
67
68
ccv_nnc_cmd_t ccv_nnc_graph_exec_cmd(const ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec)
69
6.41k
{
70
6.41k
  assert(exec.d < graph->exec_info->rnum);
71
6.41k
  assert(exec.graph == graph);
72
6.41k
  ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d);
73
6.41k
  return exec_info->cmd;
74
6.41k
}
75
76
void ccv_nnc_graph_exec_set_hint(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, const ccv_nnc_hint_t hint)
77
178
{
78
178
  assert(exec.d < graph->exec_info->rnum);
79
178
  assert(exec.graph == graph);
80
178
  ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d);
81
178
  exec_info->hint = hint;
82
178
}
83
84
static int _ccv_nnc_tensor_multiview_level_count(const ccv_nnc_tensor_multiview_t* const mv)
85
482
{
86
482
  if (!CCV_IS_TENSOR_MULTIVIEW(mv))
87
327
    return 1;
88
155
  const int count = mv->kind + mv->repeat;
89
155
  int i, c = 0;
90
502
  for (i = 0; i < count; 
i++347
)
91
347
  {
92
347
    ccv_nnc_tensor_t* tv = CCV_NNC_MULTIVIEW_DATA(mv)[i];
93
347
    if (tv == CCV_NNC_TENSOR_PLACEHOLDER)
94
8
      c = ccv_max(c, 1);
95
339
    else
96
339
      c = ccv_max(c, _ccv_nnc_tensor_multiview_level_count((ccv_nnc_tensor_multiview_t*)tv));
97
347
  }
98
155
  return c + 1;
99
482
}
100
101
static ccv_nnc_graph_tensor_wrap_t* _ccv_nnc_graph_tensor_wrap_new(const ccv_nnc_tensor_multiview_t* const mv)
102
143
{
103
143
  const int level_count = _ccv_nnc_tensor_multiview_level_count(mv);
104
143
  ccv_nnc_graph_tensor_wrap_t* tensor_wrap = (ccv_nnc_graph_tensor_wrap_t*)ccmalloc(sizeof(ccv_nnc_graph_tensor_wrap_t) + sizeof(ccv_nnc_tensor_t*) * (level_count - 1));
105
143
  tensor_wrap->update_required = 0;
106
143
  tensor_wrap->count = level_count;
107
143
  tensor_wrap->index = 0;
108
143
  tensor_wrap->tensors[0] = (ccv_nnc_tensor_t*)mv;
109
143
  return tensor_wrap;
110
143
}
111
112
static void _ccv_nnc_graph_exec_rewind(ccv_nnc_graph_exec_info_t* const info, ccv_nnc_graph_t* const graph)
113
23
{
114
23
  if (!info->tensor_wraps_ref)
115
22
    return;
116
1
  int i;
117
1
  assert(info->tensor_wraps_ref <= graph->tensor_wraps->rnum);
118
1
  ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, info->tensor_wraps_ref - 1);;
119
  // Rewind from tensor wraps.
120
3
  for (i = 0; i < info->input_size; 
i++2
)
121
2
    if (tensor_wrap_array->tensor_wraps[i])
122
1
      info->inputs[i] = tensor_wrap_array->tensor_wraps[i]->tensors[0];
123
1
  const int d = info->input_size;
124
2
  for (i = 0; i < info->output_size; 
i++1
)
125
1
    if (tensor_wrap_array->tensor_wraps[d + i])
126
1
      info->outputs[i] = tensor_wrap_array->tensor_wraps[d + i]->tensors[0];
127
1
  const int dd = info->input_size + info->output_size;
128
1
  for (i = 0; i < info->update_size; 
i++0
)
129
0
    if (tensor_wrap_array->tensor_wraps[dd + i])
130
0
      info->updates[i] = tensor_wrap_array->tensor_wraps[dd + i]->tensors[0];
131
1
}
132
133
static void _ccv_nnc_graph_tensor_wrap_free(ccv_nnc_graph_tensor_wrap_t* const tensor_wrap)
134
195
{
135
195
  ccfree(tensor_wrap);
136
195
}
137
138
ccv_nnc_graph_tensor_wrap_array_t* ccv_nnc_get_tensor_wrap_array(ccv_nnc_graph_t* const graph, const int tensor_wrap_size, int* const tensor_wraps_ref)
139
62
{
140
62
  ccv_nnc_graph_tensor_wrap_array_t** tensor_wrap_array_ref = *tensor_wraps_ref ? 
(ccv_nnc_graph_tensor_wrap_array_t**)9
ccv_array_get9
(graph->tensor_wraps, *tensor_wraps_ref - 1) :
053
;
141
  // Otherwise, find an open slot.
142
62
  if (!tensor_wrap_array_ref)
143
53
  {
144
53
    if (!graph->tensor_wraps)
145
27
      graph->tensor_wraps = ccv_array_new(sizeof(ccv_nnc_graph_tensor_wrap_array_t*), 0, 0);
146
53
    ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = 0;
147
53
    ccv_array_push(graph->tensor_wraps, &tensor_wrap_array);
148
53
    tensor_wrap_array_ref = (ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, graph->tensor_wraps->rnum - 1);
149
53
    *tensor_wraps_ref = graph->tensor_wraps->rnum;
150
53
  }
151
62
  int i;
152
62
  if (*tensor_wrap_array_ref)
153
9
  {
154
9
    if ((*tensor_wrap_array_ref)->size != tensor_wrap_size)
155
9
      *tensor_wrap_array_ref = (ccv_nnc_graph_tensor_wrap_array_t*)ccrealloc(*tensor_wrap_array_ref, sizeof(ccv_nnc_graph_tensor_wrap_array_t) + sizeof(ccv_nnc_graph_tensor_wrap_t*) * (tensor_wrap_size - 1));
156
18
    for (i = (*tensor_wrap_array_ref)->size; i < tensor_wrap_size; 
i++9
)
157
9
      (*tensor_wrap_array_ref)->tensor_wraps[i] = 0;
158
9
  } else
159
53
    *tensor_wrap_array_ref = (ccv_nnc_graph_tensor_wrap_array_t*)cccalloc(sizeof(ccv_nnc_graph_tensor_wrap_array_t) + sizeof(ccv_nnc_graph_tensor_wrap_t*) * (tensor_wrap_size - 1), 1);
160
62
  ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *tensor_wrap_array_ref;
161
62
  tensor_wrap_array->size = tensor_wrap_size;
162
62
  return tensor_wrap_array;
163
62
}
164
165
void ccv_nnc_set_tensor_wraps(ccv_nnc_graph_tensor_wrap_t** const tensor_wraps, ccv_nnc_tensor_t* const* const tensors, const int tensor_size)
166
184
{
167
184
  int i;
168
349
  for (i = 0; i < tensor_size; 
i++165
)
169
165
    if (tensors[i])
170
164
    {
171
164
      if (CCV_IS_TENSOR_MULTIVIEW(tensors[i]) &&
172
164
        
((ccv_nnc_tensor_multiview_t*)tensors[i])->anchor != 111
CCV_NNC_MULTIVIEW_PHI111
)
173
107
      {
174
107
        if (!tensor_wraps[i] || 
tensors[i] != tensor_wraps[i]->tensors[0]14
)
175
93
        {
176
93
          if (tensor_wraps[i])
177
0
            _ccv_nnc_graph_tensor_wrap_free(tensor_wraps[i]);
178
93
          tensor_wraps[i] = _ccv_nnc_graph_tensor_wrap_new((ccv_nnc_tensor_multiview_t*)tensors[i]);
179
93
        }
180
107
      } else {
181
57
        if (tensor_wraps[i])
182
0
          _ccv_nnc_graph_tensor_wrap_free(tensor_wraps[i]);
183
57
        tensor_wraps[i] = 0;
184
57
      }
185
164
    }
186
184
}
187
188
void ccv_nnc_graph_register_tensor_wraps(ccv_nnc_graph_t* graph, const int tensor_wraps_ref_d)
189
53
{
190
53
  ccv_nnc_graph_t* p = graph;
191
53
  const ccv_nnc_graph_tensor_wraps_ref_t tensor_wraps_ref = {
192
53
    .d = tensor_wraps_ref_d,
193
53
    .graph = graph,
194
53
  };
195
99
  do {
196
99
    if (!p->tensor_wraps_refs)
197
44
    {
198
44
      p->tensor_wraps_refs = ccv_array_new(sizeof(ccv_nnc_graph_tensor_wraps_ref_t), 0, 0);
199
44
      ccv_array_push(p->tensor_wraps_refs, &tensor_wraps_ref);
200
55
    } else {
201
55
      int i;
202
55
      int has_tensor_wraps_ref = 0;
203
152
      for (i = 0; !has_tensor_wraps_ref && i < p->tensor_wraps_refs->rnum; 
i++97
)
204
97
      {
205
97
        ccv_nnc_graph_tensor_wraps_ref_t* tensor_wraps_ref = (ccv_nnc_graph_tensor_wraps_ref_t*)ccv_array_get(p->tensor_wraps_refs, i);
206
97
        has_tensor_wraps_ref = (tensor_wraps_ref->d == tensor_wraps_ref_d && 
tensor_wraps_ref->graph == graph8
);
207
97
      }
208
55
      if (!has_tensor_wraps_ref)
209
55
        ccv_array_push(p->tensor_wraps_refs, &tensor_wraps_ref);
210
55
    }
211
99
    p = p->p;
212
99
  } while (p);
213
53
}
214
215
static void _ccv_nnc_graph_redo_tensor_wraps(ccv_nnc_graph_exec_info_t* const info, ccv_nnc_graph_t* const graph)
216
32.5k
{
217
32.5k
  int i;
218
32.5k
  const int has_wrap = ccv_nnc_tensors_have_wraps(info->inputs, info->input_size) ||
219
32.5k
    
ccv_nnc_tensors_have_wraps(info->outputs, info->output_size)32.4k
||
220
32.5k
    
ccv_nnc_tensors_have_wraps(info->updates, info->update_size)32.4k
;
221
32.5k
  if (has_wrap)
222
61
  {
223
61
    const int tensor_wrap_size = info->input_size + info->output_size + info->update_size;
224
61
    ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = ccv_nnc_get_tensor_wrap_array(graph, tensor_wrap_size, &info->tensor_wraps_ref);
225
61
    ccv_nnc_set_tensor_wraps(tensor_wrap_array->tensor_wraps, info->inputs, info->input_size);
226
61
    const int d = info->input_size;
227
61
    ccv_nnc_set_tensor_wraps(tensor_wrap_array->tensor_wraps + d, info->outputs, info->output_size);
228
61
    const int dd = info->input_size + info->output_size;
229
61
    ccv_nnc_set_tensor_wraps(tensor_wrap_array->tensor_wraps + dd, info->updates, info->update_size);
230
32.4k
  } else if (info->tensor_wraps_ref) {
231
1
    ccv_nnc_graph_tensor_wrap_array_t** tensor_wrap_array_ref = (ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, info->tensor_wraps_ref - 1);
232
1
    ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *tensor_wrap_array_ref;
233
1
    if (tensor_wrap_array)
234
1
    {
235
4
      for (i = 0; i < tensor_wrap_array->size; 
i++3
)
236
3
        if (tensor_wrap_array->tensor_wraps[i])
237
2
          _ccv_nnc_graph_tensor_wrap_free(tensor_wrap_array->tensor_wraps[i]);
238
1
      ccfree(tensor_wrap_array);
239
1
      *tensor_wrap_array_ref = 0;
240
1
      info->tensor_wraps_ref = 0;
241
1
    }
242
1
  }
243
32.5k
}
244
245
static void _ccv_nnc_graph_deregister_tensor_wraps(ccv_nnc_graph_t* graph, const int tensor_wraps_ref_d)
246
1
{
247
1
  ccv_nnc_graph_t* p = graph;
248
2
  do {
249
2
    int i;
250
    // Remove from the array.
251
2
    if (p->tensor_wraps_refs)
252
2
      for (i = 0; i < p->tensor_wraps_refs->rnum; 
i++0
)
253
2
      {
254
2
        ccv_nnc_graph_tensor_wraps_ref_t* const tensor_wraps_ref = (ccv_nnc_graph_tensor_wraps_ref_t*)ccv_array_get(p->tensor_wraps_refs, i);
255
2
        if (tensor_wraps_ref->d == tensor_wraps_ref_d && tensor_wraps_ref->graph == graph)
256
2
        {
257
2
          --p->tensor_wraps_refs->rnum;
258
2
          if (i < p->tensor_wraps_refs->rnum)
259
0
            memcpy(tensor_wraps_ref, tensor_wraps_ref + 1, sizeof(ccv_nnc_graph_exec_t) * (p->tensor_wraps_refs->rnum - i));
260
2
          break;
261
2
        }
262
2
      }
263
2
    p = p->p;
264
2
  } while (p);
265
1
}
266
267
void ccv_nnc_graph_exec_set_io_flags(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, const int* const input_flags, const int input_flag_size, const int* const output_flags, const int output_flag_size)
268
32.0k
{
269
32.0k
  assert(exec.d < graph->exec_info->rnum);
270
32.0k
  assert(exec.graph == graph);
271
32.0k
  ccv_nnc_graph_exec_info_t* const info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d);
272
32.0k
  assert(input_flag_size <= info->input_size);
273
32.0k
  assert(output_flag_size <= info->output_size);
274
32.0k
  if (info->input_size + info->output_size == 0)
275
19
    return;
276
32.0k
  if (!info->input_flags)
277
32.0k
  {
278
32.0k
    info->input_flags = (int*)cccalloc(info->input_size + info->output_size, sizeof(int));
279
32.0k
    info->output_flags = info->input_flags + info->input_size;
280
32.0k
  }
281
32.0k
  if (input_flag_size > 0)
282
0
    memcpy(info->input_flags, input_flags, sizeof(int) * input_flag_size);
283
32.0k
  if (output_flag_size > 0)
284
0
    memcpy(info->output_flags, output_flags, sizeof(int) * output_flag_size);
285
32.0k
}
286
287
void ccv_nnc_graph_exec_pair_with(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, const ccv_nnc_graph_exec_t pair_exec)
288
587
{
289
587
  assert(exec.graph == graph);
290
587
  assert(exec.d >= 0);
291
587
  assert(exec.d < graph->exec_info->rnum);
292
587
  assert(pair_exec.graph == graph || pair_exec.graph == graph->pair);
293
587
  assert(pair_exec.d >= 0);
294
587
  if (pair_exec.graph == graph)
295
583
    { assert(pair_exec.d < graph->exec_info->rnum); }
296
4
  else
297
4
    { assert(pair_exec.d < graph->pair->exec_info->rnum); }
298
587
  ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d);
299
587
  exec_info->pair_ref = pair_exec.d + 1;
300
587
}
301
302
static ccv_nnc_tensor_t* _ccv_nnc_any_tensor_from_tensor_multiview(ccv_nnc_tensor_multiview_t* const mv)
303
92
{
304
92
  ccv_nnc_tensor_t* tensor = (ccv_nnc_tensor_t*)mv;
305
188
  while (CCV_IS_TENSOR_MULTIVIEW(tensor))
306
96
  {
307
96
    ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
308
96
    const int count = 0;
309
96
    const int off = mv->kind;
310
96
    const int mod = mv->repeat;
311
    // If reached the root.
312
96
    tensor = CCV_NNC_MULTIVIEW_DATA(mv)[count >= off ? 
((count - off) % mod) + off83
:
count13
]; // Unwrap.
313
96
  }
314
92
  return tensor;
315
92
}
316
317
void ccv_nnc_graph_exec_set_io(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
318
23
{
319
23
  assert(exec.d < graph->exec_info->rnum);
320
23
  assert(exec.graph == graph);
321
23
  ccv_nnc_graph_exec_info_t* const info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d);
322
  // De-register from the graph if it contains multiview tensors.
323
23
  if (info->tensor_wraps_ref)
324
1
    _ccv_nnc_graph_deregister_tensor_wraps(graph, info->tensor_wraps_ref - 1);
325
  // In case it is already executed, rewind.
326
23
  _ccv_nnc_graph_exec_rewind(info, graph);
327
23
  if (input_size == 0 && 
output_size == 04
)
328
1
  {
329
1
    if (info->input_size > 0 || info->output_size > 0)
330
0
      ccfree(info->inputs);
331
1
    info->inputs = 0;
332
1
    info->outputs = 0;
333
1
    info->input_size = 0;
334
1
    info->output_size = 0;
335
1
    _ccv_nnc_graph_redo_tensor_wraps(info, graph);
336
1
    if (info->tensor_wraps_ref)
337
0
      ccv_nnc_graph_register_tensor_wraps(graph, info->tensor_wraps_ref - 1);
338
1
    return;
339
1
  }
340
22
  if (info->inputs)
341
2
    info->inputs = (ccv_nnc_tensor_t**)ccrealloc(info->inputs, sizeof(ccv_nnc_tensor_t*) * (input_size + output_size));
342
20
  else
343
20
    info->inputs = (ccv_nnc_tensor_t**)ccmalloc(sizeof(ccv_nnc_tensor_t*) * (input_size + output_size));
344
22
  info->outputs = info->inputs + input_size;
345
22
  if (inputs)
346
22
    memcpy(info->inputs, inputs, sizeof(ccv_nnc_tensor_t*) * input_size);
347
22
  if (outputs)
348
22
    memcpy(info->outputs, outputs, sizeof(ccv_nnc_tensor_t*) * output_size);
349
22
  int i;
350
22
  int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0;
351
77
  for (i = 0; i < input_size + output_size; 
i++55
)
352
55
    if (info->inputs[i])
353
55
    {
354
55
      ccv_nnc_tensor_t* const tensor = CCV_IS_TENSOR_MULTIVIEW(info->inputs[i]) ? 
_ccv_nnc_any_tensor_from_tensor_multiview((ccv_nnc_tensor_multiview_t*)info->inputs[i])3
:
info->inputs[i]52
;
355
55
      tensor_memory |= CCV_TENSOR_GET_MEMORY(tensor->info.type), tensor_formats |= tensor->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(tensor->info.datatype);
356
55
    }
357
22
  info->cmd.backend = ccv_nnc_cmd_find_backend(info->cmd, tensor_memory, tensor_formats, tensor_datatypes);
358
22
  info->input_size = input_size;
359
22
  info->output_size = output_size;
360
22
  _ccv_nnc_graph_redo_tensor_wraps(info, graph);
361
  // Register again if the tensor wraps exist.
362
22
  if (info->tensor_wraps_ref)
363
2
    ccv_nnc_graph_register_tensor_wraps(graph, info->tensor_wraps_ref - 1);
364
  // Free flags.
365
22
  if (info->input_flags)
366
0
  {
367
0
    ccfree(info->input_flags);
368
0
    info->input_flags = info->output_flags = 0;
369
0
  }
370
22
}
371
372
void ccv_nnc_graph_exec_add_as_affected(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, ccv_nnc_tensor_t* const update)
373
23
{
374
23
  assert(CCV_IS_TENSOR_MULTIVIEW(update));
375
23
  assert(exec.d < graph->exec_info->rnum);
376
23
  assert(exec.graph == graph);
377
23
  ccv_nnc_graph_exec_info_t* const info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d);
378
23
  const int register_tensor_wraps = !info->tensor_wraps_ref;
379
23
  const int update_index = info->update_size;
380
23
  ++info->update_size;
381
23
  if (info->updates)
382
6
    info->updates = (ccv_nnc_tensor_t**)ccrealloc(info->updates, sizeof(ccv_nnc_tensor_t*) * info->update_size);
383
17
  else
384
17
    info->updates = (ccv_nnc_tensor_t**)ccmalloc(sizeof(ccv_nnc_tensor_t*) * info->update_size);
385
23
  info->updates[update_index] = update;
386
23
  _ccv_nnc_graph_redo_tensor_wraps(info, graph);
387
23
  if (register_tensor_wraps)
388
14
    ccv_nnc_graph_register_tensor_wraps(graph, info->tensor_wraps_ref - 1);
389
23
}
390
391
ccv_nnc_graph_exec_t ccv_nnc_graph_exec_new(ccv_nnc_graph_t* const graph, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
392
32.4k
{
393
32.4k
  int d = graph->exec_info->rnum;
394
32.4k
  ccv_nnc_graph_exec_info_t info = {
395
32.4k
    .cmd = cmd,
396
32.4k
    .hint = hint,
397
32.4k
    .input_size = input_size,
398
32.4k
    .output_size = output_size,
399
32.4k
  };
400
32.4k
  assert(inputs || input_size == 0);
401
32.4k
  assert(outputs || output_size == 0);
402
32.4k
  if (input_size > 0 || 
output_size > 04.71k
)
403
32.1k
  {
404
32.1k
    info.inputs = (ccv_nnc_tensor_t**)ccmalloc(sizeof(ccv_nnc_tensor_t*) * (input_size + output_size));
405
32.1k
    info.outputs = info.inputs + input_size;
406
32.1k
    if (inputs)
407
32.1k
      memcpy(info.inputs, inputs, sizeof(ccv_nnc_tensor_t*) * input_size);
408
32.1k
    if (outputs)
409
32.1k
      memcpy(info.outputs, outputs, sizeof(ccv_nnc_tensor_t*) * output_size);
410
32.1k
    info.input_size = input_size;
411
32.1k
    info.output_size = output_size;
412
32.1k
    int i;
413
32.1k
    int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0;
414
173k
    for (i = 0; i < input_size + output_size; 
i++141k
)
415
141k
      if (info.inputs[i])
416
105k
      {
417
105k
        ccv_nnc_tensor_t* const tensor = CCV_IS_TENSOR_MULTIVIEW(info.inputs[i]) ? 
_ccv_nnc_any_tensor_from_tensor_multiview((ccv_nnc_tensor_multiview_t*)info.inputs[i])76
:
info.inputs[i]105k
;
418
105k
        tensor_memory |= CCV_TENSOR_GET_MEMORY(tensor->info.type), tensor_formats |= tensor->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(tensor->info.datatype);
419
105k
      }
420
32.1k
    info.cmd.backend = ccv_nnc_cmd_find_backend(info.cmd, tensor_memory, tensor_formats, tensor_datatypes);
421
32.1k
  }
422
32.4k
  _ccv_nnc_graph_redo_tensor_wraps(&info, graph);
423
  // Add itself to the graph's wraps array, this will help the run time when we run the graph and do unwrapping.
424
32.4k
  if (info.tensor_wraps_ref)
425
36
    ccv_nnc_graph_register_tensor_wraps(graph, info.tensor_wraps_ref - 1);
426
32.4k
  ccv_array_push(graph->exec_info, &info);
427
32.4k
  return (ccv_nnc_graph_exec_t){
428
32.4k
    .d = d,
429
32.4k
    .graph = graph,
430
32.4k
  };
431
32.4k
}
432
433
void ccv_nnc_graph_add_carry_over(ccv_nnc_graph_t* const graph, const ccv_nnc_tensor_t* const from, const ccv_nnc_tensor_t* const to)
434
25
{
435
25
  ccv_nnc_graph_tensor_carry_over_t carry_over = {
436
25
    .from = _ccv_nnc_graph_tensor_wrap_new((ccv_nnc_tensor_multiview_t*)from),
437
25
    .to = _ccv_nnc_graph_tensor_wrap_new((ccv_nnc_tensor_multiview_t*)to)
438
25
  };
439
25
  if (!graph->carry_overs)
440
21
    graph->carry_overs = ccv_array_new(sizeof(ccv_nnc_graph_tensor_carry_over_t), 0, 0);
441
25
  ccv_array_push(graph->carry_overs, &carry_over);
442
25
}
443
444
int ccv_nnc_graph_exec_concat(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t source, const ccv_nnc_graph_exec_t destination)
445
29.2k
{
446
29.2k
  assert(graph == source.graph);
447
29.2k
  assert(graph == destination.graph);
448
29.2k
  assert(source.d < graph->exec_info->rnum);
449
29.2k
  assert(destination.d < graph->exec_info->rnum);
450
29.2k
  ccv_nnc_graph_exec_info_t* src_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, source.d);
451
29.2k
  if (src_info->outgoings == 0)
452
26.2k
    src_info->outgoings = ccv_array_new(sizeof(int32_t), 1, 0);
453
3.07k
  else {
454
3.07k
    int i;
455
    // Check if this is already connected, if so, skip.
456
9.31k
    for (i = 0; i < src_info->outgoings->rnum; 
i++6.24k
)
457
6.24k
      if (*(int*)ccv_array_get(src_info->outgoings, i) == destination.d)
458
0
        return -1;
459
3.07k
  }
460
29.2k
  ccv_array_push(src_info->outgoings, &destination.d);
461
29.2k
  graph->topsorted = 0;
462
29.2k
  return 0;
463
29.2k
}
464
465
int ccv_nnc_graph_exec_disjoin(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t source, const ccv_nnc_graph_exec_t destination)
466
0
{
467
0
  assert(graph == source.graph);
468
0
  assert(graph == destination.graph);
469
0
  assert(source.d < graph->exec_info->rnum);
470
0
  assert(destination.d < graph->exec_info->rnum);
471
0
  ccv_nnc_graph_exec_info_t* src_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, source.d);
472
0
  if (!src_info->outgoings)
473
0
    return -1;
474
0
  int i;
475
  // Check if this is already connected, if so, skip.
476
0
  for (i = 0; i < src_info->outgoings->rnum; i++)
477
0
    if (*(int*)ccv_array_get(src_info->outgoings, i) == destination.d)
478
0
    {
479
0
      if (i < src_info->outgoings->rnum - 1)
480
0
        *(int*)ccv_array_get(src_info->outgoings, i) = *(int*)ccv_array_get(src_info->outgoings, src_info->outgoings->rnum - 1);
481
0
      --src_info->outgoings->rnum;
482
0
      graph->topsorted = 0;
483
0
      return 0;
484
0
    }
485
0
  return -1;
486
0
}
487
488
int ccv_nnc_graph_exec_count(const ccv_nnc_graph_t* const graph)
489
0
{
490
0
  return graph->exec_info ? graph->exec_info->rnum : 0;
491
0
}
492
493
void* ccv_nnc_graph_buffer(ccv_nnc_graph_t* const graph, int size)
494
26.5k
{
495
26.5k
  if (graph->buffer_size >= size)
496
26.1k
    return graph->buffer;
497
352
  graph->buffer_size = size;
498
352
  graph->buffer = (graph->buffer) ? 
ccrealloc17
(graph->buffer, size)17
:
ccmalloc335
(size)335
;
499
352
  return graph->buffer;
500
26.5k
}
501
502
void ccv_nnc_graph_topsort(ccv_nnc_graph_t* const graph, int* const exec_cvt, const int exec_cvt_size)
503
6.22k
{
504
6.22k
  if (exec_cvt_size == 0 && 
graph->exec_info->rnum == 00
)
505
0
  {
506
0
    graph->topsorted = 1;
507
0
    return;
508
0
  }
509
6.22k
  assert(exec_cvt_size == graph->exec_info->rnum);
510
6.22k
  assert(graph->sources && graph->sources->rnum);
511
6.22k
  assert(graph->destinations && graph->destinations->rnum);
512
6.22k
  int i, j;
513
38.6k
  for (i = 0; i < exec_cvt_size; 
i++32.4k
)
514
32.4k
    exec_cvt[i] = -1;
515
6.22k
  ccv_array_t* exec_info = ccv_array_new(sizeof(ccv_nnc_graph_exec_info_t), graph->exec_info->rnum, 0);
516
  // If there are breakpoints, it is more complicated, we first start to the breakpoints, and then continue from the breakpoints to the destinations.
517
6.22k
  if (graph->breakpoint_size)
518
21
  {
519
42
    ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new21
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, (ccv_nnc_graph_exec_t*)ccv_array_get(graph->sources, 0), graph->sources->rnum, graph->breakpoints, graph->breakpoint_size, 0);
520
42
    for (i = 0; i < graph->breakpoint_size; 
i++21
)
521
21
      exec_cvt[graph->breakpoints[i].d] = -2; // Mark this as breakpoints, so we will skip the first round.
522
42
    
ccv_nnc_graph_visit_for32
(visit, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), node, idx) {
523
32
      assert(!node->pair_ref); // If node has a pair ref, we cannot fix it up.
524
32
      if (exec_cvt[idx] == -2) // Skip breakpoint.
525
21
        continue;
526
      // Loop over node and push to the array.
527
11
      ccv_array_push(exec_info, node);
528
      // Go to its sub-graph to fix exec_idx
529
11
      for (i = 0; i < node->graph_ref_size; 
i++0
)
530
0
      {
531
0
        const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
532
0
        if (graph_ref >= 0)
533
0
        {
534
0
          ccv_nnc_graph_t* const sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, graph_ref);
535
0
          sub_graph->exec_idx = exec_info->rnum;
536
0
        }
537
0
      }
538
11
      exec_cvt[idx] = exec_info->rnum - 1;
539
11
    } ccv_nnc_graph_visit_endfor
540
21
    ccv_nnc_graph_visit_free(visit);
541
21
    graph->breakpoint_offset = exec_info->rnum;
542
42
    visit = 
ccv_nnc_graph_visit_new21
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph->breakpoints, graph->breakpoint_size, (ccv_nnc_graph_exec_t*)ccv_array_get(graph->destinations, 0), graph->destinations->rnum, 0);
543
44
    ccv_nnc_graph_visit_for(visit, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), node, idx) {
544
44
      assert(!node->pair_ref); // If node has a pair ref, we cannot fix it up.
545
      // Loop over node and push to the array.
546
44
      ccv_array_push(exec_info, node);
547
      // Go to its sub-graph to fix exec_idx
548
52
      for (i = 0; i < node->graph_ref_size; 
i++8
)
549
8
      {
550
8
        const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
551
8
        if (graph_ref >= 0)
552
8
        {
553
8
          ccv_nnc_graph_t* const sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, graph_ref);
554
8
          sub_graph->exec_idx = exec_info->rnum;
555
8
        }
556
8
      }
557
44
      exec_cvt[idx] = exec_info->rnum - 1;
558
44
    } ccv_nnc_graph_visit_endfor
559
21
    ccv_nnc_graph_visit_free(visit);
560
42
    for (i = 0; i < graph->breakpoint_size; 
i++21
)
561
21
      { assert(exec_cvt[graph->breakpoints[i].d] >= 0); } // All breakpoints should be assigned.
562
6.20k
  } else {
563
12.4k
    ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new6.20k
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, (ccv_nnc_graph_exec_t*)ccv_array_get(graph->sources, 0), graph->sources->rnum, (ccv_nnc_graph_exec_t*)ccv_array_get(graph->destinations, 0), graph->destinations->rnum, 0);
564
32.3k
    ccv_nnc_graph_visit_for(visit, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), node, idx) {
565
32.3k
      assert(!node->pair_ref); // If node has a pair ref, we cannot fix it up.
566
      // Loop over node and push to the array.
567
32.3k
      ccv_array_push(exec_info, node);
568
      // Go to its sub-graph to fix exec_idx
569
32.3k
      for (i = 0; i < node->graph_ref_size; 
i++42
)
570
42
      {
571
42
        const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
572
42
        if (graph_ref >= 0)
573
42
        {
574
42
          ccv_nnc_graph_t* const sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, graph_ref);
575
42
          sub_graph->exec_idx = exec_info->rnum;
576
42
        }
577
42
      }
578
32.3k
      exec_cvt[idx] = exec_info->rnum - 1;
579
32.3k
    } ccv_nnc_graph_visit_endfor
580
6.20k
    ccv_nnc_graph_visit_free(visit);
581
6.20k
  }
582
6.22k
  assert(graph->exec_info->rnum == exec_info->rnum);
583
6.22k
  ccv_array_free(graph->exec_info);
584
6.22k
  graph->exec_info = exec_info;
585
12.4k
  for (i = 0; i < graph->sources->rnum; 
i++6.22k
)
586
6.22k
  {
587
6.22k
    ccv_nnc_graph_exec_t* const source = (ccv_nnc_graph_exec_t*)ccv_array_get(graph->sources, i);
588
6.22k
    source->d = exec_cvt[source->d];
589
6.22k
  }
590
12.4k
  for (i = 0; i < graph->destinations->rnum; 
i++6.22k
)
591
6.22k
  {
592
6.22k
    ccv_nnc_graph_exec_t* const destination = (ccv_nnc_graph_exec_t*)ccv_array_get(graph->destinations, i);
593
6.22k
    destination->d = exec_cvt[destination->d];
594
6.22k
  }
595
  // Update all outgoings to reflect the latest.
596
38.6k
  for (i = 0; i < exec_info->rnum; 
i++32.4k
)
597
32.4k
  {
598
32.4k
    ccv_nnc_graph_exec_info_t* const info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(exec_info, i);
599
32.4k
    if (info->outgoings)
600
55.4k
      
for (j = 0; 26.1k
j < info->outgoings->rnum;
j++29.2k
)
601
29.2k
        *(int*)ccv_array_get(info->outgoings, j) = exec_cvt[*(int*)ccv_array_get(info->outgoings, j)];
602
32.4k
  }
603
6.22k
  graph->topsorted = 1;
604
6.22k
}
605
606
typedef struct {
607
  int device_id;
608
  int exec_idx;
609
  ccv_array_t* signal_set;
610
  ccv_array_t* command_set; // The set of command executed in this stream. In case there is a tie (on rank). We will check this.
611
} ccv_nnc_stream_data_t;
612
613
static void _ccv_nnc_graph_schedule_assign_signals(ccv_array_t* const incoming, ccv_nnc_graph_exec_schedule_t* const node, ccv_array_t* const stream_data, int* const signal_size, ccv_nnc_graph_exec_schedule_t* const exec_info, const int exec_info_size)
614
4.80k
{
615
4.80k
  assert(incoming->rnum > 0);
616
4.80k
  int i, j, k;
617
4.80k
  int wait_size = 0, max_wait_size = 0;
618
10.7k
  for (i = 0; i < incoming->rnum; 
i++5.98k
)
619
5.98k
  {
620
5.98k
    const int incoming_idx = *(int*)ccv_array_get(incoming, i);
621
5.98k
    ccv_nnc_graph_exec_schedule_t* const incoming_exec_info = exec_info + incoming_idx;
622
5.98k
    assert(incoming_exec_info->stream_size > 0);
623
5.98k
    max_wait_size += incoming_exec_info->stream_size;
624
5.98k
  }
625
4.80k
  int waits[ccv_max(1, max_wait_size)];
626
4.80k
  assert(node->stream_size > 0);
627
10.7k
  
for (i = 0; 4.80k
i < incoming->rnum;
i++5.98k
)
628
5.98k
  {
629
5.98k
    const int incoming_idx = *(int*)ccv_array_get(incoming, i);
630
5.98k
    assert(incoming_idx < exec_info_size);
631
5.98k
    assert(incoming_idx >= 0);
632
5.98k
    ccv_nnc_graph_exec_schedule_t* const incoming_exec_info = exec_info + incoming_idx;
633
5.98k
    assert(incoming_exec_info->stream_size > 0);
634
5.98k
    int stream_synced = 1;
635
    // If the current node's stream is a subset of the incoming node's stream, there
636
    // is no need to sync with signal, because we are already synced with the incoming.
637
11.9k
    for (j = 0; stream_synced && 
j < node->stream_size10.2k
;
j++5.99k
)
638
5.99k
    {
639
5.99k
      const int s = SCHEDULE_STREAMS(*node)[j];
640
5.99k
      assert(s >= 0);
641
5.99k
      int flag = 0;
642
12.8k
      for (k = 0; !flag && 
k < incoming_exec_info->stream_size8.55k
;
k++6.82k
)
643
6.82k
        flag = (SCHEDULE_STREAMS(*incoming_exec_info)[k] == s);
644
5.99k
      stream_synced = flag;
645
5.99k
    }
646
5.98k
    if (stream_synced)
647
4.24k
      continue;
648
    // Otherwise, find the streams we need to sync with, and create signals for these.
649
3.49k
    
for (j = 0; 1.73k
j < incoming_exec_info->stream_size;
j++1.75k
)
650
1.75k
    {
651
1.75k
      const int s = SCHEDULE_STREAMS(*incoming_exec_info)[j];
652
1.75k
      assert(s >= 0);
653
1.75k
      int flag = 0;
654
4.50k
      for (k = 0; !flag && 
k < node->stream_size4.48k
;
k++2.75k
)
655
2.75k
        flag = (SCHEDULE_STREAMS(*node)[k] == s);
656
1.75k
      if (!flag) // Need to have a signal.
657
1.72k
      {
658
1.72k
        if (SCHEDULE_SIGNALS(*incoming_exec_info)[j] < 0)
659
1.33k
          SCHEDULE_SIGNALS(*incoming_exec_info)[j] = (*signal_size)++;
660
393
        else {
661
393
          int flag = 0;
662
          // If any of the stream the current node has already seen this signal, we are good already.
663
1.33k
          for (k = 0; !flag && k < node->stream_size; 
k++943
)
664
943
          {
665
943
            assert(SCHEDULE_STREAMS(*node)[k] >= 0);
666
943
            ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, SCHEDULE_STREAMS(*node)[k]);
667
943
            flag = (data->signal_set && 
ccv_array_find_int(data->signal_set, 427
SCHEDULE_SIGNALS427
(*incoming_exec_info)[j]));
668
943
          }
669
393
          if (flag)
670
0
            continue;
671
393
        }
672
        // Otherwise, we need to wait for this. Currently, our granularity is about wait on all streams.
673
1.72k
        waits[wait_size++] = SCHEDULE_SIGNALS(*incoming_exec_info)[j];
674
        // All streams on this node have seen this signal.
675
4.43k
        for (k = 0; k < node->stream_size; 
k++2.70k
)
676
2.70k
        {
677
2.70k
          ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, SCHEDULE_STREAMS(*node)[k]);
678
2.70k
          if (!data->signal_set)
679
919
            data->signal_set = ccv_array_new(sizeof(int), 0, 0);
680
2.70k
          ccv_array_push(data->signal_set, &SCHEDULE_SIGNALS(*incoming_exec_info)[j]);
681
2.70k
        }
682
1.72k
      }
683
1.75k
    }
684
1.73k
  }
685
4.80k
  node->wait_size = wait_size;
686
4.80k
  if (wait_size > 0)
687
833
  {
688
833
    node->waits = node->waits ? 
ccrealloc0
(node->waits, sizeof(int) * wait_size)0
: ccmalloc(sizeof(int) * wait_size);
689
833
    memcpy(node->waits, waits, sizeof(int) * wait_size);
690
833
  }
691
4.80k
}
692
693
typedef struct {
694
  int rank;
695
  ccv_array_t* outgoings;
696
} ccv_nnc_incoming_t;
697
698
static int _ccv_nnc_device_ids_for_stream_data(ccv_nnc_graph_exec_info_t* const node, const int device_id, ccv_array_t* const stream_data, int* const device_ids, const int max_device_id_size)
699
12.8k
{
700
  // TODO: I need to re-think whether this is GPU only or not.
701
12.8k
  int device_id_size = ccv_nnc_device_ids_for_io(node->inputs, node->input_size, node->outputs, node->output_size, CCV_TENSOR_GPU_MEMORY, device_ids, max_device_id_size);
702
12.8k
  if (device_id_size == 0)
703
2.47k
  {
704
    // If there is a default data, use that device id. Otherwise, use the device id passed in (this will be the default data device id).
705
2.47k
    if (stream_data->rnum > 0)
706
2.29k
    {
707
2.29k
      ccv_nnc_stream_data_t* const default_data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, 0);
708
2.29k
      device_ids[0] = default_data->device_id;
709
2.29k
    } else
710
180
      device_ids[0] = device_id >= 0 ? 
device_id2
:
0178
;
711
2.47k
    device_id_size = 1;
712
2.47k
  }
713
12.8k
  return device_id_size;
714
12.8k
}
715
716
void ccv_nnc_graph_static_schedule_free(ccv_nnc_graph_static_schedule_t* const schedule)
717
400
{
718
400
  int i;
719
400
  ccv_nnc_graph_exec_schedule_t* const schd_info = schedule->exec_info;
720
7.64k
  for (i = 0; i < schedule->exec_info_size; 
i++7.24k
)
721
7.24k
  {
722
7.24k
    if (schd_info[i].stream_size > 1)
723
150
      ccfree(schd_info[i]._heap_streams);
724
7.24k
    if (schd_info[i].waits)
725
833
      ccfree(schd_info[i].waits);
726
7.24k
  }
727
400
  if (schedule->stream_1s)
728
14
    ccfree(schedule->stream_1s);
729
400
  if (schedule->waits)
730
10
    ccfree(schedule->waits);
731
400
  if (schedule->psort)
732
63
    ccfree(schedule->psort);
733
400
  if (schedule->begin)
734
14
    ccv_nnc_stream_signal_free(schedule->begin);
735
400
  if (schedule->end)
736
400
    ccv_nnc_stream_signal_free(schedule->end);
737
400
  ccfree(schedule);
738
400
}
739
740
static ccv_nnc_graph_static_schedule_t* _ccv_nnc_graph_static_schedule_new(ccv_nnc_graph_t* const graph, const int stream_type, const int device_id, const int max_stream_count, ccv_nnc_stream_context_t* const stream_context, const ccv_nnc_graph_exec_t* const _sources, const int _source_size, const ccv_nnc_graph_exec_t* const _destinations, const int _destination_size)
741
400
{
742
400
  assert(graph->sources && graph->sources->rnum);
743
400
  assert(graph->destinations && graph->destinations->rnum);
744
400
  assert(graph->topsorted); // Only support this on a topsorted graph.
745
400
  const int exec_info_size = graph->exec_info->rnum;
746
400
  assert(exec_info_size > 0);
747
400
  const ccv_nnc_graph_exec_t* const sources = _sources == 0 ? 
(ccv_nnc_graph_exec_t*)371
ccv_array_get371
(graph->sources, 0) :
_sources29
;
748
400
  const int source_size = _sources == 0 ? 
graph->sources->rnum371
:
_source_size29
;
749
400
  if (!_sources)
750
371
    { assert(_source_size == 0); }
751
400
  const ccv_nnc_graph_exec_t* const destinations = _destinations == 0 ? 
(ccv_nnc_graph_exec_t*)362
ccv_array_get362
(graph->destinations, 0) :
_destinations38
;
752
400
  const int destination_size = _destinations == 0 ? 
graph->destinations->rnum362
:
_destination_size38
;
753
400
  if (!_destinations)
754
362
    { assert(_destination_size == 0); }
755
400
  const int root_schedule = (_sources == 0 && 
_destinations == 0371
);
756
400
  ccv_nnc_graph_static_schedule_t* const schedule = cccalloc(1, sizeof(ccv_nnc_graph_static_schedule_t) + sizeof(ccv_nnc_graph_exec_schedule_t) * (exec_info_size - 1));
757
400
  schedule->exec_info_size = exec_info_size;
758
400
  ccv_nnc_graph_exec_schedule_t* const schd_info = schedule->exec_info;
759
400
  ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0);
760
800
  ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new400
(graph, exec_info, exec_info_size, sources, source_size, destinations, destination_size, 0);
761
400
  if (!root_schedule)
762
63
  {
763
    // If this is not a root schedule, we need to do partial topsort.
764
63
    int psort_size = 0;
765
1.54k
    ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
766
1.54k
      ++psort_size;
767
1.54k
    } ccv_nnc_graph_visit_endfor
768
63
    schedule->psort = (int*)ccmalloc(sizeof(int) * psort_size);
769
63
    schedule->psort_size = psort_size;
770
63
    psort_size = 0;
771
1.54k
    ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
772
1.54k
      schedule->psort[psort_size++] = idx;
773
1.54k
    } ccv_nnc_graph_visit_endfor
774
63
  }
775
800
  int i, j, k;
776
  // Generate exec dependencies (or, in other words, partial ordering of executions).
777
800
  ccv_sparse_matrix_t* exec_dep = ccv_sparse_matrix_new(exec_info_size, exec_info_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
778
800
  int* buf = (int*)
ccmalloc400
(sizeof(int) * exec_info_size * 2);
779
800
  int buf_size;
780
800
#define for_block(x, val) \
781
175k
  do { \
782
175k
    if (((int32_t*)val)[0] > 0) \
783
175k
    { \
784
175k
      buf[buf_size * 2] = x; \
785
175k
      buf[buf_size * 2 + 1] = ((int32_t*)val)[0] + 1; \
786
175k
      ++buf_size; \
787
175k
    } \
788
175k
  } while (0)
789
7.64k
  for (i = 0; i < exec_info_size; 
i++7.24k
)
790
7.24k
    schd_info[i].stream_size = -1;
791
5.22k
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx, term) {
792
5.22k
    buf_size = 0; /* save all its parent deps to this buffer */
793
5.22k
    ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx);
794
5.22k
    schd_info[idx].stream_size = 0;
795
5.22k
    if (vector)
796
175k
      
CCV_SPARSE_VECTOR_FOREACH4.80k
(exec_dep, vector, for_block);
797
5.22k
    if (!node->outgoings)
798
362
      continue;
799
14.0k
    
for (i = 0; 4.85k
i < node->outgoings->rnum;
i++9.21k
)
800
9.21k
    {
801
9.21k
      int outgoing = *(int*)ccv_array_get(node->outgoings, i);
802
9.21k
      const int32_t one = 1;
803
9.21k
      ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, idx);
804
      /* If not found, set, if the current node is the destination node, no need
805
       * set itself as parent of subsequent nodes because its terminal nature. */
806
9.21k
      if (!term && 
(9.13k
!cell.i329.13k
||
cell.i32[0] == 00
))
807
9.13k
        ccv_set_sparse_matrix_cell(exec_dep, outgoing, idx, &one);
808
309k
      for (j = 0; j < buf_size; 
j++300k
) /* set with all idx's dependencies as well */
809
300k
      {
810
300k
        ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2]);
811
        /* If not found, set */
812
300k
        if (!cell.i32 || 
cell.i32[0] == 0117k
)
813
183k
          ccv_set_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2], &buf[j * 2 + 1]);
814
117k
        else {
815
          /* Otherwise, set to the longest one */
816
117k
          int32_t dep = ccv_max(cell.i32[0], buf[j * 2 + 1]);
817
117k
          ccv_set_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2], &dep);
818
117k
        }
819
300k
      }
820
9.21k
    }
821
4.85k
  } ccv_nnc_graph_visit_endfor
822
400
#undef for_block
823
400
  ccfree(buf);
824
  // Algorithm to allocate signals and streams for this graph.
825
400
  ccv_array_t* const stream_data = ccv_array_new(sizeof(ccv_nnc_stream_data_t), 0, 0);
826
400
  ccv_array_t** const outgoings = cccalloc(exec_info_size, sizeof(ccv_array_t*));
827
400
  ccv_nnc_incoming_t* const incomings = cccalloc(exec_info_size, sizeof(ccv_nnc_incoming_t));
828
400
  int max_device_id_size = 1;
829
  // Filter out outgoing nodes that we will be able to access it afterwards anyway.
830
5.22k
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
831
5.22k
    max_device_id_size = ccv_max(node->input_size + node->output_size, max_device_id_size);
832
5.22k
    if (node->outgoings)
833
4.85k
    {
834
4.85k
      outgoings[idx] = ccv_array_new(sizeof(int), 0, 0);
835
14.0k
      for (i = 0; i < node->outgoings->rnum; 
i++9.21k
)
836
9.21k
      {
837
9.21k
        const int di = *(int*)ccv_array_get(node->outgoings, i);
838
        // Skip if we haven't accessed this exec.
839
9.21k
        if (schd_info[di].stream_size < 0)
840
1.34k
          continue;
841
7.86k
        int flag = 0;
842
26.2k
        for (j = 0; !flag && 
j < node->outgoings->rnum24.3k
;
j++18.3k
)
843
18.3k
        {
844
18.3k
          if (j != i)
845
12.2k
          {
846
12.2k
            const int dj = *(int*)ccv_array_get(node->outgoings, j);
847
12.2k
            ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, di, dj);
848
12.2k
            flag = (cell.i32 && 
cell.i32[0]1.88k
);
849
12.2k
          }
850
18.3k
        }
851
7.86k
        if (!flag)
852
5.98k
        {
853
5.98k
          ccv_array_push(outgoings[idx], &di);
854
5.98k
          if (!incomings[di].outgoings)
855
4.80k
            incomings[di].outgoings = ccv_array_new(sizeof(int), 1, 0);
856
5.98k
          ccv_array_push(incomings[di].outgoings, &idx);
857
5.98k
        }
858
7.86k
      }
859
4.85k
    }
860
5.22k
  } ccv_nnc_graph_visit_endfor
861
400
#define visitor(node, idx, _) \
862
5.22k
  if (node->outgoings) \
863
10.7k
    
for (i = 0; 4.80k
i < node->outgoings->rnum;
i++5.98k
) \
864
5.98k
    { \
865
5.98k
      const int d = *(int*)ccv_array_get(node->outgoings, i); \
866
5.98k
      node->rank = ccv_max(incomings[d].rank + 1, node->rank); \
867
5.98k
    }
868
5.22k
  
CCV_NNC_GRAPH_VISIT400
(graph, incomings, exec_info_size, destinations, destination_size, sources, source_size, 0, visitor);
869
400
#undef visitor
870
400
  int device_ids[max_device_id_size];
871
400
  int outgoing_device_ids[max_device_id_size];
872
400
  int signal_size = 0;
873
5.22k
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
874
    // Go through the incomings.
875
5.22k
    const int device_id_size = _ccv_nnc_device_ids_for_stream_data(node, device_id, stream_data, device_ids, max_device_id_size);
876
5.22k
    if (schd_info[idx].stream_size == 0)
877
420
    {
878
420
      schd_info[idx].stream_size = device_id_size; // At least at the same size as the device_id_size.
879
420
      if (device_id_size > 1)
880
6
      {
881
6
        schd_info[idx]._heap_streams = (int*)ccmalloc(sizeof(int) * device_id_size * 2);
882
6
        schd_info[idx]._heap_signals = (schd_info[idx]._heap_streams + device_id_size);
883
6
      }
884
854
      for (i = 0; i < device_id_size; 
i++434
)
885
434
        SCHEDULE_STREAMS(schd_info[idx])[i] = -1, SCHEDULE_SIGNALS(schd_info[idx])[i] = -1;
886
420
    }
887
10.8k
    for (i = 0; i < device_id_size; 
i++5.63k
)
888
      // Go through until the end to assign streams.
889
5.63k
      if (SCHEDULE_STREAMS(schd_info[idx])[i] < 0)
890
1.36k
      {
891
1.36k
        int stream_idx = -1;
892
1.36k
        int stream_has_command = 0;
893
        // First, find a good stream in stream data (the stream is good if it can be recycled, and it has the same command).
894
        // Otherwise, we prefer a usable stream (it doesn't have the command, but it can be recycled).
895
35.0k
        for (j = 0; (stream_idx < 0 || 
!stream_has_command249
) &&
j < stream_data->rnum35.0k
;
j++33.7k
)
896
33.7k
        {
897
33.7k
          ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, j);
898
33.7k
          if (data->device_id == device_ids[i])
899
8.94k
          {
900
8.94k
            const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, data->exec_idx);
901
            // If there is a path to conclude that exec_idx is before idx, then we can reuse
902
            // this stream. Otherwise the work in this "empty stream" could still be ongoing,
903
            // and we may delay the following work unnecessarily.
904
8.94k
            if (cell.i32 && 
cell.i32[0] > 0153
)
905
153
            {
906
153
              if (ccv_array_find_uint(data->command_set, node->cmd.cmd))
907
68
                stream_idx = j, stream_has_command = 1;
908
85
              else if (stream_idx < 0) // Otherwise, only assign the stream idx if it is not assigned yet.
909
45
                stream_idx = j;
910
153
            }
911
8.94k
          }
912
33.7k
        }
913
1.36k
        if (stream_idx < 0)
914
1.25k
        {
915
          // Note that the max stream count is a "soft" limit. Even we have different devices, our compute allocation has to be on different streams.
916
1.25k
          if (stream_data->rnum >= max_stream_count && 
max_stream_count > 01.01k
)
917
0
          {
918
            // If we are already at out limit, go through again to see if a stream is available, if the stream has command, and also its exec_idx is not preceding this execution.
919
0
            for (j = 0; (stream_idx < 0 || !stream_has_command) && j < stream_data->rnum; j++)
920
0
            {
921
0
              ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, j);
922
0
              if (data->device_id == device_ids[i])
923
0
              {
924
0
                const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, data->exec_idx, idx);
925
                // There must be no path from idx to exec_idx otherwise we already have stream_idx. Now we just to verify
926
                // there is no path from exec_idx to idx as well.
927
0
                if (!cell.i32 || cell.i32[0] == 0)
928
0
                {
929
0
                  if (ccv_array_find_uint(data->command_set, node->cmd.cmd))
930
0
                    stream_idx = j, stream_has_command = 1;
931
0
                  else if (stream_idx < 0) // Otherwise, only assign the stream idx if it is not assigned yet.
932
0
                    stream_idx = j;
933
0
                }
934
0
              }
935
0
            }
936
0
            if (stream_idx >= 0)
937
0
            {
938
              // Now need to mark exec_idx is after idx, so we can avoid A -> B -> A deadlock.
939
0
              ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, stream_idx);
940
0
              const int32_t one = 1;
941
0
              ccv_set_sparse_matrix_cell(exec_dep, idx, data->exec_idx, &one);
942
0
            }
943
0
          }
944
1.25k
          if (stream_idx < 0)
945
1.25k
          {
946
1.25k
            stream_idx = stream_data->rnum;
947
1.25k
            const ccv_nnc_stream_data_t data = {
948
1.25k
              .device_id = device_ids[i],
949
1.25k
            };
950
1.25k
            ccv_array_push(stream_data, &data);
951
1.25k
          }
952
1.25k
        }
953
1.36k
        assert(stream_idx >= 0);
954
1.36k
        ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, stream_idx);
955
1.36k
        if (!data->command_set)
956
1.25k
          data->command_set = ccv_array_new(sizeof(uint32_t), 1, 0);
957
1.36k
        SCHEDULE_STREAMS(schd_info[idx])[i] = stream_idx;
958
1.36k
        ccv_array_add_unique_uint(data->command_set, node->cmd.cmd);
959
        // Assign all subsequent node to use this stream.
960
1.36k
        int outgoing_idx = idx;
961
        // if we want to enforce the stream count is only 1, we certainly don't want to the greedy approach.
962
        // With the greedy approach, the current stream will go all the way down and certainly conflict with
963
        // other streams. We'd prefer to interleaving the execution instead in this case.
964
1.36k
        if (max_stream_count != 1)
965
5.63k
          
while (1.36k
outgoings[outgoing_idx] &&
outgoings[outgoing_idx]->rnum5.27k
)
966
5.21k
          {
967
5.21k
            int highest_rank = -1;
968
5.21k
            int highest_idx = -1;
969
5.21k
            int stream_n = -1;
970
5.21k
            int stream_has_command = 0;
971
12.8k
            for (j = 0; j < outgoings[outgoing_idx]->rnum; 
j++7.62k
)
972
7.62k
            {
973
7.62k
              const int d = *(int*)ccv_array_get(outgoings[outgoing_idx], j);
974
              // This is not outside of our scope at this point.
975
7.62k
              assert(schd_info[d].stream_size >= 0);
976
7.62k
              ccv_nnc_graph_exec_info_t* const outgoing_node = exec_info + d;
977
7.62k
              const int outgoing_device_id_size = _ccv_nnc_device_ids_for_stream_data(outgoing_node, device_id, stream_data, outgoing_device_ids, max_device_id_size);
978
7.62k
              if (schd_info[d].stream_size == 0)
979
4.80k
              {
980
4.80k
                schd_info[d].stream_size = outgoing_device_id_size; // At least at the same size as the device_id_size.
981
4.80k
                if (outgoing_device_id_size > 1)
982
144
                {
983
144
                  schd_info[d]._heap_streams = (int*)ccmalloc(sizeof(int) * outgoing_device_id_size * 2);
984
144
                  schd_info[d]._heap_signals = (schd_info[d]._heap_streams + outgoing_device_id_size);
985
144
                }
986
10.0k
                for (k = 0; k < outgoing_device_id_size; 
k++5.20k
)
987
5.20k
                  SCHEDULE_STREAMS(schd_info[d])[k] = -1, SCHEDULE_SIGNALS(schd_info[d])[k] = -1;
988
4.80k
              }
989
7.62k
              assert(schd_info[d].stream_size == outgoing_device_id_size);
990
16.2k
              
for (k = 0; 7.62k
k < outgoing_device_id_size;
k++8.66k
)
991
                // If it should be on the same device and the stream is not assign, potentially.
992
8.66k
                if (outgoing_device_ids[k] == device_ids[i] &&
993
8.66k
                  
SCHEDULE_STREAMS5.38k
(schd_info[d])[k] < 05.38k
&&
994
8.66k
                  
(4.91k
incomings[d].rank > highest_rank4.91k
||
995
4.91k
                   
(647
incomings[d].rank == highest_rank647
&&
996
647
                    !stream_has_command && 
ccv_array_find_uint(data->command_set, outgoing_node->cmd.cmd)0
)))
997
4.27k
                {
998
4.27k
                  highest_rank = incomings[d].rank;
999
4.27k
                  highest_idx = d;
1000
4.27k
                  stream_n = k;
1001
                  // This is 1 if rank is the same (thus, I must break the tie already), if the rank is not the same, we need to compute this.
1002
4.27k
                  stream_has_command = (incomings[d].rank == highest_rank || 
ccv_array_find_uint(data->command_set, outgoing_node->cmd.cmd)0
);
1003
4.27k
                }
1004
7.62k
            }
1005
5.21k
            if (highest_idx >= 0)
1006
4.27k
            {
1007
4.27k
              outgoing_idx = highest_idx;
1008
4.27k
              ccv_nnc_graph_exec_info_t* const outgoing_node = exec_info + outgoing_idx;
1009
4.27k
              assert(stream_n >= 0);
1010
4.27k
              SCHEDULE_STREAMS(schd_info[outgoing_idx])[stream_n] = stream_idx;
1011
4.27k
              ccv_array_add_unique_uint(data->command_set, outgoing_node->cmd.cmd);
1012
4.27k
            } else
1013
941
              break;
1014
5.21k
          }
1015
1.36k
        data->exec_idx = outgoing_idx;
1016
1.36k
      }
1017
5.22k
  } ccv_nnc_graph_visit_endfor
1018
  // Go through to assign signals when necessary.
1019
5.22k
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1020
5.22k
    if (incomings[idx].outgoings && 
incomings[idx].outgoings->rnum4.80k
)
1021
4.80k
      _ccv_nnc_graph_schedule_assign_signals(incomings[idx].outgoings, schd_info + idx, stream_data, &signal_size, schd_info, exec_info_size);
1022
5.22k
  } ccv_nnc_graph_visit_endfor
1023
7.64k
  for (i = 0; i < exec_info_size; 
i++7.24k
)
1024
7.24k
    if (outgoings[i])
1025
4.85k
      ccv_array_free(outgoings[i]);
1026
400
  ccfree(outgoings);
1027
400
  ccv_matrix_free(exec_dep);
1028
400
  ccv_nnc_stream_data_t* const default_data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, 0);
1029
400
  if (device_id >= 0)
1030
4
  {
1031
    // If the default stream (stream 0) is not the same as desired stream, swap with the one that is.
1032
4
    if (default_data->device_id != device_id)
1033
0
    {
1034
0
      int exchange_stream_idx = -1;
1035
      // Find the stream idx to exchange.
1036
0
      ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1037
0
        int flag = 0;
1038
0
        for(i = 0; !flag && i < schd_info[idx].stream_size; i++)
1039
0
        {
1040
0
          const int stream_idx = SCHEDULE_STREAMS(schd_info[idx])[i];
1041
0
          ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, stream_idx);
1042
0
          if (data->device_id == device_id)
1043
0
          {
1044
0
            exchange_stream_idx = stream_idx;
1045
0
            flag = 1;
1046
0
          }
1047
0
        }
1048
0
        if (flag)
1049
0
          break;
1050
0
      } ccv_nnc_graph_visit_endfor
1051
0
      assert(exchange_stream_idx >= 0);
1052
0
      ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1053
0
        for (i = 0; i < schd_info[idx].stream_size; i++)
1054
0
          if (SCHEDULE_STREAMS(schd_info[idx])[i] == 0)
1055
0
            SCHEDULE_STREAMS(schd_info[idx])[i] = -1;
1056
0
      } ccv_nnc_graph_visit_endfor
1057
0
      ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1058
0
        for (i = 0; i < schd_info[idx].stream_size; i++)
1059
0
          if (SCHEDULE_STREAMS(schd_info[idx])[i] == exchange_stream_idx)
1060
0
            SCHEDULE_STREAMS(schd_info[idx])[i] = 0;
1061
0
      } ccv_nnc_graph_visit_endfor
1062
0
      ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1063
0
        for (i = 0; i < schd_info[idx].stream_size; i++)
1064
0
          if (SCHEDULE_STREAMS(schd_info[idx])[i] == -1)
1065
0
            SCHEDULE_STREAMS(schd_info[idx])[i] = exchange_stream_idx;
1066
0
      } ccv_nnc_graph_visit_endfor
1067
0
      ((ccv_nnc_stream_data_t*)ccv_array_get(stream_data, exchange_stream_idx))->device_id = default_data->device_id;
1068
0
      default_data->device_id = device_id;
1069
0
    }
1070
4
  }
1071
400
  int graph_stream_1_size = 0;
1072
820
  for (i = 0; i < source_size; 
i++420
)
1073
420
  {
1074
420
    const int idx = sources[i].d;
1075
    // If it has incoming nodes, check whether these are on stream 0.
1076
420
    if (incomings[idx].outgoings && 
incomings[idx].outgoings->rnum0
)
1077
0
    {
1078
0
      int flag  = 0;
1079
0
      const ccv_array_t* const incoming = incomings[idx].outgoings;
1080
0
      for (j = 0; !flag && j < incoming->rnum; j++)
1081
0
      {
1082
0
        const int incoming_idx = *(int*)ccv_array_get(incoming, j);
1083
0
        for (k = 0; !flag && k < schd_info[incoming_idx].stream_size; k++)
1084
0
          flag = (SCHEDULE_STREAMS(schd_info[incoming_idx])[k] == 0); // If this is the default stream, we already have a good start.
1085
0
      }
1086
0
      if (flag)
1087
0
        continue;
1088
0
    }
1089
854
    
for (j = 0; 420
j < schd_info[idx].stream_size;
j++434
)
1090
434
      if (SCHEDULE_STREAMS(schd_info[idx])[j] != 0) // If this is not the default stream, we need explicit begin signal to start.
1091
34
        ++graph_stream_1_size;
1092
420
  }
1093
400
  if (graph_stream_1_size > 0)
1094
14
  {
1095
14
    schedule->stream_1s = ccmalloc(sizeof(int) * graph_stream_1_size);
1096
14
    graph_stream_1_size = 0;
1097
48
    for (i = 0; i < source_size; 
i++34
)
1098
34
    {
1099
34
      const int idx = sources[i].d;
1100
      // If it has incoming nodes, check whether these are on stream 0.
1101
34
      if (incomings[idx].outgoings && 
incomings[idx].outgoings->rnum0
)
1102
0
      {
1103
0
        int flag  = 0;
1104
0
        const ccv_array_t* const incoming = incomings[idx].outgoings;
1105
0
        for (j = 0; !flag && j < incoming->rnum; j++)
1106
0
        {
1107
0
          const int incoming_idx = *(int*)ccv_array_get(incoming, j);
1108
0
          for (k = 0; !flag && k < schd_info[incoming_idx].stream_size; k++)
1109
0
            flag = (SCHEDULE_STREAMS(schd_info[incoming_idx])[k] == 0); // If this is the default stream, we already have a good start.
1110
0
        }
1111
0
        if (flag)
1112
0
          continue;
1113
0
      }
1114
82
      
for (j = 0; 34
j < schd_info[idx].stream_size;
j++48
)
1115
48
        if (SCHEDULE_STREAMS(schd_info[idx])[j] != 0) // If this is not the default stream, we need explicit begin signal to start.
1116
34
        {
1117
34
          const int stream_idx = SCHEDULE_STREAMS(schd_info[idx])[j];
1118
34
          int flag = 0;
1119
64
          for (k = 0; !flag && k < graph_stream_1_size; 
k++30
)
1120
30
            flag = (stream_idx == schedule->stream_1s[k]);
1121
34
          if (!flag)
1122
34
            schedule->stream_1s[graph_stream_1_size++] = stream_idx;
1123
34
        }
1124
34
    }
1125
14
    schedule->stream_1_size = graph_stream_1_size;
1126
14
  }
1127
7.64k
  for (i = 0; i < exec_info_size; 
i++7.24k
)
1128
7.24k
    if (incomings[i].outgoings)
1129
4.80k
      ccv_array_free(incomings[i].outgoings);
1130
400
  ccfree(incomings);
1131
400
  int graph_wait_size = 0;
1132
826
  for (i = 0; i < destination_size; 
i++426
)
1133
426
  {
1134
426
    const int idx = destinations[i].d;
1135
852
    for (j = 0; j < schd_info[idx].stream_size; 
j++426
)
1136
426
      if (SCHEDULE_STREAMS(schd_info[idx])[j] != 0) // If this exec_info doesn't end with default stream, we need to wait.
1137
26
        ++graph_wait_size;
1138
426
  }
1139
400
  if (graph_wait_size > 0)
1140
10
  {
1141
10
    schedule->waits = ccmalloc(sizeof(int) * graph_wait_size);
1142
10
    graph_wait_size = 0;
1143
46
    for (i = 0; i < destination_size; 
i++36
)
1144
36
    {
1145
36
      const int idx = destinations[i].d;
1146
72
      for (j = 0; j < schd_info[idx].stream_size; 
j++36
)
1147
36
        if (SCHEDULE_STREAMS(schd_info[idx])[j] != 0) // If this exec_info doesn't end with default stream, we need to wait.
1148
26
        {
1149
26
          ccv_nnc_stream_data_t* const default_stream_data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, 0);
1150
26
          if (SCHEDULE_SIGNALS(schd_info[idx])[j] < 0)
1151
26
            SCHEDULE_SIGNALS(schd_info[idx])[j] = signal_size++;
1152
0
          else if (default_stream_data->signal_set && ccv_array_find_int(default_stream_data->signal_set, SCHEDULE_SIGNALS(schd_info[idx])[j]))
1153
0
            continue;
1154
26
          schedule->waits[graph_wait_size++] = SCHEDULE_SIGNALS(schd_info[idx])[j];
1155
26
        }
1156
36
    }
1157
10
    schedule->wait_size = graph_wait_size;
1158
10
  }
1159
1.65k
  for (i = 0; i < stream_data->rnum; 
i++1.25k
)
1160
1.25k
  {
1161
1.25k
    ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, i);
1162
1.25k
    if (data->signal_set)
1163
919
      ccv_array_free(data->signal_set);
1164
1.25k
    assert(data->command_set);
1165
1.25k
    ccv_array_free(data->command_set);
1166
1.25k
  }
1167
  // Allocate streams & signals
1168
400
  int default_stream_type = stream_type;
1169
400
  CCV_STREAM_SET_DEVICE_ID(default_stream_type, default_data->device_id);
1170
400
  if (root_schedule)
1171
337
  {
1172
337
    assert(!graph->streams);
1173
337
    graph->stream_size = stream_data->rnum;
1174
337
    graph->streams = (ccv_nnc_stream_context_t**)ccmalloc(sizeof(ccv_nnc_stream_context_t*) * graph->stream_size);
1175
337
    graph->block_stream_tasks = (co_routine_t**)cccalloc(graph->stream_size, sizeof(co_routine_t*));
1176
337
    if (stream_context)
1177
4
      graph->streams[0] = stream_context;
1178
1.44k
    for (i = (
stream_context337
?
14
:
0333
); i < stream_data->rnum;
i++1.10k
)
1179
1.10k
    {
1180
1.10k
      ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, i);
1181
1.10k
      int type = stream_type;
1182
1.10k
      CCV_STREAM_SET_DEVICE_ID(type, data->device_id);
1183
1.10k
      graph->streams[i] = ccv_nnc_stream_context_new(type);
1184
1.10k
    }
1185
337
    graph->signal_size = signal_size;
1186
337
    graph->signals = (ccv_nnc_stream_signal_t**)cccalloc(signal_size, sizeof(ccv_nnc_stream_signal_t*));
1187
3.67k
    ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1188
7.77k
      for (i = 0; i < schd_info[idx].stream_size; 
i++4.09k
)
1189
4.09k
        if (SCHEDULE_SIGNALS(schd_info[idx])[i] >= 0)
1190
1.18k
        {
1191
1.18k
          const int signal = SCHEDULE_SIGNALS(schd_info[idx])[i];
1192
1.18k
          if (!graph->signals[signal])
1193
1.18k
          {
1194
1.18k
            const ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, SCHEDULE_STREAMS(schd_info[idx])[i]);
1195
1.18k
            int type = stream_type;
1196
1.18k
            CCV_STREAM_SET_DEVICE_ID(type, data->device_id);
1197
1.18k
            graph->signals[signal] = ccv_nnc_stream_signal_new(type);
1198
1.18k
          }
1199
1.18k
        }
1200
3.67k
    } ccv_nnc_graph_visit_endfor
1201
337
  } else {
1202
63
    assert(graph->streams);
1203
63
    assert(graph->stream_size >= stream_data->rnum);
1204
    // Find streams to proper allocated stream based on the type we need.
1205
63
    int* const stream_idxs = (int*)ccmalloc(sizeof(int) * (stream_data->rnum + signal_size));
1206
63
    uint64_t* const stream_used = (uint64_t*)cccalloc(((graph->stream_size + 63) >> 6) + ((graph->signal_size + 63) >> 6), sizeof(uint64_t));
1207
207
    for (i = 0; i < stream_data->rnum; 
i++144
)
1208
144
    {
1209
144
      ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, i);
1210
144
      int type = stream_type;
1211
144
      CCV_STREAM_SET_DEVICE_ID(type, data->device_id);
1212
476
      for (j = 0; j < graph->stream_size; 
j++332
)
1213
476
        if (!(stream_used[j >> 6] & ((uint64_t)1 << (j & 63))))
1214
157
        {
1215
157
          const int stream_type = ccv_nnc_stream_context_type(graph->streams[j]);
1216
157
          if (stream_type == type)
1217
144
          {
1218
144
            stream_idxs[i] = j;
1219
144
            stream_used[j >> 6] |= ((uint64_t)1 << (j & 63));
1220
144
            break;
1221
144
          }
1222
157
        }
1223
144
    }
1224
63
    assert(graph->signal_size >= signal_size);
1225
    // Find signals to proper allocated signal based on the type we need.
1226
63
    int* const signal_idxs = stream_idxs + stream_data->rnum;
1227
63
    uint64_t* const signal_used = stream_used + ((graph->stream_size + 63) >> 6);
1228
239
    for (i = 0; i < signal_size; 
i++176
)
1229
176
      signal_idxs[i] = -1;
1230
1.54k
    ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1231
3.08k
      for (i = 0; i < schd_info[idx].stream_size; 
i++1.54k
)
1232
1.54k
        if (SCHEDULE_SIGNALS(schd_info[idx])[i] >= 0)
1233
176
        {
1234
176
          const int signal = SCHEDULE_SIGNALS(schd_info[idx])[i];
1235
176
          if (signal_idxs[signal] < 0)
1236
176
          {
1237
176
            const ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, SCHEDULE_STREAMS(schd_info[idx])[i]);
1238
176
            int type = stream_type;
1239
176
            CCV_STREAM_SET_DEVICE_ID(type, data->device_id);
1240
2.26k
            for (j = 0; j < graph->signal_size; 
j++2.09k
)
1241
2.26k
              if (!(signal_used[j >> 6] & ((uint64_t)1 << (j & 63))))
1242
334
              {
1243
334
                const int signal_type = ccv_nnc_stream_signal_type(graph->signals[j]);
1244
334
                if (signal_type == type)
1245
176
                {
1246
176
                  signal_idxs[signal] = j;
1247
176
                  signal_used[j >> 6] |= ((uint64_t)1 << (j & 63));
1248
176
                  break;
1249
176
                }
1250
334
              }
1251
176
          }
1252
176
        }
1253
1.54k
    } ccv_nnc_graph_visit_endfor
1254
    // Now rebind streams and signals from the schedule.
1255
1.54k
    ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1256
3.08k
      for (i = 0; i < schd_info[idx].stream_size; 
i++1.54k
)
1257
1.54k
      {
1258
1.54k
        SCHEDULE_STREAMS(schd_info[idx])[i] = stream_idxs[SCHEDULE_STREAMS(schd_info[idx])[i]];
1259
1.54k
        if (SCHEDULE_SIGNALS(schd_info[idx])[i] >= 0)
1260
176
          SCHEDULE_SIGNALS(schd_info[idx])[i] = signal_idxs[SCHEDULE_SIGNALS(schd_info[idx])[i]];
1261
1.54k
      }
1262
1.72k
      for (i = 0; i < schd_info[idx].wait_size; 
i++182
)
1263
182
        schd_info[idx].waits[i] = signal_idxs[schd_info[idx].waits[i]];
1264
1.54k
    } ccv_nnc_graph_visit_endfor
1265
83
    for (i = 0; i < schedule->stream_1_size; 
i++20
)
1266
20
      schedule->stream_1s[i] = stream_idxs[schedule->stream_1s[i]];
1267
89
    for (i = 0; i < schedule->wait_size; 
i++26
)
1268
26
      schedule->waits[i] = signal_idxs[schedule->waits[i]];
1269
    // Rebind who is the stream 0 (default stream).
1270
63
    schedule->stream_0 = stream_idxs[0];
1271
63
    ccfree(stream_used);
1272
63
    ccfree(stream_idxs);
1273
63
  }
1274
400
  assert(graph->streams);
1275
400
  ccv_nnc_graph_visit_free(visit);
1276
1.76k
  for (i = 0; i < signal_size; 
i++1.36k
)
1277
1.36k
    { assert(graph->signals[i]); }
1278
400
  if (schedule->stream_1_size)
1279
14
    schedule->begin = ccv_nnc_stream_signal_new(default_stream_type);
1280
400
  schedule->end = ccv_nnc_stream_signal_new(default_stream_type);
1281
  // Do this recursively for its sub graphs.
1282
400
  if (graph->sub_graphs)
1283
7
    
for (i = 0; 3
i < graph->sub_graphs->rnum;
i++4
)
1284
4
    {
1285
4
      ccv_nnc_graph_t* const sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, i);
1286
4
      if (sub_graph && !sub_graph->default_schedule)
1287
4
      {
1288
4
        const int exec_idx = sub_graph->exec_idx - 1;
1289
4
        assert(schd_info[exec_idx].stream_size == 1);
1290
4
        const int stream_idx = SCHEDULE_STREAMS(schd_info[exec_idx])[0];
1291
4
        const int device_id = ((ccv_nnc_stream_data_t*)ccv_array_get(stream_data, stream_idx))->device_id;
1292
4
        sub_graph->default_schedule = _ccv_nnc_graph_static_schedule_new(sub_graph, stream_type, device_id, max_stream_count, graph->streams[stream_idx], 0, 0, 0, 0);
1293
4
      }
1294
4
    }
1295
400
  ccv_array_free(stream_data);
1296
400
  return schedule;
1297
400
}
1298
void ccv_nnc_graph_set_default_static_schedule(ccv_nnc_graph_t* const graph, const int stream_type, const int max_stream_count)
1299
333
{
1300
333
  assert(graph->p == 0);
1301
333
  if (graph->default_schedule)
1302
0
    ccv_nnc_graph_static_schedule_free(graph->default_schedule);
1303
333
  graph->default_schedule = _ccv_nnc_graph_static_schedule_new(graph, stream_type, -1, max_stream_count, 0, 0, 0, 0, 0);
1304
333
}
1305
1306
ccv_nnc_graph_static_schedule_t* ccv_nnc_graph_static_schedule_new(ccv_nnc_graph_t* const graph, const int stream_type, const int max_stream_count, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size)
1307
63
{
1308
63
  assert(graph->p == 0);
1309
63
  return _ccv_nnc_graph_static_schedule_new(graph, stream_type, -1, max_stream_count, 0, sources, source_size, destinations, destination_size);
1310
63
}
1311
1312
ccv_nnc_stream_context_t* ccv_nnc_graph_default_stream(const ccv_nnc_graph_t* const graph)
1313
9
{
1314
9
  if (graph->streams && graph->stream_size > 0)
1315
9
    return graph->streams[0];
1316
0
  return 0;
1317
9
}
1318
1319
static void _ccv_nnc_graph_dot_exec(const int index, const ccv_nnc_graph_exec_info_t* const exec_info, const ccv_nnc_graph_exec_schedule_t* const schd_info, ccv_nnc_stream_context_t** const streams, const int flags, FILE* out)
1320
961
{
1321
961
  if (flags == CCV_NNC_LONG_DOT_GRAPH)
1322
959
    fputc('{', out);
1323
961
  fprintf(out, "node%d", index);
1324
961
  if (flags == CCV_NNC_LONG_DOT_GRAPH)
1325
959
  {
1326
959
    fputs("|Command: ", out);
1327
959
    fputs(ccv_nnc_cmd_name(exec_info->cmd.cmd), out);
1328
959
    if (schd_info)
1329
142
    {
1330
142
      if (schd_info->stream_size > 0)
1331
142
      {
1332
142
        int i, flag = 0;
1333
142
        fputs("|Stream: ", out);
1334
296
        for (i = 0; i < schd_info->stream_size; 
i++154
)
1335
154
        {
1336
154
          const int device_id = streams ? CCV_TENSOR_GET_DEVICE_ID(streams[SCHEDULE_STREAMS(*schd_info)[i]]->type) : 
00
;
1337
154
          if (i == 0)
1338
142
            fprintf(out, "%d (d%d)", SCHEDULE_STREAMS(*schd_info)[i], device_id);
1339
12
          else
1340
12
            fprintf(out, ", %d (d%d)", SCHEDULE_STREAMS(*schd_info)[i], device_id);
1341
154
        }
1342
296
        for (i = 0; i < schd_info->stream_size; 
i++154
)
1343
154
          if (SCHEDULE_SIGNALS(*schd_info)[i] >= 0)
1344
69
          {
1345
69
            if (!flag)
1346
60
            {
1347
60
              flag = 1;
1348
60
              fprintf(out, "|Signal: %d", SCHEDULE_SIGNALS(*schd_info)[i]);
1349
60
            } else
1350
9
              fprintf(out, ", %d", SCHEDULE_SIGNALS(*schd_info)[i]);
1351
69
          }
1352
142
      }
1353
142
      if (schd_info->wait_size > 0)
1354
76
      {
1355
76
        fputs("|Wait: ", out);
1356
76
        int i;
1357
116
        for (i = 0; i < schd_info->wait_size - 1; 
i++40
)
1358
40
          fprintf(out, "%d, ", schd_info->waits[i]);
1359
76
        fprintf(out, "%d", schd_info->waits[schd_info->wait_size - 1]);
1360
76
      }
1361
142
    }
1362
959
    fputc('}', out);
1363
959
  }
1364
961
}
1365
1366
static void _ccv_nnc_graph_dot_tensor(const int index, const ccv_nnc_tensor_t* const tensor, const int zone, const int flags, const int depth, FILE* out)
1367
2.67k
{
1368
  // if it has an alias pointer, or, it is a long form.
1369
2.67k
  if (flags == CCV_NNC_LONG_DOT_GRAPH)
1370
2.66k
    fputc('{', out);
1371
2.67k
  const int is_tensor_view = CCV_IS_TENSOR_VIEW(tensor);
1372
2.67k
  if (is_tensor_view)
1373
51
    fprintf(out, "tensorview%d", index);
1374
2.61k
  else
1375
2.61k
    fprintf(out, "tensor%d", index);
1376
2.67k
  int i;
1377
2.86k
  for (i = 0; i < depth; 
i++195
) // Print subscription to denote depth.
1378
195
    fputc('\'', out);
1379
2.67k
  if (CCV_GET_TAPE_ALLOC(tensor->type))
1380
9
    fputs(" (t)", out);
1381
2.67k
  if (flags == CCV_NNC_LONG_DOT_GRAPH)
1382
2.66k
  {
1383
2.66k
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(tensor->info.type);
1384
2.66k
    fprintf(out, "|d%d|zone%d", device_id, zone);
1385
2.86k
    for (i = 0; i < depth; 
i++195
) // Print subscription to denote depth.
1386
195
      fputc('\'', out);
1387
2.66k
    uintptr_t aptr = (uintptr_t)tensor->data.u8;
1388
2.66k
    size_t tensor_size;
1389
2.66k
    if (is_tensor_view)
1390
51
      tensor_size = (size_t)((ccv_nnc_tensor_view_t*)(tensor))->stride[0] * tensor->info.dim[0] * CCV_GET_DATA_TYPE_SIZE(tensor->type);
1391
2.61k
    else
1392
2.61k
      tensor_size = ccv_nnc_dimension_count(tensor->info.dim) * CCV_GET_DATA_TYPE_SIZE(tensor->type);
1393
    // Print out the range as well.
1394
2.66k
    fprintf(out, "|{%#010x|%#010x}|%d", (uint32_t)aptr, (uint32_t)(aptr + tensor_size - 1), tensor->info.dim[0]);
1395
6.76k
    for (i = 1; i < CCV_NNC_MAX_DIM_ALLOC && tensor->info.dim[i]; 
i++4.10k
)
1396
4.10k
      fprintf(out, "x%d", tensor->info.dim[i]);
1397
2.66k
    fputc('}', out);
1398
2.66k
  }
1399
2.67k
}
1400
1401
typedef struct {
1402
  int index;
1403
  int name;
1404
  int zone;
1405
  uintptr_t tensor_ref;
1406
  uintptr_t start_ptr;
1407
  uintptr_t end_ptr;
1408
} ccv_nnc_tensor_dot_t;
1409
1410
typedef struct {
1411
  ccv_nnc_tensor_dot_t* dots;
1412
  int* remap;
1413
  int* rename_zone;
1414
  int* rename_index;
1415
} ccv_nnc_tensor_dot_recovery_t;
1416
1417
// First sort by start_ptr, then sort by tensor ptr (so that we will have the same tensor sorted to one cluster).
1418
13.3k
#define less_than(i1, i2, aux) ((i1).start_ptr < (i2).start_ptr || 
(7.16k
(i1).start_ptr == (i2).start_ptr7.16k
&&
(i1).tensor_ref < (i2).tensor_ref3.51k
))
1419
13.3k
static CCV_IMPLEMENT_QSORT(_ccv_nnc_tensor_dot_sort_by_ptr, ccv_nnc_tensor_dot_t, less_than)
1420
#undef less_than
1421
1422
static int _ccv_nnc_graph_dot_tensor_multiview_count(const ccv_nnc_tensor_multiview_t* const mv)
1423
260
{
1424
260
  if (!CCV_IS_TENSOR_MULTIVIEW(mv))
1425
174
    return 1;
1426
86
  const int count = mv->kind + mv->repeat;
1427
86
  int i, c = 0;
1428
269
  for (i = 0; i < count; 
i++183
)
1429
183
    c += _ccv_nnc_graph_dot_tensor_multiview_count((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)[i]);
1430
86
  return c;
1431
260
}
1432
1433
static void _ccv_nnc_graph_dot_tensor_multiview_tensor_dots(const ccv_nnc_tensor_multiview_t* const mv, ccv_nnc_tensor_dot_t* const tensor_dots, int* tensor_index)
1434
86
{
1435
86
  const int count = mv->kind + mv->repeat;
1436
86
  int i;
1437
269
  for (i = 0; i < count; 
i++183
)
1438
183
    if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
1439
9
      _ccv_nnc_graph_dot_tensor_multiview_tensor_dots((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)[i], tensor_dots, tensor_index);
1440
174
    else {
1441
174
      tensor_dots[*tensor_index].name = *tensor_index;
1442
174
      tensor_dots[*tensor_index].start_ptr =  (uintptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->data.u8;
1443
      // Because tv's pointer will get updated, it is not correct in this case to have one tensor_ref.
1444
174
      tensor_dots[*tensor_index].tensor_ref = tensor_dots[*tensor_index].start_ptr;
1445
174
      const size_t dim_size = ccv_nnc_dimension_count(CCV_NNC_MULTIVIEW_DATA(mv)[i]->info.dim) * CCV_GET_DATA_TYPE_SIZE(CCV_NNC_MULTIVIEW_DATA(mv)[i]->type);
1446
174
      tensor_dots[*tensor_index].end_ptr = tensor_dots[*tensor_index].start_ptr + dim_size - 1;
1447
174
      ++(*tensor_index);
1448
174
    }
1449
86
}
1450
1451
static ccv_nnc_tensor_dot_recovery_t _ccv_nnc_graph_tensor_dot_recovery(const ccv_nnc_graph_t* const graph)
1452
225
{
1453
225
  int i, j;
1454
  // Recover tensor relationships for all tensors referenced in the graph.
1455
  // Most notably, we have to give these indexes, and find if they point to
1456
  // the same memory region, and whether they overlap. These information
1457
  // are lost since we converted from symbolic form to the execution form.
1458
  // and here we do our best to recover because that is easier to understand
1459
  // if we want to present the graph visually (also, we don't want to put this
1460
  // information into the tensor or execution graph to avoid overhead, thus,
1461
  // recovering is the best we can do).
1462
225
  int tensor_count = 0;
1463
1.22k
  for (i = 0; i < graph->exec_info->rnum; 
i++998
)
1464
998
  {
1465
998
    ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1466
3.00k
    for (j = 0; j < exec_info->input_size; 
j++2.01k
)
1467
2.01k
      if (exec_info->inputs[j])
1468
1.62k
        tensor_count += CCV_IS_TENSOR_MULTIVIEW(exec_info->inputs[j]) ? 
_ccv_nnc_graph_dot_tensor_multiview_count((ccv_nnc_tensor_multiview_t*)exec_info->inputs[j])36
:
11.58k
;
1469
2.17k
    for (j = 0; j < exec_info->output_size; 
j++1.17k
)
1470
1.17k
      if (exec_info->outputs[j])
1471
1.12k
        tensor_count += CCV_IS_TENSOR_MULTIVIEW(exec_info->outputs[j]) ? 
_ccv_nnc_graph_dot_tensor_multiview_count((ccv_nnc_tensor_multiview_t*)exec_info->outputs[j])41
:
11.08k
;
1472
998
  }
1473
225
  ccv_nnc_tensor_dot_t* tensor_dots = tensor_count > 0 ? 
(ccv_nnc_tensor_dot_t*)221
ccmalloc221
(sizeof(ccv_nnc_tensor_dot_t) * tensor_count) :
04
;
1474
225
  int k = 0;
1475
1.22k
  for (i = 0; i < graph->exec_info->rnum; 
i++998
)
1476
998
  {
1477
998
    ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1478
3.00k
    for (j = 0; j < exec_info->input_size; 
j++2.01k
)
1479
2.01k
    {
1480
2.01k
      ccv_nnc_tensor_t* tensor = exec_info->inputs[j];
1481
2.01k
      if (!tensor)
1482
391
        continue;
1483
1.62k
      if (CCV_IS_TENSOR_MULTIVIEW(tensor))
1484
36
        _ccv_nnc_graph_dot_tensor_multiview_tensor_dots((ccv_nnc_tensor_multiview_t*)tensor, tensor_dots, &k);
1485
1.58k
      else {
1486
1.58k
        tensor_dots[k].name = k;
1487
1.58k
        tensor_dots[k].tensor_ref = (uintptr_t)tensor;
1488
1.58k
        tensor_dots[k].start_ptr = (uintptr_t)tensor->data.u8;
1489
1.58k
        size_t tensor_size;
1490
1.58k
        if (CCV_IS_TENSOR_VIEW(tensor))
1491
29
          tensor_size = (size_t)((ccv_nnc_tensor_view_t*)(tensor))->stride[0] * tensor->info.dim[0] * CCV_GET_DATA_TYPE_SIZE(tensor->type);
1492
1.55k
        else
1493
1.55k
          tensor_size = ccv_nnc_dimension_count(tensor->info.dim) * CCV_GET_DATA_TYPE_SIZE(tensor->type);
1494
1.58k
        tensor_dots[k].end_ptr = tensor_dots[k].start_ptr + tensor_size - 1;
1495
1.58k
        ++k;
1496
1.58k
      }
1497
1.62k
    }
1498
2.17k
    for (j = 0; j < exec_info->output_size; 
j++1.17k
)
1499
1.17k
    {
1500
1.17k
      ccv_nnc_tensor_t* tensor = exec_info->outputs[j];
1501
1.17k
      if (!tensor)
1502
47
        continue;
1503
1.12k
      if (CCV_IS_TENSOR_MULTIVIEW(tensor))
1504
41
        _ccv_nnc_graph_dot_tensor_multiview_tensor_dots((ccv_nnc_tensor_multiview_t*)tensor, tensor_dots, &k);
1505
1.08k
      else {
1506
1.08k
        tensor_dots[k].name = k;
1507
1.08k
        tensor_dots[k].tensor_ref = (uintptr_t)tensor;
1508
1.08k
        tensor_dots[k].start_ptr = (uintptr_t)tensor->data.u8;
1509
1.08k
        size_t tensor_size;
1510
1.08k
        if (CCV_IS_TENSOR_VIEW(tensor))
1511
22
          tensor_size = (size_t)((ccv_nnc_tensor_view_t*)(tensor))->stride[0] * tensor->info.dim[0] * CCV_GET_DATA_TYPE_SIZE(tensor->type);
1512
1.06k
        else
1513
1.06k
          tensor_size = ccv_nnc_dimension_count(tensor->info.dim) * CCV_GET_DATA_TYPE_SIZE(tensor->type);
1514
1.08k
        tensor_dots[k].end_ptr = tensor_dots[k].start_ptr + tensor_size - 1;
1515
1.08k
        ++k;
1516
1.08k
      }
1517
1.12k
    }
1518
998
  }
1519
225
  tensor_count = k; // We may over count, now shrink.
1520
  // To group overlap memory into one zone, we sort it by start ptr first (secondary by the tensor pointer).
1521
225
  _ccv_nnc_tensor_dot_sort_by_ptr(tensor_dots, tensor_count, 0);
1522
225
  int index = 0, zone = 0;
1523
225
  uintptr_t tensor_ref = tensor_count > 0 ? 
tensor_dots[0].tensor_ref221
:
04
;
1524
225
  uintptr_t end_ptr = tensor_count > 0 ? 
tensor_dots[0].end_ptr221
:
04
;
1525
  // Then, it is trivial, we go by end ptr. If the next start ptr is still within the end ptr (start ptr <= end ptr),
1526
  // they are the same zone.
1527
3.06k
  for (i = 0; i < tensor_count; 
i++2.84k
)
1528
2.84k
  {
1529
2.84k
    if (tensor_dots[i].tensor_ref != tensor_ref)
1530
1.20k
    {
1531
1.20k
      tensor_ref = tensor_dots[i].tensor_ref;
1532
1.20k
      ++index;
1533
1.20k
    }
1534
2.84k
    if (tensor_dots[i].start_ptr > end_ptr)
1535
864
    {
1536
864
      end_ptr = ccv_max(end_ptr, tensor_dots[i].end_ptr);
1537
864
      ++zone;
1538
864
    }
1539
2.84k
    tensor_dots[i].index = index;
1540
2.84k
    tensor_dots[i].zone = zone;
1541
2.84k
  }
1542
  // We already have index and zone assigned, but the problem is that these are not very human interpretable (because
1543
  // it follows the pointer from low to high, not the tensor creation order). The following code renamed both the index
1544
  // and the zone so that it is much more understandable.
1545
225
  const int index_count = index + 1;
1546
225
  const int zone_count = zone + 1;
1547
225
  int* remap = (int*)ccmalloc(sizeof(int) * (tensor_count + index_count + zone_count));
1548
225
  int* rename_index = remap + tensor_count;
1549
225
  int* rename_zone = rename_index + index_count;
1550
3.06k
  for (i = 0; i < tensor_count; 
i++2.84k
)
1551
2.84k
    remap[tensor_dots[i].name] = i;
1552
1.65k
  for (i = 0; i < index_count; 
i++1.42k
)
1553
1.42k
    rename_index[i] = -1;
1554
1.31k
  for (i = 0; i < zone_count; 
i++1.08k
)
1555
1.08k
    rename_zone[i] = -1;
1556
225
  index = 0;
1557
225
  zone = 0;
1558
3.06k
  for (i = 0; i < tensor_count; 
i++2.84k
)
1559
2.84k
  {
1560
2.84k
    ccv_nnc_tensor_dot_t* tensor_dot = tensor_dots + remap[i];
1561
2.84k
    if (rename_index[tensor_dot->index] == -1)
1562
1.42k
      rename_index[tensor_dot->index] = index++;
1563
2.84k
    if (rename_zone[tensor_dot->zone] == -1)
1564
1.08k
      rename_zone[tensor_dot->zone] = zone++;
1565
2.84k
  }
1566
225
  ccv_nnc_tensor_dot_recovery_t recovery = {
1567
225
    .dots = tensor_dots,
1568
225
    .remap = remap,
1569
225
    .rename_index = rename_index,
1570
225
    .rename_zone = rename_zone,
1571
225
  };
1572
225
  return recovery;
1573
225
}
1574
1575
static void _ccv_nnc_graph_tensor_dot_recovery_free(const ccv_nnc_tensor_dot_recovery_t recovery)
1576
225
{
1577
225
  ccfree(recovery.dots);
1578
225
  ccfree(recovery.remap);
1579
225
}
1580
1581
static void _ccv_nnc_graph_dot_tensor_multiview_one(const ccv_nnc_tensor_multiview_t* const mv, const ccv_nnc_tensor_dot_recovery_t recovery, const int depth, int* tensor_index, FILE* out)
1582
86
{
1583
86
  const int count = mv->kind + mv->repeat;
1584
86
  int i, j;
1585
86
  fputs("|{", out);
1586
269
  for (i = 0; i < count; 
i++183
)
1587
183
    if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
1588
9
    {
1589
9
      fprintf(out, "{%d", i);
1590
9
      if (mv->kind == CCV_NNC_MULTIVIEW_K0N || 
(5
mv->kind == CCV_NNC_MULTIVIEW_K1N5
&&
i > 05
))
1591
9
        fputc('*', out); // Denotes that we loop on this.
1592
9
      _ccv_nnc_graph_dot_tensor_multiview_one((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)[i], recovery, depth, tensor_index, out);
1593
9
      if (i == count - 1)
1594
7
        fputc('}', out);
1595
2
      else
1596
2
        fputs("}|", out);
1597
174
    } else {
1598
174
      fprintf(out, "{%d", i);
1599
174
      if (mv->kind == CCV_NNC_MULTIVIEW_K0N || 
(19
mv->kind == CCV_NNC_MULTIVIEW_K1N19
&&
i > 019
))
1600
163
        fputc('*', out); // Denotes that we loop on this.
1601
174
      const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[*tensor_index];
1602
174
      fprintf(out, "|zone%d", recovery.rename_zone[tensor_dot->zone]);
1603
368
      for (j = 0; j < depth; 
j++194
)
1604
194
        fputc('\'', out);
1605
174
      uintptr_t aptr = (uintptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->data.u8;
1606
      // For the last one, we don't extend to full ainc.
1607
174
      size_t dim_size = ccv_nnc_dimension_count(CCV_NNC_MULTIVIEW_DATA(mv)[i]->info.dim) * CCV_GET_DATA_TYPE_SIZE(CCV_NNC_MULTIVIEW_DATA(mv)[i]->type);
1608
      // Print out the range as well.
1609
174
      fprintf(out, "|{%#010x|%#010x}", (uint32_t)aptr, (uint32_t)(aptr + dim_size - 1));
1610
174
      ++(*tensor_index);
1611
174
      if (i == count - 1)
1612
79
        fputc('}', out);
1613
95
      else
1614
95
        fputs("}|", out);
1615
174
    }
1616
86
  fputc('}', out);
1617
86
}
1618
1619
static void _ccv_nnc_graph_dot_tensor_multiview(const ccv_nnc_tensor_multiview_t* const mv, const ccv_nnc_tensor_dot_recovery_t recovery, const int flags, const int depth, int* tensor_index, FILE* out)
1620
77
{
1621
  // if it has an alias pointer, or, it is a long form.
1622
77
  if (flags == CCV_NNC_LONG_DOT_GRAPH)
1623
77
    fputc('{', out);
1624
77
  const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[*tensor_index];
1625
77
  fprintf(out, "multiview%d", recovery.rename_index[tensor_dot->index]);
1626
77
  int i;
1627
161
  for (i = 0; i < depth; 
i++84
) // Print subscription to denote depth.
1628
84
    fputc('\'', out);
1629
77
  if (CCV_GET_TAPE_ALLOC(mv->type))
1630
7
    fputs(" (t)", out);
1631
77
  if (flags == CCV_NNC_LONG_DOT_GRAPH)
1632
77
  {
1633
77
    _ccv_nnc_graph_dot_tensor_multiview_one(mv, recovery, depth, tensor_index, out);
1634
77
    const ccv_nnc_tensor_t* root = (ccv_nnc_tensor_t*)mv;
1635
156
    while (CCV_IS_TENSOR_MULTIVIEW(root))
1636
79
      root = CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)root)[0];
1637
77
    fprintf(out, "|%d", root->info.dim[0]);
1638
105
    for (i = 1; i < CCV_NNC_MAX_DIM_ALLOC && root->info.dim[i]; 
i++28
)
1639
28
      fprintf(out, "x%d", root->info.dim[i]);
1640
77
    fputc('}', out);
1641
77
  } else
1642
0
    *tensor_index += _ccv_nnc_graph_dot_tensor_multiview_count(mv);
1643
77
}
1644
1645
static void _ccv_nnc_graph_dot_node(const ccv_nnc_graph_exec_info_t* const exec_info, const ccv_nnc_graph_exec_schedule_t* const schd_info, const int exec_index, ccv_nnc_stream_context_t** const streams, const ccv_nnc_tensor_dot_recovery_t recovery, const int flags, const int depth, FILE* out, int* const tensor_index)
1646
961
{
1647
961
  fprintf(out, "node%d [shape=record,label=\"", exec_index);
1648
961
  _ccv_nnc_graph_dot_exec(exec_index, exec_info, schd_info, streams, flags, out);
1649
961
  int i;
1650
961
  int k = *tensor_index;
1651
961
  if (exec_info->input_size > 0)
1652
837
  {
1653
837
    fputs("|{Input", out);
1654
2.81k
    for (i = 0; i < exec_info->input_size; 
i++1.97k
)
1655
1.97k
      if (exec_info->inputs[i])
1656
1.58k
      {
1657
1.58k
        fputc('|', out);
1658
1.58k
        if (CCV_IS_TENSOR_MULTIVIEW(exec_info->inputs[i]))
1659
33
          _ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->inputs[i], recovery, flags, depth, &k, out);
1660
1.55k
        else {
1661
1.55k
          const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k];
1662
1.55k
          _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->inputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out);
1663
1.55k
          ++k;
1664
1.55k
        }
1665
1.58k
      } else
1666
391
        fputs("|-", out);
1667
837
    fputc('}', out);
1668
837
  }
1669
961
  if (exec_info->output_size > 0)
1670
900
  {
1671
900
    fputs("|{Output", out);
1672
2.03k
    for (i = 0; i < exec_info->output_size; 
i++1.13k
)
1673
1.13k
      if (exec_info->outputs[i])
1674
1.09k
      {
1675
1.09k
        fputc('|', out);
1676
1.09k
        if (CCV_IS_TENSOR_MULTIVIEW(exec_info->outputs[i]))
1677
30
          _ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->outputs[i], recovery, flags, depth, &k, out);
1678
1.06k
        else {
1679
1.06k
          const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k];
1680
1.06k
          _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->outputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out);
1681
1.06k
          ++k;
1682
1.06k
        }
1683
1.09k
      } else
1684
47
        fputs("|-", out);
1685
900
    fputc('}', out);
1686
900
  }
1687
961
  fputs("\"];\n", out);
1688
961
  *tensor_index = k;
1689
961
}
1690
1691
static void _ccv_nnc_graph_dot_while_label(const ccv_nnc_graph_exec_info_t* const exec_info, const int exec_index, const ccv_nnc_tensor_dot_recovery_t recovery, const ccv_nnc_graph_t* const while_graph, const int flags, const int depth, FILE* out, int* tensor_index)
1692
25
{
1693
25
  int i;
1694
25
  fprintf(out, "label=<<b>while%d </b>Command: ", exec_index);
1695
25
  fputs(ccv_nnc_cmd_name(exec_info->cmd.cmd), out);
1696
25
  fputs(">;\n", out);
1697
25
  fprintf(out, "label%d [shape=record,label=\"{", exec_index);
1698
25
  int k = *tensor_index;
1699
25
  if (exec_info->input_size > 0)
1700
16
  {
1701
16
    fputs("{Input|{", out);
1702
39
    for (i = 0; i < exec_info->input_size; 
i++23
)
1703
23
    {
1704
23
      if (i > 0)
1705
7
        fputc('|', out);
1706
23
      if (exec_info->inputs[i])
1707
23
      {
1708
23
        if (CCV_IS_TENSOR_MULTIVIEW(exec_info->inputs[i]))
1709
1
          _ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->inputs[i], recovery, flags, depth, &k, out);
1710
22
        else {
1711
22
          const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k];
1712
22
          _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->inputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out);
1713
22
          ++k;
1714
22
        }
1715
23
      } else
1716
0
        fputc('-', out);
1717
23
    }
1718
16
    fputs("}}", out);
1719
16
  }
1720
25
  if (exec_info->output_size > 0)
1721
15
  {
1722
15
    if (exec_info->input_size > 0)
1723
12
      fputs("|", out);
1724
15
    fputs("{Output|{", out);
1725
38
    for (i = 0; i < exec_info->output_size; 
i++23
)
1726
23
    {
1727
23
      if (i > 0)
1728
8
        fputc('|', out);
1729
23
      if (exec_info->outputs[i])
1730
23
      {
1731
23
        if (CCV_IS_TENSOR_MULTIVIEW(exec_info->outputs[i]))
1732
0
          _ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->outputs[i], recovery, flags, depth, &k, out);
1733
23
        else {
1734
23
          const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k];
1735
23
          _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->outputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out);
1736
23
          ++k;
1737
23
        }
1738
23
      } else
1739
0
        fputc('-', out);
1740
23
    }
1741
15
    fputs("}}", out);
1742
15
  }
1743
25
  fputs("}\"];\n", out);
1744
25
  *tensor_index = k;
1745
25
}
1746
1747
static void _ccv_nnc_graph_dot_case_of_label(const ccv_nnc_graph_exec_info_t* const exec_info, const int exec_index, const ccv_nnc_tensor_dot_recovery_t recovery, const int flags, const int depth, FILE* out, int* tensor_index)
1748
12
{
1749
12
  int i;
1750
12
  fprintf(out, "label=<<b>caseof%d </b>Command: ", exec_index);
1751
12
  fputs(ccv_nnc_cmd_name(exec_info->cmd.cmd), out);
1752
12
  fputs(">;\n", out);
1753
12
  fprintf(out, "label%d [shape=record,label=\"{", exec_index);
1754
12
  int k = *tensor_index;
1755
12
  if (exec_info->input_size > 0)
1756
11
  {
1757
11
    fputs("{Input|{", out);
1758
22
    for (i = 0; i < exec_info->input_size; 
i++11
)
1759
11
    {
1760
11
      if (i > 0)
1761
0
        fputc('|', out);
1762
11
      if (exec_info->inputs[i])
1763
11
      {
1764
11
        if (CCV_IS_TENSOR_MULTIVIEW(exec_info->inputs[i]))
1765
2
          _ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->inputs[i], recovery, flags, depth, &k, out);
1766
9
        else {
1767
9
          const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k];
1768
9
          _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->inputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out);
1769
9
          ++k;
1770
9
        }
1771
11
      } else
1772
0
        fputc('-', out);
1773
11
    }
1774
11
    fputs("}}", out);
1775
11
  }
1776
12
  if (exec_info->output_size > 0)
1777
11
  {
1778
11
    if (exec_info->input_size > 0)
1779
10
      fputs("|", out);
1780
11
    fputs("{Output|{", out);
1781
24
    for (i = 0; i < exec_info->output_size; 
i++13
)
1782
13
    {
1783
13
      if (i > 0)
1784
2
        fputc('|', out);
1785
13
      if (exec_info->outputs[i])
1786
13
      {
1787
13
        if (CCV_IS_TENSOR_MULTIVIEW(exec_info->outputs[i]))
1788
11
          _ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->outputs[i], recovery, flags, depth, &k, out);
1789
2
        else {
1790
2
          const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k];
1791
2
          _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->outputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out);
1792
2
          ++k;
1793
2
        }
1794
13
      } else
1795
0
        fputc('-', out);
1796
13
    }
1797
11
    fputs("}}", out);
1798
11
  }
1799
12
  fputs("}\"];\n", out);
1800
12
  *tensor_index = k;
1801
12
}
1802
1803
static void _ccv_nnc_graph_dot_sub_graphs(const ccv_nnc_graph_exec_info_t* const exec_info, const ccv_nnc_tensor_dot_recovery_t p_recovery, const ccv_array_t* const sub_graphs, const int flags, const int depth, FILE* out, int* tensor_index, int* exec_index)
1804
37
{
1805
37
  if (exec_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1806
25
  {
1807
25
    fprintf(out, "subgraph cluster%d {\nstyle=\"rounded\";\nnode%d [style=invisible];\n", *exec_index, *exec_index);
1808
25
    const ccv_nnc_graph_t* const while_graph = *(ccv_nnc_graph_t**)ccv_array_get(sub_graphs, CCV_NNC_GRAPH_REF(exec_info)[0] - 1);
1809
    // Output this node info within this subgraph.
1810
25
    _ccv_nnc_graph_dot_while_label(exec_info, *exec_index, p_recovery, while_graph, flags, depth - 1 /* Label all references to its level above. */, out, tensor_index);
1811
25
  } else 
if (12
exec_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF12
) {
1812
12
    fprintf(out, "subgraph cluster%d {\nstyle=\"rounded\";\nnode%d [style=invisible];\n", *exec_index, *exec_index);
1813
12
    _ccv_nnc_graph_dot_case_of_label(exec_info, *exec_index, p_recovery, flags, depth - 1 /* Label all references to its level above. */, out, tensor_index);
1814
12
  }
1815
37
  ++(*exec_index);
1816
37
  int p;
1817
94
  for (p = 0; p < exec_info->graph_ref_size; 
p++57
)
1818
57
  {
1819
57
    if (exec_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
1820
32
    {
1821
32
      fprintf(out, "subgraph cluster%d {\nstyle=\"rounded\";\nnode%d [style=invisible];\nlabel=\"\"\n", *exec_index, *exec_index);
1822
32
      ++(*exec_index);
1823
32
    }
1824
57
    const ccv_nnc_graph_t* const graph = *(ccv_nnc_graph_t**)ccv_array_get(sub_graphs, CCV_NNC_GRAPH_REF(exec_info)[p] - 1);
1825
57
    const ccv_nnc_graph_static_schedule_t* const schedule = graph->default_schedule;
1826
57
    ccv_nnc_tensor_dot_recovery_t recovery = _ccv_nnc_graph_tensor_dot_recovery(graph);
1827
57
    int i, j;
1828
57
    int k = 0;
1829
57
    int* node_id = (int*)ccmalloc(sizeof(int) * graph->exec_info->rnum);
1830
    // Output styles.
1831
167
    for (i = 0; i < graph->exec_info->rnum; 
i++110
)
1832
110
    {
1833
110
      node_id[i] = *exec_index;
1834
110
      ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1835
110
      if (CCV_NNC_GRAPH_REF(exec_info)[0])
1836
3
        _ccv_nnc_graph_dot_sub_graphs(exec_info, recovery, graph->sub_graphs, flags, depth + 1, out, &k, exec_index);
1837
107
      else {
1838
107
        _ccv_nnc_graph_dot_node(exec_info,
1839
107
          schedule ? 
(6
i < schedule->exec_info_size6
?
schedule->exec_info + i6
:
00
) :
0101
,
1840
107
          *exec_index, graph->streams, recovery, flags, depth, out, &k);
1841
107
        ++(*exec_index);
1842
107
      }
1843
110
    }
1844
    // Output connections.
1845
167
    for (i = 0; i < graph->exec_info->rnum; 
i++110
)
1846
110
    {
1847
110
      ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1848
110
      if (exec_info->outgoings)
1849
108
        
for (j = 0; 53
j < exec_info->outgoings->rnum;
j++55
)
1850
55
        {
1851
55
          const int outgoing_idx = *(int*)ccv_array_get(exec_info->outgoings, j);
1852
55
          const ccv_nnc_graph_exec_info_t* const outgoing_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, outgoing_idx);
1853
          // If both are sub-graphs, have both tail and head specified.
1854
55
          if (CCV_NNC_GRAPH_REF(exec_info)[0] && 
CCV_NNC_GRAPH_REF1
(outgoing_info)[0]1
)
1855
0
            fprintf(out, "node%d -> node%d [ltail=cluster%d,lhead=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[i], node_id[outgoing_idx]);
1856
55
          else if (CCV_NNC_GRAPH_REF(exec_info)[0] && 
!1
CCV_NNC_GRAPH_REF1
(outgoing_info)[0])
1857
1
            fprintf(out, "node%d -> node%d [ltail=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[i]);
1858
54
          else if (!CCV_NNC_GRAPH_REF(exec_info)[0] && CCV_NNC_GRAPH_REF(outgoing_info)[0])
1859
3
            fprintf(out, "node%d -> node%d [lhead=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[outgoing_idx]);
1860
51
          else
1861
51
            fprintf(out, "node%d -> node%d;\n", node_id[i], node_id[outgoing_idx]);
1862
55
        }
1863
110
    }
1864
57
    fputs("}\n", out);
1865
57
    _ccv_nnc_graph_tensor_dot_recovery_free(recovery);
1866
57
    ccfree(node_id);
1867
57
  }
1868
  // Extra subgraph cluster.
1869
37
  if (exec_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
1870
12
    fputs("}\n", out);
1871
37
}
1872
1873
void ccv_nnc_graph_dot(const ccv_nnc_graph_t* const graph, const int flags, FILE* out)
1874
168
{
1875
168
  fputs("digraph G {\ncompound=true;\n", out);
1876
168
  ccv_nnc_tensor_dot_recovery_t recovery = _ccv_nnc_graph_tensor_dot_recovery(graph);
1877
168
  int i, j;
1878
168
  int k = 0, c = 0;
1879
168
  int* node_id = (int*)ccmalloc(sizeof(int) * graph->exec_info->rnum);
1880
168
  const ccv_nnc_graph_static_schedule_t* const schedule = graph->default_schedule;
1881
  // Output styles.
1882
1.05k
  for (i = 0; i < graph->exec_info->rnum; 
i++888
)
1883
888
  {
1884
888
    node_id[i] = c;
1885
888
    ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1886
888
    if (CCV_NNC_GRAPH_REF(exec_info)[0])
1887
34
      _ccv_nnc_graph_dot_sub_graphs(exec_info, recovery, graph->sub_graphs, flags, 1, out, &k, &c);
1888
854
    else {
1889
854
      _ccv_nnc_graph_dot_node(exec_info,
1890
854
        schedule ? 
(136
i < schedule->exec_info_size136
?
schedule->exec_info + i136
:
00
) :
0718
,
1891
854
        c, graph->streams, recovery, flags, 0, out, &k);
1892
854
      ++c;
1893
854
    }
1894
888
  }
1895
  // Output connections.
1896
1.05k
  for (i = 0; i < graph->exec_info->rnum; 
i++888
)
1897
888
  {
1898
888
    ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1899
888
    if (exec_info->outgoings)
1900
1.80k
      
for (j = 0; 720
j < exec_info->outgoings->rnum;
j++1.08k
)
1901
1.08k
      {
1902
1.08k
        const int outgoing_idx = *(int*)ccv_array_get(exec_info->outgoings, j);
1903
1.08k
        const ccv_nnc_graph_exec_info_t* const outgoing_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, outgoing_idx);
1904
        // If both are sub-graphs, have both tail and head specified.
1905
1.08k
        if (CCV_NNC_GRAPH_REF(exec_info)[0] && 
CCV_NNC_GRAPH_REF18
(outgoing_info)[0]18
)
1906
3
          fprintf(out, "node%d -> node%d [ltail=cluster%d,lhead=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[i], node_id[outgoing_idx]);
1907
1.08k
        else if (CCV_NNC_GRAPH_REF(exec_info)[0] && 
!15
CCV_NNC_GRAPH_REF15
(outgoing_info)[0])
1908
15
          fprintf(out, "node%d -> node%d [ltail=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[i]);
1909
1.06k
        else if (!CCV_NNC_GRAPH_REF(exec_info)[0] && CCV_NNC_GRAPH_REF(outgoing_info)[0])
1910
8
          fprintf(out, "node%d -> node%d [lhead=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[outgoing_idx]);
1911
1.06k
        else
1912
1.06k
          fprintf(out, "node%d -> node%d;\n", node_id[i], node_id[outgoing_idx]);
1913
1.08k
      }
1914
888
  }
1915
168
  fputs("}\n", out);
1916
168
  _ccv_nnc_graph_tensor_dot_recovery_free(recovery);
1917
168
  ccfree(node_id);
1918
168
}
1919
1920
void ccv_nnc_graph_autotune(ccv_nnc_graph_t* const graph, const size_t max_workspace_size, const int flags, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size)
1921
120
{
1922
  // exec current node, for synchronous CPU execution, no stream unit.
1923
120
  int i;
1924
120
#define visitor(node, idx, ...) \
1925
2.68k
  do { \
1926
2.68k
    if (node->cmd.cmd == CCV_NNC_NOOP) \
1927
2.68k
      
continue99
; \
1928
2.68k
    
if (2.58k
node->cmd.cmd == CCV_NNC_GRAPH_FORWARD2.58k
||
node->cmd.cmd == CCV_NNC_GRAPH_BACKWARD2.57k
) \
1929
2.58k
      
for (i = 0; 12
i < node->graph_ref_size30
;
i++18
) \
1930
18
      { \
1931
18
        ccv_nnc_graph_t* sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[i] - 1); \
1932
18
        ccv_nnc_graph_autotune(sub_graph, max_workspace_size, flags, 0, 0, 0, 0); \
1933
18
      } \
1934
2.58k
    else { \
1935
      /* Need to unwrap these tensors */ \
1936
15.2k
      for (i = 0; i < node->input_size + node->output_size; 
i++12.7k
) \
1937
12.7k
        if (node->inputs[i] && 
CCV_IS_TENSOR_MULTIVIEW10.2k
(node->inputs[i])) \
1938
12.7k
          
node->inputs[i] = _ccv_nnc_any_tensor_from_tensor_multiview((ccv_nnc_tensor_multiview_t*)node->inputs[i])13
; \
1939
2.57k
      PRINT(CCV_CLI_VERBOSE, "%s [%d]: [%d] -> [%d]\n", ccv_nnc_cmd_name(node->cmd.cmd), idx, node->input_size, node->output_size); \
1940
10.8k
      for (i = 0; i < node->input_size; 
i++8.23k
) \
1941
8.23k
      { \
1942
8.23k
        PRINT(CCV_CLI_VERBOSE, "|-> %d. %p (%p)", i + 1, node->inputs[i], (node->inputs[i] ? node->inputs[i]->data.u8 : 0)); \
1943
8.23k
        if (node->inputs[i] && 
CCV_CLI_OUTPUT_LEVEL_IS6.02k
(CCV_CLI_VERBOSE)) \
1944
8.23k
          
ccv_nnc_print_tensor_shape(node->inputs[i])0
; \
1945
8.23k
        PRINT(CCV_CLI_VERBOSE, "\n"); \
1946
8.23k
      } \
1947
7.03k
      for (i = 0; i < node->output_size; 
i++4.46k
) \
1948
4.46k
      { \
1949
4.46k
        PRINT(CCV_CLI_VERBOSE, "|<- %d. %p (%p)", i + 1, node->outputs[i], (node->outputs[i] ? node->outputs[i]->data.u8 : 0)); \
1950
4.46k
        if (node->outputs[i] && 
CCV_CLI_OUTPUT_LEVEL_IS4.26k
(CCV_CLI_VERBOSE)) \
1951
4.46k
          
ccv_nnc_print_tensor_shape(node->outputs[i])0
; \
1952
4.46k
        PRINT(CCV_CLI_VERBOSE, "\n"); \
1953
4.46k
      } \
1954
2.57k
      node->cmd = ccv_nnc_cmd_autotune(node->cmd, max_workspace_size, node->hint, flags, node->inputs, node->input_size, node->outputs, node->output_size, 0); \
1955
2.57k
    } \
1956
2.68k
  } while (0)
1957
120
  const ccv_nnc_graph_exec_t* const graph_sources = sources ? 
sources1
:
(119
graph->sources119
?
(ccv_nnc_graph_exec_t*)116
ccv_array_get116
(graph->sources, 0):
03
);
1958
120
  const int graph_source_size = source_size ? 
source_size1
:
(119
graph->sources119
?
graph->sources->rnum116
:
03
);
1959
120
  const ccv_nnc_graph_exec_t* const graph_destinations = destinations ? 
destinations1
:
(119
graph->destinations119
?
(ccv_nnc_graph_exec_t*)116
ccv_array_get116
(graph->destinations, 0) :
03
);
1960
120
  const int graph_destination_size = destination_size ? 
destination_size1
:
(119
graph->destinations119
?
graph->destinations->rnum116
:
03
);
1961
2.68k
  
CCV_NNC_GRAPH_VISIT120
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph_sources, graph_source_size, graph_destinations, graph_destination_size, 0, visitor);
1962
120
#undef visitor
1963
120
}
1964
1965
void ccv_nnc_graph_free(ccv_nnc_graph_t* const graph)
1966
6.24k
{
1967
6.24k
  int i, j;
1968
38.7k
  for (i = 0; i < graph->exec_info->rnum; 
i++32.4k
)
1969
32.4k
  {
1970
32.4k
    ccv_nnc_graph_exec_info_t *info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1971
32.4k
    if (info->_heap_graph_ref)
1972
8
      ccfree(info->_heap_graph_ref);
1973
32.4k
    ccv_array_t* outgoings = info->outgoings;
1974
32.4k
    if (outgoings)
1975
26.2k
      ccv_array_free(outgoings);
1976
    // We allocate inputs & outputs in continuous fashion, therefore, only need to free the input array.
1977
32.4k
    if (info->inputs)
1978
32.2k
      ccfree(info->inputs);
1979
32.4k
    if (info->input_flags)
1980
32.0k
      ccfree(info->input_flags);
1981
32.4k
    if (info->updates)
1982
17
      ccfree(info->updates);
1983
32.4k
    if ((info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE) && 
info->p_while.inputs27
)
1984
23
      ccfree(info->p_while.inputs);
1985
32.4k
  }
1986
6.24k
  if (graph->tensor_wraps)
1987
27
  {
1988
80
    for (i = 0; i < graph->tensor_wraps->rnum; 
i++53
)
1989
53
    {
1990
53
      ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, i);
1991
53
      if (tensor_wrap_array)
1992
52
      {
1993
195
        for (j = 0; j < tensor_wrap_array->size; 
j++143
)
1994
143
          _ccv_nnc_graph_tensor_wrap_free(tensor_wrap_array->tensor_wraps[j]);
1995
52
        ccfree(tensor_wrap_array);
1996
52
      }
1997
53
    }
1998
27
    ccv_array_free(graph->tensor_wraps);
1999
27
  }
2000
6.24k
  if (graph->tensor_wraps_refs)
2001
44
    ccv_array_free(graph->tensor_wraps_refs);
2002
6.24k
  if (graph->breakpoints)
2003
26
    ccfree(graph->breakpoints);
2004
6.24k
  if (graph->sources)
2005
6.23k
    ccv_array_free(graph->sources);
2006
6.24k
  if (graph->destinations)
2007
6.23k
    ccv_array_free(graph->destinations);
2008
6.24k
  if (graph->default_schedule)
2009
337
    ccv_nnc_graph_static_schedule_free(graph->default_schedule);
2010
6.24k
  if (graph->streams)
2011
337
  {
2012
    // If the graph has parent graph, the default stream is allocated by the parent graph, we need to skip.
2013
337
    if (!graph->p)
2014
333
      ccv_nnc_stream_context_free(graph->streams[0]);
2015
1.11k
    for (i = 1; i < graph->stream_size; 
i++773
)
2016
773
      ccv_nnc_stream_context_free(graph->streams[i]);
2017
337
    ccfree(graph->streams);
2018
337
  }
2019
6.24k
  if (graph->block_stream_tasks)
2020
337
    ccfree(graph->block_stream_tasks);
2021
6.24k
  if (graph->signals)
2022
337
  {
2023
1.52k
    for (i = 0; i < graph->signal_size; 
i++1.18k
)
2024
1.18k
      ccv_nnc_stream_signal_free(graph->signals[i]);
2025
337
    ccfree(graph->signals);
2026
337
  }
2027
6.24k
  if (graph->carry_overs)
2028
21
  {
2029
46
    for (i = 0; i < graph->carry_overs->rnum; 
i++25
)
2030
25
    {
2031
25
      ccv_nnc_graph_tensor_carry_over_t* const carry_over = (ccv_nnc_graph_tensor_carry_over_t*)ccv_array_get(graph->carry_overs, i);
2032
25
      _ccv_nnc_graph_tensor_wrap_free(carry_over->from);
2033
25
      _ccv_nnc_graph_tensor_wrap_free(carry_over->to);
2034
25
    }
2035
21
    ccv_array_free(graph->carry_overs);
2036
21
  }
2037
6.24k
  if (graph->sub_graphs)
2038
35
  {
2039
94
    for (i = 0; i < graph->sub_graphs->rnum; 
i++59
)
2040
59
      ccv_nnc_graph_free(*(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, i));
2041
35
    ccv_array_free(graph->sub_graphs);
2042
35
  }
2043
6.24k
  ccv_array_free(graph->exec_info);
2044
6.24k
  if (graph->buffer)
2045
335
    ccfree(graph->buffer);
2046
6.24k
  ccfree(graph);
2047
6.24k
}