Coverage Report

Created: 2021-04-12 03:25

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/ccv_nnc_graph.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#include "_ccv_nnc_graph.h"
6
7
// MARK - Level-2 API
8
9
ccv_nnc_graph_t* ccv_nnc_graph_new(void)
10
6.10k
{
11
6.10k
  ccv_nnc_graph_t* graph = (ccv_nnc_graph_t*)cccalloc(1, sizeof(ccv_nnc_graph_t));
12
6.10k
  graph->exec_info = ccv_array_new(sizeof(ccv_nnc_graph_exec_info_t), 5, 0);
13
6.10k
  return graph;
14
6.10k
}
15
16
void ccv_nnc_graph_set_sources(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t* const sources, const int source_size)
17
6.09k
{
18
6.09k
  if (!graph->sources)
19
6.09k
    graph->sources = ccv_array_new(sizeof(ccv_nnc_graph_exec_t), source_size, 0);
20
0
  else
21
0
    ccv_array_clear(graph->sources);
22
6.09k
  int i;
23
12.1k
  for (i = 0; i < source_size; 
i++6.09k
)
24
6.09k
    ccv_array_push(graph->sources, sources + i);
25
6.09k
  graph->topsorted = 0;
26
6.09k
}
27
28
ccv_nnc_graph_exec_t* ccv_nnc_graph_sources(const ccv_nnc_graph_t* const graph)
29
0
{
30
0
  return graph->sources ? (ccv_nnc_graph_exec_t*)ccv_array_get(graph->sources, 0) : 0;
31
0
}
32
33
int ccv_nnc_graph_source_size(const ccv_nnc_graph_t* const graph)
34
0
{
35
0
  return graph->sources ? graph->sources->rnum : 0;
36
0
}
37
38
void ccv_nnc_graph_set_destinations(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t* const destinations, const int destination_size)
39
6.09k
{
40
6.09k
  if (!graph->destinations)
41
6.09k
    graph->destinations = ccv_array_new(sizeof(ccv_nnc_graph_exec_t), destination_size, 0);
42
0
  else
43
0
    ccv_array_clear(graph->sources);
44
6.09k
  int i;
45
12.1k
  for (i = 0; i < destination_size; 
i++6.09k
)
46
6.09k
    ccv_array_push(graph->destinations, destinations + i);
47
6.09k
  graph->topsorted = 0;
48
6.09k
}
49
50
ccv_nnc_graph_exec_t* ccv_nnc_graph_destinations(const ccv_nnc_graph_t* const graph)
51
0
{
52
0
  return graph->destinations ? (ccv_nnc_graph_exec_t*)ccv_array_get(graph->destinations, 0) : 0;
53
0
}
54
55
int ccv_nnc_graph_destination_size(const ccv_nnc_graph_t* const graph)
56
0
{
57
0
  return graph->destinations ? graph->destinations->rnum : 0;
58
0
}
59
60
void ccv_nnc_graph_exec_set(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, const ccv_nnc_cmd_t cmd)
61
41.3k
{
62
41.3k
  assert(exec.d < graph->exec_info->rnum);
63
41.3k
  assert(exec.graph == graph);
64
41.3k
  ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d);
65
41.3k
  exec_info->cmd = cmd;
66
41.3k
}
67
68
void ccv_nnc_graph_exec_set_hint(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, const ccv_nnc_hint_t hint)
69
176
{
70
176
  assert(exec.d < graph->exec_info->rnum);
71
176
  assert(exec.graph == graph);
72
176
  ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d);
73
176
  exec_info->hint = hint;
74
176
}
75
76
static int _ccv_nnc_tensor_multiview_level_count(const ccv_nnc_tensor_multiview_t* const mv)
77
482
{
78
482
  if (!CCV_IS_TENSOR_MULTIVIEW(mv))
79
482
    
return 1327
;
80
155
  const int count = mv->kind + mv->repeat;
81
155
  int i, c = 0;
82
502
  for (i = 0; i < count; 
i++347
)
83
347
  {
84
347
    ccv_nnc_tensor_t* tv = CCV_NNC_MULTIVIEW_DATA(mv)[i];
85
347
    if (tv == CCV_NNC_TENSOR_PLACEHOLDER)
86
347
      
c = 8
ccv_max8
(c, 1);
87
347
    else
88
347
      
c = 339
ccv_max339
(c, _ccv_nnc_tensor_multiview_level_count((ccv_nnc_tensor_multiview_t*)tv));
89
347
  }
90
155
  return c + 1;
91
155
}
92
93
static ccv_nnc_graph_tensor_wrap_t* _ccv_nnc_graph_tensor_wrap_new(const ccv_nnc_tensor_multiview_t* const mv)
94
143
{
95
143
  const int level_count = _ccv_nnc_tensor_multiview_level_count(mv);
96
143
  ccv_nnc_graph_tensor_wrap_t* tensor_wrap = (ccv_nnc_graph_tensor_wrap_t*)ccmalloc(sizeof(ccv_nnc_graph_tensor_wrap_t) + sizeof(ccv_nnc_tensor_t*) * (level_count - 1));
97
143
  tensor_wrap->update_required = 0;
98
143
  tensor_wrap->count = level_count;
99
143
  tensor_wrap->index = 0;
100
143
  tensor_wrap->tensors[0] = (ccv_nnc_tensor_t*)mv;
101
143
  return tensor_wrap;
102
143
}
103
104
static void _ccv_nnc_graph_exec_rewind(ccv_nnc_graph_exec_info_t* const info, ccv_nnc_graph_t* const graph)
105
23
{
106
23
  if (!info->tensor_wraps_ref)
107
22
    return;
108
1
  int i;
109
1
  assert(info->tensor_wraps_ref <= graph->tensor_wraps->rnum);
110
1
  ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, info->tensor_wraps_ref - 1);;
111
1
  // Rewind from tensor wraps.
112
3
  for (i = 0; i < info->input_size; 
i++2
)
113
2
    if (tensor_wrap_array->tensor_wraps[i])
114
1
      info->inputs[i] = tensor_wrap_array->tensor_wraps[i]->tensors[0];
115
1
  const int d = info->input_size;
116
2
  for (i = 0; i < info->output_size; 
i++1
)
117
1
    if (tensor_wrap_array->tensor_wraps[d + i])
118
1
      info->outputs[i] = tensor_wrap_array->tensor_wraps[d + i]->tensors[0];
119
1
  const int dd = info->input_size + info->output_size;
120
1
  for (i = 0; i < info->update_size; 
i++0
)
121
0
    if (tensor_wrap_array->tensor_wraps[dd + i])
122
0
      info->updates[i] = tensor_wrap_array->tensor_wraps[dd + i]->tensors[0];
123
1
}
124
125
static void _ccv_nnc_graph_tensor_wrap_free(ccv_nnc_graph_tensor_wrap_t* const tensor_wrap)
126
195
{
127
195
  ccfree(tensor_wrap);
128
195
}
129
130
ccv_nnc_graph_tensor_wrap_array_t* ccv_nnc_get_tensor_wrap_array(ccv_nnc_graph_t* const graph, const int tensor_wrap_size, int* const tensor_wraps_ref)
131
62
{
132
62
  ccv_nnc_graph_tensor_wrap_array_t** tensor_wrap_array_ref = *tensor_wraps_ref ? 
(ccv_nnc_graph_tensor_wrap_array_t**)9
ccv_array_get9
(graph->tensor_wraps, *tensor_wraps_ref - 1) :
053
;
133
62
  // Otherwise, find an open slot.
134
62
  if (!tensor_wrap_array_ref)
135
53
  {
136
53
    if (!graph->tensor_wraps)
137
27
      graph->tensor_wraps = ccv_array_new(sizeof(ccv_nnc_graph_tensor_wrap_array_t*), 0, 0);
138
53
    ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = 0;
139
53
    ccv_array_push(graph->tensor_wraps, &tensor_wrap_array);
140
53
    tensor_wrap_array_ref = (ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, graph->tensor_wraps->rnum - 1);
141
53
    *tensor_wraps_ref = graph->tensor_wraps->rnum;
142
53
  }
143
62
  int i;
144
62
  if (*tensor_wrap_array_ref)
145
9
  {
146
9
    if ((*tensor_wrap_array_ref)->size != tensor_wrap_size)
147
9
      *tensor_wrap_array_ref = (ccv_nnc_graph_tensor_wrap_array_t*)ccrealloc(*tensor_wrap_array_ref, sizeof(ccv_nnc_graph_tensor_wrap_array_t) + sizeof(ccv_nnc_graph_tensor_wrap_t*) * (tensor_wrap_size - 1));
148
18
    for (i = (*tensor_wrap_array_ref)->size; i < tensor_wrap_size; 
i++9
)
149
9
      (*tensor_wrap_array_ref)->tensor_wraps[i] = 0;
150
9
  } else
151
53
    *tensor_wrap_array_ref = (ccv_nnc_graph_tensor_wrap_array_t*)cccalloc(sizeof(ccv_nnc_graph_tensor_wrap_array_t) + sizeof(ccv_nnc_graph_tensor_wrap_t*) * (tensor_wrap_size - 1), 1);
152
62
  ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *tensor_wrap_array_ref;
153
62
  tensor_wrap_array->size = tensor_wrap_size;
154
62
  return tensor_wrap_array;
155
62
}
156
157
void ccv_nnc_set_tensor_wraps(ccv_nnc_graph_tensor_wrap_t** const tensor_wraps, ccv_nnc_tensor_t* const* const tensors, const int tensor_size)
158
184
{
159
184
  int i;
160
349
  for (i = 0; i < tensor_size; 
i++165
)
161
165
    if (tensors[i])
162
164
    {
163
164
      if (CCV_IS_TENSOR_MULTIVIEW(tensors[i]) &&
164
164
        
((ccv_nnc_tensor_multiview_t*)tensors[i])->anchor != 111
CCV_NNC_MULTIVIEW_PHI111
)
165
164
      {
166
107
        if (!tensor_wraps[i] || 
tensors[i] != tensor_wraps[i]->tensors[0]14
)
167
93
        {
168
93
          if (tensor_wraps[i])
169
0
            _ccv_nnc_graph_tensor_wrap_free(tensor_wraps[i]);
170
93
          tensor_wraps[i] = _ccv_nnc_graph_tensor_wrap_new((ccv_nnc_tensor_multiview_t*)tensors[i]);
171
93
        }
172
107
      } else {
173
57
        if (tensor_wraps[i])
174
0
          _ccv_nnc_graph_tensor_wrap_free(tensor_wraps[i]);
175
57
        tensor_wraps[i] = 0;
176
57
      }
177
164
    }
178
184
}
179
180
void ccv_nnc_graph_register_tensor_wraps(ccv_nnc_graph_t* graph, const int tensor_wraps_ref_d)
181
53
{
182
53
  ccv_nnc_graph_t* p = graph;
183
53
  const ccv_nnc_graph_tensor_wraps_ref_t tensor_wraps_ref = {
184
53
    .d = tensor_wraps_ref_d,
185
53
    .graph = graph,
186
53
  };
187
99
  do {
188
99
    if (!p->tensor_wraps_refs)
189
44
    {
190
44
      p->tensor_wraps_refs = ccv_array_new(sizeof(ccv_nnc_graph_tensor_wraps_ref_t), 0, 0);
191
44
      ccv_array_push(p->tensor_wraps_refs, &tensor_wraps_ref);
192
55
    } else {
193
55
      int i;
194
55
      int has_tensor_wraps_ref = 0;
195
152
      for (i = 0; !has_tensor_wraps_ref && i < p->tensor_wraps_refs->rnum; 
i++97
)
196
97
      {
197
97
        ccv_nnc_graph_tensor_wraps_ref_t* tensor_wraps_ref = (ccv_nnc_graph_tensor_wraps_ref_t*)ccv_array_get(p->tensor_wraps_refs, i);
198
97
        has_tensor_wraps_ref = (tensor_wraps_ref->d == tensor_wraps_ref_d && 
tensor_wraps_ref->graph == graph8
);
199
97
      }
200
55
      if (!has_tensor_wraps_ref)
201
55
        ccv_array_push(p->tensor_wraps_refs, &tensor_wraps_ref);
202
55
    }
203
99
    p = p->p;
204
99
  } while (p);
205
53
}
206
207
static void _ccv_nnc_graph_redo_tensor_wraps(ccv_nnc_graph_exec_info_t* const info, ccv_nnc_graph_t* const graph)
208
32.1k
{
209
32.1k
  int i;
210
32.1k
  const int has_wrap = ccv_nnc_tensors_have_wraps(info->inputs, info->input_size) ||
211
32.1k
    
ccv_nnc_tensors_have_wraps(info->outputs, info->output_size)32.1k
||
212
32.1k
    
ccv_nnc_tensors_have_wraps(info->updates, info->update_size)32.0k
;
213
32.1k
  if (has_wrap)
214
61
  {
215
61
    const int tensor_wrap_size = info->input_size + info->output_size + info->update_size;
216
61
    ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = ccv_nnc_get_tensor_wrap_array(graph, tensor_wrap_size, &info->tensor_wraps_ref);
217
61
    ccv_nnc_set_tensor_wraps(tensor_wrap_array->tensor_wraps, info->inputs, info->input_size);
218
61
    const int d = info->input_size;
219
61
    ccv_nnc_set_tensor_wraps(tensor_wrap_array->tensor_wraps + d, info->outputs, info->output_size);
220
61
    const int dd = info->input_size + info->output_size;
221
61
    ccv_nnc_set_tensor_wraps(tensor_wrap_array->tensor_wraps + dd, info->updates, info->update_size);
222
32.0k
  } else if (info->tensor_wraps_ref) {
223
1
    ccv_nnc_graph_tensor_wrap_array_t** tensor_wrap_array_ref = (ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, info->tensor_wraps_ref - 1);
224
1
    ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *tensor_wrap_array_ref;
225
1
    if (tensor_wrap_array)
226
1
    {
227
4
      for (i = 0; i < tensor_wrap_array->size; 
i++3
)
228
3
        if (tensor_wrap_array->tensor_wraps[i])
229
2
          _ccv_nnc_graph_tensor_wrap_free(tensor_wrap_array->tensor_wraps[i]);
230
1
      ccfree(tensor_wrap_array);
231
1
      *tensor_wrap_array_ref = 0;
232
1
      info->tensor_wraps_ref = 0;
233
1
    }
234
1
  }
235
32.1k
}
236
237
static void _ccv_nnc_graph_deregister_tensor_wraps(ccv_nnc_graph_t* graph, const int tensor_wraps_ref_d)
238
1
{
239
1
  ccv_nnc_graph_t* p = graph;
240
2
  do {
241
2
    int i;
242
2
    // Remove from the array.
243
2
    if (p->tensor_wraps_refs)
244
2
      for (i = 0; i < p->tensor_wraps_refs->rnum; 
i++0
)
245
2
      {
246
2
        ccv_nnc_graph_tensor_wraps_ref_t* const tensor_wraps_ref = (ccv_nnc_graph_tensor_wraps_ref_t*)ccv_array_get(p->tensor_wraps_refs, i);
247
2
        if (tensor_wraps_ref->d == tensor_wraps_ref_d && tensor_wraps_ref->graph == graph)
248
2
        {
249
2
          --p->tensor_wraps_refs->rnum;
250
2
          if (i < p->tensor_wraps_refs->rnum)
251
0
            memcpy(tensor_wraps_ref, tensor_wraps_ref + 1, sizeof(ccv_nnc_graph_exec_t) * (p->tensor_wraps_refs->rnum - i));
252
2
          break;
253
2
        }
254
2
      }
255
2
    p = p->p;
256
2
  } while (p);
257
1
}
258
259
void ccv_nnc_graph_exec_set_io_flags(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, const int* const input_flags, const int input_flag_size, const int* const output_flags, const int output_flag_size)
260
31.7k
{
261
31.7k
  assert(exec.d < graph->exec_info->rnum);
262
31.7k
  assert(exec.graph == graph);
263
31.7k
  ccv_nnc_graph_exec_info_t* const info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d);
264
31.7k
  assert(input_flag_size <= info->input_size);
265
31.7k
  assert(output_flag_size <= info->output_size);
266
31.7k
  if (info->input_size + info->output_size == 0)
267
19
    return;
268
31.7k
  if (!info->input_flags)
269
31.7k
  {
270
31.7k
    info->input_flags = (int*)cccalloc(info->input_size + info->output_size, sizeof(int));
271
31.7k
    info->output_flags = info->input_flags + info->input_size;
272
31.7k
  }
273
31.7k
  if (input_flag_size > 0)
274
0
    memcpy(info->input_flags, input_flags, sizeof(int) * input_flag_size);
275
31.7k
  if (output_flag_size > 0)
276
0
    memcpy(info->output_flags, output_flags, sizeof(int) * output_flag_size);
277
31.7k
}
278
279
void ccv_nnc_graph_exec_pair_with(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, const ccv_nnc_graph_exec_t pair_exec)
280
443
{
281
443
  assert(exec.graph == graph);
282
443
  assert(exec.d >= 0);
283
443
  assert(exec.d < graph->exec_info->rnum);
284
443
  assert(pair_exec.graph == graph || pair_exec.graph == graph->pair);
285
443
  assert(pair_exec.d >= 0);
286
443
  if (pair_exec.graph == graph)
287
439
    { assert(pair_exec.d < graph->exec_info->rnum); }
288
4
  else
289
4
    { assert(pair_exec.d < graph->pair->exec_info->rnum); }
290
443
  ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d);
291
443
  exec_info->pair_ref = pair_exec.d + 1;
292
443
}
293
294
static ccv_nnc_tensor_t* _ccv_nnc_any_tensor_from_tensor_multiview(ccv_nnc_tensor_multiview_t* const mv)
295
92
{
296
92
  ccv_nnc_tensor_t* tensor = (ccv_nnc_tensor_t*)mv;
297
188
  while (CCV_IS_TENSOR_MULTIVIEW(tensor))
298
96
  {
299
96
    ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
300
96
    const int count = 0;
301
96
    const int off = mv->kind;
302
96
    const int mod = mv->repeat;
303
96
    // If reached the root.
304
96
    tensor = CCV_NNC_MULTIVIEW_DATA(mv)[count >= off ? 
((count - off) % mod) + off83
:
count13
]; // Unwrap.
305
96
  }
306
92
  return tensor;
307
92
}
308
309
void ccv_nnc_graph_exec_set_io(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
310
23
{
311
23
  assert(exec.d < graph->exec_info->rnum);
312
23
  assert(exec.graph == graph);
313
23
  ccv_nnc_graph_exec_info_t* const info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d);
314
23
  // De-register from the graph if it contains multiview tensors.
315
23
  if (info->tensor_wraps_ref)
316
1
    _ccv_nnc_graph_deregister_tensor_wraps(graph, info->tensor_wraps_ref - 1);
317
23
  // In case it is already executed, rewind.
318
23
  _ccv_nnc_graph_exec_rewind(info, graph);
319
23
  if (input_size == 0 && 
output_size == 04
)
320
1
  {
321
1
    if (info->input_size > 0 || info->output_size > 0)
322
1
      
ccfree0
(info->inputs)0
;
323
1
    info->inputs = 0;
324
1
    info->outputs = 0;
325
1
    info->input_size = 0;
326
1
    info->output_size = 0;
327
1
    _ccv_nnc_graph_redo_tensor_wraps(info, graph);
328
1
    if (info->tensor_wraps_ref)
329
0
      ccv_nnc_graph_register_tensor_wraps(graph, info->tensor_wraps_ref - 1);
330
1
    return;
331
1
  }
332
22
  if (info->inputs)
333
2
    info->inputs = (ccv_nnc_tensor_t**)ccrealloc(info->inputs, sizeof(ccv_nnc_tensor_t*) * (input_size + output_size));
334
20
  else
335
20
    info->inputs = (ccv_nnc_tensor_t**)ccmalloc(sizeof(ccv_nnc_tensor_t*) * (input_size + output_size));
336
22
  info->outputs = info->inputs + input_size;
337
22
  if (inputs)
338
22
    memcpy(info->inputs, inputs, sizeof(ccv_nnc_tensor_t*) * input_size);
339
22
  if (outputs)
340
22
    memcpy(info->outputs, outputs, sizeof(ccv_nnc_tensor_t*) * output_size);
341
22
  int i;
342
22
  int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0;
343
77
  for (i = 0; i < input_size + output_size; 
i++55
)
344
55
    if (info->inputs[i])
345
55
    {
346
55
      ccv_nnc_tensor_t* const tensor = CCV_IS_TENSOR_MULTIVIEW(info->inputs[i]) ? 
_ccv_nnc_any_tensor_from_tensor_multiview((ccv_nnc_tensor_multiview_t*)info->inputs[i])3
:
info->inputs[i]52
;
347
55
      tensor_memory |= CCV_TENSOR_GET_MEMORY(tensor->info.type), tensor_formats |= tensor->info.format, tensor_datatypes |= tensor->info.datatype;
348
55
    }
349
22
  info->cmd.backend = ccv_nnc_cmd_find_backend(info->cmd, tensor_memory, tensor_formats, tensor_datatypes);
350
22
  info->input_size = input_size;
351
22
  info->output_size = output_size;
352
22
  _ccv_nnc_graph_redo_tensor_wraps(info, graph);
353
22
  // Register again if the tensor wraps exist.
354
22
  if (info->tensor_wraps_ref)
355
2
    ccv_nnc_graph_register_tensor_wraps(graph, info->tensor_wraps_ref - 1);
356
22
  // Free flags.
357
22
  if (info->input_flags)
358
0
  {
359
0
    ccfree(info->input_flags);
360
0
    info->input_flags = info->output_flags = 0;
361
0
  }
362
22
}
363
364
void ccv_nnc_graph_exec_add_as_affected(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, ccv_nnc_tensor_t* const update)
365
23
{
366
23
  assert(CCV_IS_TENSOR_MULTIVIEW(update));
367
23
  assert(exec.d < graph->exec_info->rnum);
368
23
  assert(exec.graph == graph);
369
23
  ccv_nnc_graph_exec_info_t* const info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d);
370
23
  const int register_tensor_wraps = !info->tensor_wraps_ref;
371
23
  const int update_index = info->update_size;
372
23
  ++info->update_size;
373
23
  if (info->updates)
374
6
    info->updates = (ccv_nnc_tensor_t**)ccrealloc(info->updates, sizeof(ccv_nnc_tensor_t*) * info->update_size);
375
17
  else
376
17
    info->updates = (ccv_nnc_tensor_t**)ccmalloc(sizeof(ccv_nnc_tensor_t*) * info->update_size);
377
23
  info->updates[update_index] = update;
378
23
  _ccv_nnc_graph_redo_tensor_wraps(info, graph);
379
23
  if (register_tensor_wraps)
380
14
    ccv_nnc_graph_register_tensor_wraps(graph, info->tensor_wraps_ref - 1);
381
23
}
382
383
ccv_nnc_graph_exec_t ccv_nnc_graph_exec_new(ccv_nnc_graph_t* const graph, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
384
32.0k
{
385
32.0k
  int d = graph->exec_info->rnum;
386
32.0k
  ccv_nnc_graph_exec_info_t info = {
387
32.0k
    .cmd = cmd,
388
32.0k
    .hint = hint,
389
32.0k
    .input_size = input_size,
390
32.0k
    .output_size = output_size,
391
32.0k
  };
392
32.0k
  assert(inputs || input_size == 0);
393
32.0k
  assert(outputs || output_size == 0);
394
32.0k
  if (input_size > 0 || 
output_size > 04.65k
)
395
31.8k
  {
396
31.8k
    info.inputs = (ccv_nnc_tensor_t**)ccmalloc(sizeof(ccv_nnc_tensor_t*) * (input_size + output_size));
397
31.8k
    info.outputs = info.inputs + input_size;
398
31.8k
    if (inputs)
399
31.7k
      memcpy(info.inputs, inputs, sizeof(ccv_nnc_tensor_t*) * input_size);
400
31.8k
    if (outputs)
401
31.8k
      memcpy(info.outputs, outputs, sizeof(ccv_nnc_tensor_t*) * output_size);
402
31.8k
    info.input_size = input_size;
403
31.8k
    info.output_size = output_size;
404
31.8k
    int i;
405
31.8k
    int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0;
406
171k
    for (i = 0; i < input_size + output_size; 
i++139k
)
407
139k
      if (info.inputs[i])
408
108k
      {
409
108k
        ccv_nnc_tensor_t* const tensor = CCV_IS_TENSOR_MULTIVIEW(info.inputs[i]) ? 
_ccv_nnc_any_tensor_from_tensor_multiview((ccv_nnc_tensor_multiview_t*)info.inputs[i])76
:
info.inputs[i]108k
;
410
108k
        tensor_memory |= CCV_TENSOR_GET_MEMORY(tensor->info.type), tensor_formats |= tensor->info.format, tensor_datatypes |= tensor->info.datatype;
411
108k
      }
412
31.8k
    info.cmd.backend = ccv_nnc_cmd_find_backend(info.cmd, tensor_memory, tensor_formats, tensor_datatypes);
413
31.8k
  }
414
32.0k
  _ccv_nnc_graph_redo_tensor_wraps(&info, graph);
415
32.0k
  // Add itself to the graph's wraps array, this will help the run time when we run the graph and do unwrapping.
416
32.0k
  if (info.tensor_wraps_ref)
417
36
    ccv_nnc_graph_register_tensor_wraps(graph, info.tensor_wraps_ref - 1);
418
32.0k
  ccv_array_push(graph->exec_info, &info);
419
32.0k
  return (ccv_nnc_graph_exec_t){
420
32.0k
    .d = d,
421
32.0k
    .graph = graph,
422
32.0k
  };
423
32.0k
}
424
425
void ccv_nnc_graph_add_carry_over(ccv_nnc_graph_t* const graph, const ccv_nnc_tensor_t* const from, const ccv_nnc_tensor_t* const to)
426
25
{
427
25
  ccv_nnc_graph_tensor_carry_over_t carry_over = {
428
25
    .from = _ccv_nnc_graph_tensor_wrap_new((ccv_nnc_tensor_multiview_t*)from),
429
25
    .to = _ccv_nnc_graph_tensor_wrap_new((ccv_nnc_tensor_multiview_t*)to)
430
25
  };
431
25
  if (!graph->carry_overs)
432
21
    graph->carry_overs = ccv_array_new(sizeof(ccv_nnc_graph_tensor_carry_over_t), 0, 0);
433
25
  ccv_array_push(graph->carry_overs, &carry_over);
434
25
}
435
436
int ccv_nnc_graph_exec_concat(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t source, const ccv_nnc_graph_exec_t destination)
437
28.8k
{
438
28.8k
  assert(graph == source.graph);
439
28.8k
  assert(graph == destination.graph);
440
28.8k
  assert(source.d < graph->exec_info->rnum);
441
28.8k
  assert(destination.d < graph->exec_info->rnum);
442
28.8k
  ccv_nnc_graph_exec_info_t* src_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, source.d);
443
28.8k
  if (src_info->outgoings == 0)
444
25.9k
    src_info->outgoings = ccv_array_new(sizeof(int32_t), 1, 0);
445
2.89k
  else {
446
2.89k
    int i;
447
2.89k
    // Check if this is already connected, if so, skip.
448
8.92k
    for (i = 0; i < src_info->outgoings->rnum; 
i++6.02k
)
449
6.02k
      if (*(int*)ccv_array_get(src_info->outgoings, i) == destination.d)
450
0
        return -1;
451
2.89k
  }
452
28.8k
  ccv_array_push(src_info->outgoings, &destination.d);
453
28.8k
  graph->topsorted = 0;
454
28.8k
  return 0;
455
28.8k
}
456
457
int ccv_nnc_graph_exec_disjoin(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t source, const ccv_nnc_graph_exec_t destination)
458
0
{
459
0
  assert(graph == source.graph);
460
0
  assert(graph == destination.graph);
461
0
  assert(source.d < graph->exec_info->rnum);
462
0
  assert(destination.d < graph->exec_info->rnum);
463
0
  ccv_nnc_graph_exec_info_t* src_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, source.d);
464
0
  if (!src_info->outgoings)
465
0
    return -1;
466
0
  int i, j = -1;
467
0
  // Check if this is already connected, if so, skip.
468
0
  for (i = 0; i < src_info->outgoings->rnum; i++)
469
0
    if (*(int*)ccv_array_get(src_info->outgoings, i) == destination.d)
470
0
    {
471
0
      j = i;
472
0
      break;
473
0
    }
474
0
  if (j < 0)
475
0
    return -1;
476
0
  if (j < src_info->outgoings->rnum - 1)
477
0
    *(int*)ccv_array_get(src_info->outgoings, j) = *(int*)ccv_array_get(src_info->outgoings, src_info->outgoings->rnum - 1);
478
0
  --src_info->outgoings->rnum;
479
0
  ccv_nnc_graph_exec_info_t* dest_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, destination.d);
480
0
  if (dest_info->outgoings)
481
0
    for (i = 0; i < dest_info->outgoings->rnum; i++)
482
0
      ccv_array_add_unique_int(src_info->outgoings, *(int*)ccv_array_get(dest_info->outgoings, i));
483
0
  graph->topsorted = 0;
484
0
  return 0;
485
0
}
486
487
int ccv_nnc_graph_exec_count(const ccv_nnc_graph_t* const graph)
488
0
{
489
0
  return graph->exec_info ? graph->exec_info->rnum : 0;
490
0
}
491
492
void* ccv_nnc_graph_buffer(ccv_nnc_graph_t* const graph, int size)
493
26.3k
{
494
26.3k
  if (graph->buffer_size >= size)
495
26.0k
    return graph->buffer;
496
315
  graph->buffer_size = size;
497
315
  graph->buffer = (graph->buffer) ? 
ccrealloc16
(graph->buffer, size)16
:
ccmalloc299
(size)299
;
498
315
  return graph->buffer;
499
315
}
500
501
void ccv_nnc_graph_topsort(ccv_nnc_graph_t* const graph, int* const exec_cvt, const int exec_cvt_size)
502
6.09k
{
503
6.09k
  assert(exec_cvt_size == graph->exec_info->rnum);
504
6.09k
  assert(graph->sources && graph->sources->rnum);
505
6.09k
  assert(graph->destinations && graph->destinations->rnum);
506
6.09k
  int i, j;
507
38.1k
  for (i = 0; i < exec_cvt_size; 
i++32.0k
)
508
32.0k
    exec_cvt[i] = -1;
509
6.09k
  ccv_array_t* exec_info = ccv_array_new(sizeof(ccv_nnc_graph_exec_info_t), graph->exec_info->rnum, 0);
510
6.09k
  // If there are breakpoints, it is more complicated, we first start to the breakpoints, and then continue from the breakpoints to the destinations.
511
6.09k
  if (graph->breakpoint_size)
512
21
  {
513
42
    ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new21
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, (ccv_nnc_graph_exec_t*)ccv_array_get(graph->sources, 0), graph->sources->rnum, graph->breakpoints, graph->breakpoint_size, 0);
514
42
    for (i = 0; i < graph->breakpoint_size; 
i++21
)
515
21
      exec_cvt[graph->breakpoints[i].d] = -2; // Mark this as breakpoints, so we will skip the first round.
516
42
    
ccv_nnc_graph_visit_for32
(visit, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), node, idx) {
517
32
      assert(!node->pair_ref); // If node has a pair ref, we cannot fix it up.
518
32
      if (exec_cvt[idx] == -2) // Skip breakpoint.
519
21
        continue;
520
11
      // Loop over node and push to the array.
521
11
      ccv_array_push(exec_info, node);
522
11
      // Go to its sub-graph to fix exec_idx
523
11
      for (i = 0; i < node->graph_ref_size; 
i++0
)
524
0
      {
525
0
        const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
526
0
        if (graph_ref >= 0)
527
0
        {
528
0
          ccv_nnc_graph_t* const sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, graph_ref);
529
0
          sub_graph->exec_idx = exec_info->rnum;
530
0
        }
531
0
      }
532
11
      exec_cvt[idx] = exec_info->rnum - 1;
533
11
    } ccv_nnc_graph_visit_endfor
534
42
    ccv_nnc_graph_visit_free(visit);
535
21
    graph->breakpoint_offset = exec_info->rnum;
536
42
    visit = 
ccv_nnc_graph_visit_new21
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph->breakpoints, graph->breakpoint_size, (ccv_nnc_graph_exec_t*)ccv_array_get(graph->destinations, 0), graph->destinations->rnum, 0);
537
44
    ccv_nnc_graph_visit_for(visit, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), node, idx) {
538
44
      assert(!node->pair_ref); // If node has a pair ref, we cannot fix it up.
539
44
      // Loop over node and push to the array.
540
44
      ccv_array_push(exec_info, node);
541
44
      // Go to its sub-graph to fix exec_idx
542
52
      for (i = 0; i < node->graph_ref_size; 
i++8
)
543
8
      {
544
8
        const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
545
8
        if (graph_ref >= 0)
546
8
        {
547
8
          ccv_nnc_graph_t* const sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, graph_ref);
548
8
          sub_graph->exec_idx = exec_info->rnum;
549
8
        }
550
8
      }
551
44
      exec_cvt[idx] = exec_info->rnum - 1;
552
44
    } ccv_nnc_graph_visit_endfor
553
42
    ccv_nnc_graph_visit_free(visit);
554
42
    for (i = 0; i < graph->breakpoint_size; 
i++21
)
555
21
      { assert(exec_cvt[graph->breakpoints[i].d] >= 0); } // All breakpoints should be assigned.
556
6.07k
  } else {
557
12.1k
    ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new6.07k
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, (ccv_nnc_graph_exec_t*)ccv_array_get(graph->sources, 0), graph->sources->rnum, (ccv_nnc_graph_exec_t*)ccv_array_get(graph->destinations, 0), graph->destinations->rnum, 0);
558
31.9k
    ccv_nnc_graph_visit_for(visit, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), node, idx) {
559
31.9k
      assert(!node->pair_ref); // If node has a pair ref, we cannot fix it up.
560
31.9k
      // Loop over node and push to the array.
561
31.9k
      ccv_array_push(exec_info, node);
562
31.9k
      // Go to its sub-graph to fix exec_idx
563
32.0k
      for (i = 0; i < node->graph_ref_size; 
i++42
)
564
42
      {
565
42
        const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
566
42
        if (graph_ref >= 0)
567
42
        {
568
42
          ccv_nnc_graph_t* const sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, graph_ref);
569
42
          sub_graph->exec_idx = exec_info->rnum;
570
42
        }
571
42
      }
572
31.9k
      exec_cvt[idx] = exec_info->rnum - 1;
573
31.9k
    } ccv_nnc_graph_visit_endfor
574
12.1k
    ccv_nnc_graph_visit_free(visit);
575
6.07k
  }
576
6.09k
  assert(graph->exec_info->rnum == exec_info->rnum);
577
6.09k
  ccv_array_free(graph->exec_info);
578
6.09k
  graph->exec_info = exec_info;
579
12.1k
  for (i = 0; i < graph->sources->rnum; 
i++6.09k
)
580
6.09k
  {
581
6.09k
    ccv_nnc_graph_exec_t* const source = (ccv_nnc_graph_exec_t*)ccv_array_get(graph->sources, i);
582
6.09k
    source->d = exec_cvt[source->d];
583
6.09k
  }
584
12.1k
  for (i = 0; i < graph->destinations->rnum; 
i++6.09k
)
585
6.09k
  {
586
6.09k
    ccv_nnc_graph_exec_t* const destination = (ccv_nnc_graph_exec_t*)ccv_array_get(graph->destinations, i);
587
6.09k
    destination->d = exec_cvt[destination->d];
588
6.09k
  }
589
6.09k
  // Update all outgoings to reflect the latest.
590
38.1k
  for (i = 0; i < exec_info->rnum; 
i++32.0k
)
591
32.0k
  {
592
32.0k
    ccv_nnc_graph_exec_info_t* const info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(exec_info, i);
593
32.0k
    if (info->outgoings)
594
54.7k
      
for (j = 0; 25.9k
j < info->outgoings->rnum;
j++28.8k
)
595
28.8k
        *(int*)ccv_array_get(info->outgoings, j) = exec_cvt[*(int*)ccv_array_get(info->outgoings, j)];
596
32.0k
  }
597
6.09k
  graph->topsorted = 1;
598
6.09k
}
599
600
typedef struct {
601
  int device_id;
602
  int exec_idx;
603
  ccv_array_t* signal_set;
604
  ccv_array_t* command_set; // The set of command executed in this stream. In case there is a tie (on rank). We will check this.
605
} ccv_nnc_stream_data_t;
606
607
static void _ccv_nnc_graph_schedule_assign_signals(ccv_array_t* const incoming, ccv_nnc_graph_exec_schedule_t* const node, ccv_array_t* const stream_data, int* const signal_size, ccv_nnc_graph_exec_schedule_t* const exec_info, const int exec_info_size)
608
4.82k
{
609
4.82k
  assert(incoming->rnum > 0);
610
4.82k
  int i, j, k;
611
4.82k
  int wait_size = 0, max_wait_size = 0;
612
10.8k
  for (i = 0; i < incoming->rnum; 
i++5.98k
)
613
5.98k
  {
614
5.98k
    const int incoming_idx = *(int*)ccv_array_get(incoming, i);
615
5.98k
    ccv_nnc_graph_exec_schedule_t* const incoming_exec_info = exec_info + incoming_idx;
616
5.98k
    assert(incoming_exec_info->stream_size > 0);
617
5.98k
    max_wait_size += incoming_exec_info->stream_size;
618
5.98k
  }
619
4.82k
  int waits[ccv_max(1, max_wait_size)];
620
4.82k
  assert(node->stream_size > 0);
621
10.8k
  
for (i = 0; 4.82k
i < incoming->rnum;
i++5.98k
)
622
5.98k
  {
623
5.98k
    const int incoming_idx = *(int*)ccv_array_get(incoming, i);
624
5.98k
    assert(incoming_idx < exec_info_size);
625
5.98k
    assert(incoming_idx >= 0);
626
5.98k
    ccv_nnc_graph_exec_schedule_t* const incoming_exec_info = exec_info + incoming_idx;
627
5.98k
    assert(incoming_exec_info->stream_size > 0);
628
5.98k
    int stream_synced = 1;
629
5.98k
    // If the current node's stream is a subset of the incoming node's stream, there
630
5.98k
    // is no need to sync with signal, because we are already synced with the incoming.
631
11.9k
    for (j = 0; stream_synced && 
j < node->stream_size10.2k
;
j++5.99k
)
632
5.99k
    {
633
5.99k
      const int s = SCHEDULE_STREAMS(*node)[j];
634
5.99k
      assert(s >= 0);
635
5.99k
      int flag = 0;
636
12.8k
      for (k = 0; !flag && 
k < incoming_exec_info->stream_size8.52k
;
k++6.82k
)
637
6.82k
        flag = (SCHEDULE_STREAMS(*incoming_exec_info)[k] == s);
638
5.99k
      stream_synced = flag;
639
5.99k
    }
640
5.98k
    if (stream_synced)
641
4.28k
      continue;
642
1.70k
    // Otherwise, find the streams we need to sync with, and create signals for these.
643
3.42k
    
for (j = 0; 1.70k
j < incoming_exec_info->stream_size;
j++1.71k
)
644
1.71k
    {
645
1.71k
      const int s = SCHEDULE_STREAMS(*incoming_exec_info)[j];
646
1.71k
      assert(s >= 0);
647
1.71k
      int flag = 0;
648
4.43k
      for (k = 0; !flag && 
k < node->stream_size4.41k
;
k++2.72k
)
649
2.72k
        flag = (SCHEDULE_STREAMS(*node)[k] == s);
650
1.71k
      if (!flag) // Need to have a signal.
651
1.69k
      {
652
1.69k
        if (SCHEDULE_SIGNALS(*incoming_exec_info)[j] < 0)
653
1.69k
          
SCHEDULE_SIGNALS1.28k
(*incoming_exec_info)[j] = (*signal_size)++1.28k
;
654
405
        else {
655
405
          int flag = 0;
656
405
          // If any of the stream the current node has already seen this signal, we are good already.
657
1.36k
          for (k = 0; !flag && k < node->stream_size; 
k++955
)
658
955
          {
659
955
            assert(SCHEDULE_STREAMS(*node)[k] >= 0);
660
955
            ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, SCHEDULE_STREAMS(*node)[k]);
661
955
            flag = (data->signal_set && 
ccv_array_find_int(data->signal_set, 429
SCHEDULE_SIGNALS429
(*incoming_exec_info)[j]));
662
955
          }
663
405
          if (flag)
664
0
            continue;
665
1.69k
        }
666
1.69k
        // Otherwise, we need to wait for this. Currently, our granularity is about wait on all streams.
667
1.69k
        waits[wait_size++] = SCHEDULE_SIGNALS(*incoming_exec_info)[j];
668
1.69k
        // All streams on this node have seen this signal.
669
4.36k
        for (k = 0; k < node->stream_size; 
k++2.66k
)
670
2.66k
        {
671
2.66k
          ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, SCHEDULE_STREAMS(*node)[k]);
672
2.66k
          if (!data->signal_set)
673
896
            data->signal_set = ccv_array_new(sizeof(int), 0, 0);
674
2.66k
          ccv_array_push(data->signal_set, &SCHEDULE_SIGNALS(*incoming_exec_info)[j]);
675
2.66k
        }
676
1.69k
      }
677
1.71k
    }
678
1.70k
  }
679
4.82k
  node->wait_size = wait_size;
680
4.82k
  if (wait_size > 0)
681
801
  {
682
801
    node->waits = node->waits ? 
ccrealloc0
(node->waits, sizeof(int) * wait_size)0
: ccmalloc(sizeof(int) * wait_size);
683
801
    memcpy(node->waits, waits, sizeof(int) * wait_size);
684
801
  }
685
4.82k
}
686
687
typedef struct {
688
  int rank;
689
  ccv_array_t* outgoings;
690
} ccv_nnc_incoming_t;
691
692
static int _ccv_nnc_device_ids_for_stream_data(ccv_nnc_graph_exec_info_t* const node, const int device_id, ccv_array_t* const stream_data, int* const device_ids, const int max_device_id_size)
693
12.8k
{
694
12.8k
  // TODO: I need to re-think whether this is GPU only or not.
695
12.8k
  int device_id_size = ccv_nnc_device_ids_for_io(node->inputs, node->input_size, node->outputs, node->output_size, CCV_TENSOR_GPU_MEMORY, device_ids, max_device_id_size);
696
12.8k
  if (device_id_size == 0)
697
2.06k
  {
698
2.06k
    // If there is a default data, use that device id. Otherwise, use the device id passed in (this will be the default data device id).
699
2.06k
    if (stream_data->rnum > 0)
700
1.93k
    {
701
1.93k
      ccv_nnc_stream_data_t* const default_data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, 0);
702
1.93k
      device_ids[0] = default_data->device_id;
703
1.93k
    } else
704
137
      device_ids[0] = device_id >= 0 ? 
device_id2
:
0135
;
705
2.06k
    device_id_size = 1;
706
2.06k
  }
707
12.8k
  return device_id_size;
708
12.8k
}
709
710
void ccv_nnc_graph_static_schedule_free(ccv_nnc_graph_static_schedule_t* const schedule)
711
355
{
712
355
  int i;
713
355
  ccv_nnc_graph_exec_schedule_t* const schd_info = schedule->exec_info;
714
7.52k
  for (i = 0; i < schedule->exec_info_size; 
i++7.17k
)
715
7.17k
  {
716
7.17k
    if (schd_info[i].stream_size > 1)
717
7.17k
      
ccfree150
(schd_info[i]._heap_streams)150
;
718
7.17k
    if (schd_info[i].waits)
719
7.17k
      
ccfree801
(schd_info[i].waits)801
;
720
7.17k
  }
721
355
  if (schedule->stream_1s)
722
355
    
ccfree14
(schedule->stream_1s)14
;
723
355
  if (schedule->waits)
724
355
    
ccfree10
(schedule->waits)10
;
725
355
  if (schedule->psort)
726
355
    
ccfree54
(schedule->psort)54
;
727
355
  if (schedule->begin)
728
14
    ccv_nnc_stream_signal_free(schedule->begin);
729
355
  if (schedule->end)
730
355
    ccv_nnc_stream_signal_free(schedule->end);
731
355
  ccfree(schedule);
732
355
}
733
734
static ccv_nnc_graph_static_schedule_t* _ccv_nnc_graph_static_schedule_new(ccv_nnc_graph_t* const graph, const int stream_type, const int device_id, ccv_nnc_stream_context_t* const stream_context, const ccv_nnc_graph_exec_t* const _sources, const int _source_size, const ccv_nnc_graph_exec_t* const _destinations, const int _destination_size)
735
355
{
736
355
  assert(graph->sources && graph->sources->rnum);
737
355
  assert(graph->destinations && graph->destinations->rnum);
738
355
  assert(graph->topsorted); // Only support this on a topsorted graph.
739
355
  const int exec_info_size = graph->exec_info->rnum;
740
355
  assert(exec_info_size > 0);
741
355
  const ccv_nnc_graph_exec_t* const sources = _sources == 0 ? 
(ccv_nnc_graph_exec_t*)330
ccv_array_get330
(graph->sources, 0) :
_sources25
;
742
355
  const int source_size = _sources == 0 ? 
graph->sources->rnum330
:
_source_size25
;
743
355
  if (!_sources)
744
330
    { assert(_source_size == 0); }
745
355
  const ccv_nnc_graph_exec_t* const destinations = _destinations == 0 ? 
(ccv_nnc_graph_exec_t*)322
ccv_array_get322
(graph->destinations, 0) :
_destinations33
;
746
355
  const int destination_size = _destinations == 0 ? 
graph->destinations->rnum322
:
_destination_size33
;
747
355
  if (!_destinations)
748
322
    { assert(_destination_size == 0); }
749
355
  const int root_schedule = (_sources == 0 && 
_destinations == 0330
);
750
355
  ccv_nnc_graph_static_schedule_t* const schedule = cccalloc(1, sizeof(ccv_nnc_graph_static_schedule_t) + sizeof(ccv_nnc_graph_exec_schedule_t) * (exec_info_size - 1));
751
355
  schedule->exec_info_size = exec_info_size;
752
355
  ccv_nnc_graph_exec_schedule_t* const schd_info = schedule->exec_info;
753
355
  ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0);
754
710
  ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new355
(graph, exec_info, exec_info_size, sources, source_size, destinations, destination_size, 0);
755
710
  if (
!root_schedule355
)
756
54
  {
757
54
    // If this is not a root schedule, we need to do partial topsort.
758
54
    int psort_size = 0;
759
1.51k
    ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
760
1.51k
      ++psort_size;
761
1.51k
    } ccv_nnc_graph_visit_endfor
762
54
    schedule->psort = (int*)ccmalloc(sizeof(int) * psort_size);
763
54
    schedule->psort_size = psort_size;
764
54
    psort_size = 0;
765
1.51k
    ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
766
1.51k
      schedule->psort[psort_size++] = idx;
767
1.51k
    } ccv_nnc_graph_visit_endfor
768
54
  }
769
710
  int i, j, k;
770
710
  // Generate exec dependencies (or, in other words, partial ordering of executions).
771
710
  ccv_sparse_matrix_t* exec_dep = ccv_sparse_matrix_new(exec_info_size, exec_info_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
772
710
  int* buf = (int*)
ccmalloc355
(sizeof(int) * exec_info_size * 2);
773
710
  int buf_size;
774
710
#define for_block(x, val) \
775
178k
  do { \
776
178k
    if (((int32_t*)val)[0] > 0) \
777
178k
    { \
778
178k
      buf[buf_size * 2] = x; \
779
178k
      buf[buf_size * 2 + 1] = ((int32_t*)val)[0] + 1; \
780
178k
      ++buf_size; \
781
178k
    } \
782
178k
  } while (0)
783
7.52k
  for (i = 0; i < exec_info_size; 
i++7.17k
)
784
7.17k
    schd_info[i].stream_size = -1;
785
5.19k
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx, term) {
786
5.19k
    buf_size = 0; /* save all its parent deps to this buffer */
787
5.19k
    ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx);
788
5.19k
    schd_info[idx].stream_size = 0;
789
5.19k
    if (vector)
790
178k
      
CCV_SPARSE_VECTOR_FOREACH4.82k
(exec_dep, vector, for_block);
791
5.19k
    if (!node->outgoings)
792
322
      continue;
793
14.0k
    
for (i = 0; 4.87k
i < node->outgoings->rnum;
i++9.15k
)
794
9.15k
    {
795
9.15k
      int outgoing = *(int*)ccv_array_get(node->outgoings, i);
796
9.15k
      const int32_t one = 1;
797
9.15k
      ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, idx);
798
9.15k
      /* If not found, set, if the current node is the destination node, no need
799
9.15k
       * set itself as parent of subsequent nodes because its terminal nature. */
800
9.15k
      if (!term && 
(9.08k
!cell.i329.08k
||
cell.i32[0] == 00
))
801
9.08k
        ccv_set_sparse_matrix_cell(exec_dep, outgoing, idx, &one);
802
312k
      for (j = 0; j < buf_size; 
j++302k
) /* set with all idx's dependencies as well */
803
302k
      {
804
302k
        ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2]);
805
302k
        /* If not found, set */
806
302k
        if (!cell.i32 || 
cell.i32[0] == 0116k
)
807
186k
          ccv_set_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2], &buf[j * 2 + 1]);
808
116k
        else {
809
116k
          /* Otherwise, set to the longest one */
810
116k
          int32_t dep = ccv_max(cell.i32[0], buf[j * 2 + 1]);
811
116k
          ccv_set_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2], &dep);
812
116k
        }
813
302k
      }
814
9.15k
    }
815
4.87k
  } ccv_nnc_graph_visit_endfor
816
710
#undef for_block
817
710
  
ccfree355
(buf);
818
355
  // Algorithm to allocate signals and streams for this graph.
819
355
  ccv_array_t* const stream_data = ccv_array_new(sizeof(ccv_nnc_stream_data_t), 0, 0);
820
355
  ccv_array_t** const outgoings = cccalloc(exec_info_size, sizeof(ccv_array_t*));
821
355
  ccv_nnc_incoming_t* const incomings = cccalloc(exec_info_size, sizeof(ccv_nnc_incoming_t));
822
355
  int max_device_id_size = 1;
823
355
  // Filter out outgoing nodes that we will be able to access it afterwards anyway.
824
5.19k
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
825
5.19k
    max_device_id_size = ccv_max(node->input_size + node->output_size, max_device_id_size);
826
5.19k
    if (node->outgoings)
827
4.87k
    {
828
4.87k
      outgoings[idx] = ccv_array_new(sizeof(int), 0, 0);
829
14.0k
      for (i = 0; i < node->outgoings->rnum; 
i++9.15k
)
830
9.15k
      {
831
9.15k
        const int di = *(int*)ccv_array_get(node->outgoings, i);
832
9.15k
        // Skip if we haven't accessed this exec.
833
9.15k
        if (schd_info[di].stream_size < 0)
834
1.32k
          continue;
835
7.82k
        int flag = 0;
836
26.1k
        for (j = 0; !flag && 
j < node->outgoings->rnum24.2k
;
j++18.3k
)
837
18.3k
        {
838
18.3k
          if (j != i)
839
12.2k
          {
840
12.2k
            const int dj = *(int*)ccv_array_get(node->outgoings, j);
841
12.2k
            ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, di, dj);
842
12.2k
            flag = (cell.i32 && 
cell.i32[0]1.84k
);
843
12.2k
          }
844
18.3k
        }
845
7.82k
        if (!flag)
846
5.98k
        {
847
5.98k
          ccv_array_push(outgoings[idx], &di);
848
5.98k
          if (!incomings[di].outgoings)
849
4.82k
            incomings[di].outgoings = ccv_array_new(sizeof(int), 1, 0);
850
5.98k
          ccv_array_push(incomings[di].outgoings, &idx);
851
5.98k
        }
852
7.82k
      }
853
4.87k
    }
854
5.19k
  } ccv_nnc_graph_visit_endfor
855
355
#define visitor(node, idx, _) \
856
5.19k
  if (node->outgoings) \
857
10.8k
    
for (i = 0; 4.82k
i < node->outgoings->rnum;
i++5.98k
) \
858
5.98k
    { \
859
5.98k
      const int d = *(int*)ccv_array_get(node->outgoings, i); \
860
5.98k
      node->rank = ccv_max(incomings[d].rank + 1, node->rank); \
861
5.98k
    }
862
5.19k
  
CCV_NNC_GRAPH_VISIT355
(graph, incomings, exec_info_size, destinations, destination_size, sources, source_size, 0, visitor);
863
355
#undef visitor
864
355
  int device_ids[max_device_id_size];
865
355
  int outgoing_device_ids[max_device_id_size];
866
355
  int signal_size = 0;
867
5.19k
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
868
5.19k
    // Go through the incomings.
869
5.19k
    const int device_id_size = _ccv_nnc_device_ids_for_stream_data(node, device_id, stream_data, device_ids, max_device_id_size);
870
5.19k
    if (schd_info[idx].stream_size == 0)
871
375
    {
872
375
      schd_info[idx].stream_size = device_id_size; // At least at the same size as the device_id_size.
873
375
      if (device_id_size > 1)
874
6
      {
875
6
        schd_info[idx]._heap_streams = (int*)ccmalloc(sizeof(int) * device_id_size * 2);
876
6
        schd_info[idx]._heap_signals = (schd_info[idx]._heap_streams + device_id_size);
877
6
      }
878
764
      for (i = 0; i < device_id_size; 
i++389
)
879
389
        SCHEDULE_STREAMS(schd_info[idx])[i] = -1, SCHEDULE_SIGNALS(schd_info[idx])[i] = -1;
880
375
    }
881
10.8k
    for (i = 0; i < device_id_size; 
i++5.61k
)
882
5.61k
      // Go through until the end to assign streams.
883
5.61k
      if (SCHEDULE_STREAMS(schd_info[idx])[i] < 0)
884
1.30k
      {
885
1.30k
        int stream_idx = -1;
886
1.30k
        int stream_has_command = 0;
887
1.30k
        // First, find a good stream in stream data (the stream is good if it can be recycled, and it has the same command).
888
1.30k
        // Otherwise, we prefer a usable stream (it doesn't have the command, but it can be recycled).
889
35.2k
        for (j = 0; (stream_idx < 0 || 
!stream_has_command254
) &&
j < stream_data->rnum35.1k
;
j++33.9k
)
890
33.9k
        {
891
33.9k
          ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, j);
892
33.9k
          if (data->device_id == device_ids[i])
893
8.93k
          {
894
8.93k
            const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, data->exec_idx);
895
8.93k
            // If there is a path to conclude that exec_idx is before idx, then we can reuse
896
8.93k
            // this stream. Otherwise the work in this "empty stream" could still be ongoing,
897
8.93k
            // and we may delay the following work unnecessarily.
898
8.93k
            if (cell.i32 && 
cell.i32[0] > 0146
)
899
146
            {
900
146
              if (ccv_array_find_uint(data->command_set, node->cmd.cmd))
901
72
                stream_idx = j, stream_has_command = 1;
902
74
              else if (stream_idx < 0) // Otherwise, only assign the stream idx if it is not assigned yet.
903
29
                stream_idx = j;
904
146
            }
905
8.93k
          }
906
33.9k
        }
907
1.30k
        if (stream_idx < 0)
908
1.20k
        {
909
1.20k
          stream_idx = stream_data->rnum;
910
1.20k
          const ccv_nnc_stream_data_t data = {
911
1.20k
            .device_id = device_ids[i],
912
1.20k
          };
913
1.20k
          ccv_array_push(stream_data, &data);
914
1.20k
        }
915
1.30k
        assert(stream_idx >= 0);
916
1.30k
        ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, stream_idx);
917
1.30k
        if (!data->command_set)
918
1.20k
          data->command_set = ccv_array_new(sizeof(uint32_t), 1, 0);
919
1.30k
        SCHEDULE_STREAMS(schd_info[idx])[i] = stream_idx;
920
1.30k
        ccv_array_add_unique_uint(data->command_set, node->cmd.cmd);
921
1.30k
        // Assign all subsequent node to use this stream.
922
1.30k
        int outgoing_idx = idx;
923
5.61k
        while (outgoings[outgoing_idx] && 
outgoings[outgoing_idx]->rnum5.29k
)
924
5.23k
        {
925
5.23k
          int highest_rank = -1;
926
5.23k
          int highest_idx = -1;
927
5.23k
          int stream_n = -1;
928
5.23k
          int stream_has_command = 0;
929
12.8k
          for (j = 0; j < outgoings[outgoing_idx]->rnum; 
j++7.62k
)
930
7.62k
          {
931
7.62k
            const int d = *(int*)ccv_array_get(outgoings[outgoing_idx], j);
932
7.62k
            // This is not outside of our scope at this point.
933
7.62k
            assert(schd_info[d].stream_size >= 0);
934
7.62k
            ccv_nnc_graph_exec_info_t* const outgoing_node = exec_info + d;
935
7.62k
            const int outgoing_device_id_size = _ccv_nnc_device_ids_for_stream_data(outgoing_node, device_id, stream_data, outgoing_device_ids, max_device_id_size);
936
7.62k
            if (schd_info[d].stream_size == 0)
937
4.82k
            {
938
4.82k
              schd_info[d].stream_size = outgoing_device_id_size; // At least at the same size as the device_id_size.
939
4.82k
              if (outgoing_device_id_size > 1)
940
144
              {
941
144
                schd_info[d]._heap_streams = (int*)ccmalloc(sizeof(int) * outgoing_device_id_size * 2);
942
144
                schd_info[d]._heap_signals = (schd_info[d]._heap_streams + outgoing_device_id_size);
943
144
              }
944
10.0k
              for (k = 0; k < outgoing_device_id_size; 
k++5.22k
)
945
5.22k
                SCHEDULE_STREAMS(schd_info[d])[k] = -1, SCHEDULE_SIGNALS(schd_info[d])[k] = -1;
946
4.82k
            }
947
7.62k
            assert(schd_info[d].stream_size == outgoing_device_id_size);
948
16.2k
            
for (k = 0; 7.62k
k < outgoing_device_id_size;
k++8.66k
)
949
8.66k
              // If it should be on the same device and the stream is not assign, potentially.
950
8.66k
              if (outgoing_device_ids[k] == device_ids[i] &&
951
8.66k
                
SCHEDULE_STREAMS5.37k
(schd_info[d])[k] < 05.37k
&&
952
8.66k
                
(4.92k
incomings[d].rank > highest_rank4.92k
||
953
4.92k
                 
(621
incomings[d].rank == highest_rank621
&&
954
621
                  !stream_has_command && 
ccv_array_find_uint(data->command_set, outgoing_node->cmd.cmd)0
)))
955
4.30k
              {
956
4.30k
                highest_rank = incomings[d].rank;
957
4.30k
                highest_idx = d;
958
4.30k
                stream_n = k;
959
4.30k
                // This is 1 if rank is the same (thus, I must break the tie already), if the rank is not the same, we need to compute this.
960
4.30k
                stream_has_command = (incomings[d].rank == highest_rank || 
ccv_array_find_uint(data->command_set, outgoing_node->cmd.cmd)0
);
961
4.30k
              }
962
7.62k
          }
963
5.23k
          if (highest_idx >= 0)
964
4.30k
          {
965
4.30k
            outgoing_idx = highest_idx;
966
4.30k
            ccv_nnc_graph_exec_info_t* const outgoing_node = exec_info + outgoing_idx;
967
4.30k
            assert(stream_n >= 0);
968
4.30k
            SCHEDULE_STREAMS(schd_info[outgoing_idx])[stream_n] = stream_idx;
969
4.30k
            ccv_array_add_unique_uint(data->command_set, outgoing_node->cmd.cmd);
970
4.30k
          } else
971
924
            break;
972
5.23k
        }
973
1.30k
        data->exec_idx = outgoing_idx;
974
1.30k
      }
975
5.19k
  } ccv_nnc_graph_visit_endfor
976
355
  // Go through to assign signals when necessary.
977
5.19k
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
978
5.19k
    if (incomings[idx].outgoings && 
incomings[idx].outgoings->rnum4.82k
)
979
4.82k
      _ccv_nnc_graph_schedule_assign_signals(incomings[idx].outgoings, schd_info + idx, stream_data, &signal_size, schd_info, exec_info_size);
980
5.19k
  } ccv_nnc_graph_visit_endfor
981
7.52k
  for (i = 0; i < exec_info_size; 
i++7.17k
)
982
7.17k
    if (outgoings[i])
983
4.87k
      ccv_array_free(outgoings[i]);
984
355
  ccfree(outgoings);
985
355
  ccv_matrix_free(exec_dep);
986
355
  ccv_nnc_stream_data_t* const default_data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, 0);
987
355
  if (device_id >= 0)
988
4
  {
989
4
    // If the default stream (stream 0) is not the same as desired stream, swap with the one that is.
990
4
    if (default_data->device_id != device_id)
991
0
    {
992
0
      int exchange_stream_idx = -1;
993
0
      // Find the stream idx to exchange.
994
0
      ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
995
0
        int flag = 0;
996
0
        for(i = 0; !flag && i < schd_info[idx].stream_size; i++)
997
0
        {
998
0
          const int stream_idx = SCHEDULE_STREAMS(schd_info[idx])[i];
999
0
          ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, stream_idx);
1000
0
          if (data->device_id == device_id)
1001
0
          {
1002
0
            exchange_stream_idx = stream_idx;
1003
0
            flag = 1;
1004
0
          }
1005
0
        }
1006
0
        if (flag)
1007
0
          break;
1008
0
      } ccv_nnc_graph_visit_endfor
1009
0
      assert(exchange_stream_idx >= 0);
1010
0
      ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1011
0
        for (i = 0; i < schd_info[idx].stream_size; i++)
1012
0
          if (SCHEDULE_STREAMS(schd_info[idx])[i] == 0)
1013
0
            SCHEDULE_STREAMS(schd_info[idx])[i] = -1;
1014
0
      } ccv_nnc_graph_visit_endfor
1015
0
      ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1016
0
        for (i = 0; i < schd_info[idx].stream_size; i++)
1017
0
          if (SCHEDULE_STREAMS(schd_info[idx])[i] == exchange_stream_idx)
1018
0
            SCHEDULE_STREAMS(schd_info[idx])[i] = 0;
1019
0
      } ccv_nnc_graph_visit_endfor
1020
0
      ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1021
0
        for (i = 0; i < schd_info[idx].stream_size; i++)
1022
0
          if (SCHEDULE_STREAMS(schd_info[idx])[i] == -1)
1023
0
            SCHEDULE_STREAMS(schd_info[idx])[i] = exchange_stream_idx;
1024
0
      } ccv_nnc_graph_visit_endfor
1025
0
      ((ccv_nnc_stream_data_t*)ccv_array_get(stream_data, exchange_stream_idx))->device_id = default_data->device_id;
1026
0
      default_data->device_id = device_id;
1027
0
    }
1028
4
  }
1029
355
  int graph_stream_1_size = 0;
1030
730
  for (i = 0; i < source_size; 
i++375
)
1031
375
  {
1032
375
    const int idx = sources[i].d;
1033
375
    // If it has incoming nodes, check whether these are on stream 0.
1034
375
    if (incomings[idx].outgoings && 
incomings[idx].outgoings->rnum0
)
1035
0
    {
1036
0
      int flag  = 0;
1037
0
      const ccv_array_t* const incoming = incomings[idx].outgoings;
1038
0
      for (j = 0; !flag && j < incoming->rnum; j++)
1039
0
      {
1040
0
        const int incoming_idx = *(int*)ccv_array_get(incoming, j);
1041
0
        for (k = 0; !flag && k < schd_info[incoming_idx].stream_size; k++)
1042
0
          flag = (SCHEDULE_STREAMS(schd_info[incoming_idx])[k] == 0); // If this is the default stream, we already have a good start.
1043
0
      }
1044
0
      if (flag)
1045
0
        continue;
1046
375
    }
1047
764
    
for (j = 0; 375
j < schd_info[idx].stream_size;
j++389
)
1048
389
      if (SCHEDULE_STREAMS(schd_info[idx])[j] != 0) // If this is not the default stream, we need explicit begin signal to start.
1049
34
        ++graph_stream_1_size;
1050
375
  }
1051
355
  if (graph_stream_1_size > 0)
1052
14
  {
1053
14
    schedule->stream_1s = ccmalloc(sizeof(int) * graph_stream_1_size);
1054
14
    graph_stream_1_size = 0;
1055
48
    for (i = 0; i < source_size; 
i++34
)
1056
34
    {
1057
34
      const int idx = sources[i].d;
1058
34
      // If it has incoming nodes, check whether these are on stream 0.
1059
34
      if (incomings[idx].outgoings && 
incomings[idx].outgoings->rnum0
)
1060
0
      {
1061
0
        int flag  = 0;
1062
0
        const ccv_array_t* const incoming = incomings[idx].outgoings;
1063
0
        for (j = 0; !flag && j < incoming->rnum; j++)
1064
0
        {
1065
0
          const int incoming_idx = *(int*)ccv_array_get(incoming, j);
1066
0
          for (k = 0; !flag && k < schd_info[incoming_idx].stream_size; k++)
1067
0
            flag = (SCHEDULE_STREAMS(schd_info[incoming_idx])[k] == 0); // If this is the default stream, we already have a good start.
1068
0
        }
1069
0
        if (flag)
1070
0
          continue;
1071
34
      }
1072
82
      
for (j = 0; 34
j < schd_info[idx].stream_size;
j++48
)
1073
48
        if (SCHEDULE_STREAMS(schd_info[idx])[j] != 0) // If this is not the default stream, we need explicit begin signal to start.
1074
34
        {
1075
34
          const int stream_idx = SCHEDULE_STREAMS(schd_info[idx])[j];
1076
34
          int flag = 0;
1077
64
          for (k = 0; !flag && k < graph_stream_1_size; 
k++30
)
1078
30
            flag = (stream_idx == schedule->stream_1s[k]);
1079
34
          if (!flag)
1080
34
            schedule->stream_1s[graph_stream_1_size++] = stream_idx;
1081
34
        }
1082
34
    }
1083
14
    schedule->stream_1_size = graph_stream_1_size;
1084
14
  }
1085
7.52k
  for (i = 0; i < exec_info_size; 
i++7.17k
)
1086
7.17k
    if (incomings[i].outgoings)
1087
4.82k
      ccv_array_free(incomings[i].outgoings);
1088
355
  ccfree(incomings);
1089
355
  int graph_wait_size = 0;
1090
736
  for (i = 0; i < destination_size; 
i++381
)
1091
381
  {
1092
381
    const int idx = destinations[i].d;
1093
762
    for (j = 0; j < schd_info[idx].stream_size; 
j++381
)
1094
381
      if (SCHEDULE_STREAMS(schd_info[idx])[j] != 0) // If this exec_info doesn't end with default stream, we need to wait.
1095
26
        ++graph_wait_size;
1096
381
  }
1097
355
  if (graph_wait_size > 0)
1098
10
  {
1099
10
    schedule->waits = ccmalloc(sizeof(int) * graph_wait_size);
1100
10
    graph_wait_size = 0;
1101
46
    for (i = 0; i < destination_size; 
i++36
)
1102
36
    {
1103
36
      const int idx = destinations[i].d;
1104
72
      for (j = 0; j < schd_info[idx].stream_size; 
j++36
)
1105
36
        if (SCHEDULE_STREAMS(schd_info[idx])[j] != 0) // If this exec_info doesn't end with default stream, we need to wait.
1106
26
        {
1107
26
          ccv_nnc_stream_data_t* const default_stream_data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, 0);
1108
26
          if (SCHEDULE_SIGNALS(schd_info[idx])[j] < 0)
1109
26
            SCHEDULE_SIGNALS(schd_info[idx])[j] = signal_size++;
1110
0
          else if (default_stream_data->signal_set && ccv_array_find_int(default_stream_data->signal_set, SCHEDULE_SIGNALS(schd_info[idx])[j]))
1111
0
            continue;
1112
26
          schedule->waits[graph_wait_size++] = SCHEDULE_SIGNALS(schd_info[idx])[j];
1113
26
        }
1114
36
    }
1115
10
    schedule->wait_size = graph_wait_size;
1116
10
  }
1117
1.55k
  for (i = 0; i < stream_data->rnum; 
i++1.20k
)
1118
1.20k
  {
1119
1.20k
    ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, i);
1120
1.20k
    if (data->signal_set)
1121
896
      ccv_array_free(data->signal_set);
1122
1.20k
    assert(data->command_set);
1123
1.20k
    ccv_array_free(data->command_set);
1124
1.20k
  }
1125
355
  // Allocate streams & signals
1126
355
  int default_stream_type = stream_type;
1127
355
  CCV_STREAM_SET_DEVICE_ID(default_stream_type, default_data->device_id);
1128
355
  if (root_schedule)
1129
301
  {
1130
301
    assert(!graph->streams);
1131
301
    graph->stream_size = stream_data->rnum;
1132
301
    graph->streams = (ccv_nnc_stream_context_t**)ccmalloc(sizeof(ccv_nnc_stream_context_t*) * graph->stream_size);
1133
301
    graph->block_stream_tasks = (co_routine_t**)cccalloc(graph->stream_size, sizeof(co_routine_t*));
1134
301
    if (stream_context)
1135
4
      graph->streams[0] = stream_context;
1136
1.36k
    for (i = (stream_context ? 
14
:
0297
); i < stream_data->rnum;
i++1.06k
)
1137
1.06k
    {
1138
1.06k
      ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, i);
1139
1.06k
      int type = stream_type;
1140
1.06k
      CCV_STREAM_SET_DEVICE_ID(type, data->device_id);
1141
1.06k
      graph->streams[i] = ccv_nnc_stream_context_new(type);
1142
1.06k
    }
1143
301
    graph->signal_size = signal_size;
1144
301
    graph->signals = (ccv_nnc_stream_signal_t**)cccalloc(signal_size, sizeof(ccv_nnc_stream_signal_t*));
1145
3.68k
    ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1146
7.78k
      for (i = 0; i < schd_info[idx].stream_size; 
i++4.10k
)
1147
4.10k
        if (SCHEDULE_SIGNALS(schd_info[idx])[i] >= 0)
1148
1.14k
        {
1149
1.14k
          const int signal = SCHEDULE_SIGNALS(schd_info[idx])[i];
1150
1.14k
          if (!graph->signals[signal])
1151
1.14k
          {
1152
1.14k
            const ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, SCHEDULE_STREAMS(schd_info[idx])[i]);
1153
1.14k
            int type = stream_type;
1154
1.14k
            CCV_STREAM_SET_DEVICE_ID(type, data->device_id);
1155
1.14k
            graph->signals[signal] = ccv_nnc_stream_signal_new(type);
1156
1.14k
          }
1157
1.14k
        }
1158
3.68k
    } ccv_nnc_graph_visit_endfor
1159
301
  } else {
1160
54
    assert(graph->streams);
1161
54
    assert(graph->stream_size >= stream_data->rnum);
1162
54
    // Find streams to proper allocated stream based on the type we need.
1163
54
    int* const stream_idxs = (int*)ccmalloc(sizeof(int) * (stream_data->rnum + signal_size));
1164
54
    uint64_t* const stream_used = (uint64_t*)cccalloc(((graph->stream_size + 63) >> 6) + ((graph->signal_size + 63) >> 6), sizeof(uint64_t));
1165
188
    for (i = 0; i < stream_data->rnum; 
i++134
)
1166
134
    {
1167
134
      ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, i);
1168
134
      int type = stream_type;
1169
134
      CCV_STREAM_SET_DEVICE_ID(type, data->device_id);
1170
489
      for (j = 0; j < graph->stream_size; 
j++355
)
1171
489
        if (!(stream_used[j >> 6] & ((uint64_t)1 << (j & 63))))
1172
171
        {
1173
171
          const int stream_type = ccv_nnc_stream_context_type(graph->streams[j]);
1174
171
          if (stream_type == type)
1175
134
          {
1176
134
            stream_idxs[i] = j;
1177
134
            stream_used[j >> 6] |= ((uint64_t)1 << (j & 63));
1178
134
            break;
1179
134
          }
1180
171
        }
1181
134
    }
1182
54
    assert(graph->signal_size >= signal_size);
1183
54
    // Find signals to proper allocated signal based on the type we need.
1184
54
    int* const signal_idxs = stream_idxs + stream_data->rnum;
1185
54
    uint64_t* const signal_used = stream_used + ((graph->stream_size + 63) >> 6);
1186
228
    for (i = 0; i < signal_size; 
i++174
)
1187
174
      signal_idxs[i] = -1;
1188
1.51k
    ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1189
3.02k
      for (i = 0; i < schd_info[idx].stream_size; 
i++1.51k
)
1190
1.51k
        if (SCHEDULE_SIGNALS(schd_info[idx])[i] >= 0)
1191
174
        {
1192
174
          const int signal = SCHEDULE_SIGNALS(schd_info[idx])[i];
1193
174
          if (signal_idxs[signal] < 0)
1194
174
          {
1195
174
            const ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, SCHEDULE_STREAMS(schd_info[idx])[i]);
1196
174
            int type = stream_type;
1197
174
            CCV_STREAM_SET_DEVICE_ID(type, data->device_id);
1198
2.24k
            for (j = 0; j < graph->signal_size; 
j++2.07k
)
1199
2.24k
              if (!(signal_used[j >> 6] & ((uint64_t)1 << (j & 63))))
1200
308
              {
1201
308
                const int signal_type = ccv_nnc_stream_signal_type(graph->signals[j]);
1202
308
                if (signal_type == type)
1203
174
                {
1204
174
                  signal_idxs[signal] = j;
1205
174
                  signal_used[j >> 6] |= ((uint64_t)1 << (j & 63));
1206
174
                  break;
1207
174
                }
1208
308
              }
1209
174
          }
1210
174
        }
1211
1.51k
    } ccv_nnc_graph_visit_endfor
1212
54
    // Now rebind streams and signals from the schedule.
1213
1.51k
    ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1214
3.02k
      for (i = 0; i < schd_info[idx].stream_size; 
i++1.51k
)
1215
1.51k
      {
1216
1.51k
        SCHEDULE_STREAMS(schd_info[idx])[i] = stream_idxs[SCHEDULE_STREAMS(schd_info[idx])[i]];
1217
1.51k
        if (SCHEDULE_SIGNALS(schd_info[idx])[i] >= 0)
1218
1.51k
          
SCHEDULE_SIGNALS174
(schd_info[idx])[i] = signal_idxs[174
SCHEDULE_SIGNALS174
(schd_info[idx])[i]];
1219
1.51k
      }
1220
1.69k
      for (i = 0; i < schd_info[idx].wait_size; 
i++180
)
1221
180
        schd_info[idx].waits[i] = signal_idxs[schd_info[idx].waits[i]];
1222
1.51k
    } ccv_nnc_graph_visit_endfor
1223
80
    for (i = 0; i < schedule->wait_size; 
i++26
)
1224
26
      schedule->waits[i] = signal_idxs[schedule->waits[i]];
1225
54
    // Rebind who is the stream 0 (default stream).
1226
54
    schedule->stream_0 = stream_idxs[0];
1227
54
    ccfree(stream_used);
1228
54
    ccfree(stream_idxs);
1229
54
  }
1230
355
  assert(graph->streams);
1231
355
  ccv_nnc_graph_visit_free(visit);
1232
1.66k
  for (i = 0; i < signal_size; 
i++1.31k
)
1233
1.31k
    { assert(graph->signals[i]); }
1234
355
  if (schedule->stream_1_size)
1235
14
    schedule->begin = ccv_nnc_stream_signal_new(default_stream_type);
1236
355
  schedule->end = ccv_nnc_stream_signal_new(default_stream_type);
1237
355
  // Do this recursively for its sub graphs.
1238
355
  if (graph->sub_graphs)
1239
7
    
for (i = 0; 3
i < graph->sub_graphs->rnum;
i++4
)
1240
4
    {
1241
4
      ccv_nnc_graph_t* const sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, i);
1242
4
      if (sub_graph && !sub_graph->default_schedule)
1243
4
      {
1244
4
        const int exec_idx = sub_graph->exec_idx - 1;
1245
4
        assert(schd_info[exec_idx].stream_size == 1);
1246
4
        const int stream_idx = SCHEDULE_STREAMS(schd_info[exec_idx])[0];
1247
4
        const int device_id = ((ccv_nnc_stream_data_t*)ccv_array_get(stream_data, stream_idx))->device_id;
1248
4
        sub_graph->default_schedule = _ccv_nnc_graph_static_schedule_new(sub_graph, stream_type, device_id, graph->streams[stream_idx], 0, 0, 0, 0);
1249
4
      }
1250
4
    }
1251
355
  ccv_array_free(stream_data);
1252
355
  return schedule;
1253
355
}
1254
void ccv_nnc_graph_set_default_static_schedule(ccv_nnc_graph_t* const graph, const int stream_type)
1255
297
{
1256
297
  assert(graph->p == 0);
1257
297
  if (graph->default_schedule)
1258
0
    ccv_nnc_graph_static_schedule_free(graph->default_schedule);
1259
297
  graph->default_schedule = _ccv_nnc_graph_static_schedule_new(graph, stream_type, -1, 0, 0, 0, 0, 0);
1260
297
}
1261
1262
ccv_nnc_graph_static_schedule_t* ccv_nnc_graph_static_schedule_new(ccv_nnc_graph_t* const graph, const int stream_type, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size)
1263
54
{
1264
54
  assert(graph->p == 0);
1265
54
  return _ccv_nnc_graph_static_schedule_new(graph, stream_type, -1, 0, sources, source_size, destinations, destination_size);
1266
54
}
1267
1268
ccv_nnc_stream_context_t* ccv_nnc_graph_default_stream(const ccv_nnc_graph_t* const graph)
1269
9
{
1270
9
  if (graph->streams && graph->stream_size > 0)
1271
9
    return graph->streams[0];
1272
0
  return 0;
1273
0
}
1274
1275
static void _ccv_nnc_graph_dot_exec(const int index, const ccv_nnc_graph_exec_info_t* const exec_info, const ccv_nnc_graph_exec_schedule_t* const schd_info, ccv_nnc_stream_context_t** const streams, const int flags, FILE* out)
1276
660
{
1277
660
  if (flags == CCV_NNC_LONG_DOT_GRAPH)
1278
658
    fputc('{', out);
1279
660
  fprintf(out, "node%d", index);
1280
660
  if (flags == CCV_NNC_LONG_DOT_GRAPH)
1281
658
  {
1282
658
    fputs("|Command: ", out);
1283
658
    fputs(ccv_nnc_cmd_name(exec_info->cmd.cmd), out);
1284
658
    if (schd_info)
1285
142
    {
1286
142
      if (schd_info->stream_size > 0)
1287
142
      {
1288
142
        int i, flag = 0;
1289
142
        fputs("|Stream: ", out);
1290
296
        for (i = 0; i < schd_info->stream_size; 
i++154
)
1291
154
        {
1292
154
          const int device_id = streams ? CCV_TENSOR_GET_DEVICE_ID(streams[SCHEDULE_STREAMS(*schd_info)[i]]->type) : 
00
;
1293
154
          if (i == 0)
1294
142
            fprintf(out, "%d (d%d)", SCHEDULE_STREAMS(*schd_info)[i], device_id);
1295
12
          else
1296
12
            fprintf(out, ", %d (d%d)", SCHEDULE_STREAMS(*schd_info)[i], device_id);
1297
154
        }
1298
296
        for (i = 0; i < schd_info->stream_size; 
i++154
)
1299
154
          if (SCHEDULE_SIGNALS(*schd_info)[i] >= 0)
1300
69
          {
1301
69
            if (!flag)
1302
60
            {
1303
60
              flag = 1;
1304
60
              fprintf(out, "|Signal: %d", SCHEDULE_SIGNALS(*schd_info)[i]);
1305
60
            } else
1306
9
              fprintf(out, ", %d", SCHEDULE_SIGNALS(*schd_info)[i]);
1307
69
          }
1308
142
      }
1309
142
      if (schd_info->wait_size > 0)
1310
76
      {
1311
76
        fputs("|Wait: ", out);
1312
76
        int i;
1313
116
        for (i = 0; i < schd_info->wait_size - 1; 
i++40
)
1314
40
          fprintf(out, "%d, ", schd_info->waits[i]);
1315
76
        fprintf(out, "%d", schd_info->waits[schd_info->wait_size - 1]);
1316
76
      }
1317
142
    }
1318
658
    fputc('}', out);
1319
658
  }
1320
660
}
1321
1322
static void _ccv_nnc_graph_dot_tensor(const int index, const ccv_nnc_tensor_t* const tensor, const int zone, const int flags, const int depth, FILE* out)
1323
1.82k
{
1324
1.82k
  // if it has an alias pointer, or, it is a long form.
1325
1.82k
  if (flags == CCV_NNC_LONG_DOT_GRAPH)
1326
1.82k
    fputc('{', out);
1327
1.82k
  const int is_tensor_view = CCV_IS_TENSOR_VIEW(tensor);
1328
1.82k
  if (is_tensor_view)
1329
81
    fprintf(out, "tensorview%d", index);
1330
1.74k
  else
1331
1.74k
    fprintf(out, "tensor%d", index);
1332
1.82k
  int i;
1333
2.02k
  for (i = 0; i < depth; 
i++195
) // Print subscription to denote depth.
1334
195
    fputc('\'', out);
1335
1.82k
  if (CCV_GET_TAPE_ALLOC(tensor->type))
1336
1.82k
    
fputs(" (t)", out)9
;
1337
1.82k
  if (flags == CCV_NNC_LONG_DOT_GRAPH)
1338
1.82k
  {
1339
1.82k
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(tensor->info.type);
1340
1.82k
    fprintf(out, "|d%d|zone%d", device_id, zone);
1341
2.01k
    for (i = 0; i < depth; 
i++195
) // Print subscription to denote depth.
1342
195
      fputc('\'', out);
1343
1.82k
    uintptr_t aptr = (uintptr_t)tensor->data.u8;
1344
1.82k
    const int* ainc = is_tensor_view ? 
((ccv_nnc_tensor_view_t*)(tensor))->inc81
:
tensor->info.dim1.73k
;
1345
1.82k
    // For the last one, we don't extend to full ainc.
1346
1.82k
    size_t ainc_size = (ccv_nnc_dimension_count(ainc) - ainc[0] + tensor->info.dim[0]) * CCV_GET_DATA_TYPE_SIZE(tensor->type);
1347
1.82k
    // Print out the range as well.
1348
1.82k
    fprintf(out, "|{%#010x|%#010x}|%d", (uint32_t)aptr, (uint32_t)(aptr + ainc_size - 1), tensor->info.dim[0]);
1349
3.77k
    for (i = 1; i < CCV_NNC_MAX_DIM_ALLOC && tensor->info.dim[i]; 
i++1.95k
)
1350
1.95k
      fprintf(out, "x%d", tensor->info.dim[i]);
1351
1.82k
    fputc('}', out);
1352
1.82k
  }
1353
1.82k
}
1354
1355
typedef struct {
1356
  int index;
1357
  int name;
1358
  int zone;
1359
  uintptr_t tensor_ref;
1360
  uintptr_t start_ptr;
1361
  uintptr_t end_ptr;
1362
} ccv_nnc_tensor_dot_t;
1363
1364
typedef struct {
1365
  ccv_nnc_tensor_dot_t* dots;
1366
  int* remap;
1367
  int* rename_zone;
1368
  int* rename_index;
1369
} ccv_nnc_tensor_dot_recovery_t;
1370
1371
// First sort by start_ptr, then sort by tensor ptr (so that we will have the same tensor sorted to one cluster).
1372
9.73k
#define less_than(i1, i2, aux) ((i1).start_ptr < (i2).start_ptr || 
(5.01k
(i1).start_ptr == (i2).start_ptr5.01k
&&
(i1).tensor_ref < (i2).tensor_ref2.21k
))
1373
9.73k
static CCV_IMPLEMENT_QSORT(_ccv_nnc_tensor_dot_sort_by_ptr, ccv_nnc_tensor_dot_t, less_than)
1374
#undef less_than
1375
1376
static int _ccv_nnc_graph_dot_tensor_multiview_count(const ccv_nnc_tensor_multiview_t* const mv)
1377
260
{
1378
260
  if (!CCV_IS_TENSOR_MULTIVIEW(mv))
1379
260
    
return 1174
;
1380
86
  const int count = mv->kind + mv->repeat;
1381
86
  int i, c = 0;
1382
269
  for (i = 0; i < count; 
i++183
)
1383
183
    c += _ccv_nnc_graph_dot_tensor_multiview_count((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)[i]);
1384
86
  return c;
1385
86
}
1386
1387
static void _ccv_nnc_graph_dot_tensor_multiview_tensor_dots(const ccv_nnc_tensor_multiview_t* const mv, ccv_nnc_tensor_dot_t* const tensor_dots, int* tensor_index)
1388
86
{
1389
86
  const int count = mv->kind + mv->repeat;
1390
86
  int i;
1391
269
  for (i = 0; i < count; 
i++183
)
1392
183
    if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
1393
183
      
_ccv_nnc_graph_dot_tensor_multiview_tensor_dots((ccv_nnc_tensor_multiview_t*)9
CCV_NNC_MULTIVIEW_DATA9
(mv)[i], tensor_dots, tensor_index);
1394
174
    else {
1395
174
      tensor_dots[*tensor_index].name = *tensor_index;
1396
174
      tensor_dots[*tensor_index].start_ptr =  (uintptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->data.u8;
1397
174
      // Because tv's pointer will get updated, it is not correct in this case to have one tensor_ref.
1398
174
      tensor_dots[*tensor_index].tensor_ref = tensor_dots[*tensor_index].start_ptr;
1399
174
      const size_t dim_size = ccv_nnc_dimension_count(CCV_NNC_MULTIVIEW_DATA(mv)[i]->info.dim) * CCV_GET_DATA_TYPE_SIZE(CCV_NNC_MULTIVIEW_DATA(mv)[i]->type);
1400
174
      tensor_dots[*tensor_index].end_ptr = tensor_dots[*tensor_index].start_ptr + dim_size - 1;
1401
174
      ++(*tensor_index);
1402
174
    }
1403
86
}
1404
1405
static ccv_nnc_tensor_dot_recovery_t _ccv_nnc_graph_tensor_dot_recovery(const ccv_nnc_graph_t* const graph)
1406
169
{
1407
169
  int i, j;
1408
169
  // Recover tensor relationships for all tensors referenced in the graph.
1409
169
  // Most notably, we have to give these indexes, and find if they point to
1410
169
  // the same memory region, and whether they overlap. These information
1411
169
  // are lost since we converted from symbolic form to the execution form.
1412
169
  // and here we do our best to recover because that is easier to understand
1413
169
  // if we want to present the graph visually (also, we don't want to put this
1414
169
  // information into the tensor or execution graph to avoid overhead, thus,
1415
169
  // recovering is the best we can do).
1416
169
  int tensor_count = 0;
1417
866
  for (i = 0; i < graph->exec_info->rnum; 
i++697
)
1418
697
  {
1419
697
    ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1420
2.04k
    for (j = 0; j < exec_info->input_size; 
j++1.34k
)
1421
1.34k
      if (exec_info->inputs[j])
1422
1.12k
        tensor_count += CCV_IS_TENSOR_MULTIVIEW(exec_info->inputs[j]) ? 
_ccv_nnc_graph_dot_tensor_multiview_count((ccv_nnc_tensor_multiview_t*)exec_info->inputs[j])36
:
11.09k
;
1423
1.49k
    for (j = 0; j < exec_info->output_size; 
j++799
)
1424
799
      if (exec_info->outputs[j])
1425
774
        tensor_count += CCV_IS_TENSOR_MULTIVIEW(exec_info->outputs[j]) ? 
_ccv_nnc_graph_dot_tensor_multiview_count((ccv_nnc_tensor_multiview_t*)exec_info->outputs[j])41
:
1733
;
1426
697
  }
1427
169
  ccv_nnc_tensor_dot_t* tensor_dots = tensor_count > 0 ? 
(ccv_nnc_tensor_dot_t*)165
ccmalloc165
(sizeof(ccv_nnc_tensor_dot_t) * tensor_count) :
04
;
1428
169
  int k = 0;
1429
866
  for (i = 0; i < graph->exec_info->rnum; 
i++697
)
1430
697
  {
1431
697
    ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1432
2.04k
    for (j = 0; j < exec_info->input_size; 
j++1.34k
)
1433
1.34k
    {
1434
1.34k
      ccv_nnc_tensor_t* tensor = exec_info->inputs[j];
1435
1.34k
      if (!tensor)
1436
218
        continue;
1437
1.12k
      if (CCV_IS_TENSOR_MULTIVIEW(tensor))
1438
1.12k
        
_ccv_nnc_graph_dot_tensor_multiview_tensor_dots((ccv_nnc_tensor_multiview_t*)tensor, tensor_dots, &k)36
;
1439
1.09k
      else {
1440
1.09k
        tensor_dots[k].name = k;
1441
1.09k
        tensor_dots[k].tensor_ref = (uintptr_t)tensor;
1442
1.09k
        tensor_dots[k].start_ptr = (uintptr_t)tensor->data.u8;
1443
1.09k
        const int* inc = CCV_IS_TENSOR_VIEW(tensor) ? 
((ccv_nnc_tensor_view_t*)tensor)->inc50
:
tensor->info.dim1.04k
;
1444
1.09k
        const size_t inc_size = (ccv_nnc_dimension_count(inc) - inc[0] + tensor->info.dim[0]) * CCV_GET_DATA_TYPE_SIZE(tensor->type);
1445
1.09k
        tensor_dots[k].end_ptr = tensor_dots[k].start_ptr + inc_size - 1;
1446
1.09k
        ++k;
1447
1.09k
      }
1448
1.12k
    }
1449
1.49k
    for (j = 0; j < exec_info->output_size; 
j++799
)
1450
799
    {
1451
799
      ccv_nnc_tensor_t* tensor = exec_info->outputs[j];
1452
799
      if (!tensor)
1453
25
        continue;
1454
774
      if (CCV_IS_TENSOR_MULTIVIEW(tensor))
1455
774
        
_ccv_nnc_graph_dot_tensor_multiview_tensor_dots((ccv_nnc_tensor_multiview_t*)tensor, tensor_dots, &k)41
;
1456
733
      else {
1457
733
        tensor_dots[k].name = k;
1458
733
        tensor_dots[k].tensor_ref = (uintptr_t)tensor;
1459
733
        tensor_dots[k].start_ptr = (uintptr_t)tensor->data.u8;
1460
733
        const int* inc = CCV_IS_TENSOR_VIEW(tensor) ? 
((ccv_nnc_tensor_view_t*)tensor)->inc31
:
tensor->info.dim702
;
1461
733
        const size_t inc_size = (ccv_nnc_dimension_count(inc) - inc[0] + tensor->info.dim[0]) * CCV_GET_DATA_TYPE_SIZE(tensor->type);
1462
733
        tensor_dots[k].end_ptr = tensor_dots[k].start_ptr + inc_size - 1;
1463
733
        ++k;
1464
733
      }
1465
774
    }
1466
697
  }
1467
169
  tensor_count = k; // We may over count, now shrink.
1468
169
  // To group overlap memory into one zone, we sort it by start ptr first (secondary by the tensor pointer).
1469
169
  _ccv_nnc_tensor_dot_sort_by_ptr(tensor_dots, tensor_count, 0);
1470
169
  int index = 0, zone = 0;
1471
169
  uintptr_t tensor_ref = tensor_count > 0 ? 
tensor_dots[0].tensor_ref165
:
04
;
1472
169
  uintptr_t end_ptr = tensor_count > 0 ? 
tensor_dots[0].end_ptr165
:
04
;
1473
169
  // Then, it is trivial, we go by end ptr. If the next start ptr is still within the end ptr (start ptr <= end ptr),
1474
169
  // they are the same zone.
1475
2.16k
  for (i = 0; i < tensor_count; 
i++1.99k
)
1476
1.99k
  {
1477
1.99k
    if (tensor_dots[i].tensor_ref != tensor_ref)
1478
891
    {
1479
891
      tensor_ref = tensor_dots[i].tensor_ref;
1480
891
      ++index;
1481
891
    }
1482
1.99k
    if (tensor_dots[i].start_ptr > end_ptr)
1483
664
    {
1484
664
      end_ptr = ccv_max(end_ptr, tensor_dots[i].end_ptr);
1485
664
      ++zone;
1486
664
    }
1487
1.99k
    tensor_dots[i].index = index;
1488
1.99k
    tensor_dots[i].zone = zone;
1489
1.99k
  }
1490
169
  // We already have index and zone assigned, but the problem is that these are not very human interpretable (because
1491
169
  // it follows the pointer from low to high, not the tensor creation order). The following code renamed both the index
1492
169
  // and the zone so that it is much more understandable.
1493
169
  const int index_count = index + 1;
1494
169
  const int zone_count = zone + 1;
1495
169
  int* remap = (int*)ccmalloc(sizeof(int) * (tensor_count + index_count + zone_count));
1496
169
  int* rename_index = remap + tensor_count;
1497
169
  int* rename_zone = rename_index + index_count;
1498
2.16k
  for (i = 0; i < tensor_count; 
i++1.99k
)
1499
1.99k
    remap[tensor_dots[i].name] = i;
1500
1.22k
  for (i = 0; i < index_count; 
i++1.06k
)
1501
1.06k
    rename_index[i] = -1;
1502
1.00k
  for (i = 0; i < zone_count; 
i++833
)
1503
833
    rename_zone[i] = -1;
1504
169
  index = 0;
1505
169
  zone = 0;
1506
2.16k
  for (i = 0; i < tensor_count; 
i++1.99k
)
1507
1.99k
  {
1508
1.99k
    ccv_nnc_tensor_dot_t* tensor_dot = tensor_dots + remap[i];
1509
1.99k
    if (rename_index[tensor_dot->index] == -1)
1510
1.05k
      rename_index[tensor_dot->index] = index++;
1511
1.99k
    if (rename_zone[tensor_dot->zone] == -1)
1512
829
      rename_zone[tensor_dot->zone] = zone++;
1513
1.99k
  }
1514
169
  ccv_nnc_tensor_dot_recovery_t recovery = {
1515
169
    .dots = tensor_dots,
1516
169
    .remap = remap,
1517
169
    .rename_index = rename_index,
1518
169
    .rename_zone = rename_zone,
1519
169
  };
1520
169
  return recovery;
1521
169
}
1522
1523
static void _ccv_nnc_graph_tensor_dot_recovery_free(const ccv_nnc_tensor_dot_recovery_t recovery)
1524
169
{
1525
169
  ccfree(recovery.dots);
1526
169
  ccfree(recovery.remap);
1527
169
}
1528
1529
static void _ccv_nnc_graph_dot_tensor_multiview_one(const ccv_nnc_tensor_multiview_t* const mv, const ccv_nnc_tensor_dot_recovery_t recovery, const int depth, int* tensor_index, FILE* out)
1530
86
{
1531
86
  const int count = mv->kind + mv->repeat;
1532
86
  int i, j;
1533
86
  fputs("|{", out);
1534
269
  for (i = 0; i < count; 
i++183
)
1535
183
    if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
1536
183
    {
1537
9
      fprintf(out, "{%d", i);
1538
9
      if (mv->kind == CCV_NNC_MULTIVIEW_K0N || 
(5
mv->kind == CCV_NNC_MULTIVIEW_K1N5
&&
i > 05
))
1539
9
        fputc('*', out); // Denotes that we loop on this.
1540
9
      _ccv_nnc_graph_dot_tensor_multiview_one((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)[i], recovery, depth, tensor_index, out);
1541
9
      if (i == count - 1)
1542
7
        fputc('}', out);
1543
2
      else
1544
2
        fputs("}|", out);
1545
174
    } else {
1546
174
      fprintf(out, "{%d", i);
1547
174
      if (mv->kind == CCV_NNC_MULTIVIEW_K0N || 
(19
mv->kind == CCV_NNC_MULTIVIEW_K1N19
&&
i > 019
))
1548
163
        fputc('*', out); // Denotes that we loop on this.
1549
174
      const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[*tensor_index];
1550
174
      fprintf(out, "|zone%d", recovery.rename_zone[tensor_dot->zone]);
1551
368
      for (j = 0; j < depth; 
j++194
)
1552
194
        fputc('\'', out);
1553
174
      uintptr_t aptr = (uintptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->data.u8;
1554
174
      // For the last one, we don't extend to full ainc.
1555
174
      size_t dim_size = ccv_nnc_dimension_count(CCV_NNC_MULTIVIEW_DATA(mv)[i]->info.dim) * CCV_GET_DATA_TYPE_SIZE(CCV_NNC_MULTIVIEW_DATA(mv)[i]->type);
1556
174
      // Print out the range as well.
1557
174
      fprintf(out, "|{%#010x|%#010x}", (uint32_t)aptr, (uint32_t)(aptr + dim_size - 1));
1558
174
      ++(*tensor_index);
1559
174
      if (i == count - 1)
1560
79
        fputc('}', out);
1561
95
      else
1562
95
        fputs("}|", out);
1563
174
    }
1564
86
  fputc('}', out);
1565
86
}
1566
1567
static void _ccv_nnc_graph_dot_tensor_multiview(const ccv_nnc_tensor_multiview_t* const mv, const ccv_nnc_tensor_dot_recovery_t recovery, const int flags, const int depth, int* tensor_index, FILE* out)
1568
77
{
1569
77
  // if it has an alias pointer, or, it is a long form.
1570
77
  if (flags == CCV_NNC_LONG_DOT_GRAPH)
1571
77
    fputc('{', out);
1572
77
  const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[*tensor_index];
1573
77
  fprintf(out, "multiview%d", recovery.rename_index[tensor_dot->index]);
1574
77
  int i;
1575
161
  for (i = 0; i < depth; 
i++84
) // Print subscription to denote depth.
1576
84
    fputc('\'', out);
1577
77
  if (CCV_GET_TAPE_ALLOC(mv->type))
1578
77
    
fputs(" (t)", out)7
;
1579
77
  if (flags == CCV_NNC_LONG_DOT_GRAPH)
1580
77
  {
1581
77
    _ccv_nnc_graph_dot_tensor_multiview_one(mv, recovery, depth, tensor_index, out);
1582
77
    const ccv_nnc_tensor_t* root = (ccv_nnc_tensor_t*)mv;
1583
156
    while (CCV_IS_TENSOR_MULTIVIEW(root))
1584
79
      root = CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)root)[0];
1585
77
    fprintf(out, "|%d", root->info.dim[0]);
1586
105
    for (i = 1; i < CCV_NNC_MAX_DIM_ALLOC && root->info.dim[i]; 
i++28
)
1587
28
      fprintf(out, "x%d", root->info.dim[i]);
1588
77
    fputc('}', out);
1589
77
  } else
1590
0
    *tensor_index += _ccv_nnc_graph_dot_tensor_multiview_count(mv);
1591
77
}
1592
1593
static void _ccv_nnc_graph_dot_node(const ccv_nnc_graph_exec_info_t* const exec_info, const ccv_nnc_graph_exec_schedule_t* const schd_info, const int exec_index, ccv_nnc_stream_context_t** const streams, const ccv_nnc_tensor_dot_recovery_t recovery, const int flags, const int depth, FILE* out, int* const tensor_index)
1594
660
{
1595
660
  fprintf(out, "node%d [shape=record,label=\"", exec_index);
1596
660
  _ccv_nnc_graph_dot_exec(exec_index, exec_info, schd_info, streams, flags, out);
1597
660
  int i;
1598
660
  int k = *tensor_index;
1599
660
  if (exec_info->input_size > 0)
1600
542
  {
1601
542
    fputs("|{Input", out);
1602
1.85k
    for (i = 0; i < exec_info->input_size; 
i++1.31k
)
1603
1.31k
      if (exec_info->inputs[i])
1604
1.09k
      {
1605
1.09k
        fputc('|', out);
1606
1.09k
        if (CCV_IS_TENSOR_MULTIVIEW(exec_info->inputs[i]))
1607
1.09k
          
_ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->inputs[i], recovery, flags, depth, &k, out)33
;
1608
1.06k
        else {
1609
1.06k
          const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k];
1610
1.06k
          _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->inputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out);
1611
1.06k
          ++k;
1612
1.06k
        }
1613
1.09k
      } else
1614
218
        fputs("|-", out);
1615
542
    fputc('}', out);
1616
542
  }
1617
660
  if (exec_info->output_size > 0)
1618
604
  {
1619
604
    fputs("|{Output", out);
1620
1.36k
    for (i = 0; i < exec_info->output_size; 
i++763
)
1621
763
      if (exec_info->outputs[i])
1622
738
      {
1623
738
        fputc('|', out);
1624
738
        if (CCV_IS_TENSOR_MULTIVIEW(exec_info->outputs[i]))
1625
738
          
_ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->outputs[i], recovery, flags, depth, &k, out)30
;
1626
708
        else {
1627
708
          const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k];
1628
708
          _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->outputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out);
1629
708
          ++k;
1630
708
        }
1631
738
      } else
1632
25
        fputs("|-", out);
1633
604
    fputc('}', out);
1634
604
  }
1635
660
  fputs("\"];\n", out);
1636
660
  *tensor_index = k;
1637
660
}
1638
1639
static void _ccv_nnc_graph_dot_while_label(const ccv_nnc_graph_exec_info_t* const exec_info, const int exec_index, const ccv_nnc_tensor_dot_recovery_t recovery, const ccv_nnc_graph_t* const while_graph, const int flags, const int depth, FILE* out, int* tensor_index)
1640
25
{
1641
25
  int i;
1642
25
  fprintf(out, "label=<<b>while%d </b>Command: ", exec_index);
1643
25
  fputs(ccv_nnc_cmd_name(exec_info->cmd.cmd), out);
1644
25
  fputs(">;\n", out);
1645
25
  fprintf(out, "label%d [shape=record,label=\"{", exec_index);
1646
25
  int k = *tensor_index;
1647
25
  if (exec_info->input_size > 0)
1648
16
  {
1649
16
    fputs("{Input|{", out);
1650
39
    for (i = 0; i < exec_info->input_size; 
i++23
)
1651
23
    {
1652
23
      if (i > 0)
1653
7
        fputc('|', out);
1654
23
      if (exec_info->inputs[i])
1655
23
      {
1656
23
        if (CCV_IS_TENSOR_MULTIVIEW(exec_info->inputs[i]))
1657
23
          
_ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->inputs[i], recovery, flags, depth, &k, out)1
;
1658
22
        else {
1659
22
          const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k];
1660
22
          _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->inputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out);
1661
22
          ++k;
1662
22
        }
1663
23
      } else
1664
0
        fputc('-', out);
1665
23
    }
1666
16
    fputs("}}", out);
1667
16
  }
1668
25
  if (exec_info->output_size > 0)
1669
15
  {
1670
15
    if (exec_info->input_size > 0)
1671
12
      fputs("|", out);
1672
15
    fputs("{Output|{", out);
1673
38
    for (i = 0; i < exec_info->output_size; 
i++23
)
1674
23
    {
1675
23
      if (i > 0)
1676
8
        fputc('|', out);
1677
23
      if (exec_info->outputs[i])
1678
23
      {
1679
23
        if (CCV_IS_TENSOR_MULTIVIEW(exec_info->outputs[i]))
1680
23
          
_ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->outputs[i], recovery, flags, depth, &k, out)0
;
1681
23
        else {
1682
23
          const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k];
1683
23
          _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->outputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out);
1684
23
          ++k;
1685
23
        }
1686
23
      } else
1687
0
        fputc('-', out);
1688
23
    }
1689
15
    fputs("}}", out);
1690
15
  }
1691
25
  fputs("}\"];\n", out);
1692
25
  *tensor_index = k;
1693
25
}
1694
1695
static void _ccv_nnc_graph_dot_case_of_label(const ccv_nnc_graph_exec_info_t* const exec_info, const int exec_index, const ccv_nnc_tensor_dot_recovery_t recovery, const int flags, const int depth, FILE* out, int* tensor_index)
1696
12
{
1697
12
  int i;
1698
12
  fprintf(out, "label=<<b>caseof%d </b>Command: ", exec_index);
1699
12
  fputs(ccv_nnc_cmd_name(exec_info->cmd.cmd), out);
1700
12
  fputs(">;\n", out);
1701
12
  fprintf(out, "label%d [shape=record,label=\"{", exec_index);
1702
12
  int k = *tensor_index;
1703
12
  if (exec_info->input_size > 0)
1704
11
  {
1705
11
    fputs("{Input|{", out);
1706
22
    for (i = 0; i < exec_info->input_size; 
i++11
)
1707
11
    {
1708
11
      if (i > 0)
1709
0
        fputc('|', out);
1710
11
      if (exec_info->inputs[i])
1711
11
      {
1712
11
        if (CCV_IS_TENSOR_MULTIVIEW(exec_info->inputs[i]))
1713
11
          
_ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->inputs[i], recovery, flags, depth, &k, out)2
;
1714
9
        else {
1715
9
          const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k];
1716
9
          _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->inputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out);
1717
9
          ++k;
1718
9
        }
1719
11
      } else
1720
0
        fputc('-', out);
1721
11
    }
1722
11
    fputs("}}", out);
1723
11
  }
1724
12
  if (exec_info->output_size > 0)
1725
11
  {
1726
11
    if (exec_info->input_size > 0)
1727
10
      fputs("|", out);
1728
11
    fputs("{Output|{", out);
1729
24
    for (i = 0; i < exec_info->output_size; 
i++13
)
1730
13
    {
1731
13
      if (i > 0)
1732
2
        fputc('|', out);
1733
13
      if (exec_info->outputs[i])
1734
13
      {
1735
13
        if (CCV_IS_TENSOR_MULTIVIEW(exec_info->outputs[i]))
1736
13
          
_ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->outputs[i], recovery, flags, depth, &k, out)11
;
1737
2
        else {
1738
2
          const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k];
1739
2
          _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->outputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out);
1740
2
          ++k;
1741
2
        }
1742
13
      } else
1743
0
        fputc('-', out);
1744
13
    }
1745
11
    fputs("}}", out);
1746
11
  }
1747
12
  fputs("}\"];\n", out);
1748
12
  *tensor_index = k;
1749
12
}
1750
1751
static void _ccv_nnc_graph_dot_sub_graphs(const ccv_nnc_graph_exec_info_t* const exec_info, const ccv_nnc_tensor_dot_recovery_t p_recovery, const ccv_array_t* const sub_graphs, const int flags, const int depth, FILE* out, int* tensor_index, int* exec_index)
1752
37
{
1753
37
  if (exec_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1754
25
  {
1755
25
    fprintf(out, "subgraph cluster%d {\nstyle=\"rounded\";\nnode%d [style=invisible];\n", *exec_index, *exec_index);
1756
25
    const ccv_nnc_graph_t* const while_graph = *(ccv_nnc_graph_t**)ccv_array_get(sub_graphs, CCV_NNC_GRAPH_REF(exec_info)[0] - 1);
1757
25
    // Output this node info within this subgraph.
1758
25
    _ccv_nnc_graph_dot_while_label(exec_info, *exec_index, p_recovery, while_graph, flags, depth - 1 /* Label all references to its level above. */, out, tensor_index);
1759
25
  } else 
if (12
exec_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF12
) {
1760
12
    fprintf(out, "subgraph cluster%d {\nstyle=\"rounded\";\nnode%d [style=invisible];\n", *exec_index, *exec_index);
1761
12
    _ccv_nnc_graph_dot_case_of_label(exec_info, *exec_index, p_recovery, flags, depth - 1 /* Label all references to its level above. */, out, tensor_index);
1762
12
  }
1763
37
  ++(*exec_index);
1764
37
  int p;
1765
94
  for (p = 0; p < exec_info->graph_ref_size; 
p++57
)
1766
57
  {
1767
57
    if (exec_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
1768
32
    {
1769
32
      fprintf(out, "subgraph cluster%d {\nstyle=\"rounded\";\nnode%d [style=invisible];\nlabel=\"\"\n", *exec_index, *exec_index);
1770
32
      ++(*exec_index);
1771
32
    }
1772
57
    const ccv_nnc_graph_t* const graph = *(ccv_nnc_graph_t**)ccv_array_get(sub_graphs, CCV_NNC_GRAPH_REF(exec_info)[p] - 1);
1773
57
    const ccv_nnc_graph_static_schedule_t* const schedule = graph->default_schedule;
1774
57
    ccv_nnc_tensor_dot_recovery_t recovery = _ccv_nnc_graph_tensor_dot_recovery(graph);
1775
57
    int i, j;
1776
57
    int k = 0;
1777
57
    int* node_id = (int*)ccmalloc(sizeof(int) * graph->exec_info->rnum);
1778
57
    // Output styles.
1779
167
    for (i = 0; i < graph->exec_info->rnum; 
i++110
)
1780
110
    {
1781
110
      node_id[i] = *exec_index;
1782
110
      ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1783
110
      if (CCV_NNC_GRAPH_REF(exec_info)[0])
1784
3
        _ccv_nnc_graph_dot_sub_graphs(exec_info, recovery, graph->sub_graphs, flags, depth + 1, out, &k, exec_index);
1785
107
      else {
1786
107
        _ccv_nnc_graph_dot_node(exec_info,
1787
107
          schedule ? 
(i < schedule->exec_info_size 6
?
schedule->exec_info + i6
:
00
) :
0101
,
1788
107
          *exec_index, graph->streams, recovery, flags, depth, out, &k);
1789
107
        ++(*exec_index);
1790
107
      }
1791
110
    }
1792
57
    // Output connections.
1793
167
    for (i = 0; i < graph->exec_info->rnum; 
i++110
)
1794
110
    {
1795
110
      ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1796
110
      if (exec_info->outgoings)
1797
108
        
for (j = 0; 53
j < exec_info->outgoings->rnum;
j++55
)
1798
55
        {
1799
55
          const int outgoing_idx = *(int*)ccv_array_get(exec_info->outgoings, j);
1800
55
          const ccv_nnc_graph_exec_info_t* const outgoing_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, outgoing_idx);
1801
55
          // If both are sub-graphs, have both tail and head specified.
1802
55
          if (CCV_NNC_GRAPH_REF(exec_info)[0] && 
CCV_NNC_GRAPH_REF1
(outgoing_info)[0]1
)
1803
0
            fprintf(out, "node%d -> node%d [ltail=cluster%d,lhead=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[i], node_id[outgoing_idx]);
1804
55
          else if (CCV_NNC_GRAPH_REF(exec_info)[0] && 
!1
CCV_NNC_GRAPH_REF1
(outgoing_info)[0])
1805
1
            fprintf(out, "node%d -> node%d [ltail=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[i]);
1806
54
          else if (!CCV_NNC_GRAPH_REF(exec_info)[0] && CCV_NNC_GRAPH_REF(outgoing_info)[0])
1807
3
            fprintf(out, "node%d -> node%d [lhead=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[outgoing_idx]);
1808
51
          else
1809
51
            fprintf(out, "node%d -> node%d;\n", node_id[i], node_id[outgoing_idx]);
1810
55
        }
1811
110
    }
1812
57
    fputs("}\n", out);
1813
57
    _ccv_nnc_graph_tensor_dot_recovery_free(recovery);
1814
57
    ccfree(node_id);
1815
57
  }
1816
37
  // Extra subgraph cluster.
1817
37
  if (exec_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
1818
12
    fputs("}\n", out);
1819
37
}
1820
1821
void ccv_nnc_graph_dot(const ccv_nnc_graph_t* const graph, const int flags, FILE* out)
1822
112
{
1823
112
  fputs("digraph G {\ncompound=true;\n", out);
1824
112
  ccv_nnc_tensor_dot_recovery_t recovery = _ccv_nnc_graph_tensor_dot_recovery(graph);
1825
112
  int i, j;
1826
112
  int k = 0, c = 0;
1827
112
  int* node_id = (int*)ccmalloc(sizeof(int) * graph->exec_info->rnum);
1828
112
  const ccv_nnc_graph_static_schedule_t* const schedule = graph->default_schedule;
1829
112
  // Output styles.
1830
699
  for (i = 0; i < graph->exec_info->rnum; 
i++587
)
1831
587
  {
1832
587
    node_id[i] = c;
1833
587
    ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1834
587
    if (CCV_NNC_GRAPH_REF(exec_info)[0])
1835
34
      _ccv_nnc_graph_dot_sub_graphs(exec_info, recovery, graph->sub_graphs, flags, 1, out, &k, &c);
1836
553
    else {
1837
553
      _ccv_nnc_graph_dot_node(exec_info,
1838
553
        schedule ? 
(i < schedule->exec_info_size 136
?
schedule->exec_info + i136
:
00
) :
0417
,
1839
553
        c, graph->streams, recovery, flags, 0, out, &k);
1840
553
      ++c;
1841
553
    }
1842
587
  }
1843
112
  // Output connections.
1844
699
  for (i = 0; i < graph->exec_info->rnum; 
i++587
)
1845
587
  {
1846
587
    ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1847
587
    if (exec_info->outgoings)
1848
1.19k
      
for (j = 0; 475
j < exec_info->outgoings->rnum;
j++721
)
1849
721
      {
1850
721
        const int outgoing_idx = *(int*)ccv_array_get(exec_info->outgoings, j);
1851
721
        const ccv_nnc_graph_exec_info_t* const outgoing_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, outgoing_idx);
1852
721
        // If both are sub-graphs, have both tail and head specified.
1853
721
        if (CCV_NNC_GRAPH_REF(exec_info)[0] && 
CCV_NNC_GRAPH_REF18
(outgoing_info)[0]18
)
1854
3
          fprintf(out, "node%d -> node%d [ltail=cluster%d,lhead=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[i], node_id[outgoing_idx]);
1855
718
        else if (CCV_NNC_GRAPH_REF(exec_info)[0] && 
!15
CCV_NNC_GRAPH_REF15
(outgoing_info)[0])
1856
15
          fprintf(out, "node%d -> node%d [ltail=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[i]);
1857
703
        else if (!CCV_NNC_GRAPH_REF(exec_info)[0] && CCV_NNC_GRAPH_REF(outgoing_info)[0])
1858
8
          fprintf(out, "node%d -> node%d [lhead=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[outgoing_idx]);
1859
695
        else
1860
695
          fprintf(out, "node%d -> node%d;\n", node_id[i], node_id[outgoing_idx]);
1861
721
      }
1862
587
  }
1863
112
  fputs("}\n", out);
1864
112
  _ccv_nnc_graph_tensor_dot_recovery_free(recovery);
1865
112
  ccfree(node_id);
1866
112
}
1867
1868
void ccv_nnc_graph_autotune(ccv_nnc_graph_t* const graph, const size_t max_workspace_size, const int flags, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size)
1869
87
{
1870
87
  // exec current node, for synchronous CPU execution, no stream unit.
1871
87
  int i;
1872
87
#define visitor(node, idx, ...) \
1873
2.69k
  do { \
1874
2.69k
    if (node->cmd.cmd == CCV_NNC_NOOP) \
1875
2.69k
      
continue77
; \
1876
2.69k
    
if (2.62k
node->cmd.cmd == CCV_NNC_GRAPH_FORWARD2.62k
||
node->cmd.cmd == CCV_NNC_GRAPH_BACKWARD2.60k
) \
1877
2.62k
      
for (i = 0; 12
i < node->graph_ref_size30
;
i++18
) \
1878
18
      { \
1879
18
        ccv_nnc_graph_t* sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[i] - 1); \
1880
18
        ccv_nnc_graph_autotune(sub_graph, max_workspace_size, flags, 0, 0, 0, 0); \
1881
18
      } \
1882
2.62k
    else { \
1883
2.60k
      /* Need to unwrap these tensors */ \
1884
15.3k
      for (i = 0; i < node->input_size + node->output_size; 
i++12.7k
) \
1885
12.7k
        if (node->inputs[i] && 
CCV_IS_TENSOR_MULTIVIEW10.3k
(node->inputs[i])) \
1886
12.7k
          
node->inputs[i] = _ccv_nnc_any_tensor_from_tensor_multiview((ccv_nnc_tensor_multiview_t*)node->inputs[i])13
; \
1887
2.60k
      PRINT(CCV_CLI_VERBOSE, "%s [%d]: [%d] -> [%d]\n", ccv_nnc_cmd_name(node->cmd.cmd), idx, node->input_size, node->output_size); \
1888
10.8k
      for (i = 0; i < node->input_size; 
i++8.19k
) \
1889
8.19k
        PRINT(CCV_CLI_VERBOSE, "|-> %d. %p (%p)\n", i + 1, node->inputs[i], (node->inputs[i] ? node->inputs[i]->data.u8 : 0)); \
1890
7.11k
      for (i = 0; i < node->output_size; 
i++4.50k
) \
1891
4.50k
        PRINT(CCV_CLI_VERBOSE, "|<- %d. %p (%p)\n", i + 1, node->outputs[i], (node->outputs[i] ? node->outputs[i]->data.u8 : 0)); \
1892
2.60k
      node->cmd = ccv_nnc_cmd_autotune(node->cmd, max_workspace_size, node->hint, flags, node->inputs, node->input_size, node->outputs, node->output_size, 0); \
1893
2.60k
    } \
1894
2.69k
  } while (0)
1895
87
  const ccv_nnc_graph_exec_t* const graph_sources = sources ? 
sources1
:
(graph->sources 86
?
(ccv_nnc_graph_exec_t*)83
ccv_array_get83
(graph->sources, 0):
03
);
1896
87
  const int graph_source_size = source_size ? 
source_size1
:
(graph->sources 86
?
graph->sources->rnum83
:
03
);
1897
87
  const ccv_nnc_graph_exec_t* const graph_destinations = destinations ? 
destinations1
:
(graph->destinations 86
?
(ccv_nnc_graph_exec_t*)83
ccv_array_get83
(graph->destinations, 0) :
03
);
1898
87
  const int graph_destination_size = destination_size ? 
destination_size1
:
(graph->destinations 86
?
graph->destinations->rnum83
:
03
);
1899
2.69k
  
CCV_NNC_GRAPH_VISIT87
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph_sources, graph_source_size, graph_destinations, graph_destination_size, 0, visitor);
1900
87
#undef visitor
1901
87
}
1902
1903
void ccv_nnc_graph_free(ccv_nnc_graph_t* const graph)
1904
6.10k
{
1905
6.10k
  int i, j;
1906
38.1k
  for (i = 0; i < graph->exec_info->rnum; 
i++32.0k
)
1907
32.0k
  {
1908
32.0k
    ccv_nnc_graph_exec_info_t *info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1909
32.0k
    if (info->_heap_graph_ref)
1910
32.0k
      
ccfree8
(info->_heap_graph_ref)8
;
1911
32.0k
    ccv_array_t* outgoings = info->outgoings;
1912
32.0k
    if (outgoings)
1913
25.9k
      ccv_array_free(outgoings);
1914
32.0k
    // We allocate inputs & outputs in continuous fashion, therefore, only need to free the input array.
1915
32.0k
    if (info->inputs)
1916
32.0k
      
ccfree31.8k
(info->inputs)31.8k
;
1917
32.0k
    if (info->input_flags)
1918
32.0k
      
ccfree31.7k
(info->input_flags)31.7k
;
1919
32.0k
    if (info->updates)
1920
32.0k
      
ccfree17
(info->updates)17
;
1921
32.0k
    if ((info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE) && 
info->p_while.inputs27
)
1922
32.0k
      
ccfree23
(info->p_while.inputs)23
;
1923
32.0k
  }
1924
6.10k
  if (graph->tensor_wraps)
1925
27
  {
1926
80
    for (i = 0; i < graph->tensor_wraps->rnum; 
i++53
)
1927
53
    {
1928
53
      ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, i);
1929
53
      if (tensor_wrap_array)
1930
52
      {
1931
195
        for (j = 0; j < tensor_wrap_array->size; 
j++143
)
1932
143
          _ccv_nnc_graph_tensor_wrap_free(tensor_wrap_array->tensor_wraps[j]);
1933
52
        ccfree(tensor_wrap_array);
1934
52
      }
1935
53
    }
1936
27
    ccv_array_free(graph->tensor_wraps);
1937
27
  }
1938
6.10k
  if (graph->tensor_wraps_refs)
1939
44
    ccv_array_free(graph->tensor_wraps_refs);
1940
6.10k
  if (graph->breakpoints)
1941
6.10k
    
ccfree26
(graph->breakpoints)26
;
1942
6.10k
  if (graph->sources)
1943
6.09k
    ccv_array_free(graph->sources);
1944
6.10k
  if (graph->destinations)
1945
6.09k
    ccv_array_free(graph->destinations);
1946
6.10k
  if (graph->default_schedule)
1947
301
    ccv_nnc_graph_static_schedule_free(graph->default_schedule);
1948
6.10k
  if (graph->streams)
1949
301
  {
1950
301
    // If the graph has parent graph, the default stream is allocated by the parent graph, we need to skip.
1951
301
    if (!graph->p)
1952
297
      ccv_nnc_stream_context_free(graph->streams[0]);
1953
1.07k
    for (i = 1; i < graph->stream_size; 
i++769
)
1954
769
      ccv_nnc_stream_context_free(graph->streams[i]);
1955
301
    ccfree(graph->streams);
1956
301
  }
1957
6.10k
  if (graph->block_stream_tasks)
1958
6.10k
    
ccfree301
(graph->block_stream_tasks)301
;
1959
6.10k
  if (graph->signals)
1960
301
  {
1961
1.44k
    for (i = 0; i < graph->signal_size; 
i++1.14k
)
1962
1.14k
      ccv_nnc_stream_signal_free(graph->signals[i]);
1963
301
    ccfree(graph->signals);
1964
301
  }
1965
6.10k
  if (graph->carry_overs)
1966
21
  {
1967
46
    for (i = 0; i < graph->carry_overs->rnum; 
i++25
)
1968
25
    {
1969
25
      ccv_nnc_graph_tensor_carry_over_t* const carry_over = (ccv_nnc_graph_tensor_carry_over_t*)ccv_array_get(graph->carry_overs, i);
1970
25
      _ccv_nnc_graph_tensor_wrap_free(carry_over->from);
1971
25
      _ccv_nnc_graph_tensor_wrap_free(carry_over->to);
1972
25
    }
1973
21
    ccv_array_free(graph->carry_overs);
1974
21
  }
1975
6.10k
  if (graph->sub_graphs)
1976
35
  {
1977
94
    for (i = 0; i < graph->sub_graphs->rnum; 
i++59
)
1978
59
      ccv_nnc_graph_free(*(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, i));
1979
35
    ccv_array_free(graph->sub_graphs);
1980
35
  }
1981
6.10k
  ccv_array_free(graph->exec_info);
1982
6.10k
  if (graph->buffer)
1983
6.10k
    
ccfree299
(graph->buffer)299
;
1984
6.10k
  ccfree(graph);
1985
6.10k
}