Coverage Report

Created: 2021-04-14 15:26

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/ccv_nnc_graph.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#include "_ccv_nnc_graph.h"
6
7
// MARK - Level-2 API
8
9
ccv_nnc_graph_t* ccv_nnc_graph_new(void)
10
6.10k
{
11
6.10k
  ccv_nnc_graph_t* graph = (ccv_nnc_graph_t*)cccalloc(1, sizeof(ccv_nnc_graph_t));
12
6.10k
  graph->exec_info = ccv_array_new(sizeof(ccv_nnc_graph_exec_info_t), 5, 0);
13
6.10k
  return graph;
14
6.10k
}
15
16
void ccv_nnc_graph_set_sources(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t* const sources, const int source_size)
17
6.09k
{
18
6.09k
  if (!graph->sources)
19
6.09k
    graph->sources = ccv_array_new(sizeof(ccv_nnc_graph_exec_t), source_size, 0);
20
0
  else
21
0
    ccv_array_clear(graph->sources);
22
6.09k
  int i;
23
12.1k
  for (i = 0; i < source_size; 
i++6.09k
)
24
6.09k
    ccv_array_push(graph->sources, sources + i);
25
6.09k
  graph->topsorted = 0;
26
6.09k
}
27
28
ccv_nnc_graph_exec_t* ccv_nnc_graph_sources(const ccv_nnc_graph_t* const graph)
29
0
{
30
0
  return graph->sources ? (ccv_nnc_graph_exec_t*)ccv_array_get(graph->sources, 0) : 0;
31
0
}
32
33
int ccv_nnc_graph_source_size(const ccv_nnc_graph_t* const graph)
34
0
{
35
0
  return graph->sources ? graph->sources->rnum : 0;
36
0
}
37
38
void ccv_nnc_graph_set_destinations(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t* const destinations, const int destination_size)
39
6.09k
{
40
6.09k
  if (!graph->destinations)
41
6.09k
    graph->destinations = ccv_array_new(sizeof(ccv_nnc_graph_exec_t), destination_size, 0);
42
0
  else
43
0
    ccv_array_clear(graph->sources);
44
6.09k
  int i;
45
12.1k
  for (i = 0; i < destination_size; 
i++6.09k
)
46
6.09k
    ccv_array_push(graph->destinations, destinations + i);
47
6.09k
  graph->topsorted = 0;
48
6.09k
}
49
50
ccv_nnc_graph_exec_t* ccv_nnc_graph_destinations(const ccv_nnc_graph_t* const graph)
51
0
{
52
0
  return graph->destinations ? (ccv_nnc_graph_exec_t*)ccv_array_get(graph->destinations, 0) : 0;
53
0
}
54
55
int ccv_nnc_graph_destination_size(const ccv_nnc_graph_t* const graph)
56
0
{
57
0
  return graph->destinations ? graph->destinations->rnum : 0;
58
0
}
59
60
void ccv_nnc_graph_exec_set(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, const ccv_nnc_cmd_t cmd)
61
41.3k
{
62
41.3k
  assert(exec.d < graph->exec_info->rnum);
63
41.3k
  assert(exec.graph == graph);
64
41.3k
  ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d);
65
41.3k
  exec_info->cmd = cmd;
66
41.3k
}
67
68
void ccv_nnc_graph_exec_set_hint(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, const ccv_nnc_hint_t hint)
69
176
{
70
176
  assert(exec.d < graph->exec_info->rnum);
71
176
  assert(exec.graph == graph);
72
176
  ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d);
73
176
  exec_info->hint = hint;
74
176
}
75
76
static int _ccv_nnc_tensor_multiview_level_count(const ccv_nnc_tensor_multiview_t* const mv)
77
482
{
78
482
  if (!CCV_IS_TENSOR_MULTIVIEW(mv))
79
482
    
return 1327
;
80
155
  const int count = mv->kind + mv->repeat;
81
155
  int i, c = 0;
82
502
  for (i = 0; i < count; 
i++347
)
83
347
  {
84
347
    ccv_nnc_tensor_t* tv = CCV_NNC_MULTIVIEW_DATA(mv)[i];
85
347
    if (tv == CCV_NNC_TENSOR_PLACEHOLDER)
86
347
      
c = 8
ccv_max8
(c, 1);
87
347
    else
88
347
      
c = 339
ccv_max339
(c, _ccv_nnc_tensor_multiview_level_count((ccv_nnc_tensor_multiview_t*)tv));
89
347
  }
90
155
  return c + 1;
91
155
}
92
93
static ccv_nnc_graph_tensor_wrap_t* _ccv_nnc_graph_tensor_wrap_new(const ccv_nnc_tensor_multiview_t* const mv)
94
143
{
95
143
  const int level_count = _ccv_nnc_tensor_multiview_level_count(mv);
96
143
  ccv_nnc_graph_tensor_wrap_t* tensor_wrap = (ccv_nnc_graph_tensor_wrap_t*)ccmalloc(sizeof(ccv_nnc_graph_tensor_wrap_t) + sizeof(ccv_nnc_tensor_t*) * (level_count - 1));
97
143
  tensor_wrap->update_required = 0;
98
143
  tensor_wrap->count = level_count;
99
143
  tensor_wrap->index = 0;
100
143
  tensor_wrap->tensors[0] = (ccv_nnc_tensor_t*)mv;
101
143
  return tensor_wrap;
102
143
}
103
104
static void _ccv_nnc_graph_exec_rewind(ccv_nnc_graph_exec_info_t* const info, ccv_nnc_graph_t* const graph)
105
23
{
106
23
  if (!info->tensor_wraps_ref)
107
22
    return;
108
1
  int i;
109
1
  assert(info->tensor_wraps_ref <= graph->tensor_wraps->rnum);
110
1
  ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, info->tensor_wraps_ref - 1);;
111
1
  // Rewind from tensor wraps.
112
3
  for (i = 0; i < info->input_size; 
i++2
)
113
2
    if (tensor_wrap_array->tensor_wraps[i])
114
1
      info->inputs[i] = tensor_wrap_array->tensor_wraps[i]->tensors[0];
115
1
  const int d = info->input_size;
116
2
  for (i = 0; i < info->output_size; 
i++1
)
117
1
    if (tensor_wrap_array->tensor_wraps[d + i])
118
1
      info->outputs[i] = tensor_wrap_array->tensor_wraps[d + i]->tensors[0];
119
1
  const int dd = info->input_size + info->output_size;
120
1
  for (i = 0; i < info->update_size; 
i++0
)
121
0
    if (tensor_wrap_array->tensor_wraps[dd + i])
122
0
      info->updates[i] = tensor_wrap_array->tensor_wraps[dd + i]->tensors[0];
123
1
}
124
125
static void _ccv_nnc_graph_tensor_wrap_free(ccv_nnc_graph_tensor_wrap_t* const tensor_wrap)
126
195
{
127
195
  ccfree(tensor_wrap);
128
195
}
129
130
ccv_nnc_graph_tensor_wrap_array_t* ccv_nnc_get_tensor_wrap_array(ccv_nnc_graph_t* const graph, const int tensor_wrap_size, int* const tensor_wraps_ref)
131
62
{
132
62
  ccv_nnc_graph_tensor_wrap_array_t** tensor_wrap_array_ref = *tensor_wraps_ref ? 
(ccv_nnc_graph_tensor_wrap_array_t**)9
ccv_array_get9
(graph->tensor_wraps, *tensor_wraps_ref - 1) :
053
;
133
62
  // Otherwise, find an open slot.
134
62
  if (!tensor_wrap_array_ref)
135
53
  {
136
53
    if (!graph->tensor_wraps)
137
27
      graph->tensor_wraps = ccv_array_new(sizeof(ccv_nnc_graph_tensor_wrap_array_t*), 0, 0);
138
53
    ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = 0;
139
53
    ccv_array_push(graph->tensor_wraps, &tensor_wrap_array);
140
53
    tensor_wrap_array_ref = (ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, graph->tensor_wraps->rnum - 1);
141
53
    *tensor_wraps_ref = graph->tensor_wraps->rnum;
142
53
  }
143
62
  int i;
144
62
  if (*tensor_wrap_array_ref)
145
9
  {
146
9
    if ((*tensor_wrap_array_ref)->size != tensor_wrap_size)
147
9
      *tensor_wrap_array_ref = (ccv_nnc_graph_tensor_wrap_array_t*)ccrealloc(*tensor_wrap_array_ref, sizeof(ccv_nnc_graph_tensor_wrap_array_t) + sizeof(ccv_nnc_graph_tensor_wrap_t*) * (tensor_wrap_size - 1));
148
18
    for (i = (*tensor_wrap_array_ref)->size; i < tensor_wrap_size; 
i++9
)
149
9
      (*tensor_wrap_array_ref)->tensor_wraps[i] = 0;
150
9
  } else
151
53
    *tensor_wrap_array_ref = (ccv_nnc_graph_tensor_wrap_array_t*)cccalloc(sizeof(ccv_nnc_graph_tensor_wrap_array_t) + sizeof(ccv_nnc_graph_tensor_wrap_t*) * (tensor_wrap_size - 1), 1);
152
62
  ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *tensor_wrap_array_ref;
153
62
  tensor_wrap_array->size = tensor_wrap_size;
154
62
  return tensor_wrap_array;
155
62
}
156
157
void ccv_nnc_set_tensor_wraps(ccv_nnc_graph_tensor_wrap_t** const tensor_wraps, ccv_nnc_tensor_t* const* const tensors, const int tensor_size)
158
184
{
159
184
  int i;
160
349
  for (i = 0; i < tensor_size; 
i++165
)
161
165
    if (tensors[i])
162
164
    {
163
164
      if (CCV_IS_TENSOR_MULTIVIEW(tensors[i]) &&
164
164
        
((ccv_nnc_tensor_multiview_t*)tensors[i])->anchor != 111
CCV_NNC_MULTIVIEW_PHI111
)
165
164
      {
166
107
        if (!tensor_wraps[i] || 
tensors[i] != tensor_wraps[i]->tensors[0]14
)
167
93
        {
168
93
          if (tensor_wraps[i])
169
0
            _ccv_nnc_graph_tensor_wrap_free(tensor_wraps[i]);
170
93
          tensor_wraps[i] = _ccv_nnc_graph_tensor_wrap_new((ccv_nnc_tensor_multiview_t*)tensors[i]);
171
93
        }
172
107
      } else {
173
57
        if (tensor_wraps[i])
174
0
          _ccv_nnc_graph_tensor_wrap_free(tensor_wraps[i]);
175
57
        tensor_wraps[i] = 0;
176
57
      }
177
164
    }
178
184
}
179
180
void ccv_nnc_graph_register_tensor_wraps(ccv_nnc_graph_t* graph, const int tensor_wraps_ref_d)
181
53
{
182
53
  ccv_nnc_graph_t* p = graph;
183
53
  const ccv_nnc_graph_tensor_wraps_ref_t tensor_wraps_ref = {
184
53
    .d = tensor_wraps_ref_d,
185
53
    .graph = graph,
186
53
  };
187
99
  do {
188
99
    if (!p->tensor_wraps_refs)
189
44
    {
190
44
      p->tensor_wraps_refs = ccv_array_new(sizeof(ccv_nnc_graph_tensor_wraps_ref_t), 0, 0);
191
44
      ccv_array_push(p->tensor_wraps_refs, &tensor_wraps_ref);
192
55
    } else {
193
55
      int i;
194
55
      int has_tensor_wraps_ref = 0;
195
152
      for (i = 0; !has_tensor_wraps_ref && i < p->tensor_wraps_refs->rnum; 
i++97
)
196
97
      {
197
97
        ccv_nnc_graph_tensor_wraps_ref_t* tensor_wraps_ref = (ccv_nnc_graph_tensor_wraps_ref_t*)ccv_array_get(p->tensor_wraps_refs, i);
198
97
        has_tensor_wraps_ref = (tensor_wraps_ref->d == tensor_wraps_ref_d && 
tensor_wraps_ref->graph == graph8
);
199
97
      }
200
55
      if (!has_tensor_wraps_ref)
201
55
        ccv_array_push(p->tensor_wraps_refs, &tensor_wraps_ref);
202
55
    }
203
99
    p = p->p;
204
99
  } while (p);
205
53
}
206
207
static void _ccv_nnc_graph_redo_tensor_wraps(ccv_nnc_graph_exec_info_t* const info, ccv_nnc_graph_t* const graph)
208
32.1k
{
209
32.1k
  int i;
210
32.1k
  const int has_wrap = ccv_nnc_tensors_have_wraps(info->inputs, info->input_size) ||
211
32.1k
    
ccv_nnc_tensors_have_wraps(info->outputs, info->output_size)32.1k
||
212
32.1k
    
ccv_nnc_tensors_have_wraps(info->updates, info->update_size)32.0k
;
213
32.1k
  if (has_wrap)
214
61
  {
215
61
    const int tensor_wrap_size = info->input_size + info->output_size + info->update_size;
216
61
    ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = ccv_nnc_get_tensor_wrap_array(graph, tensor_wrap_size, &info->tensor_wraps_ref);
217
61
    ccv_nnc_set_tensor_wraps(tensor_wrap_array->tensor_wraps, info->inputs, info->input_size);
218
61
    const int d = info->input_size;
219
61
    ccv_nnc_set_tensor_wraps(tensor_wrap_array->tensor_wraps + d, info->outputs, info->output_size);
220
61
    const int dd = info->input_size + info->output_size;
221
61
    ccv_nnc_set_tensor_wraps(tensor_wrap_array->tensor_wraps + dd, info->updates, info->update_size);
222
32.0k
  } else if (info->tensor_wraps_ref) {
223
1
    ccv_nnc_graph_tensor_wrap_array_t** tensor_wrap_array_ref = (ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, info->tensor_wraps_ref - 1);
224
1
    ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *tensor_wrap_array_ref;
225
1
    if (tensor_wrap_array)
226
1
    {
227
4
      for (i = 0; i < tensor_wrap_array->size; 
i++3
)
228
3
        if (tensor_wrap_array->tensor_wraps[i])
229
2
          _ccv_nnc_graph_tensor_wrap_free(tensor_wrap_array->tensor_wraps[i]);
230
1
      ccfree(tensor_wrap_array);
231
1
      *tensor_wrap_array_ref = 0;
232
1
      info->tensor_wraps_ref = 0;
233
1
    }
234
1
  }
235
32.1k
}
236
237
static void _ccv_nnc_graph_deregister_tensor_wraps(ccv_nnc_graph_t* graph, const int tensor_wraps_ref_d)
238
1
{
239
1
  ccv_nnc_graph_t* p = graph;
240
2
  do {
241
2
    int i;
242
2
    // Remove from the array.
243
2
    if (p->tensor_wraps_refs)
244
2
      for (i = 0; i < p->tensor_wraps_refs->rnum; 
i++0
)
245
2
      {
246
2
        ccv_nnc_graph_tensor_wraps_ref_t* const tensor_wraps_ref = (ccv_nnc_graph_tensor_wraps_ref_t*)ccv_array_get(p->tensor_wraps_refs, i);
247
2
        if (tensor_wraps_ref->d == tensor_wraps_ref_d && tensor_wraps_ref->graph == graph)
248
2
        {
249
2
          --p->tensor_wraps_refs->rnum;
250
2
          if (i < p->tensor_wraps_refs->rnum)
251
0
            memcpy(tensor_wraps_ref, tensor_wraps_ref + 1, sizeof(ccv_nnc_graph_exec_t) * (p->tensor_wraps_refs->rnum - i));
252
2
          break;
253
2
        }
254
2
      }
255
2
    p = p->p;
256
2
  } while (p);
257
1
}
258
259
void ccv_nnc_graph_exec_set_io_flags(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, const int* const input_flags, const int input_flag_size, const int* const output_flags, const int output_flag_size)
260
31.7k
{
261
31.7k
  assert(exec.d < graph->exec_info->rnum);
262
31.7k
  assert(exec.graph == graph);
263
31.7k
  ccv_nnc_graph_exec_info_t* const info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d);
264
31.7k
  assert(input_flag_size <= info->input_size);
265
31.7k
  assert(output_flag_size <= info->output_size);
266
31.7k
  if (info->input_size + info->output_size == 0)
267
19
    return;
268
31.7k
  if (!info->input_flags)
269
31.7k
  {
270
31.7k
    info->input_flags = (int*)cccalloc(info->input_size + info->output_size, sizeof(int));
271
31.7k
    info->output_flags = info->input_flags + info->input_size;
272
31.7k
  }
273
31.7k
  if (input_flag_size > 0)
274
0
    memcpy(info->input_flags, input_flags, sizeof(int) * input_flag_size);
275
31.7k
  if (output_flag_size > 0)
276
0
    memcpy(info->output_flags, output_flags, sizeof(int) * output_flag_size);
277
31.7k
}
278
279
void ccv_nnc_graph_exec_pair_with(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, const ccv_nnc_graph_exec_t pair_exec)
280
443
{
281
443
  assert(exec.graph == graph);
282
443
  assert(exec.d >= 0);
283
443
  assert(exec.d < graph->exec_info->rnum);
284
443
  assert(pair_exec.graph == graph || pair_exec.graph == graph->pair);
285
443
  assert(pair_exec.d >= 0);
286
443
  if (pair_exec.graph == graph)
287
439
    { assert(pair_exec.d < graph->exec_info->rnum); }
288
4
  else
289
4
    { assert(pair_exec.d < graph->pair->exec_info->rnum); }
290
443
  ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d);
291
443
  exec_info->pair_ref = pair_exec.d + 1;
292
443
}
293
294
static ccv_nnc_tensor_t* _ccv_nnc_any_tensor_from_tensor_multiview(ccv_nnc_tensor_multiview_t* const mv)
295
92
{
296
92
  ccv_nnc_tensor_t* tensor = (ccv_nnc_tensor_t*)mv;
297
188
  while (CCV_IS_TENSOR_MULTIVIEW(tensor))
298
96
  {
299
96
    ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
300
96
    const int count = 0;
301
96
    const int off = mv->kind;
302
96
    const int mod = mv->repeat;
303
96
    // If reached the root.
304
96
    tensor = CCV_NNC_MULTIVIEW_DATA(mv)[count >= off ? 
((count - off) % mod) + off83
:
count13
]; // Unwrap.
305
96
  }
306
92
  return tensor;
307
92
}
308
309
void ccv_nnc_graph_exec_set_io(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
310
23
{
311
23
  assert(exec.d < graph->exec_info->rnum);
312
23
  assert(exec.graph == graph);
313
23
  ccv_nnc_graph_exec_info_t* const info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d);
314
23
  // De-register from the graph if it contains multiview tensors.
315
23
  if (info->tensor_wraps_ref)
316
1
    _ccv_nnc_graph_deregister_tensor_wraps(graph, info->tensor_wraps_ref - 1);
317
23
  // In case it is already executed, rewind.
318
23
  _ccv_nnc_graph_exec_rewind(info, graph);
319
23
  if (input_size == 0 && 
output_size == 04
)
320
1
  {
321
1
    if (info->input_size > 0 || info->output_size > 0)
322
1
      
ccfree0
(info->inputs)0
;
323
1
    info->inputs = 0;
324
1
    info->outputs = 0;
325
1
    info->input_size = 0;
326
1
    info->output_size = 0;
327
1
    _ccv_nnc_graph_redo_tensor_wraps(info, graph);
328
1
    if (info->tensor_wraps_ref)
329
0
      ccv_nnc_graph_register_tensor_wraps(graph, info->tensor_wraps_ref - 1);
330
1
    return;
331
1
  }
332
22
  if (info->inputs)
333
2
    info->inputs = (ccv_nnc_tensor_t**)ccrealloc(info->inputs, sizeof(ccv_nnc_tensor_t*) * (input_size + output_size));
334
20
  else
335
20
    info->inputs = (ccv_nnc_tensor_t**)ccmalloc(sizeof(ccv_nnc_tensor_t*) * (input_size + output_size));
336
22
  info->outputs = info->inputs + input_size;
337
22
  if (inputs)
338
22
    memcpy(info->inputs, inputs, sizeof(ccv_nnc_tensor_t*) * input_size);
339
22
  if (outputs)
340
22
    memcpy(info->outputs, outputs, sizeof(ccv_nnc_tensor_t*) * output_size);
341
22
  int i;
342
22
  int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0;
343
77
  for (i = 0; i < input_size + output_size; 
i++55
)
344
55
    if (info->inputs[i])
345
55
    {
346
55
      ccv_nnc_tensor_t* const tensor = CCV_IS_TENSOR_MULTIVIEW(info->inputs[i]) ? 
_ccv_nnc_any_tensor_from_tensor_multiview((ccv_nnc_tensor_multiview_t*)info->inputs[i])3
:
info->inputs[i]52
;
347
55
      tensor_memory |= CCV_TENSOR_GET_MEMORY(tensor->info.type), tensor_formats |= tensor->info.format, tensor_datatypes |= tensor->info.datatype;
348
55
    }
349
22
  info->cmd.backend = ccv_nnc_cmd_find_backend(info->cmd, tensor_memory, tensor_formats, tensor_datatypes);
350
22
  info->input_size = input_size;
351
22
  info->output_size = output_size;
352
22
  _ccv_nnc_graph_redo_tensor_wraps(info, graph);
353
22
  // Register again if the tensor wraps exist.
354
22
  if (info->tensor_wraps_ref)
355
2
    ccv_nnc_graph_register_tensor_wraps(graph, info->tensor_wraps_ref - 1);
356
22
  // Free flags.
357
22
  if (info->input_flags)
358
0
  {
359
0
    ccfree(info->input_flags);
360
0
    info->input_flags = info->output_flags = 0;
361
0
  }
362
22
}
363
364
void ccv_nnc_graph_exec_add_as_affected(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, ccv_nnc_tensor_t* const update)
365
23
{
366
23
  assert(CCV_IS_TENSOR_MULTIVIEW(update));
367
23
  assert(exec.d < graph->exec_info->rnum);
368
23
  assert(exec.graph == graph);
369
23
  ccv_nnc_graph_exec_info_t* const info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d);
370
23
  const int register_tensor_wraps = !info->tensor_wraps_ref;
371
23
  const int update_index = info->update_size;
372
23
  ++info->update_size;
373
23
  if (info->updates)
374
6
    info->updates = (ccv_nnc_tensor_t**)ccrealloc(info->updates, sizeof(ccv_nnc_tensor_t*) * info->update_size);
375
17
  else
376
17
    info->updates = (ccv_nnc_tensor_t**)ccmalloc(sizeof(ccv_nnc_tensor_t*) * info->update_size);
377
23
  info->updates[update_index] = update;
378
23
  _ccv_nnc_graph_redo_tensor_wraps(info, graph);
379
23
  if (register_tensor_wraps)
380
14
    ccv_nnc_graph_register_tensor_wraps(graph, info->tensor_wraps_ref - 1);
381
23
}
382
383
ccv_nnc_graph_exec_t ccv_nnc_graph_exec_new(ccv_nnc_graph_t* const graph, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
384
32.0k
{
385
32.0k
  int d = graph->exec_info->rnum;
386
32.0k
  ccv_nnc_graph_exec_info_t info = {
387
32.0k
    .cmd = cmd,
388
32.0k
    .hint = hint,
389
32.0k
    .input_size = input_size,
390
32.0k
    .output_size = output_size,
391
32.0k
  };
392
32.0k
  assert(inputs || input_size == 0);
393
32.0k
  assert(outputs || output_size == 0);
394
32.0k
  if (input_size > 0 || 
output_size > 04.65k
)
395
31.8k
  {
396
31.8k
    info.inputs = (ccv_nnc_tensor_t**)ccmalloc(sizeof(ccv_nnc_tensor_t*) * (input_size + output_size));
397
31.8k
    info.outputs = info.inputs + input_size;
398
31.8k
    if (inputs)
399
31.7k
      memcpy(info.inputs, inputs, sizeof(ccv_nnc_tensor_t*) * input_size);
400
31.8k
    if (outputs)
401
31.8k
      memcpy(info.outputs, outputs, sizeof(ccv_nnc_tensor_t*) * output_size);
402
31.8k
    info.input_size = input_size;
403
31.8k
    info.output_size = output_size;
404
31.8k
    int i;
405
31.8k
    int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0;
406
171k
    for (i = 0; i < input_size + output_size; 
i++139k
)
407
139k
      if (info.inputs[i])
408
108k
      {
409
108k
        ccv_nnc_tensor_t* const tensor = CCV_IS_TENSOR_MULTIVIEW(info.inputs[i]) ? 
_ccv_nnc_any_tensor_from_tensor_multiview((ccv_nnc_tensor_multiview_t*)info.inputs[i])76
:
info.inputs[i]108k
;
410
108k
        tensor_memory |= CCV_TENSOR_GET_MEMORY(tensor->info.type), tensor_formats |= tensor->info.format, tensor_datatypes |= tensor->info.datatype;
411
108k
      }
412
31.8k
    info.cmd.backend = ccv_nnc_cmd_find_backend(info.cmd, tensor_memory, tensor_formats, tensor_datatypes);
413
31.8k
  }
414
32.0k
  _ccv_nnc_graph_redo_tensor_wraps(&info, graph);
415
32.0k
  // Add itself to the graph's wraps array, this will help the run time when we run the graph and do unwrapping.
416
32.0k
  if (info.tensor_wraps_ref)
417
36
    ccv_nnc_graph_register_tensor_wraps(graph, info.tensor_wraps_ref - 1);
418
32.0k
  ccv_array_push(graph->exec_info, &info);
419
32.0k
  return (ccv_nnc_graph_exec_t){
420
32.0k
    .d = d,
421
32.0k
    .graph = graph,
422
32.0k
  };
423
32.0k
}
424
425
void ccv_nnc_graph_add_carry_over(ccv_nnc_graph_t* const graph, const ccv_nnc_tensor_t* const from, const ccv_nnc_tensor_t* const to)
426
25
{
427
25
  ccv_nnc_graph_tensor_carry_over_t carry_over = {
428
25
    .from = _ccv_nnc_graph_tensor_wrap_new((ccv_nnc_tensor_multiview_t*)from),
429
25
    .to = _ccv_nnc_graph_tensor_wrap_new((ccv_nnc_tensor_multiview_t*)to)
430
25
  };
431
25
  if (!graph->carry_overs)
432
21
    graph->carry_overs = ccv_array_new(sizeof(ccv_nnc_graph_tensor_carry_over_t), 0, 0);
433
25
  ccv_array_push(graph->carry_overs, &carry_over);
434
25
}
435
436
int ccv_nnc_graph_exec_concat(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t source, const ccv_nnc_graph_exec_t destination)
437
28.8k
{
438
28.8k
  assert(graph == source.graph);
439
28.8k
  assert(graph == destination.graph);
440
28.8k
  assert(source.d < graph->exec_info->rnum);
441
28.8k
  assert(destination.d < graph->exec_info->rnum);
442
28.8k
  ccv_nnc_graph_exec_info_t* src_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, source.d);
443
28.8k
  if (src_info->outgoings == 0)
444
25.9k
    src_info->outgoings = ccv_array_new(sizeof(int32_t), 1, 0);
445
2.89k
  else {
446
2.89k
    int i;
447
2.89k
    // Check if this is already connected, if so, skip.
448
8.92k
    for (i = 0; i < src_info->outgoings->rnum; 
i++6.02k
)
449
6.02k
      if (*(int*)ccv_array_get(src_info->outgoings, i) == destination.d)
450
0
        return -1;
451
2.89k
  }
452
28.8k
  ccv_array_push(src_info->outgoings, &destination.d);
453
28.8k
  graph->topsorted = 0;
454
28.8k
  return 0;
455
28.8k
}
456
457
int ccv_nnc_graph_exec_disjoin(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t source, const ccv_nnc_graph_exec_t destination)
458
0
{
459
0
  assert(graph == source.graph);
460
0
  assert(graph == destination.graph);
461
0
  assert(source.d < graph->exec_info->rnum);
462
0
  assert(destination.d < graph->exec_info->rnum);
463
0
  ccv_nnc_graph_exec_info_t* src_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, source.d);
464
0
  if (!src_info->outgoings)
465
0
    return -1;
466
0
  int i, j = -1;
467
0
  // Check if this is already connected, if so, skip.
468
0
  for (i = 0; i < src_info->outgoings->rnum; i++)
469
0
    if (*(int*)ccv_array_get(src_info->outgoings, i) == destination.d)
470
0
    {
471
0
      j = i;
472
0
      break;
473
0
    }
474
0
  if (j < 0)
475
0
    return -1;
476
0
  if (j < src_info->outgoings->rnum - 1)
477
0
    *(int*)ccv_array_get(src_info->outgoings, j) = *(int*)ccv_array_get(src_info->outgoings, src_info->outgoings->rnum - 1);
478
0
  --src_info->outgoings->rnum;
479
0
  ccv_nnc_graph_exec_info_t* dest_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, destination.d);
480
0
  if (dest_info->outgoings)
481
0
    for (i = 0; i < dest_info->outgoings->rnum; i++)
482
0
      ccv_array_add_unique_int(src_info->outgoings, *(int*)ccv_array_get(dest_info->outgoings, i));
483
0
  graph->topsorted = 0;
484
0
  return 0;
485
0
}
486
487
int ccv_nnc_graph_exec_count(const ccv_nnc_graph_t* const graph)
488
0
{
489
0
  return graph->exec_info ? graph->exec_info->rnum : 0;
490
0
}
491
492
void* ccv_nnc_graph_buffer(ccv_nnc_graph_t* const graph, int size)
493
26.3k
{
494
26.3k
  if (graph->buffer_size >= size)
495
26.0k
    return graph->buffer;
496
315
  graph->buffer_size = size;
497
315
  graph->buffer = (graph->buffer) ? 
ccrealloc16
(graph->buffer, size)16
:
ccmalloc299
(size)299
;
498
315
  return graph->buffer;
499
315
}
500
501
void ccv_nnc_graph_topsort(ccv_nnc_graph_t* const graph, int* const exec_cvt, const int exec_cvt_size)
502
6.09k
{
503
6.09k
  assert(exec_cvt_size == graph->exec_info->rnum);
504
6.09k
  assert(graph->sources && graph->sources->rnum);
505
6.09k
  assert(graph->destinations && graph->destinations->rnum);
506
6.09k
  int i, j;
507
38.1k
  for (i = 0; i < exec_cvt_size; 
i++32.0k
)
508
32.0k
    exec_cvt[i] = -1;
509
6.09k
  ccv_array_t* exec_info = ccv_array_new(sizeof(ccv_nnc_graph_exec_info_t), graph->exec_info->rnum, 0);
510
6.09k
  // If there are breakpoints, it is more complicated, we first start to the breakpoints, and then continue from the breakpoints to the destinations.
511
6.09k
  if (graph->breakpoint_size)
512
21
  {
513
42
    ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new21
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, (ccv_nnc_graph_exec_t*)ccv_array_get(graph->sources, 0), graph->sources->rnum, graph->breakpoints, graph->breakpoint_size, 0);
514
42
    for (i = 0; i < graph->breakpoint_size; 
i++21
)
515
21
      exec_cvt[graph->breakpoints[i].d] = -2; // Mark this as breakpoints, so we will skip the first round.
516
42
    
ccv_nnc_graph_visit_for32
(visit, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), node, idx) {
517
32
      assert(!node->pair_ref); // If node has a pair ref, we cannot fix it up.
518
32
      if (exec_cvt[idx] == -2) // Skip breakpoint.
519
21
        continue;
520
11
      // Loop over node and push to the array.
521
11
      ccv_array_push(exec_info, node);
522
11
      // Go to its sub-graph to fix exec_idx
523
11
      for (i = 0; i < node->graph_ref_size; 
i++0
)
524
0
      {
525
0
        const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
526
0
        if (graph_ref >= 0)
527
0
        {
528
0
          ccv_nnc_graph_t* const sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, graph_ref);
529
0
          sub_graph->exec_idx = exec_info->rnum;
530
0
        }
531
0
      }
532
11
      exec_cvt[idx] = exec_info->rnum - 1;
533
11
    } ccv_nnc_graph_visit_endfor
534
42
    ccv_nnc_graph_visit_free(visit);
535
21
    graph->breakpoint_offset = exec_info->rnum;
536
42
    visit = 
ccv_nnc_graph_visit_new21
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph->breakpoints, graph->breakpoint_size, (ccv_nnc_graph_exec_t*)ccv_array_get(graph->destinations, 0), graph->destinations->rnum, 0);
537
44
    ccv_nnc_graph_visit_for(visit, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), node, idx) {
538
44
      assert(!node->pair_ref); // If node has a pair ref, we cannot fix it up.
539
44
      // Loop over node and push to the array.
540
44
      ccv_array_push(exec_info, node);
541
44
      // Go to its sub-graph to fix exec_idx
542
52
      for (i = 0; i < node->graph_ref_size; 
i++8
)
543
8
      {
544
8
        const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
545
8
        if (graph_ref >= 0)
546
8
        {
547
8
          ccv_nnc_graph_t* const sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, graph_ref);
548
8
          sub_graph->exec_idx = exec_info->rnum;
549
8
        }
550
8
      }
551
44
      exec_cvt[idx] = exec_info->rnum - 1;
552
44
    } ccv_nnc_graph_visit_endfor
553
42
    ccv_nnc_graph_visit_free(visit);
554
42
    for (i = 0; i < graph->breakpoint_size; 
i++21
)
555
21
      { assert(exec_cvt[graph->breakpoints[i].d] >= 0); } // All breakpoints should be assigned.
556
6.07k
  } else {
557
12.1k
    ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new6.07k
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, (ccv_nnc_graph_exec_t*)ccv_array_get(graph->sources, 0), graph->sources->rnum, (ccv_nnc_graph_exec_t*)ccv_array_get(graph->destinations, 0), graph->destinations->rnum, 0);
558
31.9k
    ccv_nnc_graph_visit_for(visit, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), node, idx) {
559
31.9k
      assert(!node->pair_ref); // If node has a pair ref, we cannot fix it up.
560
31.9k
      // Loop over node and push to the array.
561
31.9k
      ccv_array_push(exec_info, node);
562
31.9k
      // Go to its sub-graph to fix exec_idx
563
32.0k
      for (i = 0; i < node->graph_ref_size; 
i++42
)
564
42
      {
565
42
        const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
566
42
        if (graph_ref >= 0)
567
42
        {
568
42
          ccv_nnc_graph_t* const sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, graph_ref);
569
42
          sub_graph->exec_idx = exec_info->rnum;
570
42
        }
571
42
      }
572
31.9k
      exec_cvt[idx] = exec_info->rnum - 1;
573
31.9k
    } ccv_nnc_graph_visit_endfor
574
12.1k
    ccv_nnc_graph_visit_free(visit);
575
6.07k
  }
576
6.09k
  assert(graph->exec_info->rnum == exec_info->rnum);
577
6.09k
  ccv_array_free(graph->exec_info);
578
6.09k
  graph->exec_info = exec_info;
579
12.1k
  for (i = 0; i < graph->sources->rnum; 
i++6.09k
)
580
6.09k
  {
581
6.09k
    ccv_nnc_graph_exec_t* const source = (ccv_nnc_graph_exec_t*)ccv_array_get(graph->sources, i);
582
6.09k
    source->d = exec_cvt[source->d];
583
6.09k
  }
584
12.1k
  for (i = 0; i < graph->destinations->rnum; 
i++6.09k
)
585
6.09k
  {
586
6.09k
    ccv_nnc_graph_exec_t* const destination = (ccv_nnc_graph_exec_t*)ccv_array_get(graph->destinations, i);
587
6.09k
    destination->d = exec_cvt[destination->d];
588
6.09k
  }
589
6.09k
  // Update all outgoings to reflect the latest.
590
38.1k
  for (i = 0; i < exec_info->rnum; 
i++32.0k
)
591
32.0k
  {
592
32.0k
    ccv_nnc_graph_exec_info_t* const info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(exec_info, i);
593
32.0k
    if (info->outgoings)
594
54.7k
      
for (j = 0; 25.9k
j < info->outgoings->rnum;
j++28.8k
)
595
28.8k
        *(int*)ccv_array_get(info->outgoings, j) = exec_cvt[*(int*)ccv_array_get(info->outgoings, j)];
596
32.0k
  }
597
6.09k
  graph->topsorted = 1;
598
6.09k
}
599
600
typedef struct {
601
  int device_id;
602
  int exec_idx;
603
  ccv_array_t* signal_set;
604
  ccv_array_t* command_set; // The set of command executed in this stream. In case there is a tie (on rank). We will check this.
605
} ccv_nnc_stream_data_t;
606
607
static void _ccv_nnc_graph_schedule_assign_signals(ccv_array_t* const incoming, ccv_nnc_graph_exec_schedule_t* const node, ccv_array_t* const stream_data, int* const signal_size, ccv_nnc_graph_exec_schedule_t* const exec_info, const int exec_info_size)
608
4.82k
{
609
4.82k
  assert(incoming->rnum > 0);
610
4.82k
  int i, j, k;
611
4.82k
  int wait_size = 0, max_wait_size = 0;
612
10.8k
  for (i = 0; i < incoming->rnum; 
i++5.98k
)
613
5.98k
  {
614
5.98k
    const int incoming_idx = *(int*)ccv_array_get(incoming, i);
615
5.98k
    ccv_nnc_graph_exec_schedule_t* const incoming_exec_info = exec_info + incoming_idx;
616
5.98k
    assert(incoming_exec_info->stream_size > 0);
617
5.98k
    max_wait_size += incoming_exec_info->stream_size;
618
5.98k
  }
619
4.82k
  int waits[ccv_max(1, max_wait_size)];
620
4.82k
  assert(node->stream_size > 0);
621
10.8k
  
for (i = 0; 4.82k
i < incoming->rnum;
i++5.98k
)
622
5.98k
  {
623
5.98k
    const int incoming_idx = *(int*)ccv_array_get(incoming, i);
624
5.98k
    assert(incoming_idx < exec_info_size);
625
5.98k
    assert(incoming_idx >= 0);
626
5.98k
    ccv_nnc_graph_exec_schedule_t* const incoming_exec_info = exec_info + incoming_idx;
627
5.98k
    assert(incoming_exec_info->stream_size > 0);
628
5.98k
    int stream_synced = 1;
629
5.98k
    // If the current node's stream is a subset of the incoming node's stream, there
630
5.98k
    // is no need to sync with signal, because we are already synced with the incoming.
631
11.9k
    for (j = 0; stream_synced && 
j < node->stream_size10.2k
;
j++5.99k
)
632
5.99k
    {
633
5.99k
      const int s = SCHEDULE_STREAMS(*node)[j];
634
5.99k
      assert(s >= 0);
635
5.99k
      int flag = 0;
636
12.8k
      for (k = 0; !flag && 
k < incoming_exec_info->stream_size8.52k
;
k++6.82k
)
637
6.82k
        flag = (SCHEDULE_STREAMS(*incoming_exec_info)[k] == s);
638
5.99k
      stream_synced = flag;
639
5.99k
    }
640
5.98k
    if (stream_synced)
641
4.28k
      continue;
642
1.70k
    // Otherwise, find the streams we need to sync with, and create signals for these.
643
3.42k
    
for (j = 0; 1.70k
j < incoming_exec_info->stream_size;
j++1.71k
)
644
1.71k
    {
645
1.71k
      const int s = SCHEDULE_STREAMS(*incoming_exec_info)[j];
646
1.71k
      assert(s >= 0);
647
1.71k
      int flag = 0;
648
4.43k
      for (k = 0; !flag && 
k < node->stream_size4.41k
;
k++2.72k
)
649
2.72k
        flag = (SCHEDULE_STREAMS(*node)[k] == s);
650
1.71k
      if (!flag) // Need to have a signal.
651
1.69k
      {
652
1.69k
        if (SCHEDULE_SIGNALS(*incoming_exec_info)[j] < 0)
653
1.69k
          
SCHEDULE_SIGNALS1.28k
(*incoming_exec_info)[j] = (*signal_size)++1.28k
;
654
405
        else {
655
405
          int flag = 0;
656
405
          // If any of the stream the current node has already seen this signal, we are good already.
657
1.36k
          for (k = 0; !flag && k < node->stream_size; 
k++955
)
658
955
          {
659
955
            assert(SCHEDULE_STREAMS(*node)[k] >= 0);
660
955
            ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, SCHEDULE_STREAMS(*node)[k]);
661
955
            flag = (data->signal_set && 
ccv_array_find_int(data->signal_set, 429
SCHEDULE_SIGNALS429
(*incoming_exec_info)[j]));
662
955
          }
663
405
          if (flag)
664
0
            continue;
665
1.69k
        }
666
1.69k
        // Otherwise, we need to wait for this. Currently, our granularity is about wait on all streams.
667
1.69k
        waits[wait_size++] = SCHEDULE_SIGNALS(*incoming_exec_info)[j];
668
1.69k
        // All streams on this node have seen this signal.
669
4.36k
        for (k = 0; k < node->stream_size; 
k++2.66k
)
670
2.66k
        {
671
2.66k
          ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, SCHEDULE_STREAMS(*node)[k]);
672
2.66k
          if (!data->signal_set)
673
896
            data->signal_set = ccv_array_new(sizeof(int), 0, 0);
674
2.66k
          ccv_array_push(data->signal_set, &SCHEDULE_SIGNALS(*incoming_exec_info)[j]);
675
2.66k
        }
676
1.69k
      }
677
1.71k
    }
678
1.70k
  }
679
4.82k
  node->wait_size = wait_size;
680
4.82k
  if (wait_size > 0)
681
801
  {
682
801
    node->waits = node->waits ? 
ccrealloc0
(node->waits, sizeof(int) * wait_size)0
: ccmalloc(sizeof(int) * wait_size);
683
801
    memcpy(node->waits, waits, sizeof(int) * wait_size);
684
801
  }
685
4.82k
}
686
687
typedef struct {
688
  int rank;
689
  ccv_array_t* outgoings;
690
} ccv_nnc_incoming_t;
691
692
static int _ccv_nnc_device_ids_for_stream_data(ccv_nnc_graph_exec_info_t* const node, const int device_id, ccv_array_t* const stream_data, int* const device_ids, const int max_device_id_size)
693
12.8k
{
694
12.8k
  // TODO: I need to re-think whether this is GPU only or not.
695
12.8k
  int device_id_size = ccv_nnc_device_ids_for_io(node->inputs, node->input_size, node->outputs, node->output_size, CCV_TENSOR_GPU_MEMORY, device_ids, max_device_id_size);
696
12.8k
  if (device_id_size == 0)
697
2.06k
  {
698
2.06k
    // If there is a default data, use that device id. Otherwise, use the device id passed in (this will be the default data device id).
699
2.06k
    if (stream_data->rnum > 0)
700
1.93k
    {
701
1.93k
      ccv_nnc_stream_data_t* const default_data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, 0);
702
1.93k
      device_ids[0] = default_data->device_id;
703
1.93k
    } else
704
137
      device_ids[0] = device_id >= 0 ? 
device_id2
:
0135
;
705
2.06k
    device_id_size = 1;
706
2.06k
  }
707
12.8k
  return device_id_size;
708
12.8k
}
709
710
void ccv_nnc_graph_static_schedule_free(ccv_nnc_graph_static_schedule_t* const schedule)
711
355
{
712
355
  int i;
713
355
  ccv_nnc_graph_exec_schedule_t* const schd_info = schedule->exec_info;
714
7.52k
  for (i = 0; i < schedule->exec_info_size; 
i++7.17k
)
715
7.17k
  {
716
7.17k
    if (schd_info[i].stream_size > 1)
717
7.17k
      
ccfree150
(schd_info[i]._heap_streams)150
;
718
7.17k
    if (schd_info[i].waits)
719
7.17k
      
ccfree801
(schd_info[i].waits)801
;
720
7.17k
  }
721
355
  if (schedule->stream_1s)
722
355
    
ccfree14
(schedule->stream_1s)14
;
723
355
  if (schedule->waits)
724
355
    
ccfree10
(schedule->waits)10
;
725
355
  if (schedule->psort)
726
355
    
ccfree54
(schedule->psort)54
;
727
355
  if (schedule->begin)
728
14
    ccv_nnc_stream_signal_free(schedule->begin);
729
355
  if (schedule->end)
730
355
    ccv_nnc_stream_signal_free(schedule->end);
731
355
  ccfree(schedule);
732
355
}
733
734
static ccv_nnc_graph_static_schedule_t* _ccv_nnc_graph_static_schedule_new(ccv_nnc_graph_t* const graph, const int stream_type, const int device_id, ccv_nnc_stream_context_t* const stream_context, const ccv_nnc_graph_exec_t* const _sources, const int _source_size, const ccv_nnc_graph_exec_t* const _destinations, const int _destination_size)
735
355
{
736
355
  assert(graph->sources && graph->sources->rnum);
737
355
  assert(graph->destinations && graph->destinations->rnum);
738
355
  assert(graph->topsorted); // Only support this on a topsorted graph.
739
355
  const int exec_info_size = graph->exec_info->rnum;
740
355
  assert(exec_info_size > 0);
741
355
  const ccv_nnc_graph_exec_t* const sources = _sources == 0 ? 
(ccv_nnc_graph_exec_t*)330
ccv_array_get330
(graph->sources, 0) :
_sources25
;
742
355
  const int source_size = _sources == 0 ? 
graph->sources->rnum330
:
_source_size25
;
743
355
  if (!_sources)
744
330
    { assert(_source_size == 0); }
745
355
  const ccv_nnc_graph_exec_t* const destinations = _destinations == 0 ? 
(ccv_nnc_graph_exec_t*)322
ccv_array_get322
(graph->destinations, 0) :
_destinations33
;
746
355
  const int destination_size = _destinations == 0 ? 
graph->destinations->rnum322
:
_destination_size33
;
747
355
  if (!_destinations)
748
322
    { assert(_destination_size == 0); }
749
355
  const int root_schedule = (_sources == 0 && 
_destinations == 0330
);
750
355
  ccv_nnc_graph_static_schedule_t* const schedule = cccalloc(1, sizeof(ccv_nnc_graph_static_schedule_t) + sizeof(ccv_nnc_graph_exec_schedule_t) * (exec_info_size - 1));
751
355
  schedule->exec_info_size = exec_info_size;
752
355
  ccv_nnc_graph_exec_schedule_t* const schd_info = schedule->exec_info;
753
355
  ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0);
754
710
  ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new355
(graph, exec_info, exec_info_size, sources, source_size, destinations, destination_size, 0);
755
710
  if (
!root_schedule355
)
756
54
  {
757
54
    // If this is not a root schedule, we need to do partial topsort.
758
54
    int psort_size = 0;
759
1.51k
    ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
760
1.51k
      ++psort_size;
761
1.51k
    } ccv_nnc_graph_visit_endfor
762
54
    schedule->psort = (int*)ccmalloc(sizeof(int) * psort_size);
763
54
    schedule->psort_size = psort_size;
764
54
    psort_size = 0;
765
1.51k
    ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
766
1.51k
      schedule->psort[psort_size++] = idx;
767
1.51k
    } ccv_nnc_graph_visit_endfor
768
54
  }
769
710
  int i, j, k;
770
710
  // Generate exec dependencies (or, in other words, partial ordering of executions).
771
710
  ccv_sparse_matrix_t* exec_dep = ccv_sparse_matrix_new(exec_info_size, exec_info_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
772
710
  int* buf = (int*)
ccmalloc355
(sizeof(int) * exec_info_size * 2);
773
710
  int buf_size;
774
710
#define for_block(x, val) \
775
178k
  do { \
776
178k
    if (((int32_t*)val)[0] > 0) \
777
178k
    { \
778
178k
      buf[buf_size * 2] = x; \
779
178k
      buf[buf_size * 2 + 1] = ((int32_t*)val)[0] + 1; \
780
178k
      ++buf_size; \
781
178k
    } \
782
178k
  } while (0)
783
7.52k
  for (i = 0; i < exec_info_size; 
i++7.17k
)
784
7.17k
    schd_info[i].stream_size = -1;
785
5.19k
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx, term) {
786
5.19k
    buf_size = 0; /* save all its parent deps to this buffer */
787
5.19k
    ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx);
788
5.19k
    schd_info[idx].stream_size = 0;
789
5.19k
    if (vector)
790
178k
      
CCV_SPARSE_VECTOR_FOREACH4.82k
(exec_dep, vector, for_block);
791
5.19k
    if (!node->outgoings)
792
322
      continue;
793
14.0k
    
for (i = 0; 4.87k
i < node->outgoings->rnum;
i++9.15k
)
794
9.15k
    {
795
9.15k
      int outgoing = *(int*)ccv_array_get(node->outgoings, i);
796
9.15k
      const int32_t one = 1;
797
9.15k
      ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, idx);
798
9.15k
      /* If not found, set, if the current node is the destination node, no need
799
9.15k
       * set itself as parent of subsequent nodes because its terminal nature. */
800
9.15k
      if (!term && 
(9.08k
!cell.i329.08k
||
cell.i32[0] == 00
))
801
9.08k
        ccv_set_sparse_matrix_cell(exec_dep, outgoing, idx, &one);
802
312k
      for (j = 0; j < buf_size; 
j++302k
) /* set with all idx's dependencies as well */
803
302k
      {
804
302k
        ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2]);
805
302k
        /* If not found, set */
806
302k
        if (!cell.i32 || 
cell.i32[0] == 0116k
)
807
186k
          ccv_set_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2], &buf[j * 2 + 1]);
808
116k
        else {
809
116k
          /* Otherwise, set to the longest one */
810
116k
          int32_t dep = ccv_max(cell.i32[0], buf[j * 2 + 1]);
811
116k
          ccv_set_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2], &dep);
812
116k
        }
813
302k
      }
814
9.15k
    }
815
4.87k
  } ccv_nnc_graph_visit_endfor
816
710
#undef for_block
817
710
  
ccfree355
(buf);
818
355
  // Algorithm to allocate signals and streams for this graph.
819
355
  ccv_array_t* const stream_data = ccv_array_new(sizeof(ccv_nnc_stream_data_t), 0, 0);
820
355
  ccv_array_t** const outgoings = cccalloc(exec_info_size, sizeof(ccv_array_t*));
821
355
  ccv_nnc_incoming_t* const incomings = cccalloc(exec_info_size, sizeof(ccv_nnc_incoming_t));
822
355
  int max_device_id_size = 1;
823
355
  // Filter out outgoing nodes that we will be able to access it afterwards anyway.
824
5.19k
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
825
5.19k
    max_device_id_size = ccv_max(node->input_size + node->output_size, max_device_id_size);
826
5.19k
    if (node->outgoings)
827
4.87k
    {
828
4.87k
      outgoings[idx] = ccv_array_new(sizeof(int), 0, 0);
829
14.0k
      for (i = 0; i < node->outgoings->rnum; 
i++9.15k
)
830
9.15k
      {
831
9.15k
        const int di = *(int*)ccv_array_get(node->outgoings, i);
832
9.15k
        // Skip if we haven't accessed this exec.
833
9.15k
        if (schd_info[di].stream_size < 0)
834
1.32k
          continue;
835
7.82k
        int flag = 0;
836
26.1k
        for (j = 0; !flag && 
j < node->outgoings->rnum24.2k
;
j++18.3k
)
837
18.3k
        {
838
18.3k
          if (j != i)
839
12.2k
          {
840
12.2k
            const int dj = *(int*)ccv_array_get(node->outgoings, j);
841
12.2k
            ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, di, dj);
842
12.2k
            flag = (cell.i32 && 
cell.i32[0]1.84k
);
843
12.2k
          }
844
18.3k
        }
845
7.82k
        if (!flag)
846
5.98k
        {
847
5.98k
          ccv_array_push(outgoings[idx], &di);
848
5.98k
          if (!incomings[di].outgoings)
849
4.82k
            incomings[di].outgoings = ccv_array_new(sizeof(int), 1, 0);
850
5.98k
          ccv_array_push(incomings[di].outgoings, &idx);
851
5.98k
        }
852
7.82k
      }
853
4.87k
    }
854
5.19k
  } ccv_nnc_graph_visit_endfor
855
355
#define visitor(node, idx, _) \
856
5.19k
  if (node->outgoings) \
857
10.8k
    
for (i = 0; 4.82k
i < node->outgoings->rnum;
i++5.98k
) \
858
5.98k
    { \
859
5.98k
      const int d = *(int*)ccv_array_get(node->outgoings, i); \
860
5.98k
      node->rank = ccv_max(incomings[d].rank + 1, node->rank); \
861
5.98k
    }
862
5.19k
  
CCV_NNC_GRAPH_VISIT355
(graph, incomings, exec_info_size, destinations, destination_size, sources, source_size, 0, visitor);
863
355
#undef visitor
864
355
  int device_ids[max_device_id_size];
865
355
  int outgoing_device_ids[max_device_id_size];
866
355
  int signal_size = 0;
867
5.19k
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
868
5.19k
    // Go through the incomings.
869
5.19k
    const int device_id_size = _ccv_nnc_device_ids_for_stream_data(node, device_id, stream_data, device_ids, max_device_id_size);
870
5.19k
    if (schd_info[idx].stream_size == 0)
871
375
    {
872
375
      schd_info[idx].stream_size = device_id_size; // At least at the same size as the device_id_size.
873
375
      if (device_id_size > 1)
874
6
      {
875
6
        schd_info[idx]._heap_streams = (int*)ccmalloc(sizeof(int) * device_id_size * 2);
876
6
        schd_info[idx]._heap_signals = (schd_info[idx]._heap_streams + device_id_size);
877
6
      }
878
764
      for (i = 0; i < device_id_size; 
i++389
)
879
389
        SCHEDULE_STREAMS(schd_info[idx])[i] = -1, SCHEDULE_SIGNALS(schd_info[idx])[i] = -1;
880
375
    }
881
10.8k
    for (i = 0; i < device_id_size; 
i++5.61k
)
882
5.61k
      // Go through until the end to assign streams.
883
5.61k
      if (SCHEDULE_STREAMS(schd_info[idx])[i] < 0)
884
1.30k
      {
885
1.30k
        int stream_idx = -1;
886
1.30k
        int stream_has_command = 0;
887
1.30k
        // First, find a good stream in stream data (the stream is good if it can be recycled, and it has the same command).
888
1.30k
        // Otherwise, we prefer a usable stream (it doesn't have the command, but it can be recycled).
889
35.2k
        for (j = 0; (stream_idx < 0 || 
!stream_has_command254
) &&
j < stream_data->rnum35.1k
;
j++33.9k
)
890
33.9k
        {
891
33.9k
          ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, j);
892
33.9k
          if (data->device_id == device_ids[i])
893
8.93k
          {
894
8.93k
            const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, data->exec_idx);
895
8.93k
            // If there is a path to conclude that exec_idx is before idx, then we can reuse
896
8.93k
            // this stream. Otherwise the work in this "empty stream" could still be ongoing,
897
8.93k
            // and we may delay the following work unnecessarily.
898
8.93k
            if (cell.i32 && 
cell.i32[0] > 0146
)
899
146
            {
900
146
              if (ccv_array_find_uint(data->command_set, node->cmd.cmd))
901
72
                stream_idx = j, stream_has_command = 1;
902
74
              else if (stream_idx < 0) // Otherwise, only assign the stream idx if it is not assigned yet.
903
29
                stream_idx = j;
904
146
            }
905
8.93k
          }
906
33.9k
        }
907
1.30k
        if (stream_idx < 0)
908
1.20k
        {
909
1.20k
          stream_idx = stream_data->rnum;
910
1.20k
          const ccv_nnc_stream_data_t data = {
911
1.20k
            .device_id = device_ids[i],
912
1.20k
          };
913
1.20k
          ccv_array_push(stream_data, &data);
914
1.20k
        }
915
1.30k
        assert(stream_idx >= 0);
916
1.30k
        ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, stream_idx);
917
1.30k
        if (!data->command_set)
918
1.20k
          data->command_set = ccv_array_new(sizeof(uint32_t), 1, 0);
919
1.30k
        SCHEDULE_STREAMS(schd_info[idx])[i] = stream_idx;
920
1.30k
        ccv_array_add_unique_uint(data->command_set, node->cmd.cmd);
921
1.30k
        // Assign all subsequent node to use this stream.
922
1.30k
        int outgoing_idx = idx;
923
5.61k
        while (outgoings[outgoing_idx] && 
outgoings[outgoing_idx]->rnum5.29k
)
924
5.23k
        {
925
5.23k
          int highest_rank = -1;
926
5.23k
          int highest_idx = -1;
927
5.23k
          int stream_n = -1;
928
5.23k
          int stream_has_command = 0;
929
12.8k
          for (j = 0; j < outgoings[outgoing_idx]->rnum; 
j++7.62k
)
930
7.62k
          {
931
7.62k
            const int d = *(int*)ccv_array_get(outgoings[outgoing_idx], j);
932
7.62k
            // This is not outside of our scope at this point.
933
7.62k
            assert(schd_info[d].stream_size >= 0);
934
7.62k
            ccv_nnc_graph_exec_info_t* const outgoing_node = exec_info + d;
935
7.62k
            const int outgoing_device_id_size = _ccv_nnc_device_ids_for_stream_data(outgoing_node, device_id, stream_data, outgoing_device_ids, max_device_id_size);
936
7.62k
            if (schd_info[d].stream_size == 0)
937
4.82k
            {
938
4.82k
              schd_info[d].stream_size = outgoing_device_id_size; // At least at the same size as the device_id_size.
939
4.82k
              if (outgoing_device_id_size > 1)
940
144
              {
941
144
                schd_info[d]._heap_streams = (int*)ccmalloc(sizeof(int) * outgoing_device_id_size * 2);
942
144
                schd_info[d]._heap_signals = (schd_info[d]._heap_streams + outgoing_device_id_size);
943
144
              }
944
10.0k
              for (k = 0; k < outgoing_device_id_size; 
k++5.22k
)
945
5.22k
                SCHEDULE_STREAMS(schd_info[d])[k] = -1, SCHEDULE_SIGNALS(schd_info[d])[k] = -1;
946
4.82k
            }
947
7.62k
            assert(schd_info[d].stream_size == outgoing_device_id_size);
948
16.2k
            
for (k = 0; 7.62k
k < outgoing_device_id_size;
k++8.66k
)
949
8.66k
              // If it should be on the same device and the stream is not assign, potentially.
950
8.66k
              if (outgoing_device_ids[k] == device_ids[i] &&
951
8.66k
                
SCHEDULE_STREAMS5.37k
(schd_info[d])[k] < 05.37k
&&
952
8.66k
                
(4.92k
incomings[d].rank > highest_rank4.92k
||
953
4.92k
                 
(621
incomings[d].rank == highest_rank621
&&
954
621
                  !stream_has_command && 
ccv_array_find_uint(data->command_set, outgoing_node->cmd.cmd)0
)))
955
4.30k
              {
956
4.30k
                highest_rank = incomings[d].rank;
957
4.30k
                highest_idx = d;
958
4.30k
                stream_n = k;
959
4.30k
                // This is 1 if rank is the same (thus, I must break the tie already), if the rank is not the same, we need to compute this.
960
4.30k
                stream_has_command = (incomings[d].rank == highest_rank || 
ccv_array_find_uint(data->command_set, outgoing_node->cmd.cmd)0
);
961
4.30k
              }
962
7.62k
          }
963
5.23k
          if (highest_idx >= 0)
964
4.30k
          {
965
4.30k
            outgoing_idx = highest_idx;
966
4.30k
            ccv_nnc_graph_exec_info_t* const outgoing_node = exec_info + outgoing_idx;
967
4.30k
            assert(stream_n >= 0);
968
4.30k
            SCHEDULE_STREAMS(schd_info[outgoing_idx])[stream_n] = stream_idx;
969
4.30k
            ccv_array_add_unique_uint(data->command_set, outgoing_node->cmd.cmd);
970
4.30k
          } else
971
924
            break;
972
5.23k
        }
973
1.30k
        data->exec_idx = outgoing_idx;
974
1.30k
      }
975
5.19k
  } ccv_nnc_graph_visit_endfor
976
355
  // Go through to assign signals when necessary.
977
5.19k
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
978
5.19k
    if (incomings[idx].outgoings && 
incomings[idx].outgoings->rnum4.82k
)
979
4.82k
      _ccv_nnc_graph_schedule_assign_signals(incomings[idx].outgoings, schd_info + idx, stream_data, &signal_size, schd_info, exec_info_size);
980
5.19k
  } ccv_nnc_graph_visit_endfor
981
7.52k
  for (i = 0; i < exec_info_size; 
i++7.17k
)
982
7.17k
    if (outgoings[i])
983
4.87k
      ccv_array_free(outgoings[i]);
984
355
  ccfree(outgoings);
985
355
  ccv_matrix_free(exec_dep);
986
355
  ccv_nnc_stream_data_t* const default_data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, 0);
987
355
  if (device_id >= 0)
988
4
  {
989
4
    // If the default stream (stream 0) is not the same as desired stream, swap with the one that is.
990
4
    if (default_data->device_id != device_id)
991
0
    {
992
0
      int exchange_stream_idx = -1;
993
0
      // Find the stream idx to exchange.
994
0
      ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
995
0
        int flag = 0;
996
0
        for(i = 0; !flag && i < schd_info[idx].stream_size; i++)
997
0
        {
998
0
          const int stream_idx = SCHEDULE_STREAMS(schd_info[idx])[i];
999
0
          ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, stream_idx);
1000
0
          if (data->device_id == device_id)
1001
0
          {
1002
0
            exchange_stream_idx = stream_idx;
1003
0
            flag = 1;
1004
0
          }
1005
0
        }
1006
0
        if (flag)
1007
0
          break;
1008
0
      } ccv_nnc_graph_visit_endfor
1009
0
      assert(exchange_stream_idx >= 0);
1010
0
      ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1011
0
        for (i = 0; i < schd_info[idx].stream_size; i++)
1012
0
          if (SCHEDULE_STREAMS(schd_info[idx])[i] == 0)
1013
0
            SCHEDULE_STREAMS(schd_info[idx])[i] = -1;
1014
0
      } ccv_nnc_graph_visit_endfor
1015
0
      ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1016
0
        for (i = 0; i < schd_info[idx].stream_size; i++)
1017
0
          if (SCHEDULE_STREAMS(schd_info[idx])[i] == exchange_stream_idx)
1018
0
            SCHEDULE_STREAMS(schd_info[idx])[i] = 0;
1019
0
      } ccv_nnc_graph_visit_endfor
1020
0
      ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1021
0
        for (i = 0; i < schd_info[idx].stream_size; i++)
1022
0
          if (SCHEDULE_STREAMS(schd_info[idx])[i] == -1)
1023
0
            SCHEDULE_STREAMS(schd_info[idx])[i] = exchange_stream_idx;
1024
0
      } ccv_nnc_graph_visit_endfor
1025
0
      ((ccv_nnc_stream_data_t*)ccv_array_get(stream_data, exchange_stream_idx))->device_id = default_data->device_id;
1026
0
      default_data->device_id = device_id;
1027
0
    }
1028
4
  }
1029
355
  int graph_stream_1_size = 0;
1030
730
  for (i = 0; i < source_size; 
i++375
)
1031
375
  {
1032
375
    const int idx = sources[i].d;
1033
375
    // If it has incoming nodes, check whether these are on stream 0.
1034
375
    if (incomings[idx].outgoings && 
incomings[idx].outgoings->rnum0
)
1035
0
    {
1036
0
      int flag  = 0;
1037
0
      const ccv_array_t* const incoming = incomings[idx].outgoings;
1038
0
      for (j = 0; !flag && j < incoming->rnum; j++)
1039
0
      {
1040
0
        const int incoming_idx = *(int*)ccv_array_get(incoming, j);
1041
0
        for (k = 0; !flag && k < schd_info[incoming_idx].stream_size; k++)
1042
0
          flag = (SCHEDULE_STREAMS(schd_info[incoming_idx])[k] == 0); // If this is the default stream, we already have a good start.
1043
0
      }
1044
0
      if (flag)
1045
0
        continue;
1046
375
    }
1047
764
    
for (j = 0; 375
j < schd_info[idx].stream_size;
j++389
)
1048
389
      if (SCHEDULE_STREAMS(schd_info[idx])[j] != 0) // If this is not the default stream, we need explicit begin signal to start.
1049
34
        ++graph_stream_1_size;
1050
375
  }
1051
355
  if (graph_stream_1_size > 0)
1052
14
  {
1053
14
    schedule->stream_1s = ccmalloc(sizeof(int) * graph_stream_1_size);
1054
14
    graph_stream_1_size = 0;
1055
48
    for (i = 0; i < source_size; 
i++34
)
1056
34
    {
1057
34
      const int idx = sources[i].d;
1058
34
      // If it has incoming nodes, check whether these are on stream 0.
1059
34
      if (incomings[idx].outgoings && 
incomings[idx].outgoings->rnum0
)
1060
0
      {
1061
0
        int flag  = 0;
1062
0
        const ccv_array_t* const incoming = incomings[idx].outgoings;
1063
0
        for (j = 0; !flag && j < incoming->rnum; j++)
1064
0
        {
1065
0
          const int incoming_idx = *(int*)ccv_array_get(incoming, j);
1066
0
          for (k = 0; !flag && k < schd_info[incoming_idx].stream_size; k++)
1067
0
            flag = (SCHEDULE_STREAMS(schd_info[incoming_idx])[k] == 0); // If this is the default stream, we already have a good start.
1068
0
        }
1069
0
        if (flag)
1070
0
          continue;
1071
34
      }
1072
82
      
for (j = 0; 34
j < schd_info[idx].stream_size;
j++48
)
1073
48
        if (SCHEDULE_STREAMS(schd_info[idx])[j] != 0) // If this is not the default stream, we need explicit begin signal to start.
1074
34
        {
1075
34
          const int stream_idx = SCHEDULE_STREAMS(schd_info[idx])[j];
1076
34
          int flag = 0;
1077
64
          for (k = 0; !flag && k < graph_stream_1_size; 
k++30
)
1078
30
            flag = (stream_idx == schedule->stream_1s[k]);
1079
34
          if (!flag)
1080
34
            schedule->stream_1s[graph_stream_1_size++] = stream_idx;
1081
34
        }
1082
34
    }
1083
14
    schedule->stream_1_size = graph_stream_1_size;
1084
14
  }
1085
7.52k
  for (i = 0; i < exec_info_size; 
i++7.17k
)
1086
7.17k
    if (incomings[i].outgoings)
1087
4.82k
      ccv_array_free(incomings[i].outgoings);
1088
355
  ccfree(incomings);
1089
355
  int graph_wait_size = 0;
1090
736
  for (i = 0; i < destination_size; 
i++381
)
1091
381
  {
1092
381
    const int idx = destinations[i].d;
1093
762
    for (j = 0; j < schd_info[idx].stream_size; 
j++381
)
1094
381
      if (SCHEDULE_STREAMS(schd_info[idx])[j] != 0) // If this exec_info doesn't end with default stream, we need to wait.
1095
26
        ++graph_wait_size;
1096
381
  }
1097
355
  if (graph_wait_size > 0)
1098
10
  {
1099
10
    schedule->waits = ccmalloc(sizeof(int) * graph_wait_size);
1100
10
    graph_wait_size = 0;
1101
46
    for (i = 0; i < destination_size; 
i++36
)
1102
36
    {
1103
36
      const int idx = destinations[i].d;
1104
72
      for (j = 0; j < schd_info[idx].stream_size; 
j++36
)
1105
36
        if (SCHEDULE_STREAMS(schd_info[idx])[j] != 0) // If this exec_info doesn't end with default stream, we need to wait.
1106
26
        {
1107
26
          ccv_nnc_stream_data_t* const default_stream_data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, 0);
1108
26
          if (SCHEDULE_SIGNALS(schd_info[idx])[j] < 0)
1109
26
            SCHEDULE_SIGNALS(schd_info[idx])[j] = signal_size++;
1110
0
          else if (default_stream_data->signal_set && ccv_array_find_int(default_stream_data->signal_set, SCHEDULE_SIGNALS(schd_info[idx])[j]))
1111
0
            continue;
1112
26
          schedule->waits[graph_wait_size++] = SCHEDULE_SIGNALS(schd_info[idx])[j];
1113
26
        }
1114
36
    }
1115
10
    schedule->wait_size = graph_wait_size;
1116
10
  }
1117
1.55k
  for (i = 0; i < stream_data->rnum; 
i++1.20k
)
1118
1.20k
  {
1119
1.20k
    ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, i);
1120
1.20k
    if (data->signal_set)
1121
896
      ccv_array_free(data->signal_set);
1122
1.20k
    assert(data->command_set);
1123
1.20k
    ccv_array_free(data->command_set);
1124
1.20k
  }
1125
355
  // Allocate streams & signals
1126
355
  int default_stream_type = stream_type;
1127
355
  CCV_STREAM_SET_DEVICE_ID(default_stream_type, default_data->device_id);
1128
355
  if (root_schedule)
1129
301
  {
1130
301
    assert(!graph->streams);
1131
301
    graph->stream_size = stream_data->rnum;
1132
301
    graph->streams = (ccv_nnc_stream_context_t**)ccmalloc(sizeof(ccv_nnc_stream_context_t*) * graph->stream_size);
1133
301
    graph->block_stream_tasks = (co_routine_t**)cccalloc(graph->stream_size, sizeof(co_routine_t*));
1134
301
    if (stream_context)
1135
4
      graph->streams[0] = stream_context;
1136
1.36k
    for (i = (stream_context ? 
14
:
0297
); i < stream_data->rnum;
i++1.06k
)
1137
1.06k
    {
1138
1.06k
      ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, i);
1139
1.06k
      int type = stream_type;
1140
1.06k
      CCV_STREAM_SET_DEVICE_ID(type, data->device_id);
1141
1.06k
      graph->streams[i] = ccv_nnc_stream_context_new(type);
1142
1.06k
    }
1143
301
    graph->signal_size = signal_size;
1144
301
    graph->signals = (ccv_nnc_stream_signal_t**)cccalloc(signal_size, sizeof(ccv_nnc_stream_signal_t*));
1145
3.68k
    ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1146
7.78k
      for (i = 0; i < schd_info[idx].stream_size; 
i++4.10k
)
1147
4.10k
        if (SCHEDULE_SIGNALS(schd_info[idx])[i] >= 0)
1148
1.14k
        {
1149
1.14k
          const int signal = SCHEDULE_SIGNALS(schd_info[idx])[i];
1150
1.14k
          if (!graph->signals[signal])
1151
1.14k
          {
1152
1.14k
            const ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, SCHEDULE_STREAMS(schd_info[idx])[i]);
1153
1.14k
            int type = stream_type;
1154
1.14k
            CCV_STREAM_SET_DEVICE_ID(type, data->device_id);
1155
1.14k
            graph->signals[signal] = ccv_nnc_stream_signal_new(type);
1156
1.14k
          }
1157
1.14k
        }
1158
3.68k
    } ccv_nnc_graph_visit_endfor
1159
301
  } else {
1160
54
    assert(graph->streams);
1161
54
    assert(graph->stream_size >= stream_data->rnum);
1162
54
    // Find streams to proper allocated stream based on the type we need.
1163
54
    int* const stream_idxs = (int*)ccmalloc(sizeof(int) * (stream_data->rnum + signal_size));
1164
54
    uint64_t* const stream_used = (uint64_t*)cccalloc(((graph->stream_size + 63) >> 6) + ((graph->signal_size + 63) >> 6), sizeof(uint64_t));
1165
188
    for (i = 0; i < stream_data->rnum; 
i++134
)
1166
134
    {
1167
134
      ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, i);
1168
134
      int type = stream_type;
1169
134
      CCV_STREAM_SET_DEVICE_ID(type, data->device_id);
1170
489
      for (j = 0; j < graph->stream_size; 
j++355
)
1171
489
        if (!(stream_used[j >> 6] & ((uint64_t)1 << (j & 63))))
1172
171
        {
1173
171
          const int stream_type = ccv_nnc_stream_context_type(graph->streams[j]);
1174
171
          if (stream_type == type)
1175
134
          {
1176
134
            stream_idxs[i] = j;
1177
134
            stream_used[j >> 6] |= ((uint64_t)1 << (j & 63));
1178
134
            break;
1179
134
          }
1180
171
        }
1181
134
    }
1182
54
    assert(graph->signal_size >= signal_size);
1183
54
    // Find signals to proper allocated signal based on the type we need.
1184
54
    int* const signal_idxs = stream_idxs + stream_data->rnum;
1185
54
    uint64_t* const signal_used = stream_used + ((graph->stream_size + 63) >> 6);
1186
228
    for (i = 0; i < signal_size; 
i++174
)
1187
174
      signal_idxs[i] = -1;
1188
1.51k
    ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1189
3.02k
      for (i = 0; i < schd_info[idx].stream_size; 
i++1.51k
)
1190
1.51k
        if (SCHEDULE_SIGNALS(schd_info[idx])[i] >= 0)
1191
174
        {
1192
174
          const int signal = SCHEDULE_SIGNALS(schd_info[idx])[i];
1193
174
          if (signal_idxs[signal] < 0)
1194
174
          {
1195
174
            const ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, SCHEDULE_STREAMS(schd_info[idx])[i]);
1196
174
            int type = stream_type;
1197
174
            CCV_STREAM_SET_DEVICE_ID(type, data->device_id);
1198
2.24k
            for (j = 0; j < graph->signal_size; 
j++2.07k
)
1199
2.24k
              if (!(signal_used[j >> 6] & ((uint64_t)1 << (j & 63))))
1200
308
              {
1201
308
                const int signal_type = ccv_nnc_stream_signal_type(graph->signals[j]);
1202
308
                if (signal_type == type)
1203
174
                {
1204
174
                  signal_idxs[signal] = j;
1205
174
                  signal_used[j >> 6] |= ((uint64_t)1 << (j & 63));
1206
174
                  break;
1207
174
                }
1208
308
              }
1209
174
          }
1210
174
        }
1211
1.51k
    } ccv_nnc_graph_visit_endfor
1212
54
    // Now rebind streams and signals from the schedule.
1213
1.51k
    ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1214
3.02k
      for (i = 0; i < schd_info[idx].stream_size; 
i++1.51k
)
1215
1.51k
      {
1216
1.51k
        SCHEDULE_STREAMS(schd_info[idx])[i] = stream_idxs[SCHEDULE_STREAMS(schd_info[idx])[i]];
1217
1.51k
        if (SCHEDULE_SIGNALS(schd_info[idx])[i] >= 0)
1218
1.51k
          
SCHEDULE_SIGNALS174
(schd_info[idx])[i] = signal_idxs[174
SCHEDULE_SIGNALS174
(schd_info[idx])[i]];
1219
1.51k
      }
1220
1.69k
      for (i = 0; i < schd_info[idx].wait_size; 
i++180
)
1221
180
        schd_info[idx].waits[i] = signal_idxs[schd_info[idx].waits[i]];
1222
1.51k
    } ccv_nnc_graph_visit_endfor
1223
74
    for (i = 0; i < schedule->stream_1_size; 
i++20
)
1224
20
      schedule->stream_1s[i] = stream_idxs[schedule->stream_1s[i]];
1225
80
    for (i = 0; i < schedule->wait_size; 
i++26
)
1226
26
      schedule->waits[i] = signal_idxs[schedule->waits[i]];
1227
54
    // Rebind who is the stream 0 (default stream).
1228
54
    schedule->stream_0 = stream_idxs[0];
1229
54
    ccfree(stream_used);
1230
54
    ccfree(stream_idxs);
1231
54
  }
1232
355
  assert(graph->streams);
1233
355
  ccv_nnc_graph_visit_free(visit);
1234
1.66k
  for (i = 0; i < signal_size; 
i++1.31k
)
1235
1.31k
    { assert(graph->signals[i]); }
1236
355
  if (schedule->stream_1_size)
1237
14
    schedule->begin = ccv_nnc_stream_signal_new(default_stream_type);
1238
355
  schedule->end = ccv_nnc_stream_signal_new(default_stream_type);
1239
355
  // Do this recursively for its sub graphs.
1240
355
  if (graph->sub_graphs)
1241
7
    
for (i = 0; 3
i < graph->sub_graphs->rnum;
i++4
)
1242
4
    {
1243
4
      ccv_nnc_graph_t* const sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, i);
1244
4
      if (sub_graph && !sub_graph->default_schedule)
1245
4
      {
1246
4
        const int exec_idx = sub_graph->exec_idx - 1;
1247
4
        assert(schd_info[exec_idx].stream_size == 1);
1248
4
        const int stream_idx = SCHEDULE_STREAMS(schd_info[exec_idx])[0];
1249
4
        const int device_id = ((ccv_nnc_stream_data_t*)ccv_array_get(stream_data, stream_idx))->device_id;
1250
4
        sub_graph->default_schedule = _ccv_nnc_graph_static_schedule_new(sub_graph, stream_type, device_id, graph->streams[stream_idx], 0, 0, 0, 0);
1251
4
      }
1252
4
    }
1253
355
  ccv_array_free(stream_data);
1254
355
  return schedule;
1255
355
}
1256
void ccv_nnc_graph_set_default_static_schedule(ccv_nnc_graph_t* const graph, const int stream_type)
1257
297
{
1258
297
  assert(graph->p == 0);
1259
297
  if (graph->default_schedule)
1260
0
    ccv_nnc_graph_static_schedule_free(graph->default_schedule);
1261
297
  graph->default_schedule = _ccv_nnc_graph_static_schedule_new(graph, stream_type, -1, 0, 0, 0, 0, 0);
1262
297
}
1263
1264
ccv_nnc_graph_static_schedule_t* ccv_nnc_graph_static_schedule_new(ccv_nnc_graph_t* const graph, const int stream_type, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size)
1265
54
{
1266
54
  assert(graph->p == 0);
1267
54
  return _ccv_nnc_graph_static_schedule_new(graph, stream_type, -1, 0, sources, source_size, destinations, destination_size);
1268
54
}
1269
1270
ccv_nnc_stream_context_t* ccv_nnc_graph_default_stream(const ccv_nnc_graph_t* const graph)
1271
9
{
1272
9
  if (graph->streams && graph->stream_size > 0)
1273
9
    return graph->streams[0];
1274
0
  return 0;
1275
0
}
1276
1277
static void _ccv_nnc_graph_dot_exec(const int index, const ccv_nnc_graph_exec_info_t* const exec_info, const ccv_nnc_graph_exec_schedule_t* const schd_info, ccv_nnc_stream_context_t** const streams, const int flags, FILE* out)
1278
660
{
1279
660
  if (flags == CCV_NNC_LONG_DOT_GRAPH)
1280
658
    fputc('{', out);
1281
660
  fprintf(out, "node%d", index);
1282
660
  if (flags == CCV_NNC_LONG_DOT_GRAPH)
1283
658
  {
1284
658
    fputs("|Command: ", out);
1285
658
    fputs(ccv_nnc_cmd_name(exec_info->cmd.cmd), out);
1286
658
    if (schd_info)
1287
142
    {
1288
142
      if (schd_info->stream_size > 0)
1289
142
      {
1290
142
        int i, flag = 0;
1291
142
        fputs("|Stream: ", out);
1292
296
        for (i = 0; i < schd_info->stream_size; 
i++154
)
1293
154
        {
1294
154
          const int device_id = streams ? CCV_TENSOR_GET_DEVICE_ID(streams[SCHEDULE_STREAMS(*schd_info)[i]]->type) : 
00
;
1295
154
          if (i == 0)
1296
142
            fprintf(out, "%d (d%d)", SCHEDULE_STREAMS(*schd_info)[i], device_id);
1297
12
          else
1298
12
            fprintf(out, ", %d (d%d)", SCHEDULE_STREAMS(*schd_info)[i], device_id);
1299
154
        }
1300
296
        for (i = 0; i < schd_info->stream_size; 
i++154
)
1301
154
          if (SCHEDULE_SIGNALS(*schd_info)[i] >= 0)
1302
69
          {
1303
69
            if (!flag)
1304
60
            {
1305
60
              flag = 1;
1306
60
              fprintf(out, "|Signal: %d", SCHEDULE_SIGNALS(*schd_info)[i]);
1307
60
            } else
1308
9
              fprintf(out, ", %d", SCHEDULE_SIGNALS(*schd_info)[i]);
1309
69
          }
1310
142
      }
1311
142
      if (schd_info->wait_size > 0)
1312
76
      {
1313
76
        fputs("|Wait: ", out);
1314
76
        int i;
1315
116
        for (i = 0; i < schd_info->wait_size - 1; 
i++40
)
1316
40
          fprintf(out, "%d, ", schd_info->waits[i]);
1317
76
        fprintf(out, "%d", schd_info->waits[schd_info->wait_size - 1]);
1318
76
      }
1319
142
    }
1320
658
    fputc('}', out);
1321
658
  }
1322
660
}
1323
1324
static void _ccv_nnc_graph_dot_tensor(const int index, const ccv_nnc_tensor_t* const tensor, const int zone, const int flags, const int depth, FILE* out)
1325
1.82k
{
1326
1.82k
  // if it has an alias pointer, or, it is a long form.
1327
1.82k
  if (flags == CCV_NNC_LONG_DOT_GRAPH)
1328
1.82k
    fputc('{', out);
1329
1.82k
  const int is_tensor_view = CCV_IS_TENSOR_VIEW(tensor);
1330
1.82k
  if (is_tensor_view)
1331
81
    fprintf(out, "tensorview%d", index);
1332
1.74k
  else
1333
1.74k
    fprintf(out, "tensor%d", index);
1334
1.82k
  int i;
1335
2.02k
  for (i = 0; i < depth; 
i++195
) // Print subscription to denote depth.
1336
195
    fputc('\'', out);
1337
1.82k
  if (CCV_GET_TAPE_ALLOC(tensor->type))
1338
1.82k
    
fputs(" (t)", out)9
;
1339
1.82k
  if (flags == CCV_NNC_LONG_DOT_GRAPH)
1340
1.82k
  {
1341
1.82k
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(tensor->info.type);
1342
1.82k
    fprintf(out, "|d%d|zone%d", device_id, zone);
1343
2.01k
    for (i = 0; i < depth; 
i++195
) // Print subscription to denote depth.
1344
195
      fputc('\'', out);
1345
1.82k
    uintptr_t aptr = (uintptr_t)tensor->data.u8;
1346
1.82k
    const int* ainc = is_tensor_view ? 
((ccv_nnc_tensor_view_t*)(tensor))->inc81
:
tensor->info.dim1.73k
;
1347
1.82k
    // For the last one, we don't extend to full ainc.
1348
1.82k
    size_t ainc_size = (ccv_nnc_dimension_count(ainc) - ainc[0] + tensor->info.dim[0]) * CCV_GET_DATA_TYPE_SIZE(tensor->type);
1349
1.82k
    // Print out the range as well.
1350
1.82k
    fprintf(out, "|{%#010x|%#010x}|%d", (uint32_t)aptr, (uint32_t)(aptr + ainc_size - 1), tensor->info.dim[0]);
1351
3.77k
    for (i = 1; i < CCV_NNC_MAX_DIM_ALLOC && tensor->info.dim[i]; 
i++1.95k
)
1352
1.95k
      fprintf(out, "x%d", tensor->info.dim[i]);
1353
1.82k
    fputc('}', out);
1354
1.82k
  }
1355
1.82k
}
1356
1357
typedef struct {
1358
  int index;
1359
  int name;
1360
  int zone;
1361
  uintptr_t tensor_ref;
1362
  uintptr_t start_ptr;
1363
  uintptr_t end_ptr;
1364
} ccv_nnc_tensor_dot_t;
1365
1366
typedef struct {
1367
  ccv_nnc_tensor_dot_t* dots;
1368
  int* remap;
1369
  int* rename_zone;
1370
  int* rename_index;
1371
} ccv_nnc_tensor_dot_recovery_t;
1372
1373
// First sort by start_ptr, then sort by tensor ptr (so that we will have the same tensor sorted to one cluster).
1374
9.86k
#define less_than(i1, i2, aux) ((i1).start_ptr < (i2).start_ptr || 
(5.12k
(i1).start_ptr == (i2).start_ptr5.12k
&&
(i1).tensor_ref < (i2).tensor_ref2.28k
))
1375
9.86k
static CCV_IMPLEMENT_QSORT(_ccv_nnc_tensor_dot_sort_by_ptr, ccv_nnc_tensor_dot_t, less_than)
1376
#undef less_than
1377
1378
static int _ccv_nnc_graph_dot_tensor_multiview_count(const ccv_nnc_tensor_multiview_t* const mv)
1379
260
{
1380
260
  if (!CCV_IS_TENSOR_MULTIVIEW(mv))
1381
260
    
return 1174
;
1382
86
  const int count = mv->kind + mv->repeat;
1383
86
  int i, c = 0;
1384
269
  for (i = 0; i < count; 
i++183
)
1385
183
    c += _ccv_nnc_graph_dot_tensor_multiview_count((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)[i]);
1386
86
  return c;
1387
86
}
1388
1389
static void _ccv_nnc_graph_dot_tensor_multiview_tensor_dots(const ccv_nnc_tensor_multiview_t* const mv, ccv_nnc_tensor_dot_t* const tensor_dots, int* tensor_index)
1390
86
{
1391
86
  const int count = mv->kind + mv->repeat;
1392
86
  int i;
1393
269
  for (i = 0; i < count; 
i++183
)
1394
183
    if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
1395
183
      
_ccv_nnc_graph_dot_tensor_multiview_tensor_dots((ccv_nnc_tensor_multiview_t*)9
CCV_NNC_MULTIVIEW_DATA9
(mv)[i], tensor_dots, tensor_index);
1396
174
    else {
1397
174
      tensor_dots[*tensor_index].name = *tensor_index;
1398
174
      tensor_dots[*tensor_index].start_ptr =  (uintptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->data.u8;
1399
174
      // Because tv's pointer will get updated, it is not correct in this case to have one tensor_ref.
1400
174
      tensor_dots[*tensor_index].tensor_ref = tensor_dots[*tensor_index].start_ptr;
1401
174
      const size_t dim_size = ccv_nnc_dimension_count(CCV_NNC_MULTIVIEW_DATA(mv)[i]->info.dim) * CCV_GET_DATA_TYPE_SIZE(CCV_NNC_MULTIVIEW_DATA(mv)[i]->type);
1402
174
      tensor_dots[*tensor_index].end_ptr = tensor_dots[*tensor_index].start_ptr + dim_size - 1;
1403
174
      ++(*tensor_index);
1404
174
    }
1405
86
}
1406
1407
static ccv_nnc_tensor_dot_recovery_t _ccv_nnc_graph_tensor_dot_recovery(const ccv_nnc_graph_t* const graph)
1408
169
{
1409
169
  int i, j;
1410
169
  // Recover tensor relationships for all tensors referenced in the graph.
1411
169
  // Most notably, we have to give these indexes, and find if they point to
1412
169
  // the same memory region, and whether they overlap. These information
1413
169
  // are lost since we converted from symbolic form to the execution form.
1414
169
  // and here we do our best to recover because that is easier to understand
1415
169
  // if we want to present the graph visually (also, we don't want to put this
1416
169
  // information into the tensor or execution graph to avoid overhead, thus,
1417
169
  // recovering is the best we can do).
1418
169
  int tensor_count = 0;
1419
866
  for (i = 0; i < graph->exec_info->rnum; 
i++697
)
1420
697
  {
1421
697
    ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1422
2.04k
    for (j = 0; j < exec_info->input_size; 
j++1.34k
)
1423
1.34k
      if (exec_info->inputs[j])
1424
1.12k
        tensor_count += CCV_IS_TENSOR_MULTIVIEW(exec_info->inputs[j]) ? 
_ccv_nnc_graph_dot_tensor_multiview_count((ccv_nnc_tensor_multiview_t*)exec_info->inputs[j])36
:
11.09k
;
1425
1.49k
    for (j = 0; j < exec_info->output_size; 
j++799
)
1426
799
      if (exec_info->outputs[j])
1427
774
        tensor_count += CCV_IS_TENSOR_MULTIVIEW(exec_info->outputs[j]) ? 
_ccv_nnc_graph_dot_tensor_multiview_count((ccv_nnc_tensor_multiview_t*)exec_info->outputs[j])41
:
1733
;
1428
697
  }
1429
169
  ccv_nnc_tensor_dot_t* tensor_dots = tensor_count > 0 ? 
(ccv_nnc_tensor_dot_t*)165
ccmalloc165
(sizeof(ccv_nnc_tensor_dot_t) * tensor_count) :
04
;
1430
169
  int k = 0;
1431
866
  for (i = 0; i < graph->exec_info->rnum; 
i++697
)
1432
697
  {
1433
697
    ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1434
2.04k
    for (j = 0; j < exec_info->input_size; 
j++1.34k
)
1435
1.34k
    {
1436
1.34k
      ccv_nnc_tensor_t* tensor = exec_info->inputs[j];
1437
1.34k
      if (!tensor)
1438
218
        continue;
1439
1.12k
      if (CCV_IS_TENSOR_MULTIVIEW(tensor))
1440
1.12k
        
_ccv_nnc_graph_dot_tensor_multiview_tensor_dots((ccv_nnc_tensor_multiview_t*)tensor, tensor_dots, &k)36
;
1441
1.09k
      else {
1442
1.09k
        tensor_dots[k].name = k;
1443
1.09k
        tensor_dots[k].tensor_ref = (uintptr_t)tensor;
1444
1.09k
        tensor_dots[k].start_ptr = (uintptr_t)tensor->data.u8;
1445
1.09k
        const int* inc = CCV_IS_TENSOR_VIEW(tensor) ? 
((ccv_nnc_tensor_view_t*)tensor)->inc50
:
tensor->info.dim1.04k
;
1446
1.09k
        const size_t inc_size = (ccv_nnc_dimension_count(inc) - inc[0] + tensor->info.dim[0]) * CCV_GET_DATA_TYPE_SIZE(tensor->type);
1447
1.09k
        tensor_dots[k].end_ptr = tensor_dots[k].start_ptr + inc_size - 1;
1448
1.09k
        ++k;
1449
1.09k
      }
1450
1.12k
    }
1451
1.49k
    for (j = 0; j < exec_info->output_size; 
j++799
)
1452
799
    {
1453
799
      ccv_nnc_tensor_t* tensor = exec_info->outputs[j];
1454
799
      if (!tensor)
1455
25
        continue;
1456
774
      if (CCV_IS_TENSOR_MULTIVIEW(tensor))
1457
774
        
_ccv_nnc_graph_dot_tensor_multiview_tensor_dots((ccv_nnc_tensor_multiview_t*)tensor, tensor_dots, &k)41
;
1458
733
      else {
1459
733
        tensor_dots[k].name = k;
1460
733
        tensor_dots[k].tensor_ref = (uintptr_t)tensor;
1461
733
        tensor_dots[k].start_ptr = (uintptr_t)tensor->data.u8;
1462
733
        const int* inc = CCV_IS_TENSOR_VIEW(tensor) ? 
((ccv_nnc_tensor_view_t*)tensor)->inc31
:
tensor->info.dim702
;
1463
733
        const size_t inc_size = (ccv_nnc_dimension_count(inc) - inc[0] + tensor->info.dim[0]) * CCV_GET_DATA_TYPE_SIZE(tensor->type);
1464
733
        tensor_dots[k].end_ptr = tensor_dots[k].start_ptr + inc_size - 1;
1465
733
        ++k;
1466
733
      }
1467
774
    }
1468
697
  }
1469
169
  tensor_count = k; // We may over count, now shrink.
1470
169
  // To group overlap memory into one zone, we sort it by start ptr first (secondary by the tensor pointer).
1471
169
  _ccv_nnc_tensor_dot_sort_by_ptr(tensor_dots, tensor_count, 0);
1472
169
  int index = 0, zone = 0;
1473
169
  uintptr_t tensor_ref = tensor_count > 0 ? 
tensor_dots[0].tensor_ref165
:
04
;
1474
169
  uintptr_t end_ptr = tensor_count > 0 ? 
tensor_dots[0].end_ptr165
:
04
;
1475
169
  // Then, it is trivial, we go by end ptr. If the next start ptr is still within the end ptr (start ptr <= end ptr),
1476
169
  // they are the same zone.
1477
2.16k
  for (i = 0; i < tensor_count; 
i++1.99k
)
1478
1.99k
  {
1479
1.99k
    if (tensor_dots[i].tensor_ref != tensor_ref)
1480
891
    {
1481
891
      tensor_ref = tensor_dots[i].tensor_ref;
1482
891
      ++index;
1483
891
    }
1484
1.99k
    if (tensor_dots[i].start_ptr > end_ptr)
1485
664
    {
1486
664
      end_ptr = ccv_max(end_ptr, tensor_dots[i].end_ptr);
1487
664
      ++zone;
1488
664
    }
1489
1.99k
    tensor_dots[i].index = index;
1490
1.99k
    tensor_dots[i].zone = zone;
1491
1.99k
  }
1492
169
  // We already have index and zone assigned, but the problem is that these are not very human interpretable (because
1493
169
  // it follows the pointer from low to high, not the tensor creation order). The following code renamed both the index
1494
169
  // and the zone so that it is much more understandable.
1495
169
  const int index_count = index + 1;
1496
169
  const int zone_count = zone + 1;
1497
169
  int* remap = (int*)ccmalloc(sizeof(int) * (tensor_count + index_count + zone_count));
1498
169
  int* rename_index = remap + tensor_count;
1499
169
  int* rename_zone = rename_index + index_count;
1500
2.16k
  for (i = 0; i < tensor_count; 
i++1.99k
)
1501
1.99k
    remap[tensor_dots[i].name] = i;
1502
1.22k
  for (i = 0; i < index_count; 
i++1.06k
)
1503
1.06k
    rename_index[i] = -1;
1504
1.00k
  for (i = 0; i < zone_count; 
i++833
)
1505
833
    rename_zone[i] = -1;
1506
169
  index = 0;
1507
169
  zone = 0;
1508
2.16k
  for (i = 0; i < tensor_count; 
i++1.99k
)
1509
1.99k
  {
1510
1.99k
    ccv_nnc_tensor_dot_t* tensor_dot = tensor_dots + remap[i];
1511
1.99k
    if (rename_index[tensor_dot->index] == -1)
1512
1.05k
      rename_index[tensor_dot->index] = index++;
1513
1.99k
    if (rename_zone[tensor_dot->zone] == -1)
1514
829
      rename_zone[tensor_dot->zone] = zone++;
1515
1.99k
  }
1516
169
  ccv_nnc_tensor_dot_recovery_t recovery = {
1517
169
    .dots = tensor_dots,
1518
169
    .remap = remap,
1519
169
    .rename_index = rename_index,
1520
169
    .rename_zone = rename_zone,
1521
169
  };
1522
169
  return recovery;
1523
169
}
1524
1525
static void _ccv_nnc_graph_tensor_dot_recovery_free(const ccv_nnc_tensor_dot_recovery_t recovery)
1526
169
{
1527
169
  ccfree(recovery.dots);
1528
169
  ccfree(recovery.remap);
1529
169
}
1530
1531
static void _ccv_nnc_graph_dot_tensor_multiview_one(const ccv_nnc_tensor_multiview_t* const mv, const ccv_nnc_tensor_dot_recovery_t recovery, const int depth, int* tensor_index, FILE* out)
1532
86
{
1533
86
  const int count = mv->kind + mv->repeat;
1534
86
  int i, j;
1535
86
  fputs("|{", out);
1536
269
  for (i = 0; i < count; 
i++183
)
1537
183
    if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
1538
183
    {
1539
9
      fprintf(out, "{%d", i);
1540
9
      if (mv->kind == CCV_NNC_MULTIVIEW_K0N || 
(5
mv->kind == CCV_NNC_MULTIVIEW_K1N5
&&
i > 05
))
1541
9
        fputc('*', out); // Denotes that we loop on this.
1542
9
      _ccv_nnc_graph_dot_tensor_multiview_one((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)[i], recovery, depth, tensor_index, out);
1543
9
      if (i == count - 1)
1544
7
        fputc('}', out);
1545
2
      else
1546
2
        fputs("}|", out);
1547
174
    } else {
1548
174
      fprintf(out, "{%d", i);
1549
174
      if (mv->kind == CCV_NNC_MULTIVIEW_K0N || 
(19
mv->kind == CCV_NNC_MULTIVIEW_K1N19
&&
i > 019
))
1550
163
        fputc('*', out); // Denotes that we loop on this.
1551
174
      const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[*tensor_index];
1552
174
      fprintf(out, "|zone%d", recovery.rename_zone[tensor_dot->zone]);
1553
368
      for (j = 0; j < depth; 
j++194
)
1554
194
        fputc('\'', out);
1555
174
      uintptr_t aptr = (uintptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->data.u8;
1556
174
      // For the last one, we don't extend to full ainc.
1557
174
      size_t dim_size = ccv_nnc_dimension_count(CCV_NNC_MULTIVIEW_DATA(mv)[i]->info.dim) * CCV_GET_DATA_TYPE_SIZE(CCV_NNC_MULTIVIEW_DATA(mv)[i]->type);
1558
174
      // Print out the range as well.
1559
174
      fprintf(out, "|{%#010x|%#010x}", (uint32_t)aptr, (uint32_t)(aptr + dim_size - 1));
1560
174
      ++(*tensor_index);
1561
174
      if (i == count - 1)
1562
79
        fputc('}', out);
1563
95
      else
1564
95
        fputs("}|", out);
1565
174
    }
1566
86
  fputc('}', out);
1567
86
}
1568
1569
static void _ccv_nnc_graph_dot_tensor_multiview(const ccv_nnc_tensor_multiview_t* const mv, const ccv_nnc_tensor_dot_recovery_t recovery, const int flags, const int depth, int* tensor_index, FILE* out)
1570
77
{
1571
77
  // if it has an alias pointer, or, it is a long form.
1572
77
  if (flags == CCV_NNC_LONG_DOT_GRAPH)
1573
77
    fputc('{', out);
1574
77
  const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[*tensor_index];
1575
77
  fprintf(out, "multiview%d", recovery.rename_index[tensor_dot->index]);
1576
77
  int i;
1577
161
  for (i = 0; i < depth; 
i++84
) // Print subscription to denote depth.
1578
84
    fputc('\'', out);
1579
77
  if (CCV_GET_TAPE_ALLOC(mv->type))
1580
77
    
fputs(" (t)", out)7
;
1581
77
  if (flags == CCV_NNC_LONG_DOT_GRAPH)
1582
77
  {
1583
77
    _ccv_nnc_graph_dot_tensor_multiview_one(mv, recovery, depth, tensor_index, out);
1584
77
    const ccv_nnc_tensor_t* root = (ccv_nnc_tensor_t*)mv;
1585
156
    while (CCV_IS_TENSOR_MULTIVIEW(root))
1586
79
      root = CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)root)[0];
1587
77
    fprintf(out, "|%d", root->info.dim[0]);
1588
105
    for (i = 1; i < CCV_NNC_MAX_DIM_ALLOC && root->info.dim[i]; 
i++28
)
1589
28
      fprintf(out, "x%d", root->info.dim[i]);
1590
77
    fputc('}', out);
1591
77
  } else
1592
0
    *tensor_index += _ccv_nnc_graph_dot_tensor_multiview_count(mv);
1593
77
}
1594
1595
static void _ccv_nnc_graph_dot_node(const ccv_nnc_graph_exec_info_t* const exec_info, const ccv_nnc_graph_exec_schedule_t* const schd_info, const int exec_index, ccv_nnc_stream_context_t** const streams, const ccv_nnc_tensor_dot_recovery_t recovery, const int flags, const int depth, FILE* out, int* const tensor_index)
1596
660
{
1597
660
  fprintf(out, "node%d [shape=record,label=\"", exec_index);
1598
660
  _ccv_nnc_graph_dot_exec(exec_index, exec_info, schd_info, streams, flags, out);
1599
660
  int i;
1600
660
  int k = *tensor_index;
1601
660
  if (exec_info->input_size > 0)
1602
542
  {
1603
542
    fputs("|{Input", out);
1604
1.85k
    for (i = 0; i < exec_info->input_size; 
i++1.31k
)
1605
1.31k
      if (exec_info->inputs[i])
1606
1.09k
      {
1607
1.09k
        fputc('|', out);
1608
1.09k
        if (CCV_IS_TENSOR_MULTIVIEW(exec_info->inputs[i]))
1609
1.09k
          
_ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->inputs[i], recovery, flags, depth, &k, out)33
;
1610
1.06k
        else {
1611
1.06k
          const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k];
1612
1.06k
          _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->inputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out);
1613
1.06k
          ++k;
1614
1.06k
        }
1615
1.09k
      } else
1616
218
        fputs("|-", out);
1617
542
    fputc('}', out);
1618
542
  }
1619
660
  if (exec_info->output_size > 0)
1620
604
  {
1621
604
    fputs("|{Output", out);
1622
1.36k
    for (i = 0; i < exec_info->output_size; 
i++763
)
1623
763
      if (exec_info->outputs[i])
1624
738
      {
1625
738
        fputc('|', out);
1626
738
        if (CCV_IS_TENSOR_MULTIVIEW(exec_info->outputs[i]))
1627
738
          
_ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->outputs[i], recovery, flags, depth, &k, out)30
;
1628
708
        else {
1629
708
          const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k];
1630
708
          _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->outputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out);
1631
708
          ++k;
1632
708
        }
1633
738
      } else
1634
25
        fputs("|-", out);
1635
604
    fputc('}', out);
1636
604
  }
1637
660
  fputs("\"];\n", out);
1638
660
  *tensor_index = k;
1639
660
}
1640
1641
static void _ccv_nnc_graph_dot_while_label(const ccv_nnc_graph_exec_info_t* const exec_info, const int exec_index, const ccv_nnc_tensor_dot_recovery_t recovery, const ccv_nnc_graph_t* const while_graph, const int flags, const int depth, FILE* out, int* tensor_index)
1642
25
{
1643
25
  int i;
1644
25
  fprintf(out, "label=<<b>while%d </b>Command: ", exec_index);
1645
25
  fputs(ccv_nnc_cmd_name(exec_info->cmd.cmd), out);
1646
25
  fputs(">;\n", out);
1647
25
  fprintf(out, "label%d [shape=record,label=\"{", exec_index);
1648
25
  int k = *tensor_index;
1649
25
  if (exec_info->input_size > 0)
1650
16
  {
1651
16
    fputs("{Input|{", out);
1652
39
    for (i = 0; i < exec_info->input_size; 
i++23
)
1653
23
    {
1654
23
      if (i > 0)
1655
7
        fputc('|', out);
1656
23
      if (exec_info->inputs[i])
1657
23
      {
1658
23
        if (CCV_IS_TENSOR_MULTIVIEW(exec_info->inputs[i]))
1659
23
          
_ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->inputs[i], recovery, flags, depth, &k, out)1
;
1660
22
        else {
1661
22
          const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k];
1662
22
          _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->inputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out);
1663
22
          ++k;
1664
22
        }
1665
23
      } else
1666
0
        fputc('-', out);
1667
23
    }
1668
16
    fputs("}}", out);
1669
16
  }
1670
25
  if (exec_info->output_size > 0)
1671
15
  {
1672
15
    if (exec_info->input_size > 0)
1673
12
      fputs("|", out);
1674
15
    fputs("{Output|{", out);
1675
38
    for (i = 0; i < exec_info->output_size; 
i++23
)
1676
23
    {
1677
23
      if (i > 0)
1678
8
        fputc('|', out);
1679
23
      if (exec_info->outputs[i])
1680
23
      {
1681
23
        if (CCV_IS_TENSOR_MULTIVIEW(exec_info->outputs[i]))
1682
23
          
_ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->outputs[i], recovery, flags, depth, &k, out)0
;
1683
23
        else {
1684
23
          const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k];
1685
23
          _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->outputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out);
1686
23
          ++k;
1687
23
        }
1688
23
      } else
1689
0
        fputc('-', out);
1690
23
    }
1691
15
    fputs("}}", out);
1692
15
  }
1693
25
  fputs("}\"];\n", out);
1694
25
  *tensor_index = k;
1695
25
}
1696
1697
static void _ccv_nnc_graph_dot_case_of_label(const ccv_nnc_graph_exec_info_t* const exec_info, const int exec_index, const ccv_nnc_tensor_dot_recovery_t recovery, const int flags, const int depth, FILE* out, int* tensor_index)
1698
12
{
1699
12
  int i;
1700
12
  fprintf(out, "label=<<b>caseof%d </b>Command: ", exec_index);
1701
12
  fputs(ccv_nnc_cmd_name(exec_info->cmd.cmd), out);
1702
12
  fputs(">;\n", out);
1703
12
  fprintf(out, "label%d [shape=record,label=\"{", exec_index);
1704
12
  int k = *tensor_index;
1705
12
  if (exec_info->input_size > 0)
1706
11
  {
1707
11
    fputs("{Input|{", out);
1708
22
    for (i = 0; i < exec_info->input_size; 
i++11
)
1709
11
    {
1710
11
      if (i > 0)
1711
0
        fputc('|', out);
1712
11
      if (exec_info->inputs[i])
1713
11
      {
1714
11
        if (CCV_IS_TENSOR_MULTIVIEW(exec_info->inputs[i]))
1715
11
          
_ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->inputs[i], recovery, flags, depth, &k, out)2
;
1716
9
        else {
1717
9
          const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k];
1718
9
          _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->inputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out);
1719
9
          ++k;
1720
9
        }
1721
11
      } else
1722
0
        fputc('-', out);
1723
11
    }
1724
11
    fputs("}}", out);
1725
11
  }
1726
12
  if (exec_info->output_size > 0)
1727
11
  {
1728
11
    if (exec_info->input_size > 0)
1729
10
      fputs("|", out);
1730
11
    fputs("{Output|{", out);
1731
24
    for (i = 0; i < exec_info->output_size; 
i++13
)
1732
13
    {
1733
13
      if (i > 0)
1734
2
        fputc('|', out);
1735
13
      if (exec_info->outputs[i])
1736
13
      {
1737
13
        if (CCV_IS_TENSOR_MULTIVIEW(exec_info->outputs[i]))
1738
13
          
_ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->outputs[i], recovery, flags, depth, &k, out)11
;
1739
2
        else {
1740
2
          const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k];
1741
2
          _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->outputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out);
1742
2
          ++k;
1743
2
        }
1744
13
      } else
1745
0
        fputc('-', out);
1746
13
    }
1747
11
    fputs("}}", out);
1748
11
  }
1749
12
  fputs("}\"];\n", out);
1750
12
  *tensor_index = k;
1751
12
}
1752
1753
static void _ccv_nnc_graph_dot_sub_graphs(const ccv_nnc_graph_exec_info_t* const exec_info, const ccv_nnc_tensor_dot_recovery_t p_recovery, const ccv_array_t* const sub_graphs, const int flags, const int depth, FILE* out, int* tensor_index, int* exec_index)
1754
37
{
1755
37
  if (exec_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1756
25
  {
1757
25
    fprintf(out, "subgraph cluster%d {\nstyle=\"rounded\";\nnode%d [style=invisible];\n", *exec_index, *exec_index);
1758
25
    const ccv_nnc_graph_t* const while_graph = *(ccv_nnc_graph_t**)ccv_array_get(sub_graphs, CCV_NNC_GRAPH_REF(exec_info)[0] - 1);
1759
25
    // Output this node info within this subgraph.
1760
25
    _ccv_nnc_graph_dot_while_label(exec_info, *exec_index, p_recovery, while_graph, flags, depth - 1 /* Label all references to its level above. */, out, tensor_index);
1761
25
  } else 
if (12
exec_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF12
) {
1762
12
    fprintf(out, "subgraph cluster%d {\nstyle=\"rounded\";\nnode%d [style=invisible];\n", *exec_index, *exec_index);
1763
12
    _ccv_nnc_graph_dot_case_of_label(exec_info, *exec_index, p_recovery, flags, depth - 1 /* Label all references to its level above. */, out, tensor_index);
1764
12
  }
1765
37
  ++(*exec_index);
1766
37
  int p;
1767
94
  for (p = 0; p < exec_info->graph_ref_size; 
p++57
)
1768
57
  {
1769
57
    if (exec_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
1770
32
    {
1771
32
      fprintf(out, "subgraph cluster%d {\nstyle=\"rounded\";\nnode%d [style=invisible];\nlabel=\"\"\n", *exec_index, *exec_index);
1772
32
      ++(*exec_index);
1773
32
    }
1774
57
    const ccv_nnc_graph_t* const graph = *(ccv_nnc_graph_t**)ccv_array_get(sub_graphs, CCV_NNC_GRAPH_REF(exec_info)[p] - 1);
1775
57
    const ccv_nnc_graph_static_schedule_t* const schedule = graph->default_schedule;
1776
57
    ccv_nnc_tensor_dot_recovery_t recovery = _ccv_nnc_graph_tensor_dot_recovery(graph);
1777
57
    int i, j;
1778
57
    int k = 0;
1779
57
    int* node_id = (int*)ccmalloc(sizeof(int) * graph->exec_info->rnum);
1780
57
    // Output styles.
1781
167
    for (i = 0; i < graph->exec_info->rnum; 
i++110
)
1782
110
    {
1783
110
      node_id[i] = *exec_index;
1784
110
      ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1785
110
      if (CCV_NNC_GRAPH_REF(exec_info)[0])
1786
3
        _ccv_nnc_graph_dot_sub_graphs(exec_info, recovery, graph->sub_graphs, flags, depth + 1, out, &k, exec_index);
1787
107
      else {
1788
107
        _ccv_nnc_graph_dot_node(exec_info,
1789
107
          schedule ? 
(i < schedule->exec_info_size 6
?
schedule->exec_info + i6
:
00
) :
0101
,
1790
107
          *exec_index, graph->streams, recovery, flags, depth, out, &k);
1791
107
        ++(*exec_index);
1792
107
      }
1793
110
    }
1794
57
    // Output connections.
1795
167
    for (i = 0; i < graph->exec_info->rnum; 
i++110
)
1796
110
    {
1797
110
      ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1798
110
      if (exec_info->outgoings)
1799
108
        
for (j = 0; 53
j < exec_info->outgoings->rnum;
j++55
)
1800
55
        {
1801
55
          const int outgoing_idx = *(int*)ccv_array_get(exec_info->outgoings, j);
1802
55
          const ccv_nnc_graph_exec_info_t* const outgoing_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, outgoing_idx);
1803
55
          // If both are sub-graphs, have both tail and head specified.
1804
55
          if (CCV_NNC_GRAPH_REF(exec_info)[0] && 
CCV_NNC_GRAPH_REF1
(outgoing_info)[0]1
)
1805
0
            fprintf(out, "node%d -> node%d [ltail=cluster%d,lhead=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[i], node_id[outgoing_idx]);
1806
55
          else if (CCV_NNC_GRAPH_REF(exec_info)[0] && 
!1
CCV_NNC_GRAPH_REF1
(outgoing_info)[0])
1807
1
            fprintf(out, "node%d -> node%d [ltail=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[i]);
1808
54
          else if (!CCV_NNC_GRAPH_REF(exec_info)[0] && CCV_NNC_GRAPH_REF(outgoing_info)[0])
1809
3
            fprintf(out, "node%d -> node%d [lhead=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[outgoing_idx]);
1810
51
          else
1811
51
            fprintf(out, "node%d -> node%d;\n", node_id[i], node_id[outgoing_idx]);
1812
55
        }
1813
110
    }
1814
57
    fputs("}\n", out);
1815
57
    _ccv_nnc_graph_tensor_dot_recovery_free(recovery);
1816
57
    ccfree(node_id);
1817
57
  }
1818
37
  // Extra subgraph cluster.
1819
37
  if (exec_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
1820
12
    fputs("}\n", out);
1821
37
}
1822
1823
void ccv_nnc_graph_dot(const ccv_nnc_graph_t* const graph, const int flags, FILE* out)
1824
112
{
1825
112
  fputs("digraph G {\ncompound=true;\n", out);
1826
112
  ccv_nnc_tensor_dot_recovery_t recovery = _ccv_nnc_graph_tensor_dot_recovery(graph);
1827
112
  int i, j;
1828
112
  int k = 0, c = 0;
1829
112
  int* node_id = (int*)ccmalloc(sizeof(int) * graph->exec_info->rnum);
1830
112
  const ccv_nnc_graph_static_schedule_t* const schedule = graph->default_schedule;
1831
112
  // Output styles.
1832
699
  for (i = 0; i < graph->exec_info->rnum; 
i++587
)
1833
587
  {
1834
587
    node_id[i] = c;
1835
587
    ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1836
587
    if (CCV_NNC_GRAPH_REF(exec_info)[0])
1837
34
      _ccv_nnc_graph_dot_sub_graphs(exec_info, recovery, graph->sub_graphs, flags, 1, out, &k, &c);
1838
553
    else {
1839
553
      _ccv_nnc_graph_dot_node(exec_info,
1840
553
        schedule ? 
(i < schedule->exec_info_size 136
?
schedule->exec_info + i136
:
00
) :
0417
,
1841
553
        c, graph->streams, recovery, flags, 0, out, &k);
1842
553
      ++c;
1843
553
    }
1844
587
  }
1845
112
  // Output connections.
1846
699
  for (i = 0; i < graph->exec_info->rnum; 
i++587
)
1847
587
  {
1848
587
    ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1849
587
    if (exec_info->outgoings)
1850
1.19k
      
for (j = 0; 475
j < exec_info->outgoings->rnum;
j++721
)
1851
721
      {
1852
721
        const int outgoing_idx = *(int*)ccv_array_get(exec_info->outgoings, j);
1853
721
        const ccv_nnc_graph_exec_info_t* const outgoing_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, outgoing_idx);
1854
721
        // If both are sub-graphs, have both tail and head specified.
1855
721
        if (CCV_NNC_GRAPH_REF(exec_info)[0] && 
CCV_NNC_GRAPH_REF18
(outgoing_info)[0]18
)
1856
3
          fprintf(out, "node%d -> node%d [ltail=cluster%d,lhead=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[i], node_id[outgoing_idx]);
1857
718
        else if (CCV_NNC_GRAPH_REF(exec_info)[0] && 
!15
CCV_NNC_GRAPH_REF15
(outgoing_info)[0])
1858
15
          fprintf(out, "node%d -> node%d [ltail=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[i]);
1859
703
        else if (!CCV_NNC_GRAPH_REF(exec_info)[0] && CCV_NNC_GRAPH_REF(outgoing_info)[0])
1860
8
          fprintf(out, "node%d -> node%d [lhead=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[outgoing_idx]);
1861
695
        else
1862
695
          fprintf(out, "node%d -> node%d;\n", node_id[i], node_id[outgoing_idx]);
1863
721
      }
1864
587
  }
1865
112
  fputs("}\n", out);
1866
112
  _ccv_nnc_graph_tensor_dot_recovery_free(recovery);
1867
112
  ccfree(node_id);
1868
112
}
1869
1870
void ccv_nnc_graph_autotune(ccv_nnc_graph_t* const graph, const size_t max_workspace_size, const int flags, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size)
1871
87
{
1872
87
  // exec current node, for synchronous CPU execution, no stream unit.
1873
87
  int i;
1874
87
#define visitor(node, idx, ...) \
1875
2.69k
  do { \
1876
2.69k
    if (node->cmd.cmd == CCV_NNC_NOOP) \
1877
2.69k
      
continue77
; \
1878
2.69k
    
if (2.62k
node->cmd.cmd == CCV_NNC_GRAPH_FORWARD2.62k
||
node->cmd.cmd == CCV_NNC_GRAPH_BACKWARD2.60k
) \
1879
2.62k
      
for (i = 0; 12
i < node->graph_ref_size30
;
i++18
) \
1880
18
      { \
1881
18
        ccv_nnc_graph_t* sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[i] - 1); \
1882
18
        ccv_nnc_graph_autotune(sub_graph, max_workspace_size, flags, 0, 0, 0, 0); \
1883
18
      } \
1884
2.62k
    else { \
1885
2.60k
      /* Need to unwrap these tensors */ \
1886
15.3k
      for (i = 0; i < node->input_size + node->output_size; 
i++12.7k
) \
1887
12.7k
        if (node->inputs[i] && 
CCV_IS_TENSOR_MULTIVIEW10.3k
(node->inputs[i])) \
1888
12.7k
          
node->inputs[i] = _ccv_nnc_any_tensor_from_tensor_multiview((ccv_nnc_tensor_multiview_t*)node->inputs[i])13
; \
1889
2.60k
      PRINT(CCV_CLI_VERBOSE, "%s [%d]: [%d] -> [%d]\n", ccv_nnc_cmd_name(node->cmd.cmd), idx, node->input_size, node->output_size); \
1890
10.8k
      for (i = 0; i < node->input_size; 
i++8.19k
) \
1891
8.19k
        PRINT(CCV_CLI_VERBOSE, "|-> %d. %p (%p)\n", i + 1, node->inputs[i], (node->inputs[i] ? node->inputs[i]->data.u8 : 0)); \
1892
7.11k
      for (i = 0; i < node->output_size; 
i++4.50k
) \
1893
4.50k
        PRINT(CCV_CLI_VERBOSE, "|<- %d. %p (%p)\n", i + 1, node->outputs[i], (node->outputs[i] ? node->outputs[i]->data.u8 : 0)); \
1894
2.60k
      node->cmd = ccv_nnc_cmd_autotune(node->cmd, max_workspace_size, node->hint, flags, node->inputs, node->input_size, node->outputs, node->output_size, 0); \
1895
2.60k
    } \
1896
2.69k
  } while (0)
1897
87
  const ccv_nnc_graph_exec_t* const graph_sources = sources ? 
sources1
:
(graph->sources 86
?
(ccv_nnc_graph_exec_t*)83
ccv_array_get83
(graph->sources, 0):
03
);
1898
87
  const int graph_source_size = source_size ? 
source_size1
:
(graph->sources 86
?
graph->sources->rnum83
:
03
);
1899
87
  const ccv_nnc_graph_exec_t* const graph_destinations = destinations ? 
destinations1
:
(graph->destinations 86
?
(ccv_nnc_graph_exec_t*)83
ccv_array_get83
(graph->destinations, 0) :
03
);
1900
87
  const int graph_destination_size = destination_size ? 
destination_size1
:
(graph->destinations 86
?
graph->destinations->rnum83
:
03
);
1901
2.69k
  
CCV_NNC_GRAPH_VISIT87
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph_sources, graph_source_size, graph_destinations, graph_destination_size, 0, visitor);
1902
87
#undef visitor
1903
87
}
1904
1905
void ccv_nnc_graph_free(ccv_nnc_graph_t* const graph)
1906
6.10k
{
1907
6.10k
  int i, j;
1908
38.1k
  for (i = 0; i < graph->exec_info->rnum; 
i++32.0k
)
1909
32.0k
  {
1910
32.0k
    ccv_nnc_graph_exec_info_t *info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1911
32.0k
    if (info->_heap_graph_ref)
1912
32.0k
      
ccfree8
(info->_heap_graph_ref)8
;
1913
32.0k
    ccv_array_t* outgoings = info->outgoings;
1914
32.0k
    if (outgoings)
1915
25.9k
      ccv_array_free(outgoings);
1916
32.0k
    // We allocate inputs & outputs in continuous fashion, therefore, only need to free the input array.
1917
32.0k
    if (info->inputs)
1918
32.0k
      
ccfree31.8k
(info->inputs)31.8k
;
1919
32.0k
    if (info->input_flags)
1920
32.0k
      
ccfree31.7k
(info->input_flags)31.7k
;
1921
32.0k
    if (info->updates)
1922
32.0k
      
ccfree17
(info->updates)17
;
1923
32.0k
    if ((info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE) && 
info->p_while.inputs27
)
1924
32.0k
      
ccfree23
(info->p_while.inputs)23
;
1925
32.0k
  }
1926
6.10k
  if (graph->tensor_wraps)
1927
27
  {
1928
80
    for (i = 0; i < graph->tensor_wraps->rnum; 
i++53
)
1929
53
    {
1930
53
      ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, i);
1931
53
      if (tensor_wrap_array)
1932
52
      {
1933
195
        for (j = 0; j < tensor_wrap_array->size; 
j++143
)
1934
143
          _ccv_nnc_graph_tensor_wrap_free(tensor_wrap_array->tensor_wraps[j]);
1935
52
        ccfree(tensor_wrap_array);
1936
52
      }
1937
53
    }
1938
27
    ccv_array_free(graph->tensor_wraps);
1939
27
  }
1940
6.10k
  if (graph->tensor_wraps_refs)
1941
44
    ccv_array_free(graph->tensor_wraps_refs);
1942
6.10k
  if (graph->breakpoints)
1943
6.10k
    
ccfree26
(graph->breakpoints)26
;
1944
6.10k
  if (graph->sources)
1945
6.09k
    ccv_array_free(graph->sources);
1946
6.10k
  if (graph->destinations)
1947
6.09k
    ccv_array_free(graph->destinations);
1948
6.10k
  if (graph->default_schedule)
1949
301
    ccv_nnc_graph_static_schedule_free(graph->default_schedule);
1950
6.10k
  if (graph->streams)
1951
301
  {
1952
301
    // If the graph has parent graph, the default stream is allocated by the parent graph, we need to skip.
1953
301
    if (!graph->p)
1954
297
      ccv_nnc_stream_context_free(graph->streams[0]);
1955
1.07k
    for (i = 1; i < graph->stream_size; 
i++769
)
1956
769
      ccv_nnc_stream_context_free(graph->streams[i]);
1957
301
    ccfree(graph->streams);
1958
301
  }
1959
6.10k
  if (graph->block_stream_tasks)
1960
6.10k
    
ccfree301
(graph->block_stream_tasks)301
;
1961
6.10k
  if (graph->signals)
1962
301
  {
1963
1.44k
    for (i = 0; i < graph->signal_size; 
i++1.14k
)
1964
1.14k
      ccv_nnc_stream_signal_free(graph->signals[i]);
1965
301
    ccfree(graph->signals);
1966
301
  }
1967
6.10k
  if (graph->carry_overs)
1968
21
  {
1969
46
    for (i = 0; i < graph->carry_overs->rnum; 
i++25
)
1970
25
    {
1971
25
      ccv_nnc_graph_tensor_carry_over_t* const carry_over = (ccv_nnc_graph_tensor_carry_over_t*)ccv_array_get(graph->carry_overs, i);
1972
25
      _ccv_nnc_graph_tensor_wrap_free(carry_over->from);
1973
25
      _ccv_nnc_graph_tensor_wrap_free(carry_over->to);
1974
25
    }
1975
21
    ccv_array_free(graph->carry_overs);
1976
21
  }
1977
6.10k
  if (graph->sub_graphs)
1978
35
  {
1979
94
    for (i = 0; i < graph->sub_graphs->rnum; 
i++59
)
1980
59
      ccv_nnc_graph_free(*(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, i));
1981
35
    ccv_array_free(graph->sub_graphs);
1982
35
  }
1983
6.10k
  ccv_array_free(graph->exec_info);
1984
6.10k
  if (graph->buffer)
1985
6.10k
    
ccfree299
(graph->buffer)299
;
1986
6.10k
  ccfree(graph);
1987
6.10k
}