Coverage Report

Created: 2019-07-03 22:50

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/ccv_nnc_graph.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#include "_ccv_nnc_graph.h"
6
7
#pragma mark - Level-2 API
8
9
ccv_nnc_graph_t* ccv_nnc_graph_new(void)
10
1.19k
{
11
1.19k
  ccv_nnc_graph_t* graph = (ccv_nnc_graph_t*)cccalloc(1, sizeof(ccv_nnc_graph_t));
12
1.19k
  graph->exec_info = ccv_array_new(sizeof(ccv_nnc_graph_exec_info_t), 5, 0);
13
1.19k
  return graph;
14
1.19k
}
15
16
void ccv_nnc_graph_set_sources(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t* const sources, const int source_size)
17
1.18k
{
18
1.18k
  if (!graph->sources)
19
1.18k
    graph->sources = ccv_array_new(sizeof(ccv_nnc_graph_exec_t), source_size, 0);
20
0
  else
21
0
    ccv_array_clear(graph->sources);
22
1.18k
  int i;
23
2.37k
  for (i = 0; i < source_size; 
i++1.18k
)
24
1.18k
    ccv_array_push(graph->sources, sources + i);
25
1.18k
  graph->topsorted = 0;
26
1.18k
}
27
28
ccv_nnc_graph_exec_t* ccv_nnc_graph_sources(const ccv_nnc_graph_t* const graph)
29
0
{
30
0
  return graph->sources ? (ccv_nnc_graph_exec_t*)ccv_array_get(graph->sources, 0) : 0;
31
0
}
32
33
int ccv_nnc_graph_source_size(const ccv_nnc_graph_t* const graph)
34
0
{
35
0
  return graph->sources ? graph->sources->rnum : 0;
36
0
}
37
38
void ccv_nnc_graph_set_destinations(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t* const destinations, const int destination_size)
39
1.18k
{
40
1.18k
  if (!graph->destinations)
41
1.18k
    graph->destinations = ccv_array_new(sizeof(ccv_nnc_graph_exec_t), destination_size, 0);
42
0
  else
43
0
    ccv_array_clear(graph->sources);
44
1.18k
  int i;
45
2.37k
  for (i = 0; i < destination_size; 
i++1.18k
)
46
1.18k
    ccv_array_push(graph->destinations, destinations + i);
47
1.18k
  graph->topsorted = 0;
48
1.18k
}
49
50
ccv_nnc_graph_exec_t* ccv_nnc_graph_destinations(const ccv_nnc_graph_t* const graph)
51
0
{
52
0
  return graph->destinations ? (ccv_nnc_graph_exec_t*)ccv_array_get(graph->destinations, 0) : 0;
53
0
}
54
55
int ccv_nnc_graph_destination_size(const ccv_nnc_graph_t* const graph)
56
0
{
57
0
  return graph->destinations ? graph->destinations->rnum : 0;
58
0
}
59
60
void ccv_nnc_graph_exec_set(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, const ccv_nnc_cmd_t cmd)
61
713k
{
62
713k
  assert(exec.d < graph->exec_info->rnum);
63
713k
  assert(exec.graph == graph);
64
713k
  ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d);
65
713k
  exec_info->cmd = cmd;
66
713k
}
67
68
void ccv_nnc_graph_exec_set_hint(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, const ccv_nnc_hint_t hint)
69
64
{
70
64
  assert(exec.d < graph->exec_info->rnum);
71
64
  assert(exec.graph == graph);
72
64
  ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d);
73
64
  exec_info->hint = hint;
74
64
}
75
76
static int _ccv_nnc_tensor_multiview_level_count(const ccv_nnc_tensor_multiview_t* const mv)
77
482
{
78
482
  if (!CCV_IS_TENSOR_MULTIVIEW(mv))
79
482
    
return 1327
;
80
155
  const int count = mv->kind + mv->repeat;
81
155
  int i, c = 0;
82
502
  for (i = 0; i < count; 
i++347
)
83
347
  {
84
347
    ccv_nnc_tensor_t* tv = CCV_NNC_MULTIVIEW_DATA(mv)[i];
85
347
    if (tv == CCV_NNC_TENSOR_PLACEHOLDER)
86
347
      
c = 8
ccv_max8
(c, 1);
87
347
    else
88
347
      
c = 339
ccv_max339
(c, _ccv_nnc_tensor_multiview_level_count((ccv_nnc_tensor_multiview_t*)tv));
89
347
  }
90
155
  return c + 1;
91
155
}
92
93
static ccv_nnc_graph_tensor_wrap_t* _ccv_nnc_graph_tensor_wrap_new(const ccv_nnc_tensor_multiview_t* const mv)
94
143
{
95
143
  const int level_count = _ccv_nnc_tensor_multiview_level_count(mv);
96
143
  ccv_nnc_graph_tensor_wrap_t* tensor_wrap = (ccv_nnc_graph_tensor_wrap_t*)ccmalloc(sizeof(ccv_nnc_graph_tensor_wrap_t) + sizeof(ccv_nnc_tensor_t*) * (level_count - 1));
97
143
  tensor_wrap->update_required = 0;
98
143
  tensor_wrap->count = level_count;
99
143
  tensor_wrap->index = 0;
100
143
  tensor_wrap->tensors[0] = (ccv_nnc_tensor_t*)mv;
101
143
  return tensor_wrap;
102
143
}
103
104
static void _ccv_nnc_graph_exec_rewind(ccv_nnc_graph_exec_info_t* const info, ccv_nnc_graph_t* const graph)
105
23
{
106
23
  if (!info->tensor_wraps_ref)
107
22
    return;
108
1
  int i;
109
1
  assert(info->tensor_wraps_ref <= graph->tensor_wraps->rnum);
110
1
  ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, info->tensor_wraps_ref - 1);;
111
1
  // Rewind from tensor wraps.
112
3
  for (i = 0; i < info->input_size; 
i++2
)
113
2
    if (tensor_wrap_array->tensor_wraps[i])
114
1
      info->inputs[i] = tensor_wrap_array->tensor_wraps[i]->tensors[0];
115
1
  const int d = info->input_size;
116
2
  for (i = 0; i < info->output_size; 
i++1
)
117
1
    if (tensor_wrap_array->tensor_wraps[d + i])
118
1
      info->outputs[i] = tensor_wrap_array->tensor_wraps[d + i]->tensors[0];
119
1
  const int dd = info->input_size + info->output_size;
120
1
  for (i = 0; i < info->update_size; 
i++0
)
121
0
    if (tensor_wrap_array->tensor_wraps[dd + i])
122
0
      info->updates[i] = tensor_wrap_array->tensor_wraps[dd + i]->tensors[0];
123
1
}
124
125
static void _ccv_nnc_graph_tensor_wrap_free(ccv_nnc_graph_tensor_wrap_t* const tensor_wrap)
126
195
{
127
195
  ccfree(tensor_wrap);
128
195
}
129
130
ccv_nnc_graph_tensor_wrap_array_t* ccv_nnc_get_tensor_wrap_array(ccv_nnc_graph_t* const graph, const int tensor_wrap_size, int* const tensor_wraps_ref)
131
62
{
132
62
  ccv_nnc_graph_tensor_wrap_array_t** tensor_wrap_array_ref = *tensor_wraps_ref ? 
(ccv_nnc_graph_tensor_wrap_array_t**)9
ccv_array_get9
(graph->tensor_wraps, *tensor_wraps_ref - 1) :
053
;
133
62
  // Otherwise, find an open slot.
134
62
  if (!tensor_wrap_array_ref)
135
53
  {
136
53
    if (!graph->tensor_wraps)
137
27
      graph->tensor_wraps = ccv_array_new(sizeof(ccv_nnc_graph_tensor_wrap_array_t*), 0, 0);
138
53
    ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = 0;
139
53
    ccv_array_push(graph->tensor_wraps, &tensor_wrap_array);
140
53
    tensor_wrap_array_ref = (ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, graph->tensor_wraps->rnum - 1);
141
53
    *tensor_wraps_ref = graph->tensor_wraps->rnum;
142
53
  }
143
62
  int i;
144
62
  if (*tensor_wrap_array_ref)
145
9
  {
146
9
    if ((*tensor_wrap_array_ref)->size != tensor_wrap_size)
147
9
      *tensor_wrap_array_ref = (ccv_nnc_graph_tensor_wrap_array_t*)ccrealloc(*tensor_wrap_array_ref, sizeof(ccv_nnc_graph_tensor_wrap_array_t) + sizeof(ccv_nnc_graph_tensor_wrap_t*) * (tensor_wrap_size - 1));
148
18
    for (i = (*tensor_wrap_array_ref)->size; i < tensor_wrap_size; 
i++9
)
149
9
      (*tensor_wrap_array_ref)->tensor_wraps[i] = 0;
150
9
  } else
151
53
    *tensor_wrap_array_ref = (ccv_nnc_graph_tensor_wrap_array_t*)cccalloc(sizeof(ccv_nnc_graph_tensor_wrap_array_t) + sizeof(ccv_nnc_graph_tensor_wrap_t*) * (tensor_wrap_size - 1), 1);
152
62
  ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *tensor_wrap_array_ref;
153
62
  tensor_wrap_array->size = tensor_wrap_size;
154
62
  return tensor_wrap_array;
155
62
}
156
157
void ccv_nnc_set_tensor_wraps(ccv_nnc_graph_tensor_wrap_t** const tensor_wraps, ccv_nnc_tensor_t* const* const tensors, const int tensor_size)
158
184
{
159
184
  int i;
160
349
  for (i = 0; i < tensor_size; 
i++165
)
161
165
    if (tensors[i])
162
164
    {
163
164
      if (CCV_IS_TENSOR_MULTIVIEW(tensors[i]) &&
164
164
        
((ccv_nnc_tensor_multiview_t*)tensors[i])->anchor != 111
CCV_NNC_MULTIVIEW_PHI111
)
165
164
      {
166
107
        if (!tensor_wraps[i] || 
tensors[i] != tensor_wraps[i]->tensors[0]14
)
167
93
        {
168
93
          if (tensor_wraps[i])
169
0
            _ccv_nnc_graph_tensor_wrap_free(tensor_wraps[i]);
170
93
          tensor_wraps[i] = _ccv_nnc_graph_tensor_wrap_new((ccv_nnc_tensor_multiview_t*)tensors[i]);
171
93
        }
172
107
      } else {
173
57
        if (tensor_wraps[i])
174
0
          _ccv_nnc_graph_tensor_wrap_free(tensor_wraps[i]);
175
57
        tensor_wraps[i] = 0;
176
57
      }
177
164
    }
178
184
}
179
180
void ccv_nnc_graph_register_tensor_wraps(ccv_nnc_graph_t* graph, const int tensor_wraps_ref_d)
181
53
{
182
53
  ccv_nnc_graph_t* p = graph;
183
53
  const ccv_nnc_graph_tensor_wraps_ref_t tensor_wraps_ref = {
184
53
    .d = tensor_wraps_ref_d,
185
53
    .graph = graph,
186
53
  };
187
99
  do {
188
99
    if (!p->tensor_wraps_refs)
189
44
    {
190
44
      p->tensor_wraps_refs = ccv_array_new(sizeof(ccv_nnc_graph_tensor_wraps_ref_t), 0, 0);
191
44
      ccv_array_push(p->tensor_wraps_refs, &tensor_wraps_ref);
192
55
    } else {
193
55
      int i;
194
55
      int has_tensor_wraps_ref = 0;
195
152
      for (i = 0; !has_tensor_wraps_ref && i < p->tensor_wraps_refs->rnum; 
i++97
)
196
97
      {
197
97
        ccv_nnc_graph_tensor_wraps_ref_t* tensor_wraps_ref = (ccv_nnc_graph_tensor_wraps_ref_t*)ccv_array_get(p->tensor_wraps_refs, i);
198
97
        has_tensor_wraps_ref = (tensor_wraps_ref->d == tensor_wraps_ref_d && 
tensor_wraps_ref->graph == graph8
);
199
97
      }
200
55
      if (!has_tensor_wraps_ref)
201
55
        ccv_array_push(p->tensor_wraps_refs, &tensor_wraps_ref);
202
55
    }
203
99
    p = p->p;
204
99
  } while (p);
205
53
}
206
207
static void _ccv_nnc_graph_redo_tensor_wraps(ccv_nnc_graph_exec_info_t* const info, ccv_nnc_graph_t* const graph)
208
7.86k
{
209
7.86k
  int i;
210
7.86k
  const int has_wrap = ccv_nnc_tensors_have_wraps(info->inputs, info->input_size) ||
211
7.86k
    
ccv_nnc_tensors_have_wraps(info->outputs, info->output_size)7.82k
||
212
7.86k
    
ccv_nnc_tensors_have_wraps(info->updates, info->update_size)7.82k
;
213
7.86k
  if (has_wrap)
214
61
  {
215
61
    const int tensor_wrap_size = info->input_size + info->output_size + info->update_size;
216
61
    ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = ccv_nnc_get_tensor_wrap_array(graph, tensor_wrap_size, &info->tensor_wraps_ref);
217
61
    ccv_nnc_set_tensor_wraps(tensor_wrap_array->tensor_wraps, info->inputs, info->input_size);
218
61
    const int d = info->input_size;
219
61
    ccv_nnc_set_tensor_wraps(tensor_wrap_array->tensor_wraps + d, info->outputs, info->output_size);
220
61
    const int dd = info->input_size + info->output_size;
221
61
    ccv_nnc_set_tensor_wraps(tensor_wrap_array->tensor_wraps + dd, info->updates, info->update_size);
222
7.80k
  } else if (info->tensor_wraps_ref) {
223
1
    ccv_nnc_graph_tensor_wrap_array_t** tensor_wrap_array_ref = (ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, info->tensor_wraps_ref - 1);
224
1
    ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *tensor_wrap_array_ref;
225
1
    if (tensor_wrap_array)
226
1
    {
227
4
      for (i = 0; i < tensor_wrap_array->size; 
i++3
)
228
3
        if (tensor_wrap_array->tensor_wraps[i])
229
2
          _ccv_nnc_graph_tensor_wrap_free(tensor_wrap_array->tensor_wraps[i]);
230
1
      ccfree(tensor_wrap_array);
231
1
      *tensor_wrap_array_ref = 0;
232
1
      info->tensor_wraps_ref = 0;
233
1
    }
234
1
  }
235
7.86k
}
236
237
static void _ccv_nnc_graph_deregister_tensor_wraps(ccv_nnc_graph_t* graph, const int tensor_wraps_ref_d)
238
1
{
239
1
  ccv_nnc_graph_t* p = graph;
240
2
  do {
241
2
    int i;
242
2
    // Remove from the array.
243
2
    if (p->tensor_wraps_refs)
244
2
      for (i = 0; i < p->tensor_wraps_refs->rnum; 
i++0
)
245
2
      {
246
2
        ccv_nnc_graph_tensor_wraps_ref_t* const tensor_wraps_ref = (ccv_nnc_graph_tensor_wraps_ref_t*)ccv_array_get(p->tensor_wraps_refs, i);
247
2
        if (tensor_wraps_ref->d == tensor_wraps_ref_d && tensor_wraps_ref->graph == graph)
248
2
        {
249
2
          --p->tensor_wraps_refs->rnum;
250
2
          if (i < p->tensor_wraps_refs->rnum)
251
0
            memcpy(tensor_wraps_ref, tensor_wraps_ref + 1, sizeof(ccv_nnc_graph_exec_t) * (p->tensor_wraps_refs->rnum - i));
252
2
          break;
253
2
        }
254
2
      }
255
2
    p = p->p;
256
2
  } while (p);
257
1
}
258
259
void ccv_nnc_graph_exec_set_io_flags(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, const int* const input_flags, const int input_flag_size, const int* const output_flags, const int output_flag_size)
260
7.62k
{
261
7.62k
  assert(exec.d < graph->exec_info->rnum);
262
7.62k
  assert(exec.graph == graph);
263
7.62k
  ccv_nnc_graph_exec_info_t* const info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d);
264
7.62k
  assert(input_flag_size <= info->input_size);
265
7.62k
  assert(output_flag_size <= info->output_size);
266
7.62k
  if (info->input_size + info->output_size == 0)
267
19
    return;
268
7.60k
  if (!info->input_flags)
269
7.60k
  {
270
7.60k
    info->input_flags = (int*)cccalloc(info->input_size + info->output_size, sizeof(int));
271
7.60k
    info->output_flags = info->input_flags + info->input_size;
272
7.60k
  }
273
7.60k
  if (input_flag_size > 0)
274
0
    memcpy(info->input_flags, input_flags, sizeof(int) * input_flag_size);
275
7.60k
  if (output_flag_size > 0)
276
0
    memcpy(info->output_flags, output_flags, sizeof(int) * output_flag_size);
277
7.60k
}
278
279
void ccv_nnc_graph_exec_set_peer(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, const ccv_nnc_graph_exec_t peer_exec)
280
213
{
281
213
  assert(exec.graph == graph);
282
213
  assert(exec.d >= 0);
283
213
  assert(exec.d < graph->exec_info->rnum);
284
213
  assert(peer_exec.graph == graph || peer_exec.graph == graph->peer);
285
213
  assert(peer_exec.d >= 0);
286
213
  if (peer_exec.graph == graph)
287
209
    { assert(peer_exec.d < graph->exec_info->rnum); }
288
4
  else
289
4
    { assert(peer_exec.d < graph->peer->exec_info->rnum); }
290
213
  ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d);
291
213
  exec_info->peer_ref = peer_exec.d + 1;
292
213
}
293
294
static ccv_nnc_tensor_t* _ccv_nnc_any_tensor_from_tensor_multiview(ccv_nnc_tensor_multiview_t* const mv)
295
92
{
296
92
  ccv_nnc_tensor_t* tensor = (ccv_nnc_tensor_t*)mv;
297
188
  while (CCV_IS_TENSOR_MULTIVIEW(tensor))
298
96
  {
299
96
    ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
300
96
    const int count = 0;
301
96
    const int off = mv->kind;
302
96
    const int mod = mv->repeat;
303
96
    // If reached the root.
304
96
    tensor = CCV_NNC_MULTIVIEW_DATA(mv)[count >= off ? 
((count - off) % mod) + off83
:
count13
]; // Unwrap.
305
96
  }
306
92
  return tensor;
307
92
}
308
309
void ccv_nnc_graph_exec_set_io(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
310
23
{
311
23
  assert(exec.d < graph->exec_info->rnum);
312
23
  assert(exec.graph == graph);
313
23
  ccv_nnc_graph_exec_info_t* const info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d);
314
23
  // De-register from the graph if it contains multiview tensors.
315
23
  if (info->tensor_wraps_ref)
316
1
    _ccv_nnc_graph_deregister_tensor_wraps(graph, info->tensor_wraps_ref - 1);
317
23
  // In case it is already executed, rewind.
318
23
  _ccv_nnc_graph_exec_rewind(info, graph);
319
23
  if (input_size == 0 && 
output_size == 04
)
320
1
  {
321
1
    if (info->input_size > 0 || info->output_size > 0)
322
1
      
ccfree0
(info->inputs)0
;
323
1
    info->inputs = 0;
324
1
    info->outputs = 0;
325
1
    info->input_size = 0;
326
1
    info->output_size = 0;
327
1
    _ccv_nnc_graph_redo_tensor_wraps(info, graph);
328
1
    if (info->tensor_wraps_ref)
329
0
      ccv_nnc_graph_register_tensor_wraps(graph, info->tensor_wraps_ref - 1);
330
1
    return;
331
1
  }
332
22
  if (info->inputs)
333
2
    info->inputs = (ccv_nnc_tensor_t**)ccrealloc(info->inputs, sizeof(ccv_nnc_tensor_t*) * (input_size + output_size));
334
20
  else
335
20
    info->inputs = (ccv_nnc_tensor_t**)ccmalloc(sizeof(ccv_nnc_tensor_t*) * (input_size + output_size));
336
22
  info->outputs = info->inputs + input_size;
337
22
  if (inputs)
338
22
    memcpy(info->inputs, inputs, sizeof(ccv_nnc_tensor_t*) * input_size);
339
22
  if (outputs)
340
22
    memcpy(info->outputs, outputs, sizeof(ccv_nnc_tensor_t*) * output_size);
341
22
  int i;
342
22
  int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0;
343
77
  for (i = 0; i < input_size + output_size; 
i++55
)
344
55
    if (info->inputs[i])
345
55
    {
346
55
      ccv_nnc_tensor_t* const tensor = CCV_IS_TENSOR_MULTIVIEW(info->inputs[i]) ? 
_ccv_nnc_any_tensor_from_tensor_multiview((ccv_nnc_tensor_multiview_t*)info->inputs[i])3
:
info->inputs[i]52
;
347
55
      tensor_memory |= CCV_TENSOR_GET_MEMORY(tensor->info.type), tensor_formats |= tensor->info.format, tensor_datatypes |= tensor->info.datatype;
348
55
    }
349
22
  info->cmd.backend = ccv_nnc_cmd_find_backend(info->cmd, tensor_memory, tensor_formats, tensor_datatypes);
350
22
  info->input_size = input_size;
351
22
  info->output_size = output_size;
352
22
  _ccv_nnc_graph_redo_tensor_wraps(info, graph);
353
22
  // Register again if the tensor wraps exist.
354
22
  if (info->tensor_wraps_ref)
355
2
    ccv_nnc_graph_register_tensor_wraps(graph, info->tensor_wraps_ref - 1);
356
22
  // Free flags.
357
22
  if (info->input_flags)
358
0
  {
359
0
    ccfree(info->input_flags);
360
0
    info->input_flags = info->output_flags = 0;
361
0
  }
362
22
}
363
364
void ccv_nnc_graph_exec_add_update(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, ccv_nnc_tensor_t* const update)
365
23
{
366
23
  assert(CCV_IS_TENSOR_MULTIVIEW(update));
367
23
  assert(exec.d < graph->exec_info->rnum);
368
23
  assert(exec.graph == graph);
369
23
  ccv_nnc_graph_exec_info_t* const info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d);
370
23
  const int register_tensor_wraps = !info->tensor_wraps_ref;
371
23
  const int update_index = info->update_size;
372
23
  ++info->update_size;
373
23
  if (info->updates)
374
6
    info->updates = (ccv_nnc_tensor_t**)ccrealloc(info->updates, sizeof(ccv_nnc_tensor_t*) * info->update_size);
375
17
  else
376
17
    info->updates = (ccv_nnc_tensor_t**)ccmalloc(sizeof(ccv_nnc_tensor_t*) * info->update_size);
377
23
  info->updates[update_index] = update;
378
23
  _ccv_nnc_graph_redo_tensor_wraps(info, graph);
379
23
  if (register_tensor_wraps)
380
14
    ccv_nnc_graph_register_tensor_wraps(graph, info->tensor_wraps_ref - 1);
381
23
}
382
383
ccv_nnc_graph_exec_t ccv_nnc_graph_exec_new(ccv_nnc_graph_t* const graph, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
384
7.82k
{
385
7.82k
  int d = graph->exec_info->rnum;
386
7.82k
  ccv_nnc_graph_exec_info_t info = {
387
7.82k
    .cmd = cmd,
388
7.82k
    .hint = hint,
389
7.82k
    .input_size = input_size,
390
7.82k
    .output_size = output_size,
391
7.82k
  };
392
7.82k
  assert(inputs || input_size == 0);
393
7.82k
  assert(outputs || output_size == 0);
394
7.82k
  if (input_size > 0 || 
output_size > 01.27k
)
395
7.69k
  {
396
7.69k
    info.inputs = (ccv_nnc_tensor_t**)ccmalloc(sizeof(ccv_nnc_tensor_t*) * (input_size + output_size));
397
7.69k
    info.outputs = info.inputs + input_size;
398
7.69k
    if (inputs)
399
7.63k
      memcpy(info.inputs, inputs, sizeof(ccv_nnc_tensor_t*) * input_size);
400
7.69k
    if (outputs)
401
7.69k
      memcpy(info.outputs, outputs, sizeof(ccv_nnc_tensor_t*) * output_size);
402
7.69k
    info.input_size = input_size;
403
7.69k
    info.output_size = output_size;
404
7.69k
    int i;
405
7.69k
    int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0;
406
42.8k
    for (i = 0; i < input_size + output_size; 
i++35.1k
)
407
35.1k
      if (info.inputs[i])
408
28.7k
      {
409
28.7k
        ccv_nnc_tensor_t* const tensor = CCV_IS_TENSOR_MULTIVIEW(info.inputs[i]) ? 
_ccv_nnc_any_tensor_from_tensor_multiview((ccv_nnc_tensor_multiview_t*)info.inputs[i])76
:
info.inputs[i]28.6k
;
410
28.7k
        tensor_memory |= CCV_TENSOR_GET_MEMORY(tensor->info.type), tensor_formats |= tensor->info.format, tensor_datatypes |= tensor->info.datatype;
411
28.7k
      }
412
7.69k
    info.cmd.backend = ccv_nnc_cmd_find_backend(info.cmd, tensor_memory, tensor_formats, tensor_datatypes);
413
7.69k
  }
414
7.82k
  _ccv_nnc_graph_redo_tensor_wraps(&info, graph);
415
7.82k
  // Add itself to the graph's wraps array, this will help the run time when we run the graph and do unwrapping.
416
7.82k
  if (info.tensor_wraps_ref)
417
36
    ccv_nnc_graph_register_tensor_wraps(graph, info.tensor_wraps_ref - 1);
418
7.82k
  ccv_array_push(graph->exec_info, &info);
419
7.82k
  return (ccv_nnc_graph_exec_t){
420
7.82k
    .d = d,
421
7.82k
    .graph = graph,
422
7.82k
  };
423
7.82k
}
424
425
void ccv_nnc_graph_add_carry_over(ccv_nnc_graph_t* const graph, const ccv_nnc_tensor_t* const from, const ccv_nnc_tensor_t* const to)
426
25
{
427
25
  ccv_nnc_graph_tensor_carry_over_t carry_over = {
428
25
    .from = _ccv_nnc_graph_tensor_wrap_new((ccv_nnc_tensor_multiview_t*)from),
429
25
    .to = _ccv_nnc_graph_tensor_wrap_new((ccv_nnc_tensor_multiview_t*)to)
430
25
  };
431
25
  if (!graph->carry_overs)
432
21
    graph->carry_overs = ccv_array_new(sizeof(ccv_nnc_graph_tensor_carry_over_t), 0, 0);
433
25
  ccv_array_push(graph->carry_overs, &carry_over);
434
25
}
435
436
int ccv_nnc_graph_exec_concat(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t source, const ccv_nnc_graph_exec_t destination)
437
7.87k
{
438
7.87k
  assert(graph == source.graph);
439
7.87k
  assert(graph == destination.graph);
440
7.87k
  assert(source.d < graph->exec_info->rnum);
441
7.87k
  assert(destination.d < graph->exec_info->rnum);
442
7.87k
  ccv_nnc_graph_exec_info_t* src_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, source.d);
443
7.87k
  if (src_info->outgoings == 0)
444
6.62k
    src_info->outgoings = ccv_array_new(sizeof(int32_t), 1, 0);
445
1.24k
  else {
446
1.24k
    int i;
447
1.24k
    // Check if this is already connected, if so, skip.
448
3.38k
    for (i = 0; i < src_info->outgoings->rnum; 
i++2.13k
)
449
2.13k
      if (*(int*)ccv_array_get(src_info->outgoings, i) == destination.d)
450
0
        return -1;
451
1.24k
  }
452
7.87k
  ccv_array_push(src_info->outgoings, &destination.d);
453
7.87k
  graph->topsorted = 0;
454
7.87k
  return 0;
455
7.87k
}
456
457
int ccv_nnc_graph_exec_disjoin(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t source, const ccv_nnc_graph_exec_t destination)
458
0
{
459
0
  assert(graph == source.graph);
460
0
  assert(graph == destination.graph);
461
0
  assert(source.d < graph->exec_info->rnum);
462
0
  assert(destination.d < graph->exec_info->rnum);
463
0
  ccv_nnc_graph_exec_info_t* src_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, source.d);
464
0
  if (!src_info->outgoings)
465
0
    return -1;
466
0
  int i, j = -1;
467
0
  // Check if this is already connected, if so, skip.
468
0
  for (i = 0; i < src_info->outgoings->rnum; i++)
469
0
    if (*(int*)ccv_array_get(src_info->outgoings, i) == destination.d)
470
0
    {
471
0
      j = i;
472
0
      break;
473
0
    }
474
0
  if (j < 0)
475
0
    return -1;
476
0
  if (j < src_info->outgoings->rnum - 1)
477
0
    *(int*)ccv_array_get(src_info->outgoings, j) = *(int*)ccv_array_get(src_info->outgoings, src_info->outgoings->rnum - 1);
478
0
  --src_info->outgoings->rnum;
479
0
  ccv_nnc_graph_exec_info_t* dest_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, destination.d);
480
0
  if (dest_info->outgoings)
481
0
    for (i = 0; i < dest_info->outgoings->rnum; i++)
482
0
      ccv_array_add_unique_int(src_info->outgoings, *(int*)ccv_array_get(dest_info->outgoings, i));
483
0
  graph->topsorted = 0;
484
0
  return 0;
485
0
}
486
487
int ccv_nnc_graph_exec_count(const ccv_nnc_graph_t* const graph)
488
0
{
489
0
  return graph->exec_info ? graph->exec_info->rnum : 0;
490
0
}
491
492
void* ccv_nnc_graph_buffer(ccv_nnc_graph_t* const graph, int size)
493
3.47k
{
494
3.47k
  if (graph->buffer_size >= size)
495
3.45k
    return graph->buffer;
496
15
  graph->buffer_size = size;
497
15
  graph->buffer = (graph->buffer) ? 
ccrealloc2
(graph->buffer, size)2
:
ccmalloc13
(size)13
;
498
15
  return graph->buffer;
499
15
}
500
501
void ccv_nnc_graph_topsort(ccv_nnc_graph_t* const graph, int* const exec_cvt, const int exec_cvt_size)
502
1.17k
{
503
1.17k
  assert(exec_cvt_size == graph->exec_info->rnum);
504
1.17k
  assert(graph->sources && graph->sources->rnum);
505
1.17k
  assert(graph->destinations && graph->destinations->rnum);
506
1.17k
  int i, j;
507
8.93k
  for (i = 0; i < exec_cvt_size; 
i++7.75k
)
508
7.75k
    exec_cvt[i] = -1;
509
1.17k
  ccv_array_t* exec_info = ccv_array_new(sizeof(ccv_nnc_graph_exec_info_t), graph->exec_info->rnum, 0);
510
1.17k
  // If there are breakpoints, it is more complicated, we first start to the breakpoints, and then continue from the breakpoints to the destinations.
511
1.17k
  if (graph->breakpoint_size)
512
21
  {
513
42
    ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new21
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, (ccv_nnc_graph_exec_t*)ccv_array_get(graph->sources, 0), graph->sources->rnum, graph->breakpoints, graph->breakpoint_size, 0);
514
42
    for (i = 0; i < graph->breakpoint_size; 
i++21
)
515
21
      exec_cvt[graph->breakpoints[i].d] = -2; // Mark this as breakpoints, so we will skip the first round.
516
42
    
ccv_nnc_graph_visit_for32
(visit, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), node, idx) {
517
32
      assert(!node->peer_ref); // If node has a peer ref, we cannot fix it up.
518
32
      if (exec_cvt[idx] == -2) // Skip breakpoint.
519
21
        continue;
520
11
      // Loop over node and push to the array.
521
11
      ccv_array_push(exec_info, node);
522
11
      // Go to its sub-graph to fix exec_idx
523
11
      for (i = 0; i < node->graph_ref_size; 
i++0
)
524
0
      {
525
0
        const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
526
0
        if (graph_ref >= 0)
527
0
        {
528
0
          ccv_nnc_graph_t* const sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, graph_ref);
529
0
          sub_graph->exec_idx = exec_info->rnum;
530
0
        }
531
0
      }
532
11
      exec_cvt[idx] = exec_info->rnum - 1;
533
11
    } ccv_nnc_graph_visit_endfor
534
42
    ccv_nnc_graph_visit_free(visit);
535
21
    graph->breakpoint_offset = exec_info->rnum;
536
42
    visit = 
ccv_nnc_graph_visit_new21
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph->breakpoints, graph->breakpoint_size, (ccv_nnc_graph_exec_t*)ccv_array_get(graph->destinations, 0), graph->destinations->rnum, 0);
537
44
    ccv_nnc_graph_visit_for(visit, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), node, idx) {
538
44
      assert(!node->peer_ref); // If node has a peer ref, we cannot fix it up.
539
44
      // Loop over node and push to the array.
540
44
      ccv_array_push(exec_info, node);
541
44
      // Go to its sub-graph to fix exec_idx
542
52
      for (i = 0; i < node->graph_ref_size; 
i++8
)
543
8
      {
544
8
        const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
545
8
        if (graph_ref >= 0)
546
8
        {
547
8
          ccv_nnc_graph_t* const sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, graph_ref);
548
8
          sub_graph->exec_idx = exec_info->rnum;
549
8
        }
550
8
      }
551
44
      exec_cvt[idx] = exec_info->rnum - 1;
552
44
    } ccv_nnc_graph_visit_endfor
553
42
    ccv_nnc_graph_visit_free(visit);
554
42
    for (i = 0; i < graph->breakpoint_size; 
i++21
)
555
21
      { assert(exec_cvt[graph->breakpoints[i].d] >= 0); } // All breakpoints should be assigned.
556
1.15k
  } else {
557
2.31k
    ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new1.15k
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, (ccv_nnc_graph_exec_t*)ccv_array_get(graph->sources, 0), graph->sources->rnum, (ccv_nnc_graph_exec_t*)ccv_array_get(graph->destinations, 0), graph->destinations->rnum, 0);
558
7.70k
    ccv_nnc_graph_visit_for(visit, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), node, idx) {
559
7.70k
      assert(!node->peer_ref); // If node has a peer ref, we cannot fix it up.
560
7.70k
      // Loop over node and push to the array.
561
7.70k
      ccv_array_push(exec_info, node);
562
7.70k
      // Go to its sub-graph to fix exec_idx
563
7.74k
      for (i = 0; i < node->graph_ref_size; 
i++42
)
564
42
      {
565
42
        const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1;
566
42
        if (graph_ref >= 0)
567
42
        {
568
42
          ccv_nnc_graph_t* const sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, graph_ref);
569
42
          sub_graph->exec_idx = exec_info->rnum;
570
42
        }
571
42
      }
572
7.70k
      exec_cvt[idx] = exec_info->rnum - 1;
573
7.70k
    } ccv_nnc_graph_visit_endfor
574
2.31k
    ccv_nnc_graph_visit_free(visit);
575
1.15k
  }
576
1.17k
  assert(graph->exec_info->rnum == exec_info->rnum);
577
1.17k
  ccv_array_free(graph->exec_info);
578
1.17k
  graph->exec_info = exec_info;
579
2.35k
  for (i = 0; i < graph->sources->rnum; 
i++1.17k
)
580
1.17k
  {
581
1.17k
    ccv_nnc_graph_exec_t* const source = (ccv_nnc_graph_exec_t*)ccv_array_get(graph->sources, i);
582
1.17k
    source->d = exec_cvt[source->d];
583
1.17k
  }
584
2.35k
  for (i = 0; i < graph->destinations->rnum; 
i++1.17k
)
585
1.17k
  {
586
1.17k
    ccv_nnc_graph_exec_t* const destination = (ccv_nnc_graph_exec_t*)ccv_array_get(graph->destinations, i);
587
1.17k
    destination->d = exec_cvt[destination->d];
588
1.17k
  }
589
1.17k
  // Update all outgoings to reflect the latest.
590
8.93k
  for (i = 0; i < exec_info->rnum; 
i++7.75k
)
591
7.75k
  {
592
7.75k
    ccv_nnc_graph_exec_info_t* const info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(exec_info, i);
593
7.75k
    if (info->outgoings)
594
14.4k
      
for (j = 0; 6.58k
j < info->outgoings->rnum;
j++7.82k
)
595
7.82k
        *(int*)ccv_array_get(info->outgoings, j) = exec_cvt[*(int*)ccv_array_get(info->outgoings, j)];
596
7.75k
  }
597
1.17k
  graph->topsorted = 1;
598
1.17k
}
599
600
typedef struct {
601
  int device_id;
602
  int exec_idx;
603
  ccv_array_t* signal_set;
604
  ccv_array_t* command_set; // The set of command executed in this stream. In case there is a tie (on rank). We will check this.
605
} ccv_nnc_stream_data_t;
606
607
static void _ccv_nnc_graph_schedule_assign_signals(ccv_array_t* const incoming, ccv_nnc_graph_exec_info_t* const node, ccv_array_t* const stream_data, int* const signal_size, ccv_nnc_graph_exec_info_t* const exec_info, const int exec_info_size)
608
1.16k
{
609
1.16k
  assert(incoming->rnum > 0);
610
1.16k
  int i, j, k;
611
1.16k
  int wait_size = 0, max_wait_size = 0;
612
2.81k
  for (i = 0; i < incoming->rnum; 
i++1.65k
)
613
1.65k
  {
614
1.65k
    const int incoming_idx = *(int*)ccv_array_get(incoming, i);
615
1.65k
    ccv_nnc_graph_exec_info_t* const incoming_exec_info = exec_info + incoming_idx;
616
1.65k
    assert(incoming_exec_info->schedule.stream_size > 0);
617
1.65k
    max_wait_size += incoming_exec_info->schedule.stream_size;
618
1.65k
  }
619
1.16k
  int waits[ccv_max(1, max_wait_size)];
620
1.16k
  assert(node->schedule.stream_size > 0);
621
2.81k
  
for (i = 0; 1.16k
i < incoming->rnum;
i++1.65k
)
622
1.65k
  {
623
1.65k
    const int incoming_idx = *(int*)ccv_array_get(incoming, i);
624
1.65k
    assert(incoming_idx < exec_info_size);
625
1.65k
    assert(incoming_idx >= 0);
626
1.65k
    ccv_nnc_graph_exec_info_t* const incoming_exec_info = exec_info + incoming_idx;
627
1.65k
    assert(incoming_exec_info->schedule.stream_size > 0);
628
1.65k
    int stream_synced = 1;
629
1.65k
    // If the current node's stream is a subset of the incoming node's stream, there
630
1.65k
    // is no need to sync with signal, because we are already synced with the incoming.
631
3.30k
    for (j = 0; stream_synced && 
j < node->schedule.stream_size2.65k
;
j++1.65k
)
632
1.65k
    {
633
1.65k
      const int s = SCHEDULE_STREAMS(node->schedule)[j];
634
1.65k
      assert(s >= 0);
635
1.65k
      int flag = 0;
636
3.63k
      for (k = 0; !flag && 
k < incoming_exec_info->schedule.stream_size2.63k
;
k++1.98k
)
637
1.98k
        flag = (SCHEDULE_STREAMS(incoming_exec_info->schedule)[k] == s);
638
1.65k
      stream_synced = flag;
639
1.65k
    }
640
1.65k
    if (stream_synced)
641
1.00k
      continue;
642
645
    // Otherwise, find the streams we need to sync with, and create signals for these.
643
1.30k
    
for (j = 0; 645
j < incoming_exec_info->schedule.stream_size;
j++663
)
644
663
    {
645
663
      const int s = SCHEDULE_STREAMS(incoming_exec_info->schedule)[j];
646
663
      assert(s >= 0);
647
663
      int flag = 0;
648
1.95k
      for (k = 0; !flag && 
k < node->schedule.stream_size1.94k
;
k++1.29k
)
649
1.29k
        flag = (SCHEDULE_STREAMS(node->schedule)[k] == s);
650
663
      if (!flag) // Need to have a signal.
651
653
      {
652
653
        if (SCHEDULE_SIGNALS(incoming_exec_info->schedule)[j] < 0)
653
653
          
SCHEDULE_SIGNALS487
(incoming_exec_info->schedule)[j] = (*signal_size)++487
;
654
166
        else {
655
166
          int flag = 0;
656
166
          // If any of the stream the current node has already seen this signal, we are good already.
657
558
          for (k = 0; !flag && k < node->schedule.stream_size; 
k++392
)
658
392
          {
659
392
            assert(SCHEDULE_STREAMS(node->schedule)[k] >= 0);
660
392
            ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, SCHEDULE_STREAMS(node->schedule)[k]);
661
392
            flag = (data->signal_set && 
ccv_array_find_int(data->signal_set, 241
SCHEDULE_SIGNALS241
(incoming_exec_info->schedule)[j]));
662
392
          }
663
166
          if (flag)
664
0
            continue;
665
653
        }
666
653
        // Otherwise, we need to wait for this. Currently, our granularity is about wait on all streams.
667
653
        waits[wait_size++] = SCHEDULE_SIGNALS(incoming_exec_info->schedule)[j];
668
653
        // All streams on this node have seen this signal.
669
1.92k
        for (k = 0; k < node->schedule.stream_size; 
k++1.26k
)
670
1.26k
        {
671
1.26k
          ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, SCHEDULE_STREAMS(node->schedule)[k]);
672
1.26k
          if (!data->signal_set)
673
310
            data->signal_set = ccv_array_new(sizeof(int), 0, 0);
674
1.26k
          ccv_array_push(data->signal_set, &SCHEDULE_SIGNALS(incoming_exec_info->schedule)[j]);
675
1.26k
        }
676
653
      }
677
663
    }
678
645
  }
679
1.16k
  node->schedule.wait_size = wait_size;
680
1.16k
  if (wait_size > 0)
681
224
  {
682
224
    node->schedule.waits = node->schedule.waits ? 
ccrealloc0
(node->schedule.waits, sizeof(int) * wait_size)0
: ccmalloc(sizeof(int) * wait_size);
683
224
    memcpy(node->schedule.waits, waits, sizeof(int) * wait_size);
684
224
  }
685
1.16k
}
686
687
typedef struct {
688
  int rank;
689
  ccv_array_t* outgoings;
690
} ccv_nnc_incoming_t;
691
692
static int _ccv_nnc_device_ids_for_stream_data(ccv_nnc_graph_exec_info_t* const node, const int device_id, ccv_array_t* const stream_data, int* const device_ids, const int max_device_id_size)
693
3.49k
{
694
3.49k
  int device_id_size = ccv_nnc_device_ids_for_io(node->inputs, node->input_size, node->outputs, node->output_size, device_ids, max_device_id_size);
695
3.49k
  if (device_id_size == 0)
696
807
  {
697
807
    // If there is a default data, use that device id. Otherwise, use the device id passed in (this will be the default data device id).
698
807
    if (stream_data->rnum > 0)
699
783
    {
700
783
      ccv_nnc_stream_data_t* const default_data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, 0);
701
783
      device_ids[0] = default_data->device_id;
702
783
    } else
703
24
      device_ids[0] = device_id >= 0 ? 
device_id2
:
022
;
704
807
    device_id_size = 1;
705
807
  }
706
3.49k
  return device_id_size;
707
3.49k
}
708
709
static void _ccv_nnc_graph_static_schedule(ccv_nnc_graph_t* const graph, const int stream_type, const int device_id, ccv_nnc_stream_context_t* const stream_context)
710
28
{
711
28
  assert(graph->sources && graph->sources->rnum);
712
28
  assert(graph->destinations && graph->destinations->rnum);
713
28
  assert(graph->topsorted); // Only support this on a topsorted graph.
714
28
  const int exec_info_size = graph->exec_info->rnum;
715
28
  ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0);
716
56
  ccv_nnc_graph_visit_t* visit = 
ccv_nnc_graph_visit_new28
(graph, exec_info, exec_info_size, (ccv_nnc_graph_exec_t*)ccv_array_get(graph->sources, 0), graph->sources->rnum, (ccv_nnc_graph_exec_t*)ccv_array_get(graph->destinations, 0), graph->destinations->rnum, 0);
717
56
  int i, j, k;
718
56
  // Generate exec dependencies (or, in other words, partial ordering of executions).
719
56
  ccv_sparse_matrix_t* exec_dep = ccv_sparse_matrix_new(exec_info_size, exec_info_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
720
56
  int* buf = (int*)
ccmalloc28
(sizeof(int) * exec_info_size * 2);
721
56
  int buf_size;
722
56
#define for_block(x, val) \
723
77.9k
  do { \
724
77.9k
    if (((int32_t*)val)[0] > 0) \
725
77.9k
    { \
726
77.9k
      buf[buf_size * 2] = x; \
727
77.9k
      buf[buf_size * 2 + 1] = ((int32_t*)val)[0] + 1; \
728
77.9k
      ++buf_size; \
729
77.9k
    } \
730
77.9k
  } while (0)
731
1.19k
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx, term) {
732
1.19k
    buf_size = 0; /* save all its parent deps to this buffer */
733
1.19k
    ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx);
734
1.19k
    if (node->schedule.stream_size > 1)
735
1.19k
      
ccfree0
(node->schedule._heap_streams)0
;
736
1.19k
    node->schedule.stream_size = 0;
737
1.19k
    node->schedule.wait_size = 0;
738
1.19k
    if (vector)
739
77.9k
      
CCV_SPARSE_VECTOR_FOREACH1.16k
(exec_dep, vector, for_block);
740
1.19k
    if (!node->outgoings)
741
28
      continue;
742
3.39k
    
for (i = 0; 1.16k
i < node->outgoings->rnum;
i++2.22k
)
743
2.22k
    {
744
2.22k
      int outgoing = *(int*)ccv_array_get(node->outgoings, i);
745
2.22k
      const int32_t one = 1;
746
2.22k
      ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, idx);
747
2.22k
      /* If not found, set, if the current node is the destination node, no need
748
2.22k
       * set itself as parent of subsequent nodes because its terminal nature. */
749
2.22k
      if (!term && (!cell.i32 || 
cell.i32[0] == 00
))
750
2.22k
        ccv_set_sparse_matrix_cell(exec_dep, outgoing, idx, &one);
751
131k
      for (j = 0; j < buf_size; 
j++129k
) /* set with all idx's dependencies as well */
752
129k
      {
753
129k
        ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2]);
754
129k
        /* If not found, set */
755
129k
        if (!cell.i32 || 
cell.i32[0] == 053.3k
)
756
75.7k
          ccv_set_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2], &buf[j * 2 + 1]);
757
53.3k
        else {
758
53.3k
          /* Otherwise, set to the longest one */
759
53.3k
          int32_t dep = ccv_max(cell.i32[0], buf[j * 2 + 1]);
760
53.3k
          ccv_set_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2], &dep);
761
53.3k
        }
762
129k
      }
763
2.22k
    }
764
1.16k
  } ccv_nnc_graph_visit_endfor
765
56
#undef for_block
766
56
  
ccfree28
(buf);
767
28
  // Algorithm to allocate signals and streams for this graph.
768
28
  ccv_array_t* const stream_data = ccv_array_new(sizeof(ccv_nnc_stream_data_t), 0, 0);
769
28
  ccv_array_t** const outgoings = cccalloc(exec_info_size, sizeof(ccv_array_t*));
770
28
  ccv_nnc_incoming_t* const incomings = cccalloc(exec_info_size, sizeof(ccv_nnc_incoming_t));
771
28
  int max_device_id_size = 1;
772
28
  // Filter out outgoing nodes that we will be able to access it afterwards anyway.
773
1.19k
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
774
1.19k
    max_device_id_size = ccv_max(node->input_size + node->output_size, max_device_id_size);
775
1.19k
    if (node->outgoings)
776
1.16k
    {
777
1.16k
      outgoings[idx] = ccv_array_new(sizeof(int), 0, 0);
778
3.39k
      for (i = 0; i < node->outgoings->rnum; 
i++2.22k
)
779
2.22k
      {
780
2.22k
        const int di = *(int*)ccv_array_get(node->outgoings, i);
781
2.22k
        int flag = 0;
782
7.14k
        for (j = 0; !flag && 
j < node->outgoings->rnum6.57k
;
j++4.92k
)
783
4.92k
        {
784
4.92k
          if (j != i)
785
3.24k
          {
786
3.24k
            const int dj = *(int*)ccv_array_get(node->outgoings, j);
787
3.24k
            ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, di, dj);
788
3.24k
            flag = (cell.i32 && 
cell.i32[0]576
);
789
3.24k
          }
790
4.92k
        }
791
2.22k
        if (!flag)
792
1.65k
        {
793
1.65k
          ccv_array_push(outgoings[idx], &di);
794
1.65k
          if (!incomings[di].outgoings)
795
1.16k
            incomings[di].outgoings = ccv_array_new(sizeof(int), 1, 0);
796
1.65k
          ccv_array_push(incomings[di].outgoings, &idx);
797
1.65k
        }
798
2.22k
      }
799
1.16k
      // If we have outgoing nodes, I cannot filter out all of them.
800
1.16k
      assert(node->outgoings->rnum == 0 || outgoings[idx]->rnum > 0);
801
1.16k
    }
802
1.19k
  } ccv_nnc_graph_visit_endfor
803
28
#define visitor(node, idx, _) \
804
1.19k
  if (node->outgoings) \
805
2.81k
    
for (i = 0; 1.16k
i < node->outgoings->rnum;
i++1.65k
) \
806
1.65k
    { \
807
1.65k
      const int d = *(int*)ccv_array_get(node->outgoings, i); \
808
1.65k
      node->rank = ccv_max(incomings[d].rank + 1, node->rank); \
809
1.65k
    }
810
1.19k
  
CCV_NNC_GRAPH_VISIT28
(graph, incomings, exec_info_size, (ccv_nnc_graph_exec_t*)ccv_array_get(graph->destinations, 0), graph->destinations->rnum, (ccv_nnc_graph_exec_t*)ccv_array_get(graph->sources, 0), graph->sources->rnum, 0, 28
visitor);
811
28
#undef visitor
812
28
  int device_ids[max_device_id_size];
813
28
  int outgoing_device_ids[max_device_id_size];
814
28
  int signal_size = 0;
815
1.19k
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
816
1.19k
    // Go through the incomings.
817
1.19k
    const int device_id_size = _ccv_nnc_device_ids_for_stream_data(node, device_id, stream_data, device_ids, max_device_id_size);
818
1.19k
    if (node->schedule.stream_size == 0)
819
28
    {
820
28
      node->schedule.stream_size = device_id_size; // At least at the same size as the device_id_size.
821
28
      if (device_id_size > 1)
822
0
      {
823
0
        node->schedule._heap_streams = (int*)ccmalloc(sizeof(int) * device_id_size * 2);
824
0
        node->schedule._heap_signals = (node->schedule._heap_streams + device_id_size);
825
0
      }
826
56
      for (i = 0; i < device_id_size; 
i++28
)
827
28
        SCHEDULE_STREAMS(node->schedule)[i] = -1, SCHEDULE_SIGNALS(node->schedule)[i] = -1;
828
28
    }
829
2.55k
    for (i = 0; i < device_id_size; 
i++1.36k
)
830
1.36k
      // Go through until the end to assign streams.
831
1.36k
      if (SCHEDULE_STREAMS(node->schedule)[i] < 0)
832
345
      {
833
345
        int stream_idx = -1;
834
345
        int stream_has_command = 0;
835
345
        // First, find a good stream in stream data (the stream is good if it can be recycled, and it has the same command).
836
345
        // Otherwise, we prefer a usable stream (it doesn't have the command, but it can be recycled).
837
11.8k
        for (j = 0; (stream_idx < 0 || 
!stream_has_command96
) && j < stream_data->rnum;
j++11.5k
)
838
11.5k
        {
839
11.5k
          ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, j);
840
11.5k
          if (data->device_id == device_ids[i])
841
3.08k
          {
842
3.08k
            const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, data->exec_idx);
843
3.08k
            // If there is a path to conclude that exec_idx is before idx, then we can reuse
844
3.08k
            // this stream. Otherwise the work in this "empty stream" could still be ongoing,
845
3.08k
            // and we may delay the following work unnecessarily.
846
3.08k
            if (cell.i32 && 
cell.i32[0] > 052
)
847
52
            {
848
52
              if (ccv_array_find_uint(data->command_set, node->cmd.cmd))
849
0
                stream_idx = j, stream_has_command = 1;
850
52
              else if (stream_idx < 0) // Otherwise, only assign the stream idx if it is not assigned yet.
851
25
                stream_idx = j;
852
52
            }
853
3.08k
          }
854
11.5k
        }
855
345
        if (stream_idx < 0)
856
320
        {
857
320
          stream_idx = stream_data->rnum;
858
320
          const ccv_nnc_stream_data_t data = {
859
320
            .device_id = device_ids[i],
860
320
          };
861
320
          ccv_array_push(stream_data, &data);
862
320
        }
863
345
        assert(stream_idx >= 0);
864
345
        ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, stream_idx);
865
345
        if (!data->command_set)
866
320
          data->command_set = ccv_array_new(sizeof(uint32_t), 1, 0);
867
345
        SCHEDULE_STREAMS(node->schedule)[i] = stream_idx;
868
345
        ccv_array_add_unique_uint(data->command_set, node->cmd.cmd);
869
345
        // Assign all subsequent node to use this stream.
870
345
        int outgoing_idx = idx;
871
1.36k
        while (outgoings[outgoing_idx] && 
outgoings[outgoing_idx]->rnum1.33k
)
872
1.33k
        {
873
1.33k
          int highest_rank = -1;
874
1.33k
          int highest_idx = -1;
875
1.33k
          int stream_n = -1;
876
1.33k
          int stream_has_command = 0;
877
3.63k
          for (j = 0; j < outgoings[outgoing_idx]->rnum; 
j++2.29k
)
878
2.29k
          {
879
2.29k
            const int d = *(int*)ccv_array_get(outgoings[outgoing_idx], j);
880
2.29k
            ccv_nnc_graph_exec_info_t* const outgoing_node = exec_info + d;
881
2.29k
            const int outgoing_device_id_size = _ccv_nnc_device_ids_for_stream_data(outgoing_node, device_id, stream_data, outgoing_device_ids, max_device_id_size);
882
2.29k
            if (outgoing_node->schedule.stream_size == 0)
883
1.16k
            {
884
1.16k
              outgoing_node->schedule.stream_size = outgoing_device_id_size; // At least at the same size as the device_id_size.
885
1.16k
              if (outgoing_device_id_size > 1)
886
64
              {
887
64
                outgoing_node->schedule._heap_streams = (int*)ccmalloc(sizeof(int) * outgoing_device_id_size * 2);
888
64
                outgoing_node->schedule._heap_signals = (outgoing_node->schedule._heap_streams + outgoing_device_id_size);
889
64
              }
890
2.49k
              for (k = 0; k < outgoing_device_id_size; 
k++1.33k
)
891
1.33k
                SCHEDULE_STREAMS(outgoing_node->schedule)[k] = -1, SCHEDULE_SIGNALS(outgoing_node->schedule)[k] = -1;
892
1.16k
            }
893
2.29k
            assert(outgoing_node->schedule.stream_size == outgoing_device_id_size);
894
5.23k
            
for (k = 0; 2.29k
k < outgoing_device_id_size;
k++2.93k
)
895
2.93k
              // If it should be on the same device and the stream is not assign, potentially.
896
2.93k
              if (outgoing_device_ids[k] == device_ids[i] &&
897
2.93k
                
SCHEDULE_STREAMS1.46k
(outgoing_node->schedule)[k] < 01.46k
&&
898
2.93k
                
(1.30k
incomings[d].rank > highest_rank1.30k
||
899
1.30k
                 
(290
incomings[d].rank == highest_rank290
&&
900
290
                  !stream_has_command && 
ccv_array_find_uint(data->command_set, outgoing_node->cmd.cmd)0
)))
901
1.01k
              {
902
1.01k
                highest_rank = incomings[d].rank;
903
1.01k
                highest_idx = d;
904
1.01k
                stream_n = k;
905
1.01k
                // This is 1 if rank is the same (thus, I must break the tie already), if the rank is not the same, we need to compute this.
906
1.01k
                stream_has_command = (incomings[d].rank == highest_rank || 
ccv_array_find_uint(data->command_set, outgoing_node->cmd.cmd)0
);
907
1.01k
              }
908
2.29k
          }
909
1.33k
          if (highest_idx >= 0)
910
1.01k
          {
911
1.01k
            outgoing_idx = highest_idx;
912
1.01k
            ccv_nnc_graph_exec_info_t* const outgoing_node = exec_info + outgoing_idx;
913
1.01k
            assert(stream_n >= 0);
914
1.01k
            SCHEDULE_STREAMS(outgoing_node->schedule)[stream_n] = stream_idx;
915
1.01k
            ccv_array_add_unique_uint(data->command_set, outgoing_node->cmd.cmd);
916
1.01k
          } else
917
317
            break;
918
1.33k
        }
919
345
        data->exec_idx = outgoing_idx;
920
345
      }
921
1.19k
  } ccv_nnc_graph_visit_endfor
922
28
  // Go through to assign signals when necessary.
923
1.19k
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
924
1.19k
    if (incomings[idx].outgoings && 
incomings[idx].outgoings->rnum1.16k
)
925
1.16k
      _ccv_nnc_graph_schedule_assign_signals(incomings[idx].outgoings, node, stream_data, &signal_size, exec_info, exec_info_size);
926
1.19k
  } ccv_nnc_graph_visit_endfor
927
1.22k
  for (i = 0; i < exec_info_size; 
i++1.19k
)
928
1.19k
    if (outgoings[i])
929
1.16k
      ccv_array_free(outgoings[i]);
930
28
  ccfree(outgoings);
931
1.22k
  for (i = 0; i < exec_info_size; 
i++1.19k
)
932
1.19k
    if (incomings[i].outgoings)
933
1.16k
      ccv_array_free(incomings[i].outgoings);
934
28
  ccfree(incomings);
935
28
  ccv_matrix_free(exec_dep);
936
28
  ccv_nnc_stream_data_t* const default_data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, 0);
937
28
  if (device_id >= 0)
938
4
  {
939
4
    // If the default stream (stream 0) is not the same as desired stream, swap with the one that is.
940
4
    if (default_data->device_id != device_id)
941
0
    {
942
0
      int exchange_stream_idx = -1;
943
0
      // Find the stream idx to exchange.
944
0
      ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
945
0
        int flag = 0;
946
0
        for(i = 0; !flag && i < node->schedule.stream_size; i++)
947
0
        {
948
0
          const int stream_idx = SCHEDULE_STREAMS(node->schedule)[i];
949
0
          ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, stream_idx);
950
0
          if (data->device_id == device_id)
951
0
          {
952
0
            exchange_stream_idx = stream_idx;
953
0
            flag = 1;
954
0
          }
955
0
        }
956
0
        if (flag)
957
0
          break;
958
0
      } ccv_nnc_graph_visit_endfor
959
0
      assert(exchange_stream_idx >= 0);
960
0
      ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
961
0
        for (i = 0; i < node->schedule.stream_size; i++)
962
0
          if (SCHEDULE_STREAMS(node->schedule)[i] == 0)
963
0
            SCHEDULE_STREAMS(node->schedule)[i] = -1;
964
0
      } ccv_nnc_graph_visit_endfor
965
0
      ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
966
0
        for (i = 0; i < node->schedule.stream_size; i++)
967
0
          if (SCHEDULE_STREAMS(node->schedule)[i] == exchange_stream_idx)
968
0
            SCHEDULE_STREAMS(node->schedule)[i] = 0;
969
0
      } ccv_nnc_graph_visit_endfor
970
0
      ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
971
0
        for (i = 0; i < node->schedule.stream_size; i++)
972
0
          if (SCHEDULE_STREAMS(node->schedule)[i] == -1)
973
0
            SCHEDULE_STREAMS(node->schedule)[i] = exchange_stream_idx;
974
0
      } ccv_nnc_graph_visit_endfor
975
0
      ((ccv_nnc_stream_data_t*)ccv_array_get(stream_data, exchange_stream_idx))->device_id = default_data->device_id;
976
0
      default_data->device_id = device_id;
977
0
    }
978
4
  }
979
28
  int graph_wait_size = 0;
980
56
  for (i = 0; i < graph->destinations->rnum; 
i++28
)
981
28
  {
982
28
    const int idx = *(int*)ccv_array_get(graph->destinations, i);
983
56
    for (j = 0; j < exec_info[idx].schedule.stream_size; 
j++28
)
984
28
      if (SCHEDULE_STREAMS(exec_info[idx].schedule)[j] != 0) // If this exec_info doesn't end with default stream, we need to wait.
985
0
        ++graph_wait_size;
986
28
  }
987
28
  if (graph_wait_size > 0)
988
0
    graph->waits = (graph->waits) ? ccrealloc(graph->waits, sizeof(int) * graph_wait_size) : ccmalloc(sizeof(int) * graph_wait_size);
989
28
  graph_wait_size = 0;
990
56
  for (i = 0; i < graph->destinations->rnum; 
i++28
)
991
28
  {
992
28
    const int idx = *(int*)ccv_array_get(graph->destinations, i);
993
28
    ccv_nnc_graph_exec_info_t* const destination_exec_info = exec_info + idx;
994
56
    for (j = 0; j < exec_info[idx].schedule.stream_size; 
j++28
)
995
28
      if (SCHEDULE_STREAMS(destination_exec_info->schedule)[j] != 0) // If this exec_info doesn't end with default stream, we need to wait.
996
0
      {
997
0
        ccv_nnc_stream_data_t* const default_stream_data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, 0);
998
0
        if (SCHEDULE_SIGNALS(destination_exec_info->schedule)[j] < 0)
999
0
          SCHEDULE_SIGNALS(destination_exec_info->schedule)[j] = signal_size++;
1000
0
        else if (default_stream_data->signal_set && ccv_array_find_int(default_stream_data->signal_set, SCHEDULE_SIGNALS(destination_exec_info->schedule)[j]))
1001
0
          continue;
1002
0
        graph->waits[graph_wait_size++] = SCHEDULE_SIGNALS(destination_exec_info->schedule)[j];
1003
0
      }
1004
28
  }
1005
28
  graph->wait_size = graph_wait_size;
1006
348
  for (i = 0; i < stream_data->rnum; 
i++320
)
1007
320
  {
1008
320
    ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, i);
1009
320
    if (data->signal_set)
1010
310
      ccv_array_free(data->signal_set);
1011
320
    assert(data->command_set);
1012
320
    ccv_array_free(data->command_set);
1013
320
  }
1014
28
  // Allocate streams & signals
1015
28
  graph->stream_size = stream_data->rnum;
1016
28
  graph->streams = (ccv_nnc_stream_context_t**)ccmalloc(sizeof(ccv_nnc_stream_context_t*) * graph->stream_size);
1017
28
  graph->block_stream_tasks = (ccv_nnc_stream_task_t**)cccalloc(graph->stream_size, sizeof(ccv_nnc_stream_task_t*));
1018
28
  if (stream_context)
1019
4
    graph->streams[0] = stream_context;
1020
344
  for (i = (stream_context ? 
14
:
024
); i < stream_data->rnum;
i++316
)
1021
316
  {
1022
316
    ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, i);
1023
316
    int type = stream_type;
1024
316
    CCV_TENSOR_SET_DEVICE_ID(type, data->device_id);
1025
316
    graph->streams[i] = ccv_nnc_stream_context_new(type);
1026
316
  }
1027
28
  int default_stream_type = stream_type;
1028
28
  CCV_TENSOR_SET_DEVICE_ID(default_stream_type, default_data->device_id);
1029
28
  graph->signal_size = signal_size;
1030
28
  graph->signals = (ccv_nnc_stream_signal_t**)cccalloc(signal_size, sizeof(ccv_nnc_stream_signal_t*));
1031
1.19k
  ccv_nnc_graph_visit_for(visit, exec_info, node, idx) {
1032
2.55k
    for (i = 0; i < node->schedule.stream_size; 
i++1.36k
)
1033
1.36k
      if (SCHEDULE_SIGNALS(node->schedule)[i] >= 0)
1034
487
      {
1035
487
        const int signal = SCHEDULE_SIGNALS(node->schedule)[i];
1036
487
        if (!graph->signals[signal])
1037
487
        {
1038
487
          const ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, SCHEDULE_STREAMS(node->schedule)[i]);
1039
487
          int type = stream_type;
1040
487
          CCV_TENSOR_SET_DEVICE_ID(type, data->device_id);
1041
487
          graph->signals[signal] = ccv_nnc_stream_signal_new(type);
1042
487
        }
1043
487
      }
1044
1.19k
  } ccv_nnc_graph_visit_endfor
1045
28
  ccv_nnc_graph_visit_free(visit);
1046
515
  for (i = 0; i < signal_size; 
i++487
)
1047
487
    { assert(graph->signals[i]); }
1048
28
  if (!graph->extern_signal)
1049
28
    graph->extern_signal = ccv_nnc_stream_signal_new(default_stream_type);
1050
28
  // Do this recursively for its sub graphs.
1051
28
  if (graph->sub_graphs)
1052
7
    
for (i = 0; 3
i < graph->sub_graphs->rnum;
i++4
)
1053
4
    {
1054
4
      ccv_nnc_graph_t* const sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, i);
1055
4
      if (sub_graph)
1056
4
      {
1057
4
        const int exec_idx = sub_graph->exec_idx - 1;
1058
4
        assert(exec_info[exec_idx].schedule.stream_size == 1);
1059
4
        const int stream_idx = SCHEDULE_STREAMS(exec_info[exec_idx].schedule)[0];
1060
4
        const int device_id = ((ccv_nnc_stream_data_t*)ccv_array_get(stream_data, stream_idx))->device_id;
1061
4
        _ccv_nnc_graph_static_schedule(sub_graph, stream_type, device_id, graph->streams[stream_idx]);
1062
4
      }
1063
4
    }
1064
28
  ccv_array_free(stream_data);
1065
28
}
1066
1067
void ccv_nnc_graph_static_schedule(ccv_nnc_graph_t* const graph, const int stream_type)
1068
24
{
1069
24
  assert(graph->p == 0);
1070
24
  _ccv_nnc_graph_static_schedule(graph, stream_type, -1, 0);
1071
24
}
1072
1073
ccv_nnc_stream_context_t* ccv_nnc_graph_default_stream(const ccv_nnc_graph_t* const graph)
1074
5
{
1075
5
  if (graph->streams && graph->stream_size > 0)
1076
5
    return graph->streams[0];
1077
0
  return 0;
1078
0
}
1079
1080
static void _ccv_nnc_graph_dot_exec(const int index, const ccv_nnc_graph_exec_info_t* const exec_info, ccv_nnc_stream_context_t** const streams, const int flags, FILE* out)
1081
551
{
1082
551
  if (flags == CCV_NNC_LONG_DOT_GRAPH)
1083
549
    fputc('{', out);
1084
551
  fprintf(out, "node%d", index);
1085
551
  if (flags == CCV_NNC_LONG_DOT_GRAPH)
1086
549
  {
1087
549
    fputs("|Command: ", out);
1088
549
    fputs(ccv_nnc_cmd_name(exec_info->cmd.cmd), out);
1089
549
    if (exec_info->schedule.stream_size > 0)
1090
132
    {
1091
132
      int i, flag = 0;
1092
132
      fputs("|Stream: ", out);
1093
276
      for (i = 0; i < exec_info->schedule.stream_size; 
i++144
)
1094
144
      {
1095
144
        const int device_id = streams ? CCV_TENSOR_GET_DEVICE_ID(streams[SCHEDULE_STREAMS(exec_info->schedule)[i]]->type) : 
00
;
1096
144
        if (i == 0)
1097
132
          fprintf(out, "%d (d%d)", SCHEDULE_STREAMS(exec_info->schedule)[i], device_id);
1098
12
        else
1099
12
          fprintf(out, ", %d (d%d)", SCHEDULE_STREAMS(exec_info->schedule)[i], device_id);
1100
144
      }
1101
276
      for (i = 0; i < exec_info->schedule.stream_size; 
i++144
)
1102
144
        if (SCHEDULE_SIGNALS(exec_info->schedule)[i] >= 0)
1103
71
        {
1104
71
          if (!flag)
1105
62
          {
1106
62
            flag = 1;
1107
62
            fprintf(out, "|Signal: %d", SCHEDULE_SIGNALS(exec_info->schedule)[i]);
1108
62
          } else
1109
9
            fprintf(out, ", %d", SCHEDULE_SIGNALS(exec_info->schedule)[i]);
1110
71
        }
1111
132
    }
1112
549
    if (exec_info->schedule.wait_size > 0)
1113
75
    {
1114
75
      fputs("|Wait: ", out);
1115
75
      int i;
1116
118
      for (i = 0; i < exec_info->schedule.wait_size - 1; 
i++43
)
1117
43
        fprintf(out, "%d, ", exec_info->schedule.waits[i]);
1118
75
      fprintf(out, "%d", exec_info->schedule.waits[exec_info->schedule.wait_size - 1]);
1119
75
    }
1120
549
    fputc('}', out);
1121
549
  }
1122
551
}
1123
1124
static void _ccv_nnc_graph_dot_tensor(const int index, const ccv_nnc_tensor_t* const tensor, const int zone, const int flags, const int depth, FILE* out)
1125
1.50k
{
1126
1.50k
  // if it has an alias pointer, or, it is a long form.
1127
1.50k
  if (flags == CCV_NNC_LONG_DOT_GRAPH)
1128
1.49k
    fputc('{', out);
1129
1.50k
  const int is_tensor_view = CCV_IS_TENSOR_VIEW(tensor);
1130
1.50k
  if (is_tensor_view)
1131
79
    fprintf(out, "tensorview%d", index);
1132
1.42k
  else
1133
1.42k
    fprintf(out, "tensor%d", index);
1134
1.50k
  int i;
1135
1.69k
  for (i = 0; i < depth; 
i++195
) // Print subscription to denote depth.
1136
195
    fputc('\'', out);
1137
1.50k
  if (CCV_GET_TAPE_ALLOC(tensor->type))
1138
1.50k
    
fputs(" (t)", out)9
;
1139
1.50k
  if (flags == CCV_NNC_LONG_DOT_GRAPH)
1140
1.49k
  {
1141
1.49k
    const int device_id = CCV_TENSOR_GET_DEVICE_ID(tensor->info.type);
1142
1.49k
    fprintf(out, "|d%d|zone%d", device_id, zone);
1143
1.69k
    for (i = 0; i < depth; 
i++195
) // Print subscription to denote depth.
1144
195
      fputc('\'', out);
1145
1.49k
    uintptr_t aptr = (uintptr_t)tensor->data.u8;
1146
1.49k
    const int* ainc = is_tensor_view ? 
((ccv_nnc_tensor_view_t*)(tensor))->inc79
:
tensor->info.dim1.41k
;
1147
1.49k
    // For the last one, we don't extend to full ainc.
1148
1.49k
    size_t ainc_size = (ccv_nnc_dimension_count(ainc) - ainc[0] + tensor->info.dim[0]) * CCV_GET_DATA_TYPE_SIZE(tensor->type);
1149
1.49k
    // Print out the range as well.
1150
1.49k
    fprintf(out, "|{%#010x|%#010x}|%d", (uint32_t)aptr, (uint32_t)(aptr + ainc_size - 1), tensor->info.dim[0]);
1151
2.99k
    for (i = 1; i < CCV_NNC_MAX_DIM_ALLOC && tensor->info.dim[i]; 
i++1.50k
)
1152
1.50k
      fprintf(out, "x%d", tensor->info.dim[i]);
1153
1.49k
    fputc('}', out);
1154
1.49k
  }
1155
1.50k
}
1156
1157
typedef struct {
1158
  int index;
1159
  int name;
1160
  int zone;
1161
  uintptr_t tensor_ref;
1162
  uintptr_t start_ptr;
1163
  uintptr_t end_ptr;
1164
} ccv_nnc_tensor_dot_t;
1165
1166
typedef struct {
1167
  ccv_nnc_tensor_dot_t* dots;
1168
  int* remap;
1169
  int* rename_zone;
1170
  int* rename_index;
1171
} ccv_nnc_tensor_dot_recovery_t;
1172
1173
// First sort by start_ptr, then sort by tensor ptr (so that we will have the same tensor sorted to one cluster).
1174
8.08k
#define less_than(i1, i2, aux) ((i1).start_ptr < (i2).start_ptr || 
(4.16k
(i1).start_ptr == (i2).start_ptr4.16k
&&
(i1).tensor_ref < (i2).tensor_ref1.76k
))
1175
8.08k
static CCV_IMPLEMENT_QSORT(_ccv_nnc_tensor_dot_sort_by_ptr, ccv_nnc_tensor_dot_t, less_than)
1176
#undef less_than
1177
1178
static int _ccv_nnc_graph_dot_tensor_multiview_count(const ccv_nnc_tensor_multiview_t* const mv)
1179
260
{
1180
260
  if (!CCV_IS_TENSOR_MULTIVIEW(mv))
1181
260
    
return 1174
;
1182
86
  const int count = mv->kind + mv->repeat;
1183
86
  int i, c = 0;
1184
269
  for (i = 0; i < count; 
i++183
)
1185
183
    c += _ccv_nnc_graph_dot_tensor_multiview_count((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)[i]);
1186
86
  return c;
1187
86
}
1188
1189
static void _ccv_nnc_graph_dot_tensor_multiview_tensor_dots(const ccv_nnc_tensor_multiview_t* const mv, ccv_nnc_tensor_dot_t* const tensor_dots, int* tensor_index)
1190
86
{
1191
86
  const int count = mv->kind + mv->repeat;
1192
86
  int i;
1193
269
  for (i = 0; i < count; 
i++183
)
1194
183
    if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
1195
183
      
_ccv_nnc_graph_dot_tensor_multiview_tensor_dots((ccv_nnc_tensor_multiview_t*)9
CCV_NNC_MULTIVIEW_DATA9
(mv)[i], tensor_dots, tensor_index);
1196
174
    else {
1197
174
      tensor_dots[*tensor_index].name = *tensor_index;
1198
174
      tensor_dots[*tensor_index].start_ptr =  (uintptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->data.u8;
1199
174
      // Because tv's pointer will get updated, it is not correct in this case to have one tensor_ref.
1200
174
      tensor_dots[*tensor_index].tensor_ref = tensor_dots[*tensor_index].start_ptr;
1201
174
      const size_t dim_size = ccv_nnc_dimension_count(CCV_NNC_MULTIVIEW_DATA(mv)[i]->info.dim) * CCV_GET_DATA_TYPE_SIZE(CCV_NNC_MULTIVIEW_DATA(mv)[i]->type);
1202
174
      tensor_dots[*tensor_index].end_ptr = tensor_dots[*tensor_index].start_ptr + dim_size - 1;
1203
174
      ++(*tensor_index);
1204
174
    }
1205
86
}
1206
1207
static ccv_nnc_tensor_dot_recovery_t _ccv_nnc_graph_tensor_dot_recovery(const ccv_nnc_graph_t* const graph)
1208
147
{
1209
147
  int i, j;
1210
147
  // Recover tensor relationships for all tensors referenced in the graph.
1211
147
  // Most notably, we have to give these indexes, and find if they point to
1212
147
  // the same memory region, and whether they overlap. These information
1213
147
  // are lost since we converted from symbolic form to the execution form.
1214
147
  // and here we do our best to recover because that is easier to understand
1215
147
  // if we want to present the graph visually (also, we don't want to put this
1216
147
  // information into the tensor or execution graph to avoid overhead, thus,
1217
147
  // recovering is the best we can do).
1218
147
  int tensor_count = 0;
1219
735
  for (i = 0; i < graph->exec_info->rnum; 
i++588
)
1220
588
  {
1221
588
    ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1222
1.68k
    for (j = 0; j < exec_info->input_size; 
j++1.09k
)
1223
1.09k
      if (exec_info->inputs[j])
1224
928
        tensor_count += CCV_IS_TENSOR_MULTIVIEW(exec_info->inputs[j]) ? 
_ccv_nnc_graph_dot_tensor_multiview_count((ccv_nnc_tensor_multiview_t*)exec_info->inputs[j])36
:
1892
;
1225
1.25k
    for (j = 0; j < exec_info->output_size; 
j++669
)
1226
669
      if (exec_info->outputs[j])
1227
649
        tensor_count += CCV_IS_TENSOR_MULTIVIEW(exec_info->outputs[j]) ? 
_ccv_nnc_graph_dot_tensor_multiview_count((ccv_nnc_tensor_multiview_t*)exec_info->outputs[j])41
:
1608
;
1228
588
  }
1229
147
  ccv_nnc_tensor_dot_t* tensor_dots = tensor_count > 0 ? 
(ccv_nnc_tensor_dot_t*)143
ccmalloc143
(sizeof(ccv_nnc_tensor_dot_t) * tensor_count) :
04
;
1230
147
  int k = 0;
1231
735
  for (i = 0; i < graph->exec_info->rnum; 
i++588
)
1232
588
  {
1233
588
    ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1234
1.68k
    for (j = 0; j < exec_info->input_size; 
j++1.09k
)
1235
1.09k
    {
1236
1.09k
      ccv_nnc_tensor_t* tensor = exec_info->inputs[j];
1237
1.09k
      if (!tensor)
1238
171
        continue;
1239
928
      if (CCV_IS_TENSOR_MULTIVIEW(tensor))
1240
928
        
_ccv_nnc_graph_dot_tensor_multiview_tensor_dots((ccv_nnc_tensor_multiview_t*)tensor, tensor_dots, &k)36
;
1241
892
      else {
1242
892
        tensor_dots[k].name = k;
1243
892
        tensor_dots[k].tensor_ref = (uintptr_t)tensor;
1244
892
        tensor_dots[k].start_ptr = (uintptr_t)tensor->data.u8;
1245
892
        const int* inc = CCV_IS_TENSOR_VIEW(tensor) ? 
((ccv_nnc_tensor_view_t*)tensor)->inc48
:
tensor->info.dim844
;
1246
892
        const size_t inc_size = (ccv_nnc_dimension_count(inc) - inc[0] + tensor->info.dim[0]) * CCV_GET_DATA_TYPE_SIZE(tensor->type);
1247
892
        tensor_dots[k].end_ptr = tensor_dots[k].start_ptr + inc_size - 1;
1248
892
        ++k;
1249
892
      }
1250
928
    }
1251
1.25k
    for (j = 0; j < exec_info->output_size; 
j++669
)
1252
669
    {
1253
669
      ccv_nnc_tensor_t* tensor = exec_info->outputs[j];
1254
669
      if (!tensor)
1255
20
        continue;
1256
649
      if (CCV_IS_TENSOR_MULTIVIEW(tensor))
1257
649
        
_ccv_nnc_graph_dot_tensor_multiview_tensor_dots((ccv_nnc_tensor_multiview_t*)tensor, tensor_dots, &k)41
;
1258
608
      else {
1259
608
        tensor_dots[k].name = k;
1260
608
        tensor_dots[k].tensor_ref = (uintptr_t)tensor;
1261
608
        tensor_dots[k].start_ptr = (uintptr_t)tensor->data.u8;
1262
608
        const int* inc = CCV_IS_TENSOR_VIEW(tensor) ? 
((ccv_nnc_tensor_view_t*)tensor)->inc31
:
tensor->info.dim577
;
1263
608
        const size_t inc_size = (ccv_nnc_dimension_count(inc) - inc[0] + tensor->info.dim[0]) * CCV_GET_DATA_TYPE_SIZE(tensor->type);
1264
608
        tensor_dots[k].end_ptr = tensor_dots[k].start_ptr + inc_size - 1;
1265
608
        ++k;
1266
608
      }
1267
649
    }
1268
588
  }
1269
147
  tensor_count = k; // We may over count, now shrink.
1270
147
  // To group overlap memory into one zone, we sort it by start ptr first (secondary by the tensor pointer).
1271
147
  _ccv_nnc_tensor_dot_sort_by_ptr(tensor_dots, tensor_count, 0);
1272
147
  int index = 0, zone = 0;
1273
147
  uintptr_t tensor_ref = tensor_count > 0 ? 
tensor_dots[0].tensor_ref143
:
04
;
1274
147
  uintptr_t end_ptr = tensor_count > 0 ? 
tensor_dots[0].end_ptr143
:
04
;
1275
147
  // Then, it is trivial, we go by end ptr. If the next start ptr is still within the end ptr (start ptr <= end ptr),
1276
147
  // they are the same zone.
1277
1.82k
  for (i = 0; i < tensor_count; 
i++1.67k
)
1278
1.67k
  {
1279
1.67k
    if (tensor_dots[i].tensor_ref != tensor_ref)
1280
775
    {
1281
775
      tensor_ref = tensor_dots[i].tensor_ref;
1282
775
      ++index;
1283
775
    }
1284
1.67k
    if (tensor_dots[i].start_ptr > end_ptr)
1285
565
    {
1286
565
      end_ptr = ccv_max(end_ptr, tensor_dots[i].end_ptr);
1287
565
      ++zone;
1288
565
    }
1289
1.67k
    tensor_dots[i].index = index;
1290
1.67k
    tensor_dots[i].zone = zone;
1291
1.67k
  }
1292
147
  // We already have index and zone assigned, but the problem is that these are not very human interpretable (because
1293
147
  // it follows the pointer from low to high, not the tensor creation order). The following code renamed both the index
1294
147
  // and the zone so that it is much more understandable.
1295
147
  const int index_count = index + 1;
1296
147
  const int zone_count = zone + 1;
1297
147
  int* remap = (int*)ccmalloc(sizeof(int) * (tensor_count + index_count + zone_count));
1298
147
  int* rename_index = remap + tensor_count;
1299
147
  int* rename_zone = rename_index + index_count;
1300
1.82k
  for (i = 0; i < tensor_count; 
i++1.67k
)
1301
1.67k
    remap[tensor_dots[i].name] = i;
1302
1.06k
  for (i = 0; i < index_count; 
i++922
)
1303
922
    rename_index[i] = -1;
1304
859
  for (i = 0; i < zone_count; 
i++712
)
1305
712
    rename_zone[i] = -1;
1306
147
  index = 0;
1307
147
  zone = 0;
1308
1.82k
  for (i = 0; i < tensor_count; 
i++1.67k
)
1309
1.67k
  {
1310
1.67k
    ccv_nnc_tensor_dot_t* tensor_dot = tensor_dots + remap[i];
1311
1.67k
    if (rename_index[tensor_dot->index] == -1)
1312
918
      rename_index[tensor_dot->index] = index++;
1313
1.67k
    if (rename_zone[tensor_dot->zone] == -1)
1314
708
      rename_zone[tensor_dot->zone] = zone++;
1315
1.67k
  }
1316
147
  ccv_nnc_tensor_dot_recovery_t recovery = {
1317
147
    .dots = tensor_dots,
1318
147
    .remap = remap,
1319
147
    .rename_index = rename_index,
1320
147
    .rename_zone = rename_zone,
1321
147
  };
1322
147
  return recovery;
1323
147
}
1324
1325
static void _ccv_nnc_graph_tensor_dot_recovery_free(const ccv_nnc_tensor_dot_recovery_t recovery)
1326
147
{
1327
147
  ccfree(recovery.dots);
1328
147
  ccfree(recovery.remap);
1329
147
}
1330
1331
static void _ccv_nnc_graph_dot_tensor_multiview_one(const ccv_nnc_tensor_multiview_t* const mv, const ccv_nnc_tensor_dot_recovery_t recovery, const int depth, int* tensor_index, FILE* out)
1332
86
{
1333
86
  const int count = mv->kind + mv->repeat;
1334
86
  int i, j;
1335
86
  fputs("|{", out);
1336
269
  for (i = 0; i < count; 
i++183
)
1337
183
    if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i]))
1338
183
    {
1339
9
      fprintf(out, "{%d", i);
1340
9
      if (mv->kind == CCV_NNC_MULTIVIEW_K0N || 
(5
mv->kind == CCV_NNC_MULTIVIEW_K1N5
&&
i > 05
))
1341
9
        fputc('*', out); // Denotes that we loop on this.
1342
9
      _ccv_nnc_graph_dot_tensor_multiview_one((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)[i], recovery, depth, tensor_index, out);
1343
9
      if (i == count - 1)
1344
7
        fputc('}', out);
1345
2
      else
1346
2
        fputs("}|", out);
1347
174
    } else {
1348
174
      fprintf(out, "{%d", i);
1349
174
      if (mv->kind == CCV_NNC_MULTIVIEW_K0N || 
(19
mv->kind == CCV_NNC_MULTIVIEW_K1N19
&&
i > 019
))
1350
163
        fputc('*', out); // Denotes that we loop on this.
1351
174
      const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[*tensor_index];
1352
174
      fprintf(out, "|zone%d", recovery.rename_zone[tensor_dot->zone]);
1353
368
      for (j = 0; j < depth; 
j++194
)
1354
194
        fputc('\'', out);
1355
174
      uintptr_t aptr = (uintptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->data.u8;
1356
174
      // For the last one, we don't extend to full ainc.
1357
174
      size_t dim_size = ccv_nnc_dimension_count(CCV_NNC_MULTIVIEW_DATA(mv)[i]->info.dim) * CCV_GET_DATA_TYPE_SIZE(CCV_NNC_MULTIVIEW_DATA(mv)[i]->type);
1358
174
      // Print out the range as well.
1359
174
      fprintf(out, "|{%#010x|%#010x}", (uint32_t)aptr, (uint32_t)(aptr + dim_size - 1));
1360
174
      ++(*tensor_index);
1361
174
      if (i == count - 1)
1362
79
        fputc('}', out);
1363
95
      else
1364
95
        fputs("}|", out);
1365
174
    }
1366
86
  fputc('}', out);
1367
86
}
1368
1369
static void _ccv_nnc_graph_dot_tensor_multiview(const ccv_nnc_tensor_multiview_t* const mv, const ccv_nnc_tensor_dot_recovery_t recovery, const int flags, const int depth, int* tensor_index, FILE* out)
1370
77
{
1371
77
  // if it has an alias pointer, or, it is a long form.
1372
77
  if (flags == CCV_NNC_LONG_DOT_GRAPH)
1373
77
    fputc('{', out);
1374
77
  const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[*tensor_index];
1375
77
  fprintf(out, "multiview%d", recovery.rename_index[tensor_dot->index]);
1376
77
  int i;
1377
161
  for (i = 0; i < depth; 
i++84
) // Print subscription to denote depth.
1378
84
    fputc('\'', out);
1379
77
  if (CCV_GET_TAPE_ALLOC(mv->type))
1380
77
    
fputs(" (t)", out)7
;
1381
77
  if (flags == CCV_NNC_LONG_DOT_GRAPH)
1382
77
  {
1383
77
    _ccv_nnc_graph_dot_tensor_multiview_one(mv, recovery, depth, tensor_index, out);
1384
77
    const ccv_nnc_tensor_t* root = (ccv_nnc_tensor_t*)mv;
1385
156
    while (CCV_IS_TENSOR_MULTIVIEW(root))
1386
79
      root = CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)root)[0];
1387
77
    fprintf(out, "|%d", root->info.dim[0]);
1388
105
    for (i = 1; i < CCV_NNC_MAX_DIM_ALLOC && root->info.dim[i]; 
i++28
)
1389
28
      fprintf(out, "x%d", root->info.dim[i]);
1390
77
    fputc('}', out);
1391
77
  } else
1392
0
    *tensor_index += _ccv_nnc_graph_dot_tensor_multiview_count(mv);
1393
77
}
1394
1395
static void _ccv_nnc_graph_dot_node(const ccv_nnc_graph_exec_info_t* const exec_info, const int exec_index, ccv_nnc_stream_context_t** const streams, const ccv_nnc_tensor_dot_recovery_t recovery, const int flags, const int depth, FILE* out, int* const tensor_index)
1396
551
{
1397
551
  fprintf(out, "node%d [shape=record,label=\"", exec_index);
1398
551
  _ccv_nnc_graph_dot_exec(exec_index, exec_info, streams, flags, out);
1399
551
  int i;
1400
551
  int k = *tensor_index;
1401
551
  if (exec_info->input_size > 0)
1402
436
  {
1403
436
    fputs("|{Input", out);
1404
1.50k
    for (i = 0; i < exec_info->input_size; 
i++1.06k
)
1405
1.06k
      if (exec_info->inputs[i])
1406
894
      {
1407
894
        fputc('|', out);
1408
894
        if (CCV_IS_TENSOR_MULTIVIEW(exec_info->inputs[i]))
1409
894
          
_ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->inputs[i], recovery, flags, depth, &k, out)33
;
1410
861
        else {
1411
861
          const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k];
1412
861
          _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->inputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out);
1413
861
          ++k;
1414
861
        }
1415
894
      } else
1416
171
        fputs("|-", out);
1417
436
    fputc('}', out);
1418
436
  }
1419
551
  if (exec_info->output_size > 0)
1420
501
  {
1421
501
    fputs("|{Output", out);
1422
1.13k
    for (i = 0; i < exec_info->output_size; 
i++633
)
1423
633
      if (exec_info->outputs[i])
1424
613
      {
1425
613
        fputc('|', out);
1426
613
        if (CCV_IS_TENSOR_MULTIVIEW(exec_info->outputs[i]))
1427
613
          
_ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->outputs[i], recovery, flags, depth, &k, out)30
;
1428
583
        else {
1429
583
          const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k];
1430
583
          _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->outputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out);
1431
583
          ++k;
1432
583
        }
1433
613
      } else
1434
20
        fputs("|-", out);
1435
501
    fputc('}', out);
1436
501
  }
1437
551
  fputs("\"];\n", out);
1438
551
  *tensor_index = k;
1439
551
}
1440
1441
static void _ccv_nnc_graph_dot_while_label(const ccv_nnc_graph_exec_info_t* const exec_info, const int exec_index, const ccv_nnc_tensor_dot_recovery_t recovery, const ccv_nnc_graph_t* const while_graph, const int flags, const int depth, FILE* out, int* tensor_index)
1442
25
{
1443
25
  int i;
1444
25
  fprintf(out, "label=<<b>while%d </b>Command: ", exec_index);
1445
25
  fputs(ccv_nnc_cmd_name(exec_info->cmd.cmd), out);
1446
25
  fputs(">;\n", out);
1447
25
  fprintf(out, "label%d [shape=record,label=\"{", exec_index);
1448
25
  int k = *tensor_index;
1449
25
  if (exec_info->input_size > 0)
1450
16
  {
1451
16
    fputs("{Input|{", out);
1452
39
    for (i = 0; i < exec_info->input_size; 
i++23
)
1453
23
    {
1454
23
      if (i > 0)
1455
7
        fputc('|', out);
1456
23
      if (exec_info->inputs[i])
1457
23
      {
1458
23
        if (CCV_IS_TENSOR_MULTIVIEW(exec_info->inputs[i]))
1459
23
          
_ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->inputs[i], recovery, flags, depth, &k, out)1
;
1460
22
        else {
1461
22
          const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k];
1462
22
          _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->inputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out);
1463
22
          ++k;
1464
22
        }
1465
23
      } else
1466
0
        fputc('-', out);
1467
23
    }
1468
16
    fputs("}}", out);
1469
16
  }
1470
25
  if (exec_info->output_size > 0)
1471
15
  {
1472
15
    if (exec_info->input_size > 0)
1473
12
      fputs("|", out);
1474
15
    fputs("{Output|{", out);
1475
38
    for (i = 0; i < exec_info->output_size; 
i++23
)
1476
23
    {
1477
23
      if (i > 0)
1478
8
        fputc('|', out);
1479
23
      if (exec_info->outputs[i])
1480
23
      {
1481
23
        if (CCV_IS_TENSOR_MULTIVIEW(exec_info->outputs[i]))
1482
23
          
_ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->outputs[i], recovery, flags, depth, &k, out)0
;
1483
23
        else {
1484
23
          const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k];
1485
23
          _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->outputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out);
1486
23
          ++k;
1487
23
        }
1488
23
      } else
1489
0
        fputc('-', out);
1490
23
    }
1491
15
    fputs("}}", out);
1492
15
  }
1493
25
  fputs("}\"];\n", out);
1494
25
  *tensor_index = k;
1495
25
}
1496
1497
static void _ccv_nnc_graph_dot_case_of_label(const ccv_nnc_graph_exec_info_t* const exec_info, const int exec_index, const ccv_nnc_tensor_dot_recovery_t recovery, const int flags, const int depth, FILE* out, int* tensor_index)
1498
12
{
1499
12
  int i;
1500
12
  fprintf(out, "label=<<b>caseof%d </b>Command: ", exec_index);
1501
12
  fputs(ccv_nnc_cmd_name(exec_info->cmd.cmd), out);
1502
12
  fputs(">;\n", out);
1503
12
  fprintf(out, "label%d [shape=record,label=\"{", exec_index);
1504
12
  int k = *tensor_index;
1505
12
  if (exec_info->input_size > 0)
1506
11
  {
1507
11
    fputs("{Input|{", out);
1508
22
    for (i = 0; i < exec_info->input_size; 
i++11
)
1509
11
    {
1510
11
      if (i > 0)
1511
0
        fputc('|', out);
1512
11
      if (exec_info->inputs[i])
1513
11
      {
1514
11
        if (CCV_IS_TENSOR_MULTIVIEW(exec_info->inputs[i]))
1515
11
          
_ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->inputs[i], recovery, flags, depth, &k, out)2
;
1516
9
        else {
1517
9
          const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k];
1518
9
          _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->inputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out);
1519
9
          ++k;
1520
9
        }
1521
11
      } else
1522
0
        fputc('-', out);
1523
11
    }
1524
11
    fputs("}}", out);
1525
11
  }
1526
12
  if (exec_info->output_size > 0)
1527
11
  {
1528
11
    if (exec_info->input_size > 0)
1529
10
      fputs("|", out);
1530
11
    fputs("{Output|{", out);
1531
24
    for (i = 0; i < exec_info->output_size; 
i++13
)
1532
13
    {
1533
13
      if (i > 0)
1534
2
        fputc('|', out);
1535
13
      if (exec_info->outputs[i])
1536
13
      {
1537
13
        if (CCV_IS_TENSOR_MULTIVIEW(exec_info->outputs[i]))
1538
13
          
_ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->outputs[i], recovery, flags, depth, &k, out)11
;
1539
2
        else {
1540
2
          const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k];
1541
2
          _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->outputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out);
1542
2
          ++k;
1543
2
        }
1544
13
      } else
1545
0
        fputc('-', out);
1546
13
    }
1547
11
    fputs("}}", out);
1548
11
  }
1549
12
  fputs("}\"];\n", out);
1550
12
  *tensor_index = k;
1551
12
}
1552
1553
static void _ccv_nnc_graph_dot_sub_graphs(const ccv_nnc_graph_exec_info_t* const exec_info, const ccv_nnc_tensor_dot_recovery_t p_recovery, const ccv_array_t* const sub_graphs, const int flags, const int depth, FILE* out, int* tensor_index, int* exec_index)
1554
37
{
1555
37
  if (exec_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)
1556
25
  {
1557
25
    fprintf(out, "subgraph cluster%d {\nstyle=\"rounded\";\nnode%d [style=invisible];\n", *exec_index, *exec_index);
1558
25
    const ccv_nnc_graph_t* const while_graph = *(ccv_nnc_graph_t**)ccv_array_get(sub_graphs, CCV_NNC_GRAPH_REF(exec_info)[0] - 1);
1559
25
    // Output this node info within this subgraph.
1560
25
    _ccv_nnc_graph_dot_while_label(exec_info, *exec_index, p_recovery, while_graph, flags, depth - 1 /* Label all references to its level above. */, out, tensor_index);
1561
25
  } else 
if (12
exec_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF12
) {
1562
12
    fprintf(out, "subgraph cluster%d {\nstyle=\"rounded\";\nnode%d [style=invisible];\n", *exec_index, *exec_index);
1563
12
    _ccv_nnc_graph_dot_case_of_label(exec_info, *exec_index, p_recovery, flags, depth - 1 /* Label all references to its level above. */, out, tensor_index);
1564
12
  }
1565
37
  ++(*exec_index);
1566
37
  int p;
1567
94
  for (p = 0; p < exec_info->graph_ref_size; 
p++57
)
1568
57
  {
1569
57
    if (exec_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
1570
32
    {
1571
32
      fprintf(out, "subgraph cluster%d {\nstyle=\"rounded\";\nnode%d [style=invisible];\nlabel=\"\"\n", *exec_index, *exec_index);
1572
32
      ++(*exec_index);
1573
32
    }
1574
57
    const ccv_nnc_graph_t* const graph = *(ccv_nnc_graph_t**)ccv_array_get(sub_graphs, CCV_NNC_GRAPH_REF(exec_info)[p] - 1);
1575
57
    ccv_nnc_tensor_dot_recovery_t recovery = _ccv_nnc_graph_tensor_dot_recovery(graph);
1576
57
    int i, j;
1577
57
    int k = 0;
1578
57
    int* node_id = (int*)ccmalloc(sizeof(int) * graph->exec_info->rnum);
1579
57
    // Output styles.
1580
167
    for (i = 0; i < graph->exec_info->rnum; 
i++110
)
1581
110
    {
1582
110
      node_id[i] = *exec_index;
1583
110
      ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1584
110
      if (CCV_NNC_GRAPH_REF(exec_info)[0])
1585
3
        _ccv_nnc_graph_dot_sub_graphs(exec_info, recovery, graph->sub_graphs, flags, depth + 1, out, &k, exec_index);
1586
107
      else {
1587
107
        _ccv_nnc_graph_dot_node(exec_info, *exec_index, graph->streams, recovery, flags, depth, out, &k);
1588
107
        ++(*exec_index);
1589
107
      }
1590
110
    }
1591
57
    // Output connections.
1592
167
    for (i = 0; i < graph->exec_info->rnum; 
i++110
)
1593
110
    {
1594
110
      ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1595
110
      if (exec_info->outgoings)
1596
108
        
for (j = 0; 53
j < exec_info->outgoings->rnum;
j++55
)
1597
55
        {
1598
55
          const int outgoing_idx = *(int*)ccv_array_get(exec_info->outgoings, j);
1599
55
          const ccv_nnc_graph_exec_info_t* const outgoing_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, outgoing_idx);
1600
55
          // If both are sub-graphs, have both tail and head specified.
1601
55
          if (CCV_NNC_GRAPH_REF(exec_info)[0] && 
CCV_NNC_GRAPH_REF1
(outgoing_info)[0]1
)
1602
0
            fprintf(out, "node%d -> node%d [ltail=cluster%d,lhead=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[i], node_id[outgoing_idx]);
1603
55
          else if (CCV_NNC_GRAPH_REF(exec_info)[0] && 
!1
CCV_NNC_GRAPH_REF1
(outgoing_info)[0])
1604
1
            fprintf(out, "node%d -> node%d [ltail=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[i]);
1605
54
          else if (!CCV_NNC_GRAPH_REF(exec_info)[0] && CCV_NNC_GRAPH_REF(outgoing_info)[0])
1606
3
            fprintf(out, "node%d -> node%d [lhead=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[outgoing_idx]);
1607
51
          else
1608
51
            fprintf(out, "node%d -> node%d;\n", node_id[i], node_id[outgoing_idx]);
1609
55
        }
1610
110
    }
1611
57
    fputs("}\n", out);
1612
57
    _ccv_nnc_graph_tensor_dot_recovery_free(recovery);
1613
57
    ccfree(node_id);
1614
57
  }
1615
37
  // Extra subgraph cluster.
1616
37
  if (exec_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
1617
12
    fputs("}\n", out);
1618
37
}
1619
1620
void ccv_nnc_graph_dot(const ccv_nnc_graph_t* const graph, const int flags, FILE* out)
1621
90
{
1622
90
  fputs("digraph G {\ncompound=true;\n", out);
1623
90
  ccv_nnc_tensor_dot_recovery_t recovery = _ccv_nnc_graph_tensor_dot_recovery(graph);
1624
90
  int i, j;
1625
90
  int k = 0, c = 0;
1626
90
  int* node_id = (int*)ccmalloc(sizeof(int) * graph->exec_info->rnum);
1627
90
  // Output styles.
1628
568
  for (i = 0; i < graph->exec_info->rnum; 
i++478
)
1629
478
  {
1630
478
    node_id[i] = c;
1631
478
    ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1632
478
    if (CCV_NNC_GRAPH_REF(exec_info)[0])
1633
34
      _ccv_nnc_graph_dot_sub_graphs(exec_info, recovery, graph->sub_graphs, flags, 1, out, &k, &c);
1634
444
    else {
1635
444
      _ccv_nnc_graph_dot_node(exec_info, c, graph->streams, recovery, flags, 0, out, &k);
1636
444
      ++c;
1637
444
    }
1638
478
  }
1639
90
  // Output connections.
1640
568
  for (i = 0; i < graph->exec_info->rnum; 
i++478
)
1641
478
  {
1642
478
    ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1643
478
    if (exec_info->outgoings)
1644
986
      
for (j = 0; 388
j < exec_info->outgoings->rnum;
j++598
)
1645
598
      {
1646
598
        const int outgoing_idx = *(int*)ccv_array_get(exec_info->outgoings, j);
1647
598
        const ccv_nnc_graph_exec_info_t* const outgoing_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, outgoing_idx);
1648
598
        // If both are sub-graphs, have both tail and head specified.
1649
598
        if (CCV_NNC_GRAPH_REF(exec_info)[0] && 
CCV_NNC_GRAPH_REF18
(outgoing_info)[0]18
)
1650
3
          fprintf(out, "node%d -> node%d [ltail=cluster%d,lhead=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[i], node_id[outgoing_idx]);
1651
595
        else if (CCV_NNC_GRAPH_REF(exec_info)[0] && 
!15
CCV_NNC_GRAPH_REF15
(outgoing_info)[0])
1652
15
          fprintf(out, "node%d -> node%d [ltail=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[i]);
1653
580
        else if (!CCV_NNC_GRAPH_REF(exec_info)[0] && CCV_NNC_GRAPH_REF(outgoing_info)[0])
1654
8
          fprintf(out, "node%d -> node%d [lhead=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[outgoing_idx]);
1655
572
        else
1656
572
          fprintf(out, "node%d -> node%d;\n", node_id[i], node_id[outgoing_idx]);
1657
598
      }
1658
478
  }
1659
90
  fputs("}\n", out);
1660
90
  _ccv_nnc_graph_tensor_dot_recovery_free(recovery);
1661
90
  ccfree(node_id);
1662
90
}
1663
1664
void ccv_nnc_graph_autotune(ccv_nnc_graph_t* const graph, const size_t max_workspace_size, const int flags, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size)
1665
44
{
1666
44
  // exec current node, for synchronous CPU execution, no stream unit.
1667
44
  int i;
1668
44
#define visitor(node, idx, ...) \
1669
1.11k
  do { \
1670
1.11k
    if (node->cmd.cmd == CCV_NNC_NOOP) \
1671
1.11k
      
continue25
; \
1672
1.11k
    
if (1.08k
node->cmd.cmd == CCV_NNC_GRAPH_FORWARD1.08k
||
node->cmd.cmd == CCV_NNC_GRAPH_BACKWARD1.07k
) \
1673
1.08k
      
for (i = 0; 12
i < node->graph_ref_size30
;
i++18
) \
1674
18
      { \
1675
18
        ccv_nnc_graph_t* sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[i] - 1); \
1676
18
        ccv_nnc_graph_autotune(sub_graph, max_workspace_size, flags, 0, 0, 0, 0); \
1677
18
      } \
1678
1.08k
    else { \
1679
1.07k
      /* Need to unwrap these tensors */ \
1680
7.01k
      for (i = 0; i < node->input_size + node->output_size; 
i++5.93k
) \
1681
5.93k
        if (node->inputs[i] && 
CCV_IS_TENSOR_MULTIVIEW4.81k
(node->inputs[i])) \
1682
5.93k
          
node->inputs[i] = _ccv_nnc_any_tensor_from_tensor_multiview((ccv_nnc_tensor_multiview_t*)node->inputs[i])13
; \
1683
1.07k
      PRINT(CCV_CLI_VERBOSE, "%s [%d]: [%d] -> [%d]\n", ccv_nnc_cmd_name(node->cmd.cmd), idx, node->input_size, node->output_size); \
1684
4.87k
      for (i = 0; i < node->input_size; 
i++3.79k
) \
1685
3.79k
        PRINT(CCV_CLI_VERBOSE, "|-> %d. %p (%p)\n", i + 1, node->inputs[i], (node->inputs[i] ? node->inputs[i]->data.u8 : 0)); \
1686
3.21k
      for (i = 0; i < node->output_size; 
i++2.14k
) \
1687
2.14k
        PRINT(CCV_CLI_VERBOSE, "|<- %d. %p (%p)\n", i + 1, node->outputs[i], (node->outputs[i] ? node->outputs[i]->data.u8 : 0)); \
1688
1.07k
      node->cmd = ccv_nnc_cmd_autotune(node->cmd, max_workspace_size, node->hint, flags, node->inputs, node->input_size, node->outputs, node->output_size, 0); \
1689
1.07k
    } \
1690
1.11k
  } while (0)
1691
44
  const ccv_nnc_graph_exec_t* const graph_sources = sources ? 
sources1
:
(graph->sources 43
?
(ccv_nnc_graph_exec_t*)40
ccv_array_get40
(graph->sources, 0):
03
);
1692
44
  const int graph_source_size = source_size ? 
source_size1
:
(graph->sources 43
?
graph->sources->rnum40
:
03
);
1693
44
  const ccv_nnc_graph_exec_t* const graph_destinations = destinations ? 
destinations1
:
(graph->destinations 43
?
(ccv_nnc_graph_exec_t*)40
ccv_array_get40
(graph->destinations, 0) :
03
);
1694
44
  const int graph_destination_size = destination_size ? 
destination_size1
:
(graph->destinations 43
?
graph->destinations->rnum40
:
03
);
1695
1.11k
  
CCV_NNC_GRAPH_VISIT44
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph_sources, graph_source_size, graph_destinations, graph_destination_size, 0, visitor);
1696
44
#undef visitor
1697
44
}
1698
1699
void ccv_nnc_graph_free(ccv_nnc_graph_t* const graph)
1700
1.19k
{
1701
1.19k
  int i, j;
1702
9.01k
  for (i = 0; i < graph->exec_info->rnum; 
i++7.82k
)
1703
7.82k
  {
1704
7.82k
    ccv_nnc_graph_exec_info_t *info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i);
1705
7.82k
    if (info->_heap_graph_ref)
1706
7.82k
      
ccfree8
(info->_heap_graph_ref)8
;
1707
7.82k
    ccv_array_t* outgoings = info->outgoings;
1708
7.82k
    if (outgoings)
1709
6.62k
      ccv_array_free(outgoings);
1710
7.82k
    // We allocate inputs & outputs in continuous fashion, therefore, only need to free the input array.
1711
7.82k
    if (info->inputs)
1712
7.82k
      
ccfree7.71k
(info->inputs)7.71k
;
1713
7.82k
    if (info->input_flags)
1714
7.82k
      
ccfree7.60k
(info->input_flags)7.60k
;
1715
7.82k
    if (info->updates)
1716
7.82k
      
ccfree17
(info->updates)17
;
1717
7.82k
    if ((info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE) && 
info->p_while.inputs27
)
1718
7.82k
      
ccfree23
(info->p_while.inputs)23
;
1719
7.82k
    if (info->schedule.stream_size > 1)
1720
7.82k
      
ccfree64
(info->schedule._heap_streams)64
;
1721
7.82k
    if (info->schedule.waits)
1722
7.82k
      
ccfree224
(info->schedule.waits)224
;
1723
7.82k
  }
1724
1.19k
  if (graph->tensor_wraps)
1725
27
  {
1726
80
    for (i = 0; i < graph->tensor_wraps->rnum; 
i++53
)
1727
53
    {
1728
53
      ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, i);
1729
53
      if (tensor_wrap_array)
1730
52
      {
1731
195
        for (j = 0; j < tensor_wrap_array->size; 
j++143
)
1732
143
          _ccv_nnc_graph_tensor_wrap_free(tensor_wrap_array->tensor_wraps[j]);
1733
52
        ccfree(tensor_wrap_array);
1734
52
      }
1735
53
    }
1736
27
    ccv_array_free(graph->tensor_wraps);
1737
27
  }
1738
1.19k
  if (graph->tensor_wraps_refs)
1739
44
    ccv_array_free(graph->tensor_wraps_refs);
1740
1.19k
  if (graph->breakpoints)
1741
1.19k
    
ccfree26
(graph->breakpoints)26
;
1742
1.19k
  if (graph->sources)
1743
1.18k
    ccv_array_free(graph->sources);
1744
1.19k
  if (graph->destinations)
1745
1.18k
    ccv_array_free(graph->destinations);
1746
1.19k
  if (graph->streams)
1747
28
  {
1748
28
    // If the graph has parent graph, the default stream is allocated by the parent graph, we need to skip.
1749
28
    if (!graph->p)
1750
24
      ccv_nnc_stream_context_free(graph->streams[0]);
1751
320
    for (i = 1; i < graph->stream_size; 
i++292
)
1752
292
      ccv_nnc_stream_context_free(graph->streams[i]);
1753
28
    ccfree(graph->streams);
1754
28
  }
1755
1.19k
  if (graph->block_stream_tasks)
1756
1.19k
    
ccfree28
(graph->block_stream_tasks)28
;
1757
1.19k
  if (graph->signals)
1758
28
  {
1759
515
    for (i = 0; i < graph->signal_size; 
i++487
)
1760
487
      ccv_nnc_stream_signal_free(graph->signals[i]);
1761
28
    ccfree(graph->signals);
1762
28
  }
1763
1.19k
  if (graph->extern_signal)
1764
28
    ccv_nnc_stream_signal_free(graph->extern_signal);
1765
1.19k
  if (graph->waits)
1766
1.19k
    
ccfree0
(graph->waits)0
;
1767
1.19k
  if (graph->carry_overs)
1768
21
  {
1769
46
    for (i = 0; i < graph->carry_overs->rnum; 
i++25
)
1770
25
    {
1771
25
      ccv_nnc_graph_tensor_carry_over_t* const carry_over = (ccv_nnc_graph_tensor_carry_over_t*)ccv_array_get(graph->carry_overs, i);
1772
25
      _ccv_nnc_graph_tensor_wrap_free(carry_over->from);
1773
25
      _ccv_nnc_graph_tensor_wrap_free(carry_over->to);
1774
25
    }
1775
21
    ccv_array_free(graph->carry_overs);
1776
21
  }
1777
1.19k
  if (graph->sub_graphs)
1778
35
  {
1779
94
    for (i = 0; i < graph->sub_graphs->rnum; 
i++59
)
1780
59
      ccv_nnc_graph_free(*(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, i));
1781
35
    ccv_array_free(graph->sub_graphs);
1782
35
  }
1783
1.19k
  ccv_array_free(graph->exec_info);
1784
1.19k
  ccfree(graph);
1785
1.19k
}