Coverage Report

Created: 2021-09-21 23:33

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/ccv_nnc_graph_run.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#include "_ccv_nnc_graph.h"
6
#include "_ccv_nnc_stream.h"
7
#ifdef HAVE_CUDA
8
#include "gpu/ccv_nnc_compat.h"
9
#endif
10
11
// MARK - Level-2 API
12
13
static void _ccv_nnc_unwrap_tensor_wrap(const ccv_nnc_graph_t* const graph, const int64_t count, const int64_t reverse_count, ccv_nnc_graph_tensor_wrap_t* const tensor_wrap)
14
930
{
15
930
  ccv_nnc_tensor_t* tensor = tensor_wrap->tensors[tensor_wrap->index];
16
1.96k
  while (CCV_IS_TENSOR_MULTIVIEW(tensor) &&
17
1.96k
       
(1.06k
((ccv_nnc_tensor_multiview_t*)tensor)->anchor == (intptr_t)graph1.06k
||
18
1.06k
      
((ccv_nnc_tensor_multiview_t*)tensor)->anchor == (intptr_t)graph->pair45
))
19
1.03k
  {
20
1.03k
    // If the anchor is from the pair, we use the reverse_count instead (we are looking it up).
21
1.03k
    const int i = (int)((((ccv_nnc_tensor_multiview_t*)tensor)->anchor == (intptr_t)graph) ? 
count1.02k
:
reverse_count15
);
22
1.03k
    ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
23
1.03k
    const int off = mv->kind;
24
1.03k
    const int mod = mv->repeat;
25
1.03k
    tensor = CCV_NNC_MULTIVIEW_DATA(mv)[i >= off ? 
((i - off) % mod) + off1.00k
:
i32
]; // Unwrap.
26
1.03k
    // If reached the root.
27
1.03k
    if (!CCV_IS_TENSOR_MULTIVIEW(tensor))
28
1.03k
      
tensor_wrap->update_required = 1889
; // Need to update tensor updates.
29
1.03k
    ++tensor_wrap->index;
30
1.03k
    tensor_wrap->tensors[tensor_wrap->index] = tensor;
31
1.03k
    assert(tensor_wrap->index < tensor_wrap->count);
32
1.03k
  }
33
930
}
34
35
static void _ccv_nnc_graph_unwrap_sub_graph(const ccv_nnc_graph_t* const graph, const int64_t count, const int64_t reverse_count, const ccv_nnc_graph_t* const sub_graph)
36
198
{
37
198
  int i;
38
198
  if (sub_graph->carry_overs)
39
265
    
for (i = 0; 121
i < sub_graph->carry_overs->rnum;
i++144
)
40
144
    {
41
144
      ccv_nnc_graph_tensor_carry_over_t* const carry_over = (ccv_nnc_graph_tensor_carry_over_t*)ccv_array_get(sub_graph->carry_overs, i);
42
144
      _ccv_nnc_unwrap_tensor_wrap(graph, count, reverse_count, carry_over->from);
43
144
      _ccv_nnc_unwrap_tensor_wrap(graph, count, reverse_count, carry_over->to);
44
144
    }
45
198
  if (sub_graph->sub_graphs)
46
82
    
for (i = 0; 21
i < sub_graph->sub_graphs->rnum;
i++61
)
47
61
      _ccv_nnc_graph_unwrap_sub_graph(graph, count, reverse_count, *(ccv_nnc_graph_t**)ccv_array_get(sub_graph->sub_graphs, i));
48
198
}
49
50
static void _ccv_nnc_graph_unwrap(const ccv_nnc_graph_t* const graph, const int64_t count, const int64_t reverse_count)
51
171
{
52
171
  if (!graph->tensor_wraps_refs)
53
34
    return;
54
137
  int i, j;
55
510
  for (i = 0; i < graph->tensor_wraps_refs->rnum; 
i++373
)
56
373
  {
57
373
    const ccv_nnc_graph_tensor_wraps_ref_t* const tensor_wraps_ref = (const ccv_nnc_graph_tensor_wraps_ref_t*)ccv_array_get(graph->tensor_wraps_refs, i);
58
373
    const ccv_nnc_graph_t* const sub_graph = tensor_wraps_ref->graph;
59
373
    ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(sub_graph->tensor_wraps, tensor_wraps_ref->d);
60
373
    if (tensor_wrap_array)
61
1.36k
      
for (j = 0; 373
j < tensor_wrap_array->size;
j++994
)
62
994
      {
63
994
        ccv_nnc_graph_tensor_wrap_t* const tensor_wrap = tensor_wrap_array->tensor_wraps[j];
64
994
        if (!tensor_wrap)
65
352
          continue;
66
642
        _ccv_nnc_unwrap_tensor_wrap(graph, count, reverse_count, tensor_wrap);
67
642
      }
68
373
  }
69
137
  _ccv_nnc_graph_unwrap_sub_graph(graph, count, reverse_count, graph);
70
137
}
71
72
static void _ccv_nnc_graph_transit_move_to(const ccv_nnc_graph_t* const graph)
73
141
{
74
141
  int i;
75
141
  if (graph->carry_overs)
76
255
    
for (i = 0; 118
i < graph->carry_overs->rnum;
i++137
)
77
137
    {
78
137
      ccv_nnc_graph_tensor_carry_over_t* const carry_over = (ccv_nnc_graph_tensor_carry_over_t*)ccv_array_get(graph->carry_overs, i);
79
137
      ccv_nnc_tensor_t* it = (ccv_nnc_tensor_t*)(carry_over->to->tensors[carry_over->to->index]);
80
137
      assert(!CCV_IS_TENSOR_MULTIVIEW(it));
81
137
      it->data = carry_over->transit;
82
137
    }
83
141
}
84
85
static void _ccv_nnc_graph_from_move_transit(const ccv_nnc_graph_t* const graph)
86
143
{
87
143
  int i;
88
143
  if (graph->carry_overs)
89
258
    
for (i = 0; 119
i < graph->carry_overs->rnum;
i++139
)
90
139
    {
91
139
      ccv_nnc_graph_tensor_carry_over_t* const carry_over = (ccv_nnc_graph_tensor_carry_over_t*)ccv_array_get(graph->carry_overs, i);
92
139
      ccv_nnc_tensor_t* it = (ccv_nnc_tensor_t*)(carry_over->from->tensors[carry_over->from->index]);
93
139
      assert(!CCV_IS_TENSOR_MULTIVIEW(it));
94
139
      carry_over->transit = it->data;
95
139
    }
96
143
}
97
98
static void _ccv_nnc_rewrap_tensor_wrap(const ccv_nnc_graph_t* const graph, ccv_nnc_graph_tensor_wrap_t* const tensor_wrap)
99
930
{
100
1.96k
  while (tensor_wrap->index > 0 && 
CCV_IS_TENSOR_MULTIVIEW1.18k
(tensor_wrap->tensors[tensor_wrap->index - 1]) &&
101
1.96k
      
(1.18k
((ccv_nnc_tensor_multiview_t*)tensor_wrap->tensors[tensor_wrap->index - 1])->anchor == (intptr_t)graph1.18k
||
102
1.18k
       
((ccv_nnc_tensor_multiview_t*)tensor_wrap->tensors[tensor_wrap->index - 1])->anchor == (intptr_t)graph->pair165
))
103
1.03k
    --tensor_wrap->index;
104
930
}
105
106
static void _ccv_nnc_graph_rewrap_sub_graph(const ccv_nnc_graph_t* const graph, const ccv_nnc_graph_t* const sub_graph)
107
198
{
108
198
  int i;
109
198
  if (sub_graph->carry_overs)
110
265
    
for (i = 0; 121
i < sub_graph->carry_overs->rnum;
i++144
)
111
144
    {
112
144
      ccv_nnc_graph_tensor_carry_over_t* const carry_over = (ccv_nnc_graph_tensor_carry_over_t*)ccv_array_get(sub_graph->carry_overs, i);
113
144
      _ccv_nnc_rewrap_tensor_wrap(graph, carry_over->from);
114
144
      _ccv_nnc_rewrap_tensor_wrap(graph, carry_over->to);
115
144
    }
116
198
  if (sub_graph->sub_graphs)
117
82
    
for (i = 0; 21
i < sub_graph->sub_graphs->rnum;
i++61
)
118
61
      _ccv_nnc_graph_rewrap_sub_graph(graph, *(ccv_nnc_graph_t**)ccv_array_get(sub_graph->sub_graphs, i));
119
198
}
120
121
static void _ccv_nnc_graph_rewrap(const ccv_nnc_graph_t* const graph) // Call this method at the end to roll the wrap_ptr back
122
171
{
123
171
  if (!graph->tensor_wraps_refs)
124
34
    return;
125
137
  int i, j;
126
510
  for (i = 0; i < graph->tensor_wraps_refs->rnum; 
i++373
)
127
373
  {
128
373
    const ccv_nnc_graph_tensor_wraps_ref_t* const tensor_wraps_ref = (const ccv_nnc_graph_tensor_wraps_ref_t*)ccv_array_get(graph->tensor_wraps_refs, i);
129
373
    const ccv_nnc_graph_t* const sub_graph = tensor_wraps_ref->graph;
130
373
    ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(sub_graph->tensor_wraps, tensor_wraps_ref->d);
131
373
    if (tensor_wrap_array)
132
1.36k
      
for (j = 0; 373
j < tensor_wrap_array->size;
j++994
)
133
994
      {
134
994
        ccv_nnc_graph_tensor_wrap_t* const tensor_wrap = tensor_wrap_array->tensor_wraps[j];
135
994
        if (!tensor_wrap)
136
352
          continue;
137
642
        _ccv_nnc_rewrap_tensor_wrap(graph, tensor_wrap);
138
642
      }
139
373
  }
140
137
  _ccv_nnc_graph_rewrap_sub_graph(graph, graph);
141
137
}
142
143
static void _ccv_nnc_graph_exec_unwrap_io(const ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const node)
144
274k
{
145
274k
  if (!node->tensor_wraps_ref)
146
274k
    return;
147
277
  int i;
148
277
  ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, node->tensor_wraps_ref - 1);
149
277
  ccv_nnc_graph_tensor_wrap_t** const tensor_wraps = tensor_wrap_array->tensor_wraps;
150
1.04k
  for (i = 0; i < tensor_wrap_array->size; 
i++767
)
151
767
    if (tensor_wraps[i])
152
492
    {
153
492
      assert(tensor_wraps[i]->index > 0);
154
492
      ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)(tensor_wraps[i]->tensors[tensor_wraps[i]->index - 1]);
155
492
      assert(CCV_IS_TENSOR_MULTIVIEW(mv));
156
492
      // Only now set the mv->it, because now this node is about to get executed.
157
492
      mv->it = tensor_wraps[i]->tensors[tensor_wraps[i]->index];
158
492
      assert(!CCV_IS_TENSOR_MULTIVIEW(mv->it));
159
492
    }
160
699
  
for (i = 0; 277
i < node->input_size;
i++422
)
161
422
    if (tensor_wraps[i])
162
191
      node->inputs[i] = tensor_wraps[i]->tensors[tensor_wraps[i]->index];
163
277
  const int d = node->input_size;
164
472
  for (i = 0; i < node->output_size; 
i++195
)
165
195
    if (tensor_wraps[d + i])
166
151
      node->outputs[i] = tensor_wraps[d + i]->tensors[tensor_wraps[d + i]->index];
167
277
}
168
169
static void _ccv_nnc_graph_exec_unwrap_while_expr(const ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const node)
170
161
{
171
161
  assert(node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE);
172
161
  if (!node->p_while.tensor_wraps_ref)
173
155
    return;
174
6
  int i;
175
6
  ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, node->p_while.tensor_wraps_ref - 1);
176
6
  ccv_nnc_graph_tensor_wrap_t** const tensor_wraps = tensor_wrap_array->tensor_wraps;
177
18
  for (i = 0; i < tensor_wrap_array->size; 
i++12
)
178
12
    if (tensor_wraps[i])
179
6
    {
180
6
      assert(tensor_wraps[i]->index > 0);
181
6
      ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)(tensor_wraps[i]->tensors[tensor_wraps[i]->index - 1]);
182
6
      assert(CCV_IS_TENSOR_MULTIVIEW(mv));
183
6
      // Only now set the mv->it, because now this node is about to get executed.
184
6
      mv->it = tensor_wraps[i]->tensors[tensor_wraps[i]->index];
185
6
      assert(!CCV_IS_TENSOR_MULTIVIEW(mv->it));
186
6
    }
187
18
  
for (i = 0; 6
i < node->p_while.input_size;
i++12
)
188
12
    if (tensor_wraps[i])
189
6
      node->p_while.inputs[i] = tensor_wraps[i]->tensors[tensor_wraps[i]->index];
190
6
}
191
192
static void _ccv_nnc_graph_exec_unwrap_phi(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_info_t* const node, const int ref)
193
41
{
194
41
  int i;
195
41
  // If the output tensor is a phi multi-view tensor, we update our selection to all the subscribers.
196
80
  for (i = 0; i < node->output_size; 
i++39
)
197
39
    if (CCV_IS_TENSOR_MULTIVIEW(node->outputs[i]) &&
198
39
      
((ccv_nnc_tensor_multiview_t*)node->outputs[i])->anchor == 29
CCV_NNC_MULTIVIEW_PHI29
)
199
39
    {
200
29
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)node->outputs[i];
201
29
      mv->it = CCV_NNC_MULTIVIEW_DATA(mv)[ref >= 0];
202
29
      ccv_nnc_tensor_multiview_synchronize(mv);
203
29
    }
204
41
}
205
206
static void _ccv_nnc_graph_exec_begin_synchronize_multiviews(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const node)
207
274k
{
208
274k
  if (!node->tensor_wraps_ref)
209
274k
    return;
210
277
  int i;
211
277
  ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, node->tensor_wraps_ref - 1);
212
277
  ccv_nnc_graph_tensor_wrap_t** const tensor_wraps = tensor_wrap_array->tensor_wraps;
213
1.04k
  for (i = 0; i < tensor_wrap_array->size; 
i++767
)
214
767
    if (tensor_wraps[i] && 
tensor_wraps[i]->update_required492
)
215
492
    {
216
492
      assert(tensor_wraps[i]->index > 0);
217
492
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)(tensor_wraps[i]->tensors[tensor_wraps[i]->index - 1]);
218
492
      // Now update the final pointer.
219
492
      ccv_nnc_tensor_multiview_synchronize(mv);
220
492
      tensor_wraps[i]->update_required = 0; // Reset, no need to update.
221
492
    }
222
277
}
223
224
void ccv_nnc_print_tensor_info(const ccv_nnc_tensor_t* const tensor)
225
0
{
226
0
  int i;
227
0
  PRINT(CCV_CLI_INFO, " [%d", tensor->info.dim[0]);
228
0
  for (i = 1; i < CCV_NNC_MAX_DIM_ALLOC && tensor->info.dim[i]; i++)
229
0
    PRINT(CCV_CLI_INFO, "x%d", tensor->info.dim[i]);
230
0
  PRINT(CCV_CLI_INFO, "]");
231
0
  if (!CCV_CLI_OUTPUT_LEVEL_IS(CCV_CLI_VERBOSE) || tensor->info.dim[0] <= 0)
232
0
    return;
233
0
  const int nd = ccv_nnc_tensor_nd(tensor->info.dim);
234
0
  const int len = ccv_min(tensor->info.dim[nd - 1], 3);
235
0
  if (CCV_TENSOR_GET_MEMORY(tensor->info.type) == CCV_TENSOR_GPU_MEMORY)
236
0
  {
237
0
#ifdef HAVE_CUDA
238
0
    switch (tensor->info.datatype)
239
0
    {
240
0
      case CCV_16F: {
241
0
        uint16_t data[len];
242
0
        cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.f16, tensor->info.type, len * sizeof(uint16_t));
243
0
        float fp32[len];
244
0
        ccv_half_precision_to_float(data, fp32, len);
245
0
        for (i = 0; i < len; i++)
246
0
          PRINT(CCV_CLI_VERBOSE, " %f", fp32[i]);
247
0
        break;
248
0
      }
249
0
      case CCV_32F: {
250
0
        float data[len];
251
0
        cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.f32, tensor->info.type, len * sizeof(float));
252
0
        for (i = 0; i < len; i++)
253
0
          PRINT(CCV_CLI_VERBOSE, " %f", data[i]);
254
0
        break;
255
0
      }
256
0
      case CCV_64F: {
257
0
        double data[len];
258
0
        cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.f64, tensor->info.type, len * sizeof(double));
259
0
        for (i = 0; i < len; i++)
260
0
          PRINT(CCV_CLI_VERBOSE, " %f", data[i]);
261
0
        break;
262
0
      }
263
0
      case CCV_32S: {
264
0
        int data[len];
265
0
        cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.i32, tensor->info.type, len * sizeof(int));
266
0
        for (i = 0; i < len; i++)
267
0
          PRINT(CCV_CLI_VERBOSE, " %d", data[i]);
268
0
        break;
269
0
      }
270
0
      case CCV_64S: {
271
0
        int64_t data[len];
272
0
        cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.i64, tensor->info.type, len * sizeof(int64_t));
273
0
        for (i = 0; i < len; i++)
274
0
          PRINT(CCV_CLI_VERBOSE, " %lld", (long long)data[i]);
275
0
        break;
276
0
      }
277
0
      case CCV_8U: {
278
0
        uint8_t data[len];
279
0
        cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.u8, tensor->info.type, len * sizeof(uint8_t));
280
0
        for (i = 0; i < len; i++)
281
0
          PRINT(CCV_CLI_VERBOSE, " %d", (int)data[i]);
282
0
        break;
283
0
      }
284
0
    }
285
0
    if (ccv_nnc_tensor_count(tensor->info) > 3)
286
0
      PRINT(CCV_CLI_VERBOSE, " ..");
287
0
#endif
288
0
  } else if (CCV_TENSOR_GET_MEMORY(tensor->info.type) == CCV_TENSOR_CPU_MEMORY) {
289
0
    switch (tensor->info.datatype)
290
0
    {
291
0
      case CCV_16F: {
292
0
        float fp32[len];
293
0
        ccv_half_precision_to_float((uint16_t*)tensor->data.f16, fp32, len);
294
0
        for (i = 0; i < len; i++)
295
0
          PRINT(CCV_CLI_VERBOSE, " %f", fp32[i]);
296
0
        break;
297
0
      }
298
0
      case CCV_32F:
299
0
        for (i = 0; i < len; i++)
300
0
          PRINT(CCV_CLI_VERBOSE, " %f", tensor->data.f32[i]);
301
0
        break;
302
0
      case CCV_64F:
303
0
        for (i = 0; i < len; i++)
304
0
          PRINT(CCV_CLI_VERBOSE, " %f", tensor->data.f64[i]);
305
0
        break;
306
0
      case CCV_32S:
307
0
        for (i = 0; i < len; i++)
308
0
          PRINT(CCV_CLI_VERBOSE, " %d", tensor->data.i32[i]);
309
0
        break;
310
0
      case CCV_64S:
311
0
        for (i = 0; i < len; i++)
312
0
          PRINT(CCV_CLI_VERBOSE, " %lld", (long long)tensor->data.i64[i]);
313
0
        break;
314
0
      case CCV_8U:
315
0
        for (i = 0; i < len; i++)
316
0
          PRINT(CCV_CLI_VERBOSE, " %d", (int)tensor->data.u8[i]);
317
0
        break;
318
0
    }
319
0
    if (ccv_nnc_tensor_count(tensor->info) > 3)
320
0
      PRINT(CCV_CLI_VERBOSE, " ..");
321
0
  }
322
0
}
323
324
static co_decl(_ccv_nnc_graph_topsorted_run_coro, (ccv_nnc_graph_t* const graph, const int exec_idx, const ccv_nnc_graph_static_schedule_t* const schedule, ccv_nnc_graph_exec_info_t* const exec, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context, const int flags));
325
326
6
static 
co_decl_task2
(_ccv_nnc_graph_exec_cases_of_coro, (ccv_nnc_graph_t* const graph, const int exec_idx, ccv_nnc_graph_exec_info_t* const exec, const ccv_nnc_graph_exec_schedule_t* const schd, ccv_nnc_tensor_t* const* const inputs, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context, int flags), private(
327
6
  int ref;
328
6
  ccv_nnc_graph_t* sub_graph;
329
6
)) {
330
6
  // Wait until this stream context is done.
331
6
  
co_stream_await2
(CO_P(stream_context));
332
6
  
if (2
CO_P2
(exec)->cmd.cmd == CCV_NNC_GRAPH_FORWARD2
)
333
2
  {
334
2
    CO_V(ref) = CO_P(exec)->case_of.offset + CO_P(exec)->case_of.expr(CO_P(inputs), CO_P(exec)->input_size, CO_P(exec)->case_of.data);
335
2
    if (CO_P(tensor_tape))
336
2
      
ccv_nnc_tensor_tape_set_numbering(0
CO_P0
(tensor_tape),
CO_P0
(graph), (ccv_nnc_graph_exec_t){
337
0
        .d = CO_P(exec_idx),
338
0
        .graph = CO_P(graph),
339
0
      }, CO_V(ref));
340
2
  } else {
341
0
    assert(CO_P(exec)->cmd.cmd == CCV_NNC_GRAPH_BACKWARD);
342
0
    assert(CO_P(tensor_tape));
343
0
    CO_V(ref) = ccv_nnc_tensor_tape_numbering(CO_P(tensor_tape), CO_P(graph), (ccv_nnc_graph_exec_t){
344
0
        .d = CO_P(exec_idx),
345
0
        .graph = CO_P(graph),
346
0
      });
347
0
  }
348
2
  if (CO_V(ref) >= 0)
349
2
  {
350
2
    assert(CO_V(ref) < CO_P(exec)->graph_ref_size);
351
2
    CO_V(sub_graph) = *(ccv_nnc_graph_t**)ccv_array_get(CO_P(graph)->sub_graphs, CCV_NNC_GRAPH_REF(CO_P(exec))[CO_V(ref)] - 1);
352
2
    assert(CO_P(schd)->stream_size == 1);
353
2
    assert(CO_P(graph)->streams[SCHEDULE_STREAMS(*CO_P(schd))[0]] == CO_V(sub_graph)->streams[0]);
354
2
    co_apply(_ccv_nnc_graph_topsorted_run_coro, (CO_V(sub_graph), CO_P(exec_idx), CO_V(sub_graph)->default_schedule, CO_P(exec), CO_P(tensor_tape), CO_P(graph)->streams[SCHEDULE_STREAMS(*CO_P(schd))[0]], CO_P(flags)));
355
2
  }
356
2
  _ccv_nnc_graph_exec_unwrap_phi(CO_P(graph), CO_P(exec), CO_V(ref));
357
2
} co_end()
358
359
typedef struct {
360
  ccv_nnc_graph_t* graph;
361
  const ccv_nnc_graph_exec_schedule_t* node;
362
  ccv_nnc_stream_context_t* stream;
363
} ccv_nnc_graph_neighbor_context_discovery_t;
364
365
static ccv_nnc_stream_context_t* _ccv_nnc_graph_neighbor_context_discovery(const int device_id, void* const context)
366
13.9k
{
367
13.9k
  const ccv_nnc_graph_neighbor_context_discovery_t* const discovery = (ccv_nnc_graph_neighbor_context_discovery_t*)context;
368
13.9k
  if (CCV_STREAM_GET_DEVICE_ID(ccv_nnc_stream_context_type(discovery->stream)) == device_id)
369
3.65k
    return discovery->stream;
370
10.3k
  ccv_nnc_graph_t* const graph = discovery->graph;
371
10.3k
  const ccv_nnc_graph_exec_schedule_t* const node = discovery->node;
372
10.3k
  int i;
373
10.3k
  // First try to find in other streams of the same node.
374
30.9k
  for (i = 0; i < node->stream_size; 
i++20.6k
)
375
30.9k
  {
376
30.9k
    ccv_nnc_stream_context_t* const stream = graph->streams[SCHEDULE_STREAMS(*node)[i]];
377
30.9k
    if (CCV_STREAM_GET_DEVICE_ID(ccv_nnc_stream_context_type(stream)) == device_id)
378
10.3k
      return stream;
379
30.9k
  }
380
10.3k
  // If cannot find, try to find in all the wait streams.
381
10.3k
  
for (i = 0; 4
i < node->wait_size7
;
i++3
)
382
7
  {
383
7
    ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_signal_get_emitter(graph->signals[node->waits[i]]);
384
7
    if (stream_context && CCV_STREAM_GET_DEVICE_ID(ccv_nnc_stream_context_type(stream_context)) == device_id)
385
4
      return stream_context;
386
7
  }
387
4
  
return 00
;
388
4
}
389
390
static co_routine_t* _ccv_nnc_graph_exec_run_task(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const node, const ccv_nnc_graph_exec_schedule_t* const schd, const int idx, ccv_nnc_tensor_tape_t* const tensor_tape, const int flags)
391
221k
{
392
221k
  _ccv_nnc_graph_exec_unwrap_io(graph, node);
393
221k
  ccv_nnc_tensor_t** inputs = node->inputs;
394
221k
  ccv_nnc_tensor_t** outputs = inputs ? 
inputs + node->input_size194k
:
027.7k
;
395
221k
  if (tensor_tape)
396
0
    ccv_nnc_tensor_tape_io(tensor_tape, graph, node->input_flags, inputs, node->input_size, node->output_flags, outputs, node->output_size);
397
221k
  /* Broadcast the updates to all subscribed references for input / output, even though at th
398
221k
   * time output is not written yet, propagate pointer change is still valid. */
399
221k
  _ccv_nnc_graph_exec_begin_synchronize_multiviews(graph, node);
400
221k
  if (node->cmd.cmd == CCV_NNC_GRAPH_FORWARD || 
node->cmd.cmd == CCV_NNC_GRAPH_BACKWARD221k
)
401
4
  {
402
4
    if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
403
2
    {
404
2
      ccv_nnc_stream_context_t* const node_stream = graph->streams[SCHEDULE_STREAMS(*schd)[0]];
405
2
      return co_new(_ccv_nnc_graph_exec_cases_of_coro, (graph, idx, node, schd, inputs, tensor_tape, node_stream, flags));
406
2
    } else if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE) {
407
2
      ccv_nnc_graph_t* sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[0] - 1);
408
2
      assert(graph->streams[SCHEDULE_STREAMS(*schd)[0]] == sub_graph->streams[0]);
409
2
      return co_new(_ccv_nnc_graph_topsorted_run_coro, (sub_graph, idx, sub_graph->default_schedule, node, tensor_tape, graph->streams[SCHEDULE_STREAMS(*schd)[0]], flags));
410
221k
    }
411
221k
  } else {
412
221k
    PRINT(CCV_CLI_INFO, "%s [%d]: [%d] -> [%d] (%d)\n", 
ccv_nnc_cmd_name(node->cmd.cmd), idx, node->input_size, node->output_size, 0
SCHEDULE_STREAMS0
(*schd)[0]);
413
221k
    int i, j;
414
221k
    int flag = 0;
415
454k
    for (i = 0; i < schd->stream_size; 
i++232k
)
416
232k
    {
417
232k
      ccv_nnc_stream_context_t* const stream = graph->streams[SCHEDULE_STREAMS(*schd)[i]];
418
344k
      for (j = 0; j < schd->wait_size; 
j++112k
)
419
112k
      {
420
112k
        ccv_nnc_stream_context_wait_signal(stream, graph->signals[schd->waits[j]]);
421
112k
        if (!flag)
422
40.8k
        {
423
40.8k
          PRINT(CCV_CLI_INFO, "Wait: (%d, %d)", 
SCHEDULE_STREAMS0
(*schd)[i], schd->waits[j]);
424
40.8k
          flag = 1;
425
40.8k
        } else
426
112k
          
PRINT71.5k
(CCV_CLI_INFO, ", (%d, %d)",
SCHEDULE_STREAMS0
(*schd)[i], schd->waits[j]);
427
112k
      }
428
232k
    }
429
221k
    if (flag)
430
221k
      
PRINT40.8k
(CCV_CLI_INFO, "\n");
431
853k
    for (i = 0; i < node->input_size; 
i++631k
)
432
631k
    {
433
631k
      PRINT(CCV_CLI_INFO, "|-> %d. %p (%p:%d)", 
i + 1, inputs[i], (inputs[i] ? inputs[i]->data.u8 : 0), (inputs[i] ? 0
CCV_TENSOR_GET_DEVICE_ID0
(inputs[i]->info.type) : -1));
434
631k
      if (inputs[i] && 
CCV_CLI_OUTPUT_LEVEL_IS489k
(CCV_CLI_INFO))
435
631k
        
ccv_nnc_print_tensor_info(inputs[i])0
;
436
631k
      PRINT(CCV_CLI_INFO, "\n");
437
631k
    }
438
221k
    ccv_nnc_stream_context_t* const node_stream = graph->streams[SCHEDULE_STREAMS(*schd)[0]];
439
221k
    ccv_nnc_graph_neighbor_context_discovery_t discovery_context = {
440
221k
      .graph = graph,
441
221k
      .node = schd,
442
221k
      .stream = node_stream
443
221k
    };
444
221k
    ccv_nnc_stream_context_set_neighbor_discovery(node_stream, _ccv_nnc_graph_neighbor_context_discovery, &discovery_context);
445
221k
    ccv_nnc_cmd_exec(node->cmd, node->hint, flags, inputs, node->input_size, outputs, node->output_size, node_stream);
446
568k
    for (i = 0; i < node->output_size; 
i++346k
)
447
346k
    {
448
346k
      PRINT(CCV_CLI_INFO, "|<- %d. %p (%p:%d)", 
i + 1, outputs[i], (outputs[i] ? outputs[i]->data.u8 : 0), (outputs[i] ? 0
CCV_TENSOR_GET_DEVICE_ID0
(outputs[i]->info.type) : -1));
449
346k
      if (outputs[i] && 
CCV_CLI_OUTPUT_LEVEL_IS331k
(CCV_CLI_INFO))
450
346k
        
ccv_nnc_print_tensor_info(outputs[i])0
;
451
346k
      PRINT(CCV_CLI_INFO, "\n");
452
346k
    }
453
221k
    flag = 0;
454
454k
    for (i = 0; i < schd->stream_size; 
i++232k
)
455
232k
      if (SCHEDULE_SIGNALS(*schd)[i] >= 0)
456
55.7k
      {
457
55.7k
        ccv_nnc_stream_context_t* const stream = graph->streams[SCHEDULE_STREAMS(*schd)[i]];
458
55.7k
        ccv_nnc_stream_context_emit_signal(stream, graph->signals[SCHEDULE_SIGNALS(*schd)[i]]);
459
55.7k
        if (!flag)
460
55.7k
        {
461
55.7k
          PRINT(CCV_CLI_INFO, "Emit: (%d, %d)", 
SCHEDULE_STREAMS0
(*schd)[i],
SCHEDULE_SIGNALS0
(*schd)[i]);
462
55.7k
          flag = 1;
463
55.7k
        } else
464
55.7k
          
PRINT9
(CCV_CLI_INFO, ", (%d, %d)",
SCHEDULE_STREAMS0
(*schd)[i],
SCHEDULE_SIGNALS0
(*schd)[i]);
465
55.7k
      }
466
221k
    if (flag)
467
221k
      
PRINT55.7k
(CCV_CLI_INFO, "\n");
468
221k
  }
469
221k
  
return 0221k
;
470
221k
}
471
472
static void _ccv_nnc_graph_mark_outgoing_streams_blocked_by_task(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_schedule_t* const schd_info, ccv_nnc_graph_exec_info_t* const node, co_routine_t* const task)
473
6
{
474
6
  int i, j;
475
6
  if (node->outgoings)
476
8
    
for (i = 0; 4
i < node->outgoings->rnum;
i++4
)
477
4
    {
478
4
      const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i);
479
4
      const ccv_nnc_graph_exec_schedule_t* const outgoing_schd = schd_info + outgoing_idx;
480
4
      // An outgoing stream can be blocked by multiple other tasks from other streams. But it is OK,
481
4
      // because on next round of execution, that one will be marked as blocked again.
482
8
      for (j = 0; j < outgoing_schd->stream_size; 
j++4
)
483
4
        graph->block_stream_tasks[SCHEDULE_STREAMS(*outgoing_schd)[j]] = task;
484
4
    }
485
6
}
486
487
6
static 
co_decl_task2
(_ccv_nnc_graph_wait_any_sub_tasks, (ccv_nnc_graph_t* const graph, co_routine_t* const* const sub_tasks, const int sub_task_size, const ccv_nnc_graph_exec_schedule_t* const schd_info, const int* const pending_nodes, const int pending_node_size), private(
488
6
)) {
489
6
  assert(CO_P(sub_task_size) > 0);
490
6
  
co_await_any2
(CO_P(sub_tasks), CO_P(sub_task_size));
491
2
  // This is not good, these local variables need to be in the private section.
492
2
  // I got away with it because there is no yield or resume or apply or any after await above.
493
2
  int i, j, k;
494
4
  for (i = 0; i < CO_P(sub_task_size); 
i++2
)
495
2
    if (co_is_done(CO_P(sub_tasks)[i]))
496
2
    {
497
6
      for (j = 0; j < CO_P(pending_node_size); 
j++4
)
498
4
      {
499
4
        const ccv_nnc_graph_exec_schedule_t* const node = CO_P(schd_info) + CO_P(pending_nodes)[j];
500
8
        for (k = 0; k < node->stream_size; 
k++4
)
501
4
          if (CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*node)[k]] == CO_P(sub_tasks)[i])
502
4
            
CO_P2
(graph)->block_stream_tasks[2
SCHEDULE_STREAMS2
(*node)[k]] = 0;
503
4
      }
504
2
      co_free(CO_P(sub_tasks)[i]);
505
2
    }
506
2
} co_end()
507
508
52.6k
static 
co_decl_task26.3k
(_ccv_nnc_graph_exec_run_loop, (ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const exec_info, const ccv_nnc_graph_exec_schedule_t* const schd_info, const int* const psort, const int start_index, const int exec_info_size, ccv_nnc_tensor_tape_t* const tensor_tape, const int flags), private(
509
52.6k
  int i, p, q;
510
52.6k
  int sub_task_size;
511
52.6k
  co_routine_t** sub_tasks;
512
52.6k
  int* pending_nodes[2];
513
52.6k
  int pending_node_size[2];
514
52.6k
  int idx;
515
52.6k
  ccv_nnc_graph_exec_info_t* node;
516
52.6k
  const ccv_nnc_graph_exec_schedule_t* schd;
517
52.6k
  co_routine_t* task;
518
52.6k
)) {
519
52.6k
  
CO_V26.3k
(sub_task_size) = 0;
520
52.6k
  
CO_V26.3k
(sub_tasks) = (co_routine_t**)ccv_nnc_graph_buffer(
CO_P26.3k
(graph), sizeof(co_routine_t*) * (
CO_P26.3k
(graph)->sub_graphs ?
CO_P3
(graph)->sub_graphs->rnum3
:
026.3k
) + sizeof(int) *
CO_P26.3k
(exec_info_size) * 2);
521
52.6k
  
CO_V26.3k
(pending_nodes)[0] = (int*)(
CO_V26.3k
(sub_tasks) + (
CO_P26.3k
(graph)->sub_graphs ?
CO_P3
(graph)->sub_graphs->rnum3
:
026.3k
));
522
52.6k
  
CO_V26.3k
(pending_nodes)[1] =
CO_V26.3k
(pending_nodes)[0] +
CO_P26.3k
(exec_info_size);
523
52.6k
  
CO_V26.3k
(pending_node_size)[0] = 0;
524
52.6k
  
CO_V26.3k
(pending_node_size)[1] = 0;
525
248k
  for (
CO_V26.3k
(i) =
CO_P26.3k
(start_index); CO_V(i) < CO_P(exec_info_size);
CO_V221k
(i)++221k
)
526
221k
  {
527
221k
    CO_V(idx) = CO_P(psort) ? 
CO_P92.0k
(psort)[92.0k
CO_V92.0k
(i)] :
CO_V129k
(i);
528
221k
    CO_V(node) = CO_P(exec_info) + CO_V(idx);
529
221k
    CO_V(schd) = CO_P(schd_info) + CO_V(idx);
530
221k
    // If stream is blocked by but not blocked by current executing task.
531
221k
    int blocked = 0, j;
532
454k
    for (j = 0; j < CO_V(schd)->stream_size; 
j++232k
)
533
232k
      if (CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*CO_V(schd))[j]])
534
4
      {
535
4
        CO_V(pending_nodes)[0][CO_V(pending_node_size)[0]++] = CO_V(idx);
536
4
        _ccv_nnc_graph_mark_outgoing_streams_blocked_by_task(CO_P(graph), CO_P(schd_info), CO_V(node), CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*CO_V(schd))[j]]);
537
4
        blocked = 1;
538
4
      }
539
221k
    if (blocked)
540
4
      continue;
541
221k
    CO_V(task) = _ccv_nnc_graph_exec_run_task(CO_P(graph), CO_V(node), CO_V(schd), CO_V(idx), CO_P(tensor_tape), CO_P(flags));
542
221k
    if (CO_V(task))
543
221k
    {
544
4
      co_resume(CO_V(task));
545
4
      if (!co_is_done(CO_V(task)))
546
2
      {
547
2
        CO_V(sub_tasks)[CO_V(sub_task_size)++] = CO_V(task);
548
2
        int j;
549
4
        for (j = 0; j < CO_V(schd)->stream_size; 
j++2
)
550
2
          CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*CO_V(schd))[j]] = CO_V(task);
551
2
        _ccv_nnc_graph_mark_outgoing_streams_blocked_by_task(CO_P(graph), CO_P(schd_info), CO_V(node), CO_V(task));
552
2
      } else
553
2
        co_free(CO_V(task));
554
4
    }
555
221k
  }
556
52.6k
  
if (26.3k
CO_V26.3k
(sub_task_size))
557
26.3k
    co_apply(_ccv_nnc_graph_wait_any_sub_tasks, (CO_P(graph), CO_V(sub_tasks), CO_V(sub_task_size), CO_P(schd_info), CO_V(pending_nodes)[0], CO_V(pending_node_size)[0]));
558
26.3k
  CO_V(p) = 0;
559
26.3k
  CO_V(q) = 1;
560
26.3k
  while (CO_V(pending_node_size)[CO_V(p)] > 0)
561
2
  {
562
2
    CO_V(pending_node_size)[CO_V(q)] = 0;
563
2
    CO_V(sub_task_size) = 0;
564
6
    for (
CO_V2
(i) = 0; CO_V(i) < CO_V(pending_node_size)[CO_V(p)];
CO_V4
(i)++4
)
565
4
    {
566
4
      CO_V(idx) = CO_V(pending_nodes)[CO_V(p)][CO_V(i)];
567
4
      CO_V(node) = CO_P(exec_info) + CO_V(idx);
568
4
      CO_V(schd) = CO_P(schd_info) + CO_V(idx);
569
4
      int blocked = 0, j;
570
8
      for (j = 0; j < CO_V(schd)->stream_size; 
j++4
)
571
4
        if (CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*CO_V(schd))[j]])
572
0
        {
573
0
          _ccv_nnc_graph_mark_outgoing_streams_blocked_by_task(CO_P(graph), CO_P(schd_info), CO_V(node), CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*CO_V(schd))[j]]);
574
0
          CO_V(pending_nodes)[CO_V(q)][CO_V(pending_node_size)[CO_V(q)]++] = CO_V(idx);
575
0
          blocked = 1;
576
0
        }
577
4
      if (blocked)
578
0
        continue;
579
4
      CO_V(task) = _ccv_nnc_graph_exec_run_task(CO_P(graph), CO_V(node), CO_V(schd), CO_V(idx), CO_P(tensor_tape), CO_P(flags));
580
4
      if (CO_V(task))
581
4
      {
582
0
        co_resume(CO_V(task));
583
0
        if (!co_is_done(CO_V(task)))
584
0
        {
585
0
          CO_V(sub_tasks)[CO_V(sub_task_size)++] = CO_V(task);
586
0
          for (j = 0; j < CO_V(schd)->stream_size; j++)
587
0
            CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*CO_V(schd))[j]] = CO_V(task);
588
0
          _ccv_nnc_graph_mark_outgoing_streams_blocked_by_task(CO_P(graph), CO_P(schd_info), CO_V(node), CO_V(task));
589
0
        } else
590
0
          co_free(CO_V(task));
591
0
      }
592
4
    }
593
2
    int t;
594
2
    CCV_SWAP(CO_V(p), CO_V(q), t);
595
2
    if (CO_V(sub_task_size))
596
2
      co_apply(_ccv_nnc_graph_wait_any_sub_tasks, (CO_P(graph), CO_V(sub_tasks), CO_V(sub_task_size), CO_P(schd_info), CO_V(pending_nodes)[CO_V(p)], CO_V(pending_node_size)[CO_V(p)]));
597
2
  }
598
26.3k
} co_end()
599
600
78.9k
co_task26.3k
(_ccv_nnc_graph_topsorted_run_coro, (ccv_nnc_graph_t* const graph, const int exec_idx, const ccv_nnc_graph_static_schedule_t* const schedule, ccv_nnc_graph_exec_info_t* const exec, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context, const int flags), private(
601
78.9k
  ccv_nnc_graph_exec_info_t* exec_info;
602
78.9k
  const ccv_nnc_graph_exec_schedule_t* schd_info;
603
78.9k
  co_routine_t* previous_main;
604
78.9k
  int stream_0;
605
78.9k
  // while loop
606
78.9k
  int64_t count, reverse_count;
607
78.9k
  int graph_breakpoint_size;
608
78.9k
  int i, j;
609
78.9k
)) {
610
78.9k
  assert(CO_P(graph)->stream_size > 0);
611
78.9k
  int i;
612
26.3k
  // Assign the resource container pointer.
613
103k
  for (i = 0; i < CO_P(graph)->stream_size; 
i++77.4k
)
614
77.4k
    CO_P(graph)->streams[i]->resource_container = CO_P(stream_context)->_inline_container;
615
26.3k
  CO_V(exec_info) = (ccv_nnc_graph_exec_info_t*)ccv_array_get(CO_P(graph)->exec_info, 0);
616
26.3k
  CO_V(schd_info) = CO_P(schedule)->exec_info;
617
26.3k
  CO_V(stream_0) = CO_P(schedule)->stream_0;
618
26.3k
  if (CO_P(exec_idx) == -1)
619
26.3k
  {
620
26.3k
    if (CO_P(stream_context)->main)
621
0
    {
622
0
      CO_V(previous_main) = CO_P(stream_context)->main;
623
0
      CO_P(stream_context)->main = co_self();
624
0
      // Wait the previous task to be done. This makes sure that our graph run is serial on the same stream.
625
0
      assert(!co_is_done(CO_V(previous_main)));
626
0
      co_await(CO_V(previous_main));
627
0
    } else
628
26.3k
      CO_P(stream_context)->main = co_self();
629
26.3k
    PRINT(CCV_CLI_INFO, "Graph Stream %d Begin", 
CO_V0
(stream_0));
630
26.3k
    ccv_nnc_stream_signal_t* stream_0_signal;
631
26.3k
    if (CO_P(stream_context) != CO_P(graph)->streams[CO_V(stream_0)])
632
907
    {
633
907
      // Make sure when we start work on streams[0], the current stream context is done.
634
907
      stream_0_signal = ccv_nnc_stream_context_emit_signal_new(CO_P(stream_context));
635
907
      ccv_nnc_stream_context_wait_signal(CO_P(graph)->streams[CO_V(stream_0)], stream_0_signal);
636
25.4k
    } else if (CO_P(schedule)->stream_1_size) {
637
81
      ccv_nnc_stream_context_emit_signal(CO_P(graph)->streams[CO_V(stream_0)], CO_P(schedule)->begin);
638
81
      stream_0_signal = CO_P(schedule)->begin;
639
81
    }
640
26.3k
    int i, flag = 0;
641
26.5k
    for (i = 0; i < CO_P(schedule)->stream_1_size; 
i++250
)
642
250
    {
643
250
      ccv_nnc_stream_context_wait_signal(CO_P(graph)->streams[CO_P(schedule)->stream_1s[i]], stream_0_signal);
644
250
      if (!flag)
645
86
      {
646
86
        PRINT(CCV_CLI_INFO, ", Wait: %d", 
CO_P0
(schedule)->stream_1s[i]);
647
86
        flag = 1;
648
86
      } else
649
250
        
PRINT164
(CCV_CLI_INFO, ", %d",
CO_P0
(schedule)->stream_1s[i]);
650
250
    }
651
26.3k
    PRINT(CCV_CLI_INFO, "\n");
652
26.3k
  } else {
653
4
    assert(CO_P(stream_context) == CO_P(graph)->streams[0]);
654
4
  }
655
26.3k
  if (CO_P(exec) && 
(4
CO_P4
(exec)->flags & CCV_NNC_GRAPH_EXEC_P_WHILE))
656
2
  {
657
2
    assert(CO_P(schedule) == CO_P(graph)->default_schedule);
658
2
    assert(CO_P(exec)->p_while.expr);
659
2
    CO_V(count) = 0;
660
2
    // This is a forward while loop. Backward while loop will just consult its pairing part.
661
2
    if (CO_P(exec)->cmd.cmd == CCV_NNC_GRAPH_FORWARD)
662
2
    {
663
2
      CO_V(graph_breakpoint_size) = CO_P(graph)->breakpoint_offset + CO_P(graph)->breakpoint_size;
664
10
      for (;; ++CO_V(count))
665
12
      {
666
12
        CO_P(graph)->while_count = CO_V(count);
667
12
        if (CO_P(tensor_tape))
668
12
          
ccv_nnc_tensor_tape_set_numbering(0
CO_P0
(tensor_tape),
CO_P0
(graph)->p, (ccv_nnc_graph_exec_t){
669
0
            .d = CO_P(exec_idx),
670
0
            .graph = CO_P(graph)->p,
671
0
          }, CO_V(count));
672
12
        _ccv_nnc_graph_unwrap(CO_P(graph), CO_V(count), 0);
673
12
        if (CO_V(count) > 0)
674
10
          _ccv_nnc_graph_transit_move_to(CO_P(graph));
675
12
        co_apply(_ccv_nnc_graph_exec_run_loop, (CO_P(graph), CO_V(exec_info), CO_V(schd_info), 0, 0, CO_V(graph_breakpoint_size), CO_P(tensor_tape), CO_P(flags)));
676
12
        // Reached breakpoints, now check the breakpoint, if not met, break out.
677
12
        // Wait until everything on the stream is executed.
678
24
        
for (12
CO_V12
(i) =
CO_P12
(graph)->breakpoint_offset; CO_V(i) < CO_V(graph_breakpoint_size);
CO_V12
(i)++12
)
679
24
          
for (12
CO_V12
(j) = 0; CO_V(j) < CO_V(schd_info)[CO_V(i)].stream_size;
CO_V12
(j)++12
)
680
12
            co_stream_await(CO_P(graph)->streams[SCHEDULE_STREAMS(CO_V(schd_info)[CO_V(i)])[CO_V(j)]]);
681
12
        _ccv_nnc_graph_exec_unwrap_while_expr(CO_P(graph), CO_P(exec));
682
12
        if (!CO_P(exec)->p_while.expr(CO_P(exec)->p_while.inputs, CO_P(exec)->p_while.input_size, CO_P(exec)->p_while.data))
683
2
        {
684
2
          _ccv_nnc_graph_rewrap(CO_P(graph));
685
2
          // If we break from here, it is ok because all the streams are waited.
686
2
          break;
687
2
        }
688
10
        co_apply(_ccv_nnc_graph_exec_run_loop, (CO_P(graph), CO_V(exec_info), CO_V(schd_info), 0, CO_V(graph_breakpoint_size), CO_P(graph)->exec_info->rnum, CO_P(tensor_tape), CO_P(flags)));
689
10
        _ccv_nnc_graph_from_move_transit(CO_P(graph));
690
10
        _ccv_nnc_graph_rewrap(CO_P(graph));
691
10
      }
692
2
    } else {
693
0
      // For backward graph, no need to evaluate the while expr.
694
0
      assert(CO_P(exec)->cmd.cmd == CCV_NNC_GRAPH_BACKWARD);
695
0
      assert(CO_P(graph)->pair);
696
0
      assert(CO_P(tensor_tape));
697
0
      CO_V(count) = 0;
698
0
      CO_V(reverse_count) = CO_P(graph)->while_count = ccv_nnc_tensor_tape_numbering(CO_P(tensor_tape), CO_P(graph)->p, (ccv_nnc_graph_exec_t){
699
0
          .d = CO_P(exec_idx),
700
0
          .graph = CO_P(graph)->p,
701
0
        });
702
0
      _ccv_nnc_graph_unwrap(CO_P(graph), CO_V(count), CO_V(reverse_count));
703
0
      co_apply(_ccv_nnc_graph_exec_run_loop, (CO_P(graph), CO_V(exec_info), CO_V(schd_info), 0, CO_P(graph)->breakpoint_offset, CO_P(graph)->exec_info->rnum, CO_P(tensor_tape), CO_P(flags)));
704
0
      _ccv_nnc_graph_from_move_transit(CO_P(graph));
705
0
      _ccv_nnc_graph_rewrap(CO_P(graph));
706
0
      for (CO_V(count) = 1; CO_V(reverse_count) > 0; ++CO_V(count))
707
0
      {
708
0
        CO_P(graph)->while_count = --CO_V(reverse_count);
709
0
        _ccv_nnc_graph_unwrap(CO_P(graph), CO_V(count), CO_V(reverse_count));
710
0
        _ccv_nnc_graph_transit_move_to(CO_P(graph));
711
0
        co_apply(_ccv_nnc_graph_exec_run_loop, (CO_P(graph), CO_V(exec_info), CO_V(schd_info), 0, 0, CO_P(graph)->exec_info->rnum, CO_P(tensor_tape), CO_P(flags)));
712
0
        _ccv_nnc_graph_from_move_transit(CO_P(graph));
713
0
        _ccv_nnc_graph_rewrap(CO_P(graph));
714
0
      }
715
0
    }
716
2
    assert(CO_V(stream_0) == 0);
717
2
    int i;
718
2
    for (i = 0; i < CO_P(schedule)->wait_size; 
i++0
)
719
0
      ccv_nnc_stream_context_wait_signal(CO_P(graph)->streams[0], CO_P(graph)->signals[CO_P(schedule)->waits[i]]);
720
26.3k
  } else {
721
26.3k
    CO_P(graph)->while_count = 0;
722
26.3k
    co_apply(_ccv_nnc_graph_exec_run_loop, (CO_P(graph), CO_V(exec_info), CO_V(schd_info), CO_P(schedule)->psort, 0, CO_P(schedule)->psort ? CO_P(schedule)->psort_size : CO_P(schedule)->exec_info_size, CO_P(tensor_tape), CO_P(flags)));
723
26.3k
    PRINT(CCV_CLI_INFO, "Graph Stream %d End", 
CO_V0
(stream_0));
724
26.3k
    int i, flag = 0;
725
26.5k
    for (i = 0; i < CO_P(schedule)->wait_size; 
i++191
)
726
191
    {
727
191
      ccv_nnc_stream_context_wait_signal(CO_P(graph)->streams[CO_V(stream_0)], CO_P(graph)->signals[CO_P(schedule)->waits[i]]);
728
191
      if (!flag)
729
65
      {
730
65
        PRINT(CCV_CLI_INFO, ", Wait: %d", 
CO_P0
(schedule)->waits[i]);
731
65
        flag = 1;
732
65
      } else
733
191
        
PRINT126
(CCV_CLI_INFO, ", %d",
CO_P0
(schedule)->waits[i]);
734
191
    }
735
26.3k
    PRINT(CCV_CLI_INFO, "\n");
736
26.3k
  }
737
26.3k
  if (CO_P(stream_context) != CO_P(graph)->streams[CO_V(stream_0)])
738
907
  {
739
907
    assert(CO_P(exec_idx) == -1);
740
907
    ccv_nnc_stream_context_emit_signal(CO_P(graph)->streams[CO_V(stream_0)], CO_P(schedule)->end);
741
907
    ccv_nnc_stream_context_wait_signal(CO_P(stream_context), CO_P(schedule)->end);
742
907
  }
743
26.3k
  // Reset main to 0 if it is current me.
744
26.3k
  if (CO_P(exec_idx) == -1 && 
CO_P26.3k
(stream_context)->main == 26.3k
co_self26.3k
())
745
26.3k
    
CO_P26.3k
(stream_context)->main = 026.3k
;
746
26.3k
} co_end()
747
748
static int _ccv_nnc_graph_run(ccv_nnc_graph_t* const graph, const int exec_idx, ccv_nnc_graph_exec_info_t* const exec, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const int flags, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context);
749
750
static inline void _ccv_nnc_graph_exec_run(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const node, const int idx, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context, const int flags)
751
52.7k
{
752
52.7k
  int i;
753
52.7k
  _ccv_nnc_graph_exec_unwrap_io(graph, node);
754
52.7k
  ccv_nnc_tensor_t** inputs = node->inputs;
755
52.7k
  ccv_nnc_tensor_t** outputs = inputs ? 
inputs + node->input_size52.5k
:
0214
;
756
52.7k
  if (tensor_tape)
757
78
    ccv_nnc_tensor_tape_io(tensor_tape, graph, node->input_flags, inputs, node->input_size, node->output_flags, outputs, node->output_size);
758
52.7k
  /* Broadcast the updates to all subscribed references for input / output, even though at th
759
52.7k
   * time output is not written yet, propagate pointer change is still valid. */
760
52.7k
  _ccv_nnc_graph_exec_begin_synchronize_multiviews(graph, node);
761
52.7k
  if (node->cmd.cmd == CCV_NNC_GRAPH_FORWARD || 
node->cmd.cmd == CCV_NNC_GRAPH_BACKWARD52.7k
)
762
67
  {
763
67
    assert(!stream_context); // This doesn't work properly with stream context.
764
67
    if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
765
39
    {
766
39
      int ref;
767
39
      if (node->cmd.cmd == CCV_NNC_GRAPH_FORWARD)
768
35
      {
769
35
        ref = node->case_of.offset + node->case_of.expr(inputs, node->input_size, node->case_of.data);
770
35
        if (tensor_tape)
771
4
          ccv_nnc_tensor_tape_set_numbering(tensor_tape, graph, (ccv_nnc_graph_exec_t){
772
4
            .d = idx,
773
4
            .graph = graph,
774
4
          }, ref);
775
35
      } else {
776
4
        assert(node->cmd.cmd == CCV_NNC_GRAPH_BACKWARD);
777
4
        assert(tensor_tape);
778
4
        ref = ccv_nnc_tensor_tape_numbering(tensor_tape, graph, (ccv_nnc_graph_exec_t){
779
4
            .d = idx,
780
4
            .graph = graph,
781
4
          });
782
4
      }
783
39
      if (ref >= 0)
784
31
      {
785
31
        assert(ref < node->graph_ref_size);
786
31
        ccv_nnc_graph_t* sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[ref] - 1);
787
31
        _ccv_nnc_graph_run(sub_graph, idx, node, inputs, node->input_size, outputs, node->output_size, flags, 0, 0, 0, 0, tensor_tape, stream_context);
788
31
      }
789
39
      _ccv_nnc_graph_exec_unwrap_phi(graph, node, ref);
790
39
    } else 
if (28
node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE28
) {
791
28
      ccv_nnc_graph_t* sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[0] - 1);
792
28
      _ccv_nnc_graph_run(sub_graph, idx, node, inputs, node->input_size, outputs, node->output_size, flags, 0, 0, 0, 0, tensor_tape, stream_context);
793
28
    }
794
52.6k
  } else {
795
52.6k
    PRINT(CCV_CLI_INFO, "%s [%d]: [%d] -> [%d]\n", ccv_nnc_cmd_name(node->cmd.cmd), idx, node->input_size, node->output_size);
796
202k
    for (i = 0; i < node->input_size; 
i++149k
)
797
149k
    {
798
149k
      PRINT(CCV_CLI_INFO, "|-> %d. %p (%p:%d)", 
i + 1, inputs[i], (inputs[i] ? inputs[i]->data.u8 : 0), (inputs[i] ? 0
CCV_TENSOR_GET_DEVICE_ID0
(inputs[i]->info.type) : -1));
799
149k
      if (inputs[i] && 
CCV_CLI_OUTPUT_LEVEL_IS116k
(CCV_CLI_INFO))
800
149k
        
ccv_nnc_print_tensor_info(inputs[i])0
;
801
149k
      PRINT(CCV_CLI_INFO, "\n");
802
149k
    }
803
52.6k
    ccv_nnc_cmd_exec(node->cmd, node->hint, flags, inputs, node->input_size, outputs, node->output_size, stream_context);
804
134k
    for (i = 0; i < node->output_size; 
i++81.6k
)
805
81.6k
    {
806
81.6k
      PRINT(CCV_CLI_INFO, "|<- %d. %p (%p:%d)", 
i + 1, outputs[i], (outputs[i] ? outputs[i]->data.u8 : 0), (outputs[i] ? 0
CCV_TENSOR_GET_DEVICE_ID0
(outputs[i]->info.type) : -1));
807
81.6k
      if (outputs[i] && 
CCV_CLI_OUTPUT_LEVEL_IS72.3k
(CCV_CLI_INFO))
808
81.6k
        
ccv_nnc_print_tensor_info(outputs[i])0
;
809
81.6k
      PRINT(CCV_CLI_INFO, "\n");
810
81.6k
    }
811
52.6k
  }
812
52.7k
}
813
814
static inline void _ccv_nnc_graph_topsorted_run(ccv_nnc_graph_t* const graph, const int exec_idx, ccv_nnc_graph_exec_info_t* const exec, const int flags, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
815
8.80k
{
816
8.80k
  int i;
817
8.80k
  if (exec && 
(exec->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)51
)
818
23
  {
819
23
    assert(!stream_context); // This doesn't work properly with stream context.
820
23
    assert(exec->p_while.expr);
821
23
    int64_t count = 0;
822
23
    // This is a forward while loop. Backward while loop will just consult its pairing part.
823
23
    if (exec->cmd.cmd == CCV_NNC_GRAPH_FORWARD)
824
22
    {
825
22
      const int graph_breakpoint_size = graph->breakpoint_offset + graph->breakpoint_size;
826
104
      for (;; ++count)
827
126
      {
828
126
        graph->while_count = count;
829
126
        if (tensor_tape)
830
5
          ccv_nnc_tensor_tape_set_numbering(tensor_tape, graph->p, (ccv_nnc_graph_exec_t){
831
5
            .d = exec_idx,
832
5
            .graph = graph->p,
833
5
          }, count);
834
126
        _ccv_nnc_graph_unwrap(graph, count, 0);
835
126
        if (count > 0)
836
104
          _ccv_nnc_graph_transit_move_to(graph);
837
312
        for (i = 0; i < graph_breakpoint_size; 
i++186
)
838
186
          _ccv_nnc_graph_exec_run(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i), i, tensor_tape, stream_context, flags);
839
126
        _ccv_nnc_graph_exec_unwrap_while_expr(graph, exec);
840
126
        // Reached breakpoints, now check the breakpoint, if not met, break out.
841
126
        if (!exec->p_while.expr(exec->p_while.inputs, exec->p_while.input_size, exec->p_while.data))
842
22
        {
843
22
          _ccv_nnc_graph_rewrap(graph);
844
22
          break;
845
22
        }
846
210
        
for (i = graph_breakpoint_size; 104
i < graph->exec_info->rnum;
i++106
)
847
106
          _ccv_nnc_graph_exec_run(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i), i, tensor_tape, stream_context, flags);
848
104
        _ccv_nnc_graph_from_move_transit(graph);
849
104
        _ccv_nnc_graph_rewrap(graph);
850
104
      }
851
22
    } else {
852
1
      // For backward graph, no need to evaluate the while expr.
853
1
      assert(exec->cmd.cmd == CCV_NNC_GRAPH_BACKWARD);
854
1
      assert(graph->pair);
855
1
      assert(tensor_tape);
856
1
      count = 0;
857
1
      int64_t reverse_count = graph->while_count = ccv_nnc_tensor_tape_numbering(tensor_tape, graph->p, (ccv_nnc_graph_exec_t){
858
1
          .d = exec_idx,
859
1
          .graph = graph->p,
860
1
        });
861
1
      _ccv_nnc_graph_unwrap(graph, count, reverse_count);
862
5
      for (i = graph->breakpoint_offset; i < graph->exec_info->rnum; 
i++4
)
863
4
        _ccv_nnc_graph_exec_run(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i), i, tensor_tape, stream_context, flags);
864
1
      _ccv_nnc_graph_from_move_transit(graph);
865
1
      _ccv_nnc_graph_rewrap(graph);
866
5
      for (count = 1; reverse_count > 0; 
++count4
)
867
4
      {
868
4
        graph->while_count = --reverse_count;
869
4
        _ccv_nnc_graph_unwrap(graph, count, reverse_count);
870
4
        _ccv_nnc_graph_transit_move_to(graph);
871
20
        for (i = 0; i < graph->exec_info->rnum; 
i++16
)
872
16
          _ccv_nnc_graph_exec_run(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i), i, tensor_tape, stream_context, flags);
873
4
        _ccv_nnc_graph_from_move_transit(graph);
874
4
        _ccv_nnc_graph_rewrap(graph);
875
4
      }
876
1
    }
877
8.77k
  } else {
878
8.77k
    graph->while_count = 0;
879
60.9k
    for (i = 0; i < graph->exec_info->rnum; 
i++52.2k
)
880
52.2k
      _ccv_nnc_graph_exec_run(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i), i, tensor_tape, stream_context, flags);
881
8.77k
  }
882
8.80k
}
883
884
static inline void _ccv_nnc_graph_run_slow_path(ccv_nnc_graph_t* const graph, const int exec_idx, ccv_nnc_graph_exec_info_t* const exec, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const int flags, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
885
35
{
886
35
  int i, j;
887
35
  const ccv_nnc_graph_exec_t* const graph_sources = sources ? 
sources27
:
(ccv_nnc_graph_exec_t*)8
ccv_array_get8
(graph->sources, 0);
888
35
  const int graph_source_size = source_size ? 
source_size27
:
graph->sources->rnum8
;
889
35
  const ccv_nnc_graph_exec_t* const graph_destinations = destinations ? 
destinations27
:
(ccv_nnc_graph_exec_t*)8
ccv_array_get8
(graph->destinations, 0);
890
35
  const int graph_destination_size = destination_size ? 
destination_size27
:
graph->destinations->rnum8
;
891
35
#define visitor(node, idx, ...) \
892
235
  _ccv_nnc_graph_exec_run(graph, node, idx, tensor_tape, stream_context, flags)
893
35
  if (exec && 
(exec->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)8
)
894
5
  {
895
5
    assert(!stream_context); // This doesn't work properly with stream context.
896
5
    assert(exec->p_while.expr);
897
5
    int64_t count = 0;
898
5
    // This is a forward while loop. Backward while loop will just consult its pairing part.
899
5
    if (exec->cmd.cmd == CCV_NNC_GRAPH_FORWARD)
900
4
    {
901
4
      ccv_array_t* follows = ccv_array_new(sizeof(ccv_nnc_graph_exec_t), graph->breakpoint_size, 0);
902
8
      for (i = 0; i < graph->breakpoint_size; 
i++4
)
903
4
      {
904
4
        const ccv_nnc_graph_exec_info_t* const exec_info = (const ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, graph->breakpoints->d);
905
4
        if (exec_info->outgoings)
906
6
          
for (j = 0; 3
j < exec_info->outgoings->rnum;
j++3
)
907
3
          {
908
3
            const ccv_nnc_graph_exec_t exec = {
909
3
              .d = *(int*)ccv_array_get(exec_info->outgoings, j),
910
3
              .graph = graph,
911
3
            };
912
3
            ccv_array_push(follows, &exec);
913
3
          }
914
4
      }
915
19
      for (;; ++count)
916
23
      {
917
23
        graph->while_count = count;
918
23
        if (tensor_tape)
919
5
          ccv_nnc_tensor_tape_set_numbering(tensor_tape, graph->p, (ccv_nnc_graph_exec_t){
920
5
            .d = exec_idx,
921
5
            .graph = graph->p,
922
5
          }, count);
923
23
        _ccv_nnc_graph_unwrap(graph, count, 0);
924
23
        if (count > 0)
925
19
          _ccv_nnc_graph_transit_move_to(graph);
926
28
        
CCV_NNC_GRAPH_VISIT23
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph_sources, graph_source_size, graph->breakpoints, graph->breakpoint_size, 0, visitor);
927
23
        _ccv_nnc_graph_exec_unwrap_while_expr(graph, exec);
928
23
        // Reached breakpoints, now check the breakpoint, if not met, break out.
929
23
        if (!exec->p_while.expr(exec->p_while.inputs, exec->p_while.input_size, exec->p_while.data))
930
4
        {
931
4
          _ccv_nnc_graph_rewrap(graph);
932
4
          break;
933
4
        }
934
19
        if (follows->rnum > 0)
935
19
          
CCV_NNC_GRAPH_VISIT15
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, (ccv_nnc_graph_exec_t*)ccv_array_get(follows, 0), follows->rnum, graph_destinations, graph_destination_size, 0,
visitor15
);
936
19
        _ccv_nnc_graph_from_move_transit(graph);
937
19
        _ccv_nnc_graph_rewrap(graph);
938
19
      }
939
4
      ccv_array_free(follows);
940
4
    } else {
941
1
      // For backward graph, no need to evaluate the while expr.
942
1
      assert(exec->cmd.cmd == CCV_NNC_GRAPH_BACKWARD);
943
1
      assert(graph->pair);
944
1
      assert(tensor_tape);
945
1
      count = 0;
946
1
      int64_t reverse_count = graph->while_count = ccv_nnc_tensor_tape_numbering(tensor_tape, graph->p, (ccv_nnc_graph_exec_t){
947
1
          .d = exec_idx,
948
1
          .graph = graph->p,
949
1
        });
950
1
      _ccv_nnc_graph_unwrap(graph, count, reverse_count);
951
2
      
CCV_NNC_GRAPH_VISIT1
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph->breakpoints, graph->breakpoint_size, graph_destinations, graph_destination_size, 1, visitor);
952
1
      _ccv_nnc_graph_from_move_transit(graph);
953
1
      _ccv_nnc_graph_rewrap(graph);
954
5
      for (count = 1; reverse_count > 0; 
++count4
)
955
4
      {
956
4
        graph->while_count = --reverse_count;
957
4
        _ccv_nnc_graph_unwrap(graph, count, reverse_count);
958
4
        _ccv_nnc_graph_transit_move_to(graph);
959
8
        
CCV_NNC_GRAPH_VISIT4
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph_sources, graph_source_size, graph_destinations, graph_destination_size, 0, visitor);
960
4
        _ccv_nnc_graph_from_move_transit(graph);
961
4
        _ccv_nnc_graph_rewrap(graph);
962
4
      }
963
1
    }
964
30
  } else {
965
30
    graph->while_count = 0;
966
182
    
CCV_NNC_GRAPH_VISIT30
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph_sources, graph_source_size, graph_destinations, graph_destination_size, 0, visitor);
967
30
  }
968
35
#undef visitor
969
35
}
970
971
static int _ccv_nnc_graph_run(ccv_nnc_graph_t* const graph, const int exec_idx, ccv_nnc_graph_exec_info_t* const exec, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const int flags, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
972
8.83k
{
973
8.83k
  assert((sources == 0 && source_size == 0) || (sources && source_size));
974
8.83k
  assert((destinations == 0 && destination_size == 0) || (destinations && destination_size));
975
8.83k
  const ccv_nnc_graph_exec_t* const graph_sources = sources ? 
sources27
:
(ccv_nnc_graph_exec_t*)8.80k
ccv_array_get8.80k
(graph->sources, 0);
976
8.83k
  const int graph_source_size = source_size ? 
source_size27
:
graph->sources->rnum8.80k
;
977
8.83k
  const ccv_nnc_graph_exec_t* const graph_destinations = destinations ? 
destinations27
:
(ccv_nnc_graph_exec_t*)8.80k
ccv_array_get8.80k
(graph->destinations, 0);
978
8.83k
  const int graph_destination_size = destination_size ? 
destination_size27
:
graph->destinations->rnum8.80k
;
979
8.83k
  int i;
980
17.6k
  for (i = 0; i < graph_source_size; 
i++8.83k
)
981
8.83k
    if (graph_sources[i].graph != graph)
982
0
      return CCV_NNC_EXEC_INVALID;
983
17.6k
  
for (i = 0; 8.83k
i < graph_destination_size;
i++8.83k
)
984
8.83k
    if (graph_destinations[i].graph != graph)
985
0
      return CCV_NNC_EXEC_INVALID;
986
8.83k
  // When topsorted is true, there is no memory allocation when run the graph.
987
8.83k
  const int topsorted = (!sources && 
!destinations8.80k
&&
graph->topsorted8.80k
);
988
8.83k
  if (topsorted)
989
8.80k
    _ccv_nnc_graph_topsorted_run(graph, exec_idx, exec, flags, tensor_tape, stream_context);
990
35
  else
991
35
    _ccv_nnc_graph_run_slow_path(graph, exec_idx, exec, inputs, input_size, outputs, output_size, flags, sources, source_size, destinations, destination_size, tensor_tape, stream_context);
992
8.83k
  return CCV_NNC_EXEC_SUCCESS;
993
8.83k
}
994
995
int ccv_nnc_graph_run(ccv_nnc_graph_t* const graph, const int flags, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
996
8.99k
{
997
8.99k
  if (stream_context && 
graph->topsorted215
&&
graph->stream_size > 0215
&&
graph->default_schedule215
&&
source_size == 0215
&&
destination_size == 0215
)
998
215
  {
999
215
    co_scheduler_t* const scheduler = ccv_nnc_stream_context_get_scheduler(stream_context);
1000
215
    co_routine_t* const task = co_new(_ccv_nnc_graph_topsorted_run_coro, (graph, -1, graph->default_schedule, 0, tensor_tape, stream_context, flags));
1001
215
    co_schedule(scheduler, task);
1002
215
    // I don't need to worry about freeing this task, it will free itself at the end.
1003
215
    return CCV_NNC_EXEC_SUCCESS;
1004
215
  } else
1005
8.77k
    return _ccv_nnc_graph_run(graph, -1, 0, 0, 0, 0, 0, flags, sources, source_size, destinations, destination_size, tensor_tape, 0 /* In this case, we don't support stream context yet. */);
1006
8.99k
}
1007
1008
int ccv_nnc_graph_run_with_schedule(ccv_nnc_graph_t* const graph, const int flags, const ccv_nnc_graph_static_schedule_t* const _schedule, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const _stream_context)
1009
26.1k
{
1010
26.1k
  assert(graph->topsorted);
1011
26.1k
  assert(graph->stream_size > 0);
1012
26.1k
  const ccv_nnc_graph_static_schedule_t* const schedule = _schedule ? 
_schedule15.7k
:
graph->default_schedule10.3k
;
1013
26.1k
  assert(schedule);
1014
26.1k
  ccv_nnc_stream_context_t* const stream_context = _stream_context ? 
_stream_context699
:
graph->streams[schedule->stream_0]25.4k
;
1015
26.1k
  co_scheduler_t* const scheduler = ccv_nnc_stream_context_get_scheduler(stream_context);
1016
26.1k
  co_routine_t* const task = co_new(_ccv_nnc_graph_topsorted_run_coro, (graph, -1, schedule, 0, tensor_tape, stream_context, flags));
1017
26.1k
  co_schedule(scheduler, task);
1018
26.1k
  // I don't need to worry about freeing this task, it will free itself at the end.
1019
26.1k
  if (!_stream_context) // If no stream context provided, this is a sync operation.
1020
25.4k
    ccv_nnc_stream_context_wait(stream_context);
1021
26.1k
  return CCV_NNC_EXEC_SUCCESS;
1022
26.1k
}