Coverage Report

Created: 2026-04-14 20:00

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_graph_run.c
Line
Count
Source
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#include "_ccv_nnc_graph.h"
6
#include "_ccv_nnc_stream.h"
7
#ifdef HAVE_CUDA
8
#include "gpu/ccv_nnc_compat.h"
9
#elif defined(HAVE_MPS)
10
#include "mps/ccv_nnc_mps.h"
11
#endif
12
13
// MARK - Level-2 API
14
15
static void _ccv_nnc_unwrap_tensor_wrap(const ccv_nnc_graph_t* const graph, const int64_t count, const int64_t reverse_count, ccv_nnc_graph_tensor_wrap_t* const tensor_wrap)
16
930
{
17
930
  ccv_nnc_tensor_t* tensor = tensor_wrap->tensors[tensor_wrap->index];
18
1.96k
  while (CCV_IS_TENSOR_MULTIVIEW(tensor) &&
19
1.96k
       
(1.06k
((ccv_nnc_tensor_multiview_t*)tensor)->anchor == (intptr_t)graph1.06k
||
20
1.06k
      
((ccv_nnc_tensor_multiview_t*)tensor)->anchor == (intptr_t)graph->pair45
))
21
1.03k
  {
22
    // If the anchor is from the pair, we use the reverse_count instead (we are looking it up).
23
1.03k
    const int i = (int)((((ccv_nnc_tensor_multiview_t*)tensor)->anchor == (intptr_t)graph) ? 
count1.02k
:
reverse_count15
);
24
1.03k
    ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
25
1.03k
    const int off = mv->kind;
26
1.03k
    const int mod = mv->repeat;
27
1.03k
    tensor = CCV_NNC_MULTIVIEW_DATA(mv)[i >= off ? 
((i - off) % mod) + off1.00k
:
i32
]; // Unwrap.
28
    // If reached the root.
29
1.03k
    if (!CCV_IS_TENSOR_MULTIVIEW(tensor))
30
889
      tensor_wrap->update_required = 1; // Need to update tensor updates.
31
1.03k
    ++tensor_wrap->index;
32
1.03k
    tensor_wrap->tensors[tensor_wrap->index] = tensor;
33
1.03k
    assert(tensor_wrap->index < tensor_wrap->count);
34
1.03k
  }
35
930
}
36
37
static void _ccv_nnc_graph_unwrap_sub_graph(const ccv_nnc_graph_t* const graph, const int64_t count, const int64_t reverse_count, const ccv_nnc_graph_t* const sub_graph)
38
198
{
39
198
  int i;
40
198
  if (sub_graph->carry_overs)
41
265
    
for (i = 0; 121
i < sub_graph->carry_overs->rnum;
i++144
)
42
144
    {
43
144
      ccv_nnc_graph_tensor_carry_over_t* const carry_over = (ccv_nnc_graph_tensor_carry_over_t*)ccv_array_get(sub_graph->carry_overs, i);
44
144
      _ccv_nnc_unwrap_tensor_wrap(graph, count, reverse_count, carry_over->from);
45
144
      _ccv_nnc_unwrap_tensor_wrap(graph, count, reverse_count, carry_over->to);
46
144
    }
47
198
  if (sub_graph->sub_graphs)
48
82
    
for (i = 0; 21
i < sub_graph->sub_graphs->rnum;
i++61
)
49
61
      _ccv_nnc_graph_unwrap_sub_graph(graph, count, reverse_count, *(ccv_nnc_graph_t**)ccv_array_get(sub_graph->sub_graphs, i));
50
198
}
51
52
static void _ccv_nnc_graph_unwrap(const ccv_nnc_graph_t* const graph, const int64_t count, const int64_t reverse_count)
53
171
{
54
171
  if (!graph->tensor_wraps_refs)
55
34
    return;
56
137
  int i, j;
57
510
  for (i = 0; i < graph->tensor_wraps_refs->rnum; 
i++373
)
58
373
  {
59
373
    const ccv_nnc_graph_tensor_wraps_ref_t* const tensor_wraps_ref = (const ccv_nnc_graph_tensor_wraps_ref_t*)ccv_array_get(graph->tensor_wraps_refs, i);
60
373
    const ccv_nnc_graph_t* const sub_graph = tensor_wraps_ref->graph;
61
373
    ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(sub_graph->tensor_wraps, tensor_wraps_ref->d);
62
373
    if (tensor_wrap_array)
63
1.36k
      
for (j = 0; 373
j < tensor_wrap_array->size;
j++994
)
64
994
      {
65
994
        ccv_nnc_graph_tensor_wrap_t* const tensor_wrap = tensor_wrap_array->tensor_wraps[j];
66
994
        if (!tensor_wrap)
67
352
          continue;
68
642
        _ccv_nnc_unwrap_tensor_wrap(graph, count, reverse_count, tensor_wrap);
69
642
      }
70
373
  }
71
137
  _ccv_nnc_graph_unwrap_sub_graph(graph, count, reverse_count, graph);
72
137
}
73
74
static void _ccv_nnc_graph_transit_move_to(const ccv_nnc_graph_t* const graph)
75
141
{
76
141
  int i;
77
141
  if (graph->carry_overs)
78
255
    
for (i = 0; 118
i < graph->carry_overs->rnum;
i++137
)
79
137
    {
80
137
      ccv_nnc_graph_tensor_carry_over_t* const carry_over = (ccv_nnc_graph_tensor_carry_over_t*)ccv_array_get(graph->carry_overs, i);
81
137
      ccv_nnc_tensor_t* it = (ccv_nnc_tensor_t*)(carry_over->to->tensors[carry_over->to->index]);
82
137
      assert(!CCV_IS_TENSOR_MULTIVIEW(it));
83
137
      it->data = carry_over->transit;
84
137
    }
85
141
}
86
87
static void _ccv_nnc_graph_from_move_transit(const ccv_nnc_graph_t* const graph)
88
143
{
89
143
  int i;
90
143
  if (graph->carry_overs)
91
258
    
for (i = 0; 119
i < graph->carry_overs->rnum;
i++139
)
92
139
    {
93
139
      ccv_nnc_graph_tensor_carry_over_t* const carry_over = (ccv_nnc_graph_tensor_carry_over_t*)ccv_array_get(graph->carry_overs, i);
94
139
      ccv_nnc_tensor_t* it = (ccv_nnc_tensor_t*)(carry_over->from->tensors[carry_over->from->index]);
95
139
      assert(!CCV_IS_TENSOR_MULTIVIEW(it));
96
139
      carry_over->transit = it->data;
97
139
    }
98
143
}
99
100
static void _ccv_nnc_rewrap_tensor_wrap(const ccv_nnc_graph_t* const graph, ccv_nnc_graph_tensor_wrap_t* const tensor_wrap)
101
930
{
102
1.96k
  while (tensor_wrap->index > 0 && 
CCV_IS_TENSOR_MULTIVIEW1.18k
(tensor_wrap->tensors[tensor_wrap->index - 1]) &&
103
1.96k
      
(1.18k
((ccv_nnc_tensor_multiview_t*)tensor_wrap->tensors[tensor_wrap->index - 1])->anchor == (intptr_t)graph1.18k
||
104
1.18k
       
((ccv_nnc_tensor_multiview_t*)tensor_wrap->tensors[tensor_wrap->index - 1])->anchor == (intptr_t)graph->pair165
))
105
1.03k
    --tensor_wrap->index;
106
930
}
107
108
static void _ccv_nnc_graph_rewrap_sub_graph(const ccv_nnc_graph_t* const graph, const ccv_nnc_graph_t* const sub_graph)
109
198
{
110
198
  int i;
111
198
  if (sub_graph->carry_overs)
112
265
    
for (i = 0; 121
i < sub_graph->carry_overs->rnum;
i++144
)
113
144
    {
114
144
      ccv_nnc_graph_tensor_carry_over_t* const carry_over = (ccv_nnc_graph_tensor_carry_over_t*)ccv_array_get(sub_graph->carry_overs, i);
115
144
      _ccv_nnc_rewrap_tensor_wrap(graph, carry_over->from);
116
144
      _ccv_nnc_rewrap_tensor_wrap(graph, carry_over->to);
117
144
    }
118
198
  if (sub_graph->sub_graphs)
119
82
    
for (i = 0; 21
i < sub_graph->sub_graphs->rnum;
i++61
)
120
61
      _ccv_nnc_graph_rewrap_sub_graph(graph, *(ccv_nnc_graph_t**)ccv_array_get(sub_graph->sub_graphs, i));
121
198
}
122
123
static void _ccv_nnc_graph_rewrap(const ccv_nnc_graph_t* const graph) // Call this method at the end to roll the wrap_ptr back
124
171
{
125
171
  if (!graph->tensor_wraps_refs)
126
34
    return;
127
137
  int i, j;
128
510
  for (i = 0; i < graph->tensor_wraps_refs->rnum; 
i++373
)
129
373
  {
130
373
    const ccv_nnc_graph_tensor_wraps_ref_t* const tensor_wraps_ref = (const ccv_nnc_graph_tensor_wraps_ref_t*)ccv_array_get(graph->tensor_wraps_refs, i);
131
373
    const ccv_nnc_graph_t* const sub_graph = tensor_wraps_ref->graph;
132
373
    ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(sub_graph->tensor_wraps, tensor_wraps_ref->d);
133
373
    if (tensor_wrap_array)
134
1.36k
      
for (j = 0; 373
j < tensor_wrap_array->size;
j++994
)
135
994
      {
136
994
        ccv_nnc_graph_tensor_wrap_t* const tensor_wrap = tensor_wrap_array->tensor_wraps[j];
137
994
        if (!tensor_wrap)
138
352
          continue;
139
642
        _ccv_nnc_rewrap_tensor_wrap(graph, tensor_wrap);
140
642
      }
141
373
  }
142
137
  _ccv_nnc_graph_rewrap_sub_graph(graph, graph);
143
137
}
144
145
static void _ccv_nnc_graph_exec_unwrap_io(const ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const node)
146
291k
{
147
291k
  if (!node->tensor_wraps_ref)
148
290k
    return;
149
277
  int i;
150
277
  ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, node->tensor_wraps_ref - 1);
151
277
  ccv_nnc_graph_tensor_wrap_t** const tensor_wraps = tensor_wrap_array->tensor_wraps;
152
1.04k
  for (i = 0; i < tensor_wrap_array->size; 
i++767
)
153
767
    if (tensor_wraps[i])
154
492
    {
155
492
      assert(tensor_wraps[i]->index > 0);
156
492
      ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)(tensor_wraps[i]->tensors[tensor_wraps[i]->index - 1]);
157
492
      assert(CCV_IS_TENSOR_MULTIVIEW(mv));
158
      // Only now set the mv->it, because now this node is about to get executed.
159
492
      mv->it = tensor_wraps[i]->tensors[tensor_wraps[i]->index];
160
492
      assert(!CCV_IS_TENSOR_MULTIVIEW(mv->it));
161
492
    }
162
699
  
for (i = 0; 277
i < node->input_size;
i++422
)
163
422
    if (tensor_wraps[i])
164
191
      node->inputs[i] = tensor_wraps[i]->tensors[tensor_wraps[i]->index];
165
277
  const int d = node->input_size;
166
472
  for (i = 0; i < node->output_size; 
i++195
)
167
195
    if (tensor_wraps[d + i])
168
151
      node->outputs[i] = tensor_wraps[d + i]->tensors[tensor_wraps[d + i]->index];
169
277
}
170
171
static void _ccv_nnc_graph_exec_unwrap_while_expr(const ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const node)
172
161
{
173
161
  assert(node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE);
174
161
  if (!node->p_while.tensor_wraps_ref)
175
155
    return;
176
6
  int i;
177
6
  ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, node->p_while.tensor_wraps_ref - 1);
178
6
  ccv_nnc_graph_tensor_wrap_t** const tensor_wraps = tensor_wrap_array->tensor_wraps;
179
18
  for (i = 0; i < tensor_wrap_array->size; 
i++12
)
180
12
    if (tensor_wraps[i])
181
6
    {
182
6
      assert(tensor_wraps[i]->index > 0);
183
6
      ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)(tensor_wraps[i]->tensors[tensor_wraps[i]->index - 1]);
184
6
      assert(CCV_IS_TENSOR_MULTIVIEW(mv));
185
      // Only now set the mv->it, because now this node is about to get executed.
186
6
      mv->it = tensor_wraps[i]->tensors[tensor_wraps[i]->index];
187
6
      assert(!CCV_IS_TENSOR_MULTIVIEW(mv->it));
188
6
    }
189
18
  
for (i = 0; 6
i < node->p_while.input_size;
i++12
)
190
12
    if (tensor_wraps[i])
191
6
      node->p_while.inputs[i] = tensor_wraps[i]->tensors[tensor_wraps[i]->index];
192
6
}
193
194
static void _ccv_nnc_graph_exec_unwrap_phi(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_info_t* const node, const int ref)
195
41
{
196
41
  int i;
197
  // If the output tensor is a phi multi-view tensor, we update our selection to all the subscribers.
198
80
  for (i = 0; i < node->output_size; 
i++39
)
199
39
    if (CCV_IS_TENSOR_MULTIVIEW(node->outputs[i]) &&
200
39
      
((ccv_nnc_tensor_multiview_t*)node->outputs[i])->anchor == 29
CCV_NNC_MULTIVIEW_PHI29
)
201
29
    {
202
29
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)node->outputs[i];
203
29
      mv->it = CCV_NNC_MULTIVIEW_DATA(mv)[ref >= 0];
204
29
      ccv_nnc_tensor_multiview_synchronize(mv);
205
29
    }
206
41
}
207
208
static void _ccv_nnc_graph_exec_begin_synchronize_multiviews(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const node)
209
291k
{
210
291k
  if (!node->tensor_wraps_ref)
211
290k
    return;
212
277
  int i;
213
277
  ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, node->tensor_wraps_ref - 1);
214
277
  ccv_nnc_graph_tensor_wrap_t** const tensor_wraps = tensor_wrap_array->tensor_wraps;
215
1.04k
  for (i = 0; i < tensor_wrap_array->size; 
i++767
)
216
767
    if (tensor_wraps[i] && 
tensor_wraps[i]->update_required492
)
217
492
    {
218
492
      assert(tensor_wraps[i]->index > 0);
219
492
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)(tensor_wraps[i]->tensors[tensor_wraps[i]->index - 1]);
220
      // Now update the final pointer.
221
492
      ccv_nnc_tensor_multiview_synchronize(mv);
222
492
      tensor_wraps[i]->update_required = 0; // Reset, no need to update.
223
492
    }
224
277
}
225
226
void ccv_nnc_print_tensor_shape(const ccv_nnc_tensor_t* const tensor)
227
0
{
228
0
  int i;
229
0
  PRINT(CCV_CLI_INFO, " [%d", tensor->info.dim[0]);
230
0
  for (i = 1; i < CCV_NNC_MAX_DIM_ALLOC && tensor->info.dim[i]; i++)
231
0
    PRINT(CCV_CLI_INFO, "x%d", tensor->info.dim[i]);
232
0
  PRINT(CCV_CLI_INFO, "]");
233
0
}
234
235
void ccv_nnc_print_tensor_info(const ccv_nnc_tensor_t* const tensor)
236
0
{
237
0
  int i;
238
0
  PRINT(CCV_CLI_INFO, " [%d", tensor->info.dim[0]);
239
0
  for (i = 1; i < CCV_NNC_MAX_DIM_ALLOC && tensor->info.dim[i]; i++)
240
0
    PRINT(CCV_CLI_INFO, "x%d", tensor->info.dim[i]);
241
0
  PRINT(CCV_CLI_INFO, "]");
242
0
  if (!CCV_CLI_OUTPUT_LEVEL_IS(CCV_CLI_VERBOSE) || tensor->info.dim[0] <= 0)
243
0
    return;
244
0
  const int nd = ccv_nnc_tensor_nd(tensor->info.dim);
245
0
  const int len = ccv_min(tensor->info.dim[nd - 1], 3);
246
0
  if (CCV_TENSOR_GET_MEMORY(tensor->info.type) == CCV_TENSOR_GPU_MEMORY)
247
0
  {
248
0
#ifdef HAVE_CUDA
249
0
    switch (tensor->info.datatype)
250
0
    {
251
0
      case CCV_16F: {
252
0
        uint16_t data[len];
253
0
        cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.f16, tensor->info.type, len * sizeof(uint16_t));
254
0
        float fp32[len];
255
0
        ccv_half_precision_to_float(data, fp32, len);
256
0
        for (i = 0; i < len; i++)
257
0
          PRINT(CCV_CLI_VERBOSE, " %f", fp32[i]);
258
0
        break;
259
0
      }
260
0
      case CCV_16BF: {
261
0
        uint16_t data[len];
262
0
        cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.f16, tensor->info.type, len * sizeof(uint16_t));
263
0
        float fp32[len];
264
0
        ccv_bfloat_to_float(data, fp32, len);
265
0
        for (i = 0; i < len; i++)
266
0
          PRINT(CCV_CLI_VERBOSE, " %f", fp32[i]);
267
0
        break;
268
0
      }
269
0
      case CCV_32F: {
270
0
        float data[len];
271
0
        cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.f32, tensor->info.type, len * sizeof(float));
272
0
        for (i = 0; i < len; i++)
273
0
          PRINT(CCV_CLI_VERBOSE, " %f", data[i]);
274
0
        break;
275
0
      }
276
0
      case CCV_64F: {
277
0
        double data[len];
278
0
        cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.f64, tensor->info.type, len * sizeof(double));
279
0
        for (i = 0; i < len; i++)
280
0
          PRINT(CCV_CLI_VERBOSE, " %f", data[i]);
281
0
        break;
282
0
      }
283
0
      case CCV_32S: {
284
0
        int data[len];
285
0
        cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.i32, tensor->info.type, len * sizeof(int));
286
0
        for (i = 0; i < len; i++)
287
0
          PRINT(CCV_CLI_VERBOSE, " %d", data[i]);
288
0
        break;
289
0
      }
290
0
      case CCV_64S: {
291
0
        int64_t data[len];
292
0
        cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.i64, tensor->info.type, len * sizeof(int64_t));
293
0
        for (i = 0; i < len; i++)
294
0
          PRINT(CCV_CLI_VERBOSE, " %lld", (long long)data[i]);
295
0
        break;
296
0
      }
297
0
      case CCV_8U: {
298
0
        uint8_t data[len];
299
0
        cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.u8, tensor->info.type, len * sizeof(uint8_t));
300
0
        for (i = 0; i < len; i++)
301
0
          PRINT(CCV_CLI_VERBOSE, " %d", (int)data[i]);
302
0
        break;
303
0
      }
304
0
    }
305
0
    if (ccv_nnc_tensor_count(tensor->info) > 3)
306
0
      PRINT(CCV_CLI_VERBOSE, " ..");
307
#elif defined(HAVE_MPS)
308
    switch (tensor->info.datatype)
309
    {
310
      case CCV_16F: {
311
        uint16_t data[len];
312
        mpmemcpy(data, 0, CCV_TENSOR_CPU_MEMORY, tensor->data.f16, tensor->dataof, tensor->info.type, len * sizeof(uint16_t));
313
        float fp32[len];
314
        ccv_half_precision_to_float(data, fp32, len);
315
        for (i = 0; i < len; i++)
316
          PRINT(CCV_CLI_VERBOSE, " %f", fp32[i]);
317
        break;
318
      }
319
      case CCV_16BF: {
320
        uint16_t data[len];
321
        mpmemcpy(data, 0, CCV_TENSOR_CPU_MEMORY, tensor->data.f16, tensor->dataof, tensor->info.type, len * sizeof(uint16_t));
322
        float fp32[len];
323
        ccv_bfloat_to_float(data, fp32, len);
324
        for (i = 0; i < len; i++)
325
          PRINT(CCV_CLI_VERBOSE, " %f", fp32[i]);
326
        break;
327
      }
328
      case CCV_32F: {
329
        float data[len];
330
        mpmemcpy(data, 0, CCV_TENSOR_CPU_MEMORY, tensor->data.f32, tensor->dataof, tensor->info.type, len * sizeof(float));
331
        for (i = 0; i < len; i++)
332
          PRINT(CCV_CLI_VERBOSE, " %f", data[i]);
333
        break;
334
      }
335
      case CCV_64F: {
336
        double data[len];
337
        mpmemcpy(data, 0, CCV_TENSOR_CPU_MEMORY, tensor->data.f64, tensor->dataof, tensor->info.type, len * sizeof(double));
338
        for (i = 0; i < len; i++)
339
          PRINT(CCV_CLI_VERBOSE, " %f", data[i]);
340
        break;
341
      }
342
      case CCV_32S: {
343
        int data[len];
344
        mpmemcpy(data, 0, CCV_TENSOR_CPU_MEMORY, tensor->data.i32, tensor->dataof, tensor->info.type, len * sizeof(int));
345
        for (i = 0; i < len; i++)
346
          PRINT(CCV_CLI_VERBOSE, " %d", data[i]);
347
        break;
348
      }
349
      case CCV_64S: {
350
        int64_t data[len];
351
        mpmemcpy(data, 0, CCV_TENSOR_CPU_MEMORY, tensor->data.i64, tensor->dataof, tensor->info.type, len * sizeof(int64_t));
352
        for (i = 0; i < len; i++)
353
          PRINT(CCV_CLI_VERBOSE, " %lld", (long long)data[i]);
354
        break;
355
      }
356
      case CCV_8U: {
357
        uint8_t data[len];
358
        mpmemcpy(data, 0, CCV_TENSOR_CPU_MEMORY, tensor->data.u8, tensor->dataof, tensor->info.type, len * sizeof(uint8_t));
359
        for (i = 0; i < len; i++)
360
          PRINT(CCV_CLI_VERBOSE, " %d", (int)data[i]);
361
        break;
362
      }
363
    }
364
    if (ccv_nnc_tensor_count(tensor->info) > 3)
365
      PRINT(CCV_CLI_VERBOSE, " ..");
366
#endif
367
0
  } else if (CCV_TENSOR_GET_MEMORY(tensor->info.type) == CCV_TENSOR_CPU_MEMORY) {
368
0
    switch (tensor->info.datatype)
369
0
    {
370
0
      case CCV_16F: {
371
0
        float fp32[len];
372
0
        ccv_half_precision_to_float((uint16_t*)tensor->data.f16, fp32, len);
373
0
        for (i = 0; i < len; i++)
374
0
          PRINT(CCV_CLI_VERBOSE, " %f", fp32[i]);
375
0
        break;
376
0
      }
377
0
      case CCV_16BF: {
378
0
        float fp32[len];
379
0
        ccv_bfloat_to_float((uint16_t*)tensor->data.f16, fp32, len);
380
0
        for (i = 0; i < len; i++)
381
0
          PRINT(CCV_CLI_VERBOSE, " %f", fp32[i]);
382
0
        break;
383
0
      }
384
0
      case CCV_32F:
385
0
        for (i = 0; i < len; i++)
386
0
          PRINT(CCV_CLI_VERBOSE, " %f", tensor->data.f32[i]);
387
0
        break;
388
0
      case CCV_64F:
389
0
        for (i = 0; i < len; i++)
390
0
          PRINT(CCV_CLI_VERBOSE, " %f", tensor->data.f64[i]);
391
0
        break;
392
0
      case CCV_32S:
393
0
        for (i = 0; i < len; i++)
394
0
          PRINT(CCV_CLI_VERBOSE, " %d", tensor->data.i32[i]);
395
0
        break;
396
0
      case CCV_64S:
397
0
        for (i = 0; i < len; i++)
398
0
          PRINT(CCV_CLI_VERBOSE, " %lld", (long long)tensor->data.i64[i]);
399
0
        break;
400
0
      case CCV_8U:
401
0
        for (i = 0; i < len; i++)
402
0
          PRINT(CCV_CLI_VERBOSE, " %d", (int)tensor->data.u8[i]);
403
0
        break;
404
0
    }
405
0
    if (ccv_nnc_tensor_count(tensor->info) > 3)
406
0
      PRINT(CCV_CLI_VERBOSE, " ..");
407
0
  }
408
0
}
409
410
static co_decl(_ccv_nnc_graph_topsorted_run_coro, (ccv_nnc_graph_t* const graph, const int exec_idx, const ccv_nnc_graph_static_schedule_t* const schedule, ccv_nnc_graph_exec_info_t* const exec, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context, const int flags));
411
412
6
static 
co_decl_task2
(_ccv_nnc_graph_exec_cases_of_coro, (ccv_nnc_graph_t* const graph, const int exec_idx, ccv_nnc_graph_exec_info_t* const exec, const ccv_nnc_graph_exec_schedule_t* const schd, ccv_nnc_tensor_t* const* const inputs, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context, int flags), private(
413
6
  int ref;
414
6
  ccv_nnc_graph_t* sub_graph;
415
6
)) {
416
  // Wait until this stream context is done.
417
6
  
co_stream_await2
(CO_P(stream_context));
418
2
  if (CO_P(exec)->cmd.cmd == CCV_NNC_GRAPH_FORWARD)
419
2
  {
420
2
    CO_V(ref) = CO_P(exec)->case_of.offset + CO_P(exec)->case_of.expr(CO_P(inputs), CO_P(exec)->input_size, CO_P(exec)->case_of.data);
421
2
    if (CO_P(tensor_tape))
422
0
      ccv_nnc_tensor_tape_set_numbering(CO_P(tensor_tape), CO_P(graph), (ccv_nnc_graph_exec_t){
423
0
        .d = CO_P(exec_idx),
424
0
        .graph = CO_P(graph),
425
0
      }, CO_V(ref));
426
2
  } else {
427
0
    assert(CO_P(exec)->cmd.cmd == CCV_NNC_GRAPH_BACKWARD);
428
0
    assert(CO_P(tensor_tape));
429
0
    CO_V(ref) = ccv_nnc_tensor_tape_numbering(CO_P(tensor_tape), CO_P(graph), (ccv_nnc_graph_exec_t){
430
0
        .d = CO_P(exec_idx),
431
0
        .graph = CO_P(graph),
432
0
      });
433
0
  }
434
2
  if (CO_V(ref) >= 0)
435
2
  {
436
2
    assert(CO_V(ref) < CO_P(exec)->graph_ref_size);
437
2
    CO_V(sub_graph) = *(ccv_nnc_graph_t**)ccv_array_get(CO_P(graph)->sub_graphs, CCV_NNC_GRAPH_REF(CO_P(exec))[CO_V(ref)] - 1);
438
2
    assert(CO_P(schd)->stream_size == 1);
439
2
    assert(CO_P(graph)->streams[SCHEDULE_STREAMS(*CO_P(schd))[0]] == CO_V(sub_graph)->streams[0]);
440
2
    co_apply(_ccv_nnc_graph_topsorted_run_coro, (CO_V(sub_graph), CO_P(exec_idx), CO_V(sub_graph)->default_schedule, CO_P(exec), CO_P(tensor_tape), CO_P(graph)->streams[SCHEDULE_STREAMS(*CO_P(schd))[0]], CO_P(flags)));
441
2
  }
442
2
  _ccv_nnc_graph_exec_unwrap_phi(CO_P(graph), CO_P(exec), CO_V(ref));
443
2
} co_end()
444
445
typedef struct {
446
  ccv_nnc_graph_t* graph;
447
  const ccv_nnc_graph_exec_schedule_t* node;
448
  ccv_nnc_stream_context_t* stream;
449
} ccv_nnc_graph_neighbor_context_discovery_t;
450
451
static ccv_nnc_stream_context_t* _ccv_nnc_graph_neighbor_context_discovery(const int device_id, void* const context)
452
13.9k
{
453
13.9k
  const ccv_nnc_graph_neighbor_context_discovery_t* const discovery = (ccv_nnc_graph_neighbor_context_discovery_t*)context;
454
13.9k
  if (CCV_STREAM_GET_DEVICE_ID(ccv_nnc_stream_context_type(discovery->stream)) == device_id)
455
3.65k
    return discovery->stream;
456
10.3k
  ccv_nnc_graph_t* const graph = discovery->graph;
457
10.3k
  const ccv_nnc_graph_exec_schedule_t* const node = discovery->node;
458
10.3k
  int i;
459
  // First try to find in other streams of the same node.
460
30.9k
  for (i = 0; i < node->stream_size; 
i++20.6k
)
461
30.9k
  {
462
30.9k
    ccv_nnc_stream_context_t* const stream = graph->streams[SCHEDULE_STREAMS(*node)[i]];
463
30.9k
    if (CCV_STREAM_GET_DEVICE_ID(ccv_nnc_stream_context_type(stream)) == device_id)
464
10.3k
      return stream;
465
30.9k
  }
466
  // If cannot find, try to find in all the wait streams.
467
7
  
for (i = 0; 4
i < node->wait_size;
i++3
)
468
7
  {
469
7
    ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_signal_get_emitter(graph->signals[node->waits[i]]);
470
7
    if (stream_context && CCV_STREAM_GET_DEVICE_ID(ccv_nnc_stream_context_type(stream_context)) == device_id)
471
4
      return stream_context;
472
7
  }
473
0
  return 0;
474
4
}
475
476
static co_routine_t* _ccv_nnc_graph_exec_run_task(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const node, const ccv_nnc_graph_exec_schedule_t* const schd, const int idx, ccv_nnc_tensor_tape_t* const tensor_tape, const int flags)
477
229k
{
478
229k
  _ccv_nnc_graph_exec_unwrap_io(graph, node);
479
229k
  ccv_nnc_tensor_t** inputs = node->inputs;
480
229k
  ccv_nnc_tensor_t** outputs = inputs ? 
inputs + node->input_size201k
:
028.1k
;
481
229k
  if (tensor_tape)
482
0
    ccv_nnc_tensor_tape_io(tensor_tape, graph, node->input_flags, inputs, node->input_size, node->output_flags, outputs, node->output_size);
483
  /* Broadcast the updates to all subscribed references for input / output, even though at th
484
   * time output is not written yet, propagate pointer change is still valid. */
485
229k
  _ccv_nnc_graph_exec_begin_synchronize_multiviews(graph, node);
486
229k
  if (node->cmd.cmd == CCV_NNC_GRAPH_FORWARD || 
node->cmd.cmd == CCV_NNC_GRAPH_BACKWARD229k
)
487
4
  {
488
4
    if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
489
2
    {
490
2
      ccv_nnc_stream_context_t* const node_stream = graph->streams[SCHEDULE_STREAMS(*schd)[0]];
491
2
      return co_new(_ccv_nnc_graph_exec_cases_of_coro, (graph, idx, node, schd, inputs, tensor_tape, node_stream, flags));
492
2
    } else if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE) {
493
2
      ccv_nnc_graph_t* sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[0] - 1);
494
2
      assert(graph->streams[SCHEDULE_STREAMS(*schd)[0]] == sub_graph->streams[0]);
495
2
      return co_new(_ccv_nnc_graph_topsorted_run_coro, (sub_graph, idx, sub_graph->default_schedule, node, tensor_tape, graph->streams[SCHEDULE_STREAMS(*schd)[0]], flags));
496
2
    }
497
229k
  } else {
498
229k
    PRINT(CCV_CLI_INFO, "%s [%d]: [%d] -> [%d] (%d)\n", 
ccv_nnc_cmd_name(node->cmd.cmd), idx, node->input_size, node->output_size, 0
SCHEDULE_STREAMS0
(*schd)[0]);
499
229k
    int i, j;
500
229k
    int flag = 0;
501
470k
    for (i = 0; i < schd->stream_size; 
i++240k
)
502
240k
    {
503
240k
      ccv_nnc_stream_context_t* const stream = graph->streams[SCHEDULE_STREAMS(*schd)[i]];
504
355k
      for (j = 0; j < schd->wait_size; 
j++115k
)
505
115k
      {
506
115k
        ccv_nnc_stream_context_wait_signal(stream, graph->signals[schd->waits[j]]);
507
115k
        if (!flag)
508
43.1k
        {
509
43.1k
          PRINT(CCV_CLI_INFO, "Wait: (%d, %d)", 
SCHEDULE_STREAMS0
(*schd)[i], schd->waits[j]);
510
43.1k
          flag = 1;
511
43.1k
        } else
512
71.9k
          PRINT(CCV_CLI_INFO, ", (%d, %d)", 
SCHEDULE_STREAMS0
(*schd)[i], schd->waits[j]);
513
115k
      }
514
240k
    }
515
229k
    if (flag)
516
43.1k
      PRINT(CCV_CLI_INFO, "\n");
517
880k
    for (i = 0; i < node->input_size; 
i++650k
)
518
650k
    {
519
650k
      PRINT(CCV_CLI_INFO, "|-> %d. %p (%p:%d)", 
i + 1, inputs[i], (inputs[i] ? inputs[i]->data.u8 : 0), (inputs[i] ? 0
CCV_TENSOR_GET_DEVICE_ID0
(inputs[i]->info.type) : -1));
520
650k
      if (inputs[i] && 
CCV_CLI_OUTPUT_LEVEL_IS498k
(CCV_CLI_INFO))
521
0
        ccv_nnc_print_tensor_info(inputs[i]);
522
650k
      PRINT(CCV_CLI_INFO, "\n");
523
650k
    }
524
586k
    for (i = 0; i < node->output_size; 
i++356k
)
525
356k
    {
526
356k
      PRINT(CCV_CLI_INFO, "|<- %d. %p (%p:%d)", 
i + 1, outputs[i], (outputs[i] ? outputs[i]->data.u8 : 0), (outputs[i] ? 0
CCV_TENSOR_GET_DEVICE_ID0
(outputs[i]->info.type) : -1));
527
356k
      if (outputs[i] && 
CCV_CLI_OUTPUT_LEVEL_IS336k
(CCV_CLI_INFO))
528
0
        ccv_nnc_print_tensor_shape(outputs[i]);
529
356k
      PRINT(CCV_CLI_INFO, "\n");
530
356k
    }
531
229k
    ccv_nnc_stream_context_t* const node_stream = graph->streams[SCHEDULE_STREAMS(*schd)[0]];
532
229k
    ccv_nnc_graph_neighbor_context_discovery_t discovery_context = {
533
229k
      .graph = graph,
534
229k
      .node = schd,
535
229k
      .stream = node_stream
536
229k
    };
537
229k
    ccv_nnc_stream_context_set_neighbor_discovery(node_stream, _ccv_nnc_graph_neighbor_context_discovery, &discovery_context);
538
229k
    const int status = ccv_nnc_cmd_exec(node->cmd, node->hint, flags, inputs, node->input_size, outputs, node->output_size, node_stream);
539
229k
    if (status != 0)
540
0
      PRINT(CCV_CLI_INFO, "Invalid Status: %d\n", status);
541
229k
    if (CCV_CLI_OUTPUT_LEVEL_IS(CCV_CLI_VERBOSE))
542
0
    {
543
0
      for (i = 0; i < node->output_size; i++)
544
0
      {
545
0
        PRINT(CCV_CLI_VERBOSE, "POST: |<- %d. %p (%p:%d)", i + 1, outputs[i], (outputs[i] ? outputs[i]->data.u8 : 0), (outputs[i] ? CCV_TENSOR_GET_DEVICE_ID(outputs[i]->info.type) : -1));
546
0
        if (outputs[i])
547
0
          ccv_nnc_print_tensor_info(outputs[i]);
548
0
        PRINT(CCV_CLI_VERBOSE, "\n");
549
0
      }
550
0
    }
551
229k
    flag = 0;
552
470k
    for (i = 0; i < schd->stream_size; 
i++240k
)
553
240k
      if (SCHEDULE_SIGNALS(*schd)[i] >= 0)
554
57.9k
      {
555
57.9k
        ccv_nnc_stream_context_t* const stream = graph->streams[SCHEDULE_STREAMS(*schd)[i]];
556
57.9k
        ccv_nnc_stream_context_emit_signal(stream, graph->signals[SCHEDULE_SIGNALS(*schd)[i]]);
557
57.9k
        if (!flag)
558
57.9k
        {
559
57.9k
          PRINT(CCV_CLI_INFO, "Emit: (%d, %d)", 
SCHEDULE_STREAMS0
(*schd)[i],
SCHEDULE_SIGNALS0
(*schd)[i]);
560
57.9k
          flag = 1;
561
57.9k
        } else
562
9
          PRINT(CCV_CLI_INFO, ", (%d, %d)", 
SCHEDULE_STREAMS0
(*schd)[i],
SCHEDULE_SIGNALS0
(*schd)[i]);
563
57.9k
      }
564
229k
    if (flag)
565
57.9k
      PRINT(CCV_CLI_INFO, "\n");
566
229k
  }
567
229k
  return 0;
568
229k
}
569
570
static void _ccv_nnc_graph_mark_outgoing_streams_blocked_by_task(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_schedule_t* const schd_info, ccv_nnc_graph_exec_info_t* const node, co_routine_t* const task)
571
6
{
572
6
  int i, j;
573
6
  if (node->outgoings)
574
8
    
for (i = 0; 4
i < node->outgoings->rnum;
i++4
)
575
4
    {
576
4
      const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i);
577
4
      const ccv_nnc_graph_exec_schedule_t* const outgoing_schd = schd_info + outgoing_idx;
578
      // An outgoing stream can be blocked by multiple other tasks from other streams. But it is OK,
579
      // because on next round of execution, that one will be marked as blocked again.
580
8
      for (j = 0; j < outgoing_schd->stream_size; 
j++4
)
581
4
        graph->block_stream_tasks[SCHEDULE_STREAMS(*outgoing_schd)[j]] = task;
582
4
    }
583
6
}
584
585
6
static 
co_decl_task2
(_ccv_nnc_graph_wait_any_sub_tasks, (ccv_nnc_graph_t* const graph, co_routine_t* const* const sub_tasks, const int sub_task_size, const ccv_nnc_graph_exec_schedule_t* const schd_info, const int* const pending_nodes, const int pending_node_size), private(
586
6
)) {
587
6
  assert(CO_P(sub_task_size) > 0);
588
2
  co_await_any(CO_P(sub_tasks), CO_P(sub_task_size));
589
  // This is not good, these local variables need to be in the private section.
590
  // I got away with it because there is no yield or resume or apply or any after await above.
591
2
  int i, j, k;
592
4
  for (i = 0; i < CO_P(sub_task_size); 
i++2
)
593
2
    if (co_is_done(CO_P(sub_tasks)[i]))
594
2
    {
595
6
      for (j = 0; j < CO_P(pending_node_size); 
j++4
)
596
4
      {
597
4
        const ccv_nnc_graph_exec_schedule_t* const node = CO_P(schd_info) + CO_P(pending_nodes)[j];
598
8
        for (k = 0; k < node->stream_size; 
k++4
)
599
4
          if (CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*node)[k]] == CO_P(sub_tasks)[i])
600
2
            CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*node)[k]] = 0;
601
4
      }
602
2
      co_free(CO_P(sub_tasks)[i]);
603
2
    }
604
2
} co_end()
605
606
53.2k
static 
co_decl_task26.6k
(_ccv_nnc_graph_exec_run_loop, (ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const exec_info, const ccv_nnc_graph_exec_schedule_t* const schd_info, const int* const psort, const int start_index, const int exec_info_size, ccv_nnc_tensor_tape_t* const tensor_tape, const int flags), private(
607
53.2k
  int i, p, q;
608
53.2k
  int sub_task_size;
609
53.2k
  co_routine_t** sub_tasks;
610
53.2k
  int* pending_nodes[2];
611
53.2k
  int pending_node_size[2];
612
53.2k
  int idx;
613
53.2k
  ccv_nnc_graph_exec_info_t* node;
614
53.2k
  const ccv_nnc_graph_exec_schedule_t* schd;
615
53.2k
  co_routine_t* task;
616
53.2k
)) {
617
53.2k
  
CO_V26.6k
(sub_task_size) = 0;
618
53.2k
  
CO_V26.6k
(sub_tasks) = (co_routine_t**)ccv_nnc_graph_buffer(
CO_P26.6k
(graph), sizeof(co_routine_t*) * (
CO_P26.6k
(graph)->sub_graphs26.6k
?
CO_P3
(graph)->sub_graphs->rnum3
:
026.6k
) + sizeof(int) *
CO_P26.6k
(exec_info_size) * 2);
619
53.2k
  
CO_V26.6k
(pending_nodes)[0] = (int*)(
CO_V26.6k
(sub_tasks) + (
CO_P26.6k
(graph)->sub_graphs26.6k
?
CO_P3
(graph)->sub_graphs->rnum3
:
026.6k
));
620
53.2k
  
CO_V26.6k
(pending_nodes)[1] =
CO_V26.6k
(pending_nodes)[0] +
CO_P26.6k
(exec_info_size);
621
53.2k
  
CO_V26.6k
(pending_node_size)[0] = 0;
622
53.2k
  
CO_V26.6k
(pending_node_size)[1] = 0;
623
256k
  for (
CO_V26.6k
(i) =
CO_P26.6k
(start_index); CO_V(i) < CO_P(exec_info_size);
CO_V229k
(i)++229k
)
624
229k
  {
625
229k
    if (__atomic_load_n(&CO_P(graph)->run_state, __ATOMIC_ACQUIRE) == CCV_NNC_GRAPH_STATE_CANCEL)
626
0
      break;
627
229k
    CO_V(idx) = CO_P(psort) ? 
CO_P94.7k
(psort)[94.7k
CO_V94.7k
(i)] :
CO_V135k
(i);
628
229k
    CO_V(node) = CO_P(exec_info) + CO_V(idx);
629
229k
    CO_V(schd) = CO_P(schd_info) + CO_V(idx);
630
    // If stream is blocked by but not blocked by current executing task.
631
229k
    int blocked = 0, j;
632
470k
    for (j = 0; j < CO_V(schd)->stream_size; 
j++240k
)
633
240k
      if (CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*CO_V(schd))[j]])
634
4
      {
635
4
        CO_V(pending_nodes)[0][CO_V(pending_node_size)[0]++] = CO_V(idx);
636
4
        _ccv_nnc_graph_mark_outgoing_streams_blocked_by_task(CO_P(graph), CO_P(schd_info), CO_V(node), CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*CO_V(schd))[j]]);
637
4
        blocked = 1;
638
4
      }
639
229k
    if (blocked)
640
4
      continue;
641
229k
    CO_V(task) = _ccv_nnc_graph_exec_run_task(CO_P(graph), CO_V(node), CO_V(schd), CO_V(idx), CO_P(tensor_tape), CO_P(flags));
642
229k
    if (CO_V(task))
643
4
    {
644
4
      co_resume(CO_V(task));
645
4
      if (!co_is_done(CO_V(task)))
646
2
      {
647
2
        CO_V(sub_tasks)[CO_V(sub_task_size)++] = CO_V(task);
648
2
        int j;
649
4
        for (j = 0; j < CO_V(schd)->stream_size; 
j++2
)
650
2
          CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*CO_V(schd))[j]] = CO_V(task);
651
2
        _ccv_nnc_graph_mark_outgoing_streams_blocked_by_task(CO_P(graph), CO_P(schd_info), CO_V(node), CO_V(task));
652
2
      } else
653
2
        co_free(CO_V(task));
654
4
    }
655
229k
  }
656
26.6k
  if (CO_V(sub_task_size))
657
26.6k
    co_apply(_ccv_nnc_graph_wait_any_sub_tasks, (CO_P(graph), CO_V(sub_tasks), CO_V(sub_task_size), CO_P(schd_info), CO_V(pending_nodes)[0], CO_V(pending_node_size)[0]));
658
26.6k
  if (__atomic_load_n(&CO_P(graph)->run_state, __ATOMIC_ACQUIRE) == CCV_NNC_GRAPH_STATE_CANCEL)
659
26.6k
    co_return();
660
26.6k
  CO_V(p) = 0;
661
26.6k
  CO_V(q) = 1;
662
26.6k
  while (CO_V(pending_node_size)[CO_V(p)] > 0)
663
2
  {
664
2
    CO_V(pending_node_size)[CO_V(q)] = 0;
665
2
    CO_V(sub_task_size) = 0;
666
6
    for (
CO_V2
(i) = 0; CO_V(i) < CO_V(pending_node_size)[CO_V(p)];
CO_V4
(i)++4
)
667
4
    {
668
4
      CO_V(idx) = CO_V(pending_nodes)[CO_V(p)][CO_V(i)];
669
4
      CO_V(node) = CO_P(exec_info) + CO_V(idx);
670
4
      CO_V(schd) = CO_P(schd_info) + CO_V(idx);
671
4
      int blocked = 0, j;
672
8
      for (j = 0; j < CO_V(schd)->stream_size; 
j++4
)
673
4
        if (CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*CO_V(schd))[j]])
674
0
        {
675
0
          _ccv_nnc_graph_mark_outgoing_streams_blocked_by_task(CO_P(graph), CO_P(schd_info), CO_V(node), CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*CO_V(schd))[j]]);
676
0
          CO_V(pending_nodes)[CO_V(q)][CO_V(pending_node_size)[CO_V(q)]++] = CO_V(idx);
677
0
          blocked = 1;
678
0
        }
679
4
      if (blocked)
680
0
        continue;
681
4
      CO_V(task) = _ccv_nnc_graph_exec_run_task(CO_P(graph), CO_V(node), CO_V(schd), CO_V(idx), CO_P(tensor_tape), CO_P(flags));
682
4
      if (CO_V(task))
683
0
      {
684
0
        co_resume(CO_V(task));
685
0
        if (!co_is_done(CO_V(task)))
686
0
        {
687
0
          CO_V(sub_tasks)[CO_V(sub_task_size)++] = CO_V(task);
688
0
          for (j = 0; j < CO_V(schd)->stream_size; j++)
689
0
            CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*CO_V(schd))[j]] = CO_V(task);
690
0
          _ccv_nnc_graph_mark_outgoing_streams_blocked_by_task(CO_P(graph), CO_P(schd_info), CO_V(node), CO_V(task));
691
0
        } else
692
0
          co_free(CO_V(task));
693
0
      }
694
4
    }
695
2
    int t;
696
2
    CCV_SWAP(CO_V(p), CO_V(q), t);
697
2
    if (CO_V(sub_task_size))
698
2
      co_apply(_ccv_nnc_graph_wait_any_sub_tasks, (CO_P(graph), CO_V(sub_tasks), CO_V(sub_task_size), CO_P(schd_info), CO_V(pending_nodes)[CO_V(p)], CO_V(pending_node_size)[CO_V(p)]));
699
2
  }
700
26.6k
} co_end()
701
702
79.8k
co_task26.6k
(_ccv_nnc_graph_topsorted_run_coro, (ccv_nnc_graph_t* const graph, const int exec_idx, const ccv_nnc_graph_static_schedule_t* const schedule, ccv_nnc_graph_exec_info_t* const exec, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context, const int flags), private(
703
79.8k
  ccv_nnc_graph_exec_info_t* exec_info;
704
79.8k
  const ccv_nnc_graph_exec_schedule_t* schd_info;
705
79.8k
  co_routine_t* previous_main;
706
79.8k
  int stream_0;
707
  // while loop
708
79.8k
  int64_t count, reverse_count;
709
79.8k
  int graph_breakpoint_size;
710
79.8k
  int i, j;
711
79.8k
)) {
712
79.8k
  assert(CO_P(graph)->stream_size > 0);
713
26.6k
  int i;
714
  // Assign the resource container pointer.
715
104k
  for (i = 0; i < CO_P(graph)->stream_size; 
i++78.3k
)
716
78.3k
    CO_P(graph)->streams[i]->resource_container = CO_P(stream_context)->_inline_container;
717
26.6k
  CO_V(exec_info) = (ccv_nnc_graph_exec_info_t*)ccv_array_get(CO_P(graph)->exec_info, 0);
718
26.6k
  CO_V(schd_info) = CO_P(schedule)->exec_info;
719
26.6k
  CO_V(stream_0) = CO_P(schedule)->stream_0;
720
26.6k
  if (CO_P(exec_idx) == -1)
721
26.5k
  {
722
26.5k
    if (CO_P(stream_context)->main)
723
0
    {
724
0
      CO_V(previous_main) = CO_P(stream_context)->main;
725
0
      CO_P(stream_context)->main = co_self();
726
      // Wait the previous task to be done. This makes sure that our graph run is serial on the same stream.
727
0
      assert(!co_is_done(CO_V(previous_main)));
728
0
      co_await(CO_V(previous_main));
729
0
    } else
730
26.5k
      CO_P(stream_context)->main = co_self();
731
26.5k
    PRINT(CCV_CLI_INFO, "Graph Stream %d Begin", 
CO_V0
(stream_0));
732
26.5k
    ccv_nnc_stream_signal_t* stream_0_signal;
733
26.5k
    if (CO_P(stream_context) != CO_P(graph)->streams[CO_V(stream_0)])
734
907
    {
735
      // Make sure when we start work on streams[0], the current stream context is done.
736
907
      stream_0_signal = ccv_nnc_stream_context_emit_signal_new(CO_P(stream_context));
737
907
      ccv_nnc_stream_context_wait_signal(CO_P(graph)->streams[CO_V(stream_0)], stream_0_signal);
738
25.6k
    } else if (CO_P(schedule)->stream_1_size) {
739
81
      ccv_nnc_stream_context_emit_signal(CO_P(graph)->streams[CO_V(stream_0)], CO_P(schedule)->begin);
740
81
      stream_0_signal = CO_P(schedule)->begin;
741
81
    }
742
26.5k
    int i, flag = 0;
743
26.8k
    for (i = 0; i < CO_P(schedule)->stream_1_size; 
i++250
)
744
250
    {
745
250
      ccv_nnc_stream_context_wait_signal(CO_P(graph)->streams[CO_P(schedule)->stream_1s[i]], stream_0_signal);
746
250
      if (!flag)
747
86
      {
748
86
        PRINT(CCV_CLI_INFO, ", Wait: %d", 
CO_P0
(schedule)->stream_1s[i]);
749
86
        flag = 1;
750
86
      } else
751
164
        PRINT(CCV_CLI_INFO, ", %d", 
CO_P0
(schedule)->stream_1s[i]);
752
250
    }
753
26.5k
    PRINT(CCV_CLI_INFO, "\n");
754
26.5k
  } else {
755
4
    assert(CO_P(stream_context) == CO_P(graph)->streams[0]);
756
4
  }
757
26.6k
  if (CO_P(exec) && 
(4
CO_P4
(exec)->flags & CCV_NNC_GRAPH_EXEC_P_WHILE))
758
2
  {
759
2
    assert(CO_P(schedule) == CO_P(graph)->default_schedule);
760
2
    assert(CO_P(exec)->p_while.expr);
761
2
    CO_V(count) = 0;
762
    // This is a forward while loop. Backward while loop will just consult its pairing part.
763
2
    if (CO_P(exec)->cmd.cmd == CCV_NNC_GRAPH_FORWARD)
764
2
    {
765
2
      CO_V(graph_breakpoint_size) = CO_P(graph)->breakpoint_offset + CO_P(graph)->breakpoint_size;
766
10
      for (;; ++CO_V(count))
767
12
      {
768
12
        CO_P(graph)->while_count = CO_V(count);
769
12
        if (CO_P(tensor_tape))
770
0
          ccv_nnc_tensor_tape_set_numbering(CO_P(tensor_tape), CO_P(graph)->p, (ccv_nnc_graph_exec_t){
771
0
            .d = CO_P(exec_idx),
772
0
            .graph = CO_P(graph)->p,
773
0
          }, CO_V(count));
774
12
        _ccv_nnc_graph_unwrap(CO_P(graph), CO_V(count), 0);
775
12
        if (CO_V(count) > 0)
776
10
          _ccv_nnc_graph_transit_move_to(CO_P(graph));
777
12
        co_apply(_ccv_nnc_graph_exec_run_loop, (CO_P(graph), CO_V(exec_info), CO_V(schd_info), 0, 0, CO_V(graph_breakpoint_size), CO_P(tensor_tape), CO_P(flags)));
778
12
        if (__atomic_load_n(&CO_P(graph)->run_state, __ATOMIC_ACQUIRE) == CCV_NNC_GRAPH_STATE_CANCEL)
779
0
          break;
780
        // Reached breakpoints, now check the breakpoint, if not met, break out.
781
        // Wait until everything on the stream is executed.
782
24
        
for (12
CO_V12
(i) =
CO_P12
(graph)->breakpoint_offset; CO_V(i) < CO_V(graph_breakpoint_size);
CO_V12
(i)++12
)
783
24
          
for (12
CO_V12
(j) = 0; CO_V(j) < CO_V(schd_info)[CO_V(i)].stream_size;
CO_V12
(j)++12
)
784
12
            co_stream_await(CO_P(graph)->streams[SCHEDULE_STREAMS(CO_V(schd_info)[CO_V(i)])[CO_V(j)]]);
785
12
        _ccv_nnc_graph_exec_unwrap_while_expr(CO_P(graph), CO_P(exec));
786
12
        if (!CO_P(exec)->p_while.expr(CO_P(exec)->p_while.inputs, CO_P(exec)->p_while.input_size, CO_P(exec)->p_while.data))
787
2
        {
788
2
          _ccv_nnc_graph_rewrap(CO_P(graph));
789
          // If we break from here, it is ok because all the streams are waited.
790
2
          break;
791
2
        }
792
10
        co_apply(_ccv_nnc_graph_exec_run_loop, (CO_P(graph), CO_V(exec_info), CO_V(schd_info), 0, CO_V(graph_breakpoint_size), CO_P(graph)->exec_info->rnum, CO_P(tensor_tape), CO_P(flags)));
793
        // If it is cancelled here, we don't need to breakout yet, we can breakout on earlier place. The most important thing is to avoid stream wait if there is a cancel.
794
10
        _ccv_nnc_graph_from_move_transit(CO_P(graph));
795
10
        _ccv_nnc_graph_rewrap(CO_P(graph));
796
10
      }
797
2
    } else {
798
      // For backward graph, no need to evaluate the while expr.
799
0
      assert(CO_P(exec)->cmd.cmd == CCV_NNC_GRAPH_BACKWARD);
800
0
      assert(CO_P(graph)->pair);
801
0
      assert(CO_P(tensor_tape));
802
0
      CO_V(count) = 0;
803
0
      CO_V(reverse_count) = CO_P(graph)->while_count = ccv_nnc_tensor_tape_numbering(CO_P(tensor_tape), CO_P(graph)->p, (ccv_nnc_graph_exec_t){
804
0
          .d = CO_P(exec_idx),
805
0
          .graph = CO_P(graph)->p,
806
0
        });
807
0
      _ccv_nnc_graph_unwrap(CO_P(graph), CO_V(count), CO_V(reverse_count));
808
0
      co_apply(_ccv_nnc_graph_exec_run_loop, (CO_P(graph), CO_V(exec_info), CO_V(schd_info), 0, CO_P(graph)->breakpoint_offset, CO_P(graph)->exec_info->rnum, CO_P(tensor_tape), CO_P(flags)));
809
      // If it is cancelled here, we don't need to breakout yet, we can breakout later.
810
0
      _ccv_nnc_graph_from_move_transit(CO_P(graph));
811
0
      _ccv_nnc_graph_rewrap(CO_P(graph));
812
0
      for (CO_V(count) = 1; CO_V(reverse_count) > 0; ++CO_V(count))
813
0
      {
814
0
        CO_P(graph)->while_count = --CO_V(reverse_count);
815
0
        _ccv_nnc_graph_unwrap(CO_P(graph), CO_V(count), CO_V(reverse_count));
816
0
        _ccv_nnc_graph_transit_move_to(CO_P(graph));
817
0
        co_apply(_ccv_nnc_graph_exec_run_loop, (CO_P(graph), CO_V(exec_info), CO_V(schd_info), 0, 0, CO_P(graph)->exec_info->rnum, CO_P(tensor_tape), CO_P(flags)));
818
0
        if (__atomic_load_n(&CO_P(graph)->run_state, __ATOMIC_ACQUIRE) == CCV_NNC_GRAPH_STATE_CANCEL)
819
0
          break;
820
0
        _ccv_nnc_graph_from_move_transit(CO_P(graph));
821
0
        _ccv_nnc_graph_rewrap(CO_P(graph));
822
0
      }
823
0
    }
824
2
    if (__atomic_load_n(&CO_P(graph)->run_state, __ATOMIC_ACQUIRE) == CCV_NNC_GRAPH_STATE_CANCEL)
825
0
    {
826
      // The most important thing is to reset main and then return, we don't need to wait for any streaming event.
827
0
      if (CO_P(exec_idx) == -1 && CO_P(stream_context)->main == co_self())
828
0
        CO_P(stream_context)->main = 0;
829
0
      co_return();
830
0
    }
831
2
    assert(CO_V(stream_0) == 0);
832
2
    int i;
833
2
    for (i = 0; i < CO_P(schedule)->wait_size; 
i++0
)
834
0
      ccv_nnc_stream_context_wait_signal(CO_P(graph)->streams[0], CO_P(graph)->signals[CO_P(schedule)->waits[i]]);
835
26.5k
  } else {
836
26.5k
    CO_P(graph)->while_count = 0;
837
26.5k
    co_apply(_ccv_nnc_graph_exec_run_loop, (CO_P(graph), CO_V(exec_info), CO_V(schd_info), CO_P(schedule)->psort, 0, CO_P(schedule)->psort ? CO_P(schedule)->psort_size : CO_P(schedule)->exec_info_size, CO_P(tensor_tape), CO_P(flags)));
838
26.5k
    if (__atomic_load_n(&CO_P(graph)->run_state, __ATOMIC_ACQUIRE) == CCV_NNC_GRAPH_STATE_CANCEL)
839
0
    {
840
      // The most important thing is to reset main and then return, we don't need to wait for any streaming event.
841
0
      if (CO_P(exec_idx) == -1 && CO_P(stream_context)->main == co_self())
842
0
        CO_P(stream_context)->main = 0;
843
0
      co_return();
844
0
    }
845
26.5k
    PRINT(CCV_CLI_INFO, "Graph Stream %d End", 
CO_V0
(stream_0));
846
26.5k
    int i, flag = 0;
847
26.7k
    for (i = 0; i < CO_P(schedule)->wait_size; 
i++194
)
848
194
    {
849
194
      ccv_nnc_stream_context_wait_signal(CO_P(graph)->streams[CO_V(stream_0)], CO_P(graph)->signals[CO_P(schedule)->waits[i]]);
850
194
      if (!flag)
851
66
      {
852
66
        PRINT(CCV_CLI_INFO, ", Wait: %d", 
CO_P0
(schedule)->waits[i]);
853
66
        flag = 1;
854
66
      } else
855
128
        PRINT(CCV_CLI_INFO, ", %d", 
CO_P0
(schedule)->waits[i]);
856
194
    }
857
26.5k
    PRINT(CCV_CLI_INFO, "\n");
858
26.5k
  }
859
26.6k
  if (CO_P(stream_context) != CO_P(graph)->streams[CO_V(stream_0)])
860
907
  {
861
907
    assert(CO_P(exec_idx) == -1);
862
907
    ccv_nnc_stream_context_emit_signal(CO_P(graph)->streams[CO_V(stream_0)], CO_P(schedule)->end);
863
907
    ccv_nnc_stream_context_wait_signal(CO_P(stream_context), CO_P(schedule)->end);
864
907
  }
865
  // Reset main to 0 if it is current me.
866
26.6k
  if (CO_P(exec_idx) == -1 && 
CO_P26.5k
(stream_context)->main == 26.5k
co_self26.5k
())
867
26.5k
    CO_P(stream_context)->main = 0;
868
26.6k
} co_end()
869
870
static int _ccv_nnc_graph_run(ccv_nnc_graph_t* const graph, const int exec_idx, ccv_nnc_graph_exec_info_t* const exec, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const int flags, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context);
871
872
static inline void _ccv_nnc_graph_exec_run(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const node, const int idx, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context, const int flags)
873
61.2k
{
874
61.2k
  int i;
875
61.2k
  _ccv_nnc_graph_exec_unwrap_io(graph, node);
876
61.2k
  ccv_nnc_tensor_t** inputs = node->inputs;
877
61.2k
  ccv_nnc_tensor_t** outputs = inputs ? 
inputs + node->input_size61.0k
:
0223
;
878
61.2k
  if (tensor_tape)
879
78
    ccv_nnc_tensor_tape_io(tensor_tape, graph, node->input_flags, inputs, node->input_size, node->output_flags, outputs, node->output_size);
880
  /* Broadcast the updates to all subscribed references for input / output, even though at th
881
   * time output is not written yet, propagate pointer change is still valid. */
882
61.2k
  _ccv_nnc_graph_exec_begin_synchronize_multiviews(graph, node);
883
61.2k
  if (node->cmd.cmd == CCV_NNC_GRAPH_FORWARD || 
node->cmd.cmd == CCV_NNC_GRAPH_BACKWARD61.1k
)
884
67
  {
885
67
    assert(!stream_context); // This doesn't work properly with stream context.
886
67
    if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
887
39
    {
888
39
      int ref;
889
39
      if (node->cmd.cmd == CCV_NNC_GRAPH_FORWARD)
890
35
      {
891
35
        ref = node->case_of.offset + node->case_of.expr(inputs, node->input_size, node->case_of.data);
892
35
        if (tensor_tape)
893
4
          ccv_nnc_tensor_tape_set_numbering(tensor_tape, graph, (ccv_nnc_graph_exec_t){
894
4
            .d = idx,
895
4
            .graph = graph,
896
4
          }, ref);
897
35
      } else {
898
4
        assert(node->cmd.cmd == CCV_NNC_GRAPH_BACKWARD);
899
4
        assert(tensor_tape);
900
4
        ref = ccv_nnc_tensor_tape_numbering(tensor_tape, graph, (ccv_nnc_graph_exec_t){
901
4
            .d = idx,
902
4
            .graph = graph,
903
4
          });
904
4
      }
905
39
      if (ref >= 0)
906
31
      {
907
31
        assert(ref < node->graph_ref_size);
908
31
        ccv_nnc_graph_t* sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[ref] - 1);
909
31
        _ccv_nnc_graph_run(sub_graph, idx, node, inputs, node->input_size, outputs, node->output_size, flags, 0, 0, 0, 0, tensor_tape, stream_context);
910
31
      }
911
39
      _ccv_nnc_graph_exec_unwrap_phi(graph, node, ref);
912
39
    } else 
if (28
node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE28
) {
913
28
      ccv_nnc_graph_t* sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[0] - 1);
914
28
      _ccv_nnc_graph_run(sub_graph, idx, node, inputs, node->input_size, outputs, node->output_size, flags, 0, 0, 0, 0, tensor_tape, stream_context);
915
28
    }
916
61.1k
  } else {
917
61.1k
    PRINT(CCV_CLI_INFO, "%s [%d]: [%d] -> [%d]\n", ccv_nnc_cmd_name(node->cmd.cmd), idx, node->input_size, node->output_size);
918
235k
    for (i = 0; i < node->input_size; 
i++174k
)
919
174k
    {
920
174k
      PRINT(CCV_CLI_INFO, "|-> %d. %p (%p:%d)", 
i + 1, inputs[i], (inputs[i] ? inputs[i]->data.u8 : 0), (inputs[i] ? 0
CCV_TENSOR_GET_DEVICE_ID0
(inputs[i]->info.type) : -1));
921
174k
      if (inputs[i] && 
CCV_CLI_OUTPUT_LEVEL_IS131k
(CCV_CLI_INFO))
922
0
        ccv_nnc_print_tensor_info(inputs[i]);
923
174k
      PRINT(CCV_CLI_INFO, "\n");
924
174k
    }
925
61.1k
    ccv_nnc_cmd_exec(node->cmd, node->hint, flags, inputs, node->input_size, outputs, node->output_size, stream_context);
926
156k
    for (i = 0; i < node->output_size; 
i++95.2k
)
927
95.2k
    {
928
95.2k
      PRINT(CCV_CLI_INFO, "|<- %d. %p (%p:%d)", 
i + 1, outputs[i], (outputs[i] ? outputs[i]->data.u8 : 0), (outputs[i] ? 0
CCV_TENSOR_GET_DEVICE_ID0
(outputs[i]->info.type) : -1));
929
95.2k
      if (outputs[i] && 
CCV_CLI_OUTPUT_LEVEL_IS78.9k
(CCV_CLI_INFO))
930
0
        ccv_nnc_print_tensor_info(outputs[i]);
931
95.2k
      PRINT(CCV_CLI_INFO, "\n");
932
95.2k
    }
933
61.1k
  }
934
61.2k
}
935
936
static inline void _ccv_nnc_graph_topsorted_run(ccv_nnc_graph_t* const graph, const int exec_idx, ccv_nnc_graph_exec_info_t* const exec, const int flags, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
937
9.93k
{
938
9.93k
  int i;
939
9.93k
  if (exec && 
(exec->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)51
)
940
23
  {
941
23
    assert(!stream_context); // This doesn't work properly with stream context.
942
23
    assert(exec->p_while.expr);
943
23
    int64_t count = 0;
944
    // This is a forward while loop. Backward while loop will just consult its pairing part.
945
23
    if (exec->cmd.cmd == CCV_NNC_GRAPH_FORWARD)
946
22
    {
947
22
      const int graph_breakpoint_size = graph->breakpoint_offset + graph->breakpoint_size;
948
104
      for (;; ++count)
949
126
      {
950
126
        graph->while_count = count;
951
126
        if (tensor_tape)
952
5
          ccv_nnc_tensor_tape_set_numbering(tensor_tape, graph->p, (ccv_nnc_graph_exec_t){
953
5
            .d = exec_idx,
954
5
            .graph = graph->p,
955
5
          }, count);
956
126
        _ccv_nnc_graph_unwrap(graph, count, 0);
957
126
        if (count > 0)
958
104
          _ccv_nnc_graph_transit_move_to(graph);
959
312
        for (i = 0; i < graph_breakpoint_size; 
i++186
)
960
186
          _ccv_nnc_graph_exec_run(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i), i, tensor_tape, stream_context, flags);
961
126
        _ccv_nnc_graph_exec_unwrap_while_expr(graph, exec);
962
        // Reached breakpoints, now check the breakpoint, if not met, break out.
963
126
        if (!exec->p_while.expr(exec->p_while.inputs, exec->p_while.input_size, exec->p_while.data))
964
22
        {
965
22
          _ccv_nnc_graph_rewrap(graph);
966
22
          break;
967
22
        }
968
210
        
for (i = graph_breakpoint_size; 104
i < graph->exec_info->rnum;
i++106
)
969
106
          _ccv_nnc_graph_exec_run(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i), i, tensor_tape, stream_context, flags);
970
104
        _ccv_nnc_graph_from_move_transit(graph);
971
104
        _ccv_nnc_graph_rewrap(graph);
972
104
      }
973
22
    } else {
974
      // For backward graph, no need to evaluate the while expr.
975
1
      assert(exec->cmd.cmd == CCV_NNC_GRAPH_BACKWARD);
976
1
      assert(graph->pair);
977
1
      assert(tensor_tape);
978
1
      count = 0;
979
1
      int64_t reverse_count = graph->while_count = ccv_nnc_tensor_tape_numbering(tensor_tape, graph->p, (ccv_nnc_graph_exec_t){
980
1
          .d = exec_idx,
981
1
          .graph = graph->p,
982
1
        });
983
1
      _ccv_nnc_graph_unwrap(graph, count, reverse_count);
984
5
      for (i = graph->breakpoint_offset; i < graph->exec_info->rnum; 
i++4
)
985
4
        _ccv_nnc_graph_exec_run(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i), i, tensor_tape, stream_context, flags);
986
1
      _ccv_nnc_graph_from_move_transit(graph);
987
1
      _ccv_nnc_graph_rewrap(graph);
988
5
      for (count = 1; reverse_count > 0; 
++count4
)
989
4
      {
990
4
        graph->while_count = --reverse_count;
991
4
        _ccv_nnc_graph_unwrap(graph, count, reverse_count);
992
4
        _ccv_nnc_graph_transit_move_to(graph);
993
20
        for (i = 0; i < graph->exec_info->rnum; 
i++16
)
994
16
          _ccv_nnc_graph_exec_run(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i), i, tensor_tape, stream_context, flags);
995
4
        _ccv_nnc_graph_from_move_transit(graph);
996
4
        _ccv_nnc_graph_rewrap(graph);
997
4
      }
998
1
    }
999
9.91k
  } else {
1000
9.91k
    graph->while_count = 0;
1001
70.6k
    for (i = 0; i < graph->exec_info->rnum; 
i++60.6k
)
1002
60.6k
      _ccv_nnc_graph_exec_run(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i), i, tensor_tape, stream_context, flags);
1003
9.91k
  }
1004
9.93k
}
1005
1006
static inline void _ccv_nnc_graph_run_slow_path(ccv_nnc_graph_t* const graph, const int exec_idx, ccv_nnc_graph_exec_info_t* const exec, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const int flags, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
1007
35
{
1008
35
  int i, j;
1009
35
  const ccv_nnc_graph_exec_t* const graph_sources = sources ? 
sources27
:
(ccv_nnc_graph_exec_t*)8
ccv_array_get8
(graph->sources, 0);
1010
35
  const int graph_source_size = source_size ? 
source_size27
:
graph->sources->rnum8
;
1011
35
  const ccv_nnc_graph_exec_t* const graph_destinations = destinations ? 
destinations27
:
(ccv_nnc_graph_exec_t*)8
ccv_array_get8
(graph->destinations, 0);
1012
35
  const int graph_destination_size = destination_size ? 
destination_size27
:
graph->destinations->rnum8
;
1013
35
#define visitor(node, idx, ...) \
1014
235
  _ccv_nnc_graph_exec_run(graph, node, idx, tensor_tape, stream_context, flags)
1015
35
  if (exec && 
(exec->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)8
)
1016
5
  {
1017
5
    assert(!stream_context); // This doesn't work properly with stream context.
1018
5
    assert(exec->p_while.expr);
1019
5
    int64_t count = 0;
1020
    // This is a forward while loop. Backward while loop will just consult its pairing part.
1021
5
    if (exec->cmd.cmd == CCV_NNC_GRAPH_FORWARD)
1022
4
    {
1023
4
      ccv_array_t* follows = ccv_array_new(sizeof(ccv_nnc_graph_exec_t), graph->breakpoint_size, 0);
1024
8
      for (i = 0; i < graph->breakpoint_size; 
i++4
)
1025
4
      {
1026
4
        const ccv_nnc_graph_exec_info_t* const exec_info = (const ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, graph->breakpoints->d);
1027
4
        if (exec_info->outgoings)
1028
6
          
for (j = 0; 3
j < exec_info->outgoings->rnum;
j++3
)
1029
3
          {
1030
3
            const ccv_nnc_graph_exec_t exec = {
1031
3
              .d = *(int*)ccv_array_get(exec_info->outgoings, j),
1032
3
              .graph = graph,
1033
3
            };
1034
3
            ccv_array_push(follows, &exec);
1035
3
          }
1036
4
      }
1037
19
      for (;; ++count)
1038
23
      {
1039
23
        graph->while_count = count;
1040
23
        if (tensor_tape)
1041
5
          ccv_nnc_tensor_tape_set_numbering(tensor_tape, graph->p, (ccv_nnc_graph_exec_t){
1042
5
            .d = exec_idx,
1043
5
            .graph = graph->p,
1044
5
          }, count);
1045
23
        _ccv_nnc_graph_unwrap(graph, count, 0);
1046
23
        if (count > 0)
1047
19
          _ccv_nnc_graph_transit_move_to(graph);
1048
28
        
CCV_NNC_GRAPH_VISIT23
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph_sources, graph_source_size, graph->breakpoints, graph->breakpoint_size, 0, visitor);
1049
23
        _ccv_nnc_graph_exec_unwrap_while_expr(graph, exec);
1050
        // Reached breakpoints, now check the breakpoint, if not met, break out.
1051
23
        if (!exec->p_while.expr(exec->p_while.inputs, exec->p_while.input_size, exec->p_while.data))
1052
4
        {
1053
4
          _ccv_nnc_graph_rewrap(graph);
1054
4
          break;
1055
4
        }
1056
19
        if (follows->rnum > 0)
1057
19
          
CCV_NNC_GRAPH_VISIT15
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, (ccv_nnc_graph_exec_t*)ccv_array_get(follows, 0), follows->rnum, graph_destinations, graph_destination_size, 0,
visitor15
);
1058
19
        _ccv_nnc_graph_from_move_transit(graph);
1059
19
        _ccv_nnc_graph_rewrap(graph);
1060
19
      }
1061
4
      ccv_array_free(follows);
1062
4
    } else {
1063
      // For backward graph, no need to evaluate the while expr.
1064
1
      assert(exec->cmd.cmd == CCV_NNC_GRAPH_BACKWARD);
1065
1
      assert(graph->pair);
1066
1
      assert(tensor_tape);
1067
1
      count = 0;
1068
1
      int64_t reverse_count = graph->while_count = ccv_nnc_tensor_tape_numbering(tensor_tape, graph->p, (ccv_nnc_graph_exec_t){
1069
1
          .d = exec_idx,
1070
1
          .graph = graph->p,
1071
1
        });
1072
1
      _ccv_nnc_graph_unwrap(graph, count, reverse_count);
1073
2
      
CCV_NNC_GRAPH_VISIT1
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph->breakpoints, graph->breakpoint_size, graph_destinations, graph_destination_size, 1, visitor);
1074
1
      _ccv_nnc_graph_from_move_transit(graph);
1075
1
      _ccv_nnc_graph_rewrap(graph);
1076
5
      for (count = 1; reverse_count > 0; 
++count4
)
1077
4
      {
1078
4
        graph->while_count = --reverse_count;
1079
4
        _ccv_nnc_graph_unwrap(graph, count, reverse_count);
1080
4
        _ccv_nnc_graph_transit_move_to(graph);
1081
8
        
CCV_NNC_GRAPH_VISIT4
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph_sources, graph_source_size, graph_destinations, graph_destination_size, 0, visitor);
1082
4
        _ccv_nnc_graph_from_move_transit(graph);
1083
4
        _ccv_nnc_graph_rewrap(graph);
1084
4
      }
1085
1
    }
1086
30
  } else {
1087
30
    graph->while_count = 0;
1088
182
    
CCV_NNC_GRAPH_VISIT30
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph_sources, graph_source_size, graph_destinations, graph_destination_size, 0, visitor);
1089
30
  }
1090
35
#undef visitor
1091
35
}
1092
1093
static int _ccv_nnc_graph_run(ccv_nnc_graph_t* const graph, const int exec_idx, ccv_nnc_graph_exec_info_t* const exec, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const int flags, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
1094
9.96k
{
1095
9.96k
  assert((sources == 0 && source_size == 0) || (sources && source_size));
1096
9.96k
  assert((destinations == 0 && destination_size == 0) || (destinations && destination_size));
1097
9.96k
  const ccv_nnc_graph_exec_t* const graph_sources = sources ? 
sources27
:
(ccv_nnc_graph_exec_t*)9.94k
ccv_array_get9.94k
(graph->sources, 0);
1098
9.96k
  const int graph_source_size = source_size ? 
source_size27
:
graph->sources->rnum9.94k
;
1099
9.96k
  const ccv_nnc_graph_exec_t* const graph_destinations = destinations ? 
destinations27
:
(ccv_nnc_graph_exec_t*)9.94k
ccv_array_get9.94k
(graph->destinations, 0);
1100
9.96k
  const int graph_destination_size = destination_size ? 
destination_size27
:
graph->destinations->rnum9.94k
;
1101
9.96k
  int i;
1102
19.9k
  for (i = 0; i < graph_source_size; 
i++9.97k
)
1103
9.97k
    if (graph_sources[i].graph != graph)
1104
0
      return CCV_NNC_EXEC_INVALID;
1105
19.9k
  
for (i = 0; 9.96k
i < graph_destination_size;
i++9.97k
)
1106
9.97k
    if (graph_destinations[i].graph != graph)
1107
0
      return CCV_NNC_EXEC_INVALID;
1108
  // When topsorted is true, there is no memory allocation when run the graph.
1109
9.96k
  const int topsorted = (!sources && 
!destinations9.94k
&&
graph->topsorted9.94k
);
1110
9.96k
  if (topsorted)
1111
9.93k
    _ccv_nnc_graph_topsorted_run(graph, exec_idx, exec, flags, tensor_tape, stream_context);
1112
35
  else
1113
35
    _ccv_nnc_graph_run_slow_path(graph, exec_idx, exec, inputs, input_size, outputs, output_size, flags, sources, source_size, destinations, destination_size, tensor_tape, stream_context);
1114
9.96k
  return CCV_NNC_EXEC_SUCCESS;
1115
9.96k
}
1116
1117
int ccv_nnc_graph_run(ccv_nnc_graph_t* const graph, const int flags, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
1118
10.1k
{
1119
10.1k
  __atomic_store_n(&graph->run_state, CCV_NNC_GRAPH_STATE_RUNNING, __ATOMIC_RELEASE);
1120
10.1k
  if (stream_context && 
graph->topsorted215
&&
graph->stream_size > 0215
&&
graph->default_schedule215
&&
source_size == 0215
&&
destination_size == 0215
)
1121
215
  {
1122
215
    co_scheduler_t* const scheduler = ccv_nnc_stream_context_get_scheduler(stream_context);
1123
215
    co_routine_t* const task = co_new(_ccv_nnc_graph_topsorted_run_coro, (graph, -1, graph->default_schedule, 0, tensor_tape, stream_context, flags));
1124
215
    co_schedule(scheduler, task);
1125
    // I don't need to worry about freeing this task, it will free itself at the end.
1126
215
    return CCV_NNC_EXEC_SUCCESS;
1127
215
  } else
1128
9.90k
    return _ccv_nnc_graph_run(graph, -1, 0, 0, 0, 0, 0, flags, sources, source_size, destinations, destination_size, tensor_tape, 0 /* In this case, we don't support stream context yet. */);
1129
10.1k
}
1130
1131
int ccv_nnc_graph_run_with_schedule(ccv_nnc_graph_t* const graph, const int flags, const ccv_nnc_graph_static_schedule_t* const _schedule, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const _stream_context)
1132
26.3k
{
1133
26.3k
  assert(graph->topsorted);
1134
26.3k
  if (graph->exec_info->rnum == 0)
1135
0
    return CCV_NNC_EXEC_SUCCESS;
1136
26.3k
  __atomic_store_n(&graph->run_state, CCV_NNC_GRAPH_STATE_RUNNING, __ATOMIC_RELEASE);
1137
26.3k
  assert(graph->stream_size > 0);
1138
26.3k
  const ccv_nnc_graph_static_schedule_t* const schedule = _schedule ? 
_schedule15.8k
:
graph->default_schedule10.5k
;
1139
26.3k
  assert(schedule);
1140
26.3k
  assert(schedule->stream_0 < graph->stream_size);
1141
26.3k
  ccv_nnc_stream_context_t* const stream_context = _stream_context ? 
_stream_context699
:
graph->streams[schedule->stream_0]25.6k
;
1142
26.3k
  co_scheduler_t* const scheduler = ccv_nnc_stream_context_get_scheduler(stream_context);
1143
26.3k
  co_routine_t* const task = co_new(_ccv_nnc_graph_topsorted_run_coro, (graph, -1, schedule, 0, tensor_tape, stream_context, flags));
1144
26.3k
  co_schedule(scheduler, task);
1145
  // I don't need to worry about freeing this task, it will free itself at the end.
1146
26.3k
  if (!_stream_context) // If no stream context provided, this is a sync operation.
1147
25.6k
    ccv_nnc_stream_context_wait(stream_context);
1148
26.3k
  return CCV_NNC_EXEC_SUCCESS;
1149
26.3k
}
1150
1151
void ccv_nnc_graph_cancel(ccv_nnc_graph_t* const graph)
1152
0
{
1153
0
  __atomic_store_n(&graph->run_state, CCV_NNC_GRAPH_STATE_CANCEL, __ATOMIC_RELEASE);
1154
0
}