Coverage Report

Created: 2019-07-03 22:50

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/ccv_nnc_graph_run.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#include "_ccv_nnc_graph.h"
6
#include "_ccv_nnc_stream.h"
7
#ifdef HAVE_CUDA
8
#include "gpu/ccv_nnc_compat.h"
9
#endif
10
11
#pragma mark - Level-2 API
12
13
static void _ccv_nnc_unwrap_tensor_wrap(const ccv_nnc_graph_t* const graph, const int64_t count, const int64_t reverse_count, ccv_nnc_graph_tensor_wrap_t* const tensor_wrap)
14
960
{
15
960
  ccv_nnc_tensor_t* tensor = tensor_wrap->tensors[tensor_wrap->index];
16
2.04k
  while (CCV_IS_TENSOR_MULTIVIEW(tensor) &&
17
2.04k
       
(1.11k
((ccv_nnc_tensor_multiview_t*)tensor)->anchor == (intptr_t)graph1.11k
||
18
1.11k
      
((ccv_nnc_tensor_multiview_t*)tensor)->anchor == (intptr_t)graph->peer45
))
19
1.08k
  {
20
1.08k
    // If the anchor is from the peer, we use the reverse_count instead (we are looking it up).
21
1.08k
    const int i = (int)((((ccv_nnc_tensor_multiview_t*)tensor)->anchor == (intptr_t)graph) ? 
count1.06k
:
reverse_count15
);
22
1.08k
    ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor;
23
1.08k
    const int off = mv->kind;
24
1.08k
    const int mod = mv->repeat;
25
1.08k
    tensor = CCV_NNC_MULTIVIEW_DATA(mv)[i >= off ? 
((i - off) % mod) + off1.04k
:
i35
]; // Unwrap.
26
1.08k
    // If reached the root.
27
1.08k
    if (!CCV_IS_TENSOR_MULTIVIEW(tensor))
28
1.08k
      
tensor_wrap->update_required = 1919
; // Need to update tensor updates.
29
1.08k
    ++tensor_wrap->index;
30
1.08k
    tensor_wrap->tensors[tensor_wrap->index] = tensor;
31
1.08k
    assert(tensor_wrap->index < tensor_wrap->count);
32
1.08k
  }
33
960
}
34
35
static void _ccv_nnc_graph_unwrap_sub_graph(const ccv_nnc_graph_t* const graph, const int64_t count, const int64_t reverse_count, const ccv_nnc_graph_t* const sub_graph)
36
204
{
37
204
  int i;
38
204
  if (sub_graph->carry_overs)
39
277
    
for (i = 0; 127
i < sub_graph->carry_overs->rnum;
i++150
)
40
150
    {
41
150
      ccv_nnc_graph_tensor_carry_over_t* const carry_over = (ccv_nnc_graph_tensor_carry_over_t*)ccv_array_get(sub_graph->carry_overs, i);
42
150
      _ccv_nnc_unwrap_tensor_wrap(graph, count, reverse_count, carry_over->from);
43
150
      _ccv_nnc_unwrap_tensor_wrap(graph, count, reverse_count, carry_over->to);
44
150
    }
45
204
  if (sub_graph->sub_graphs)
46
82
    
for (i = 0; 21
i < sub_graph->sub_graphs->rnum;
i++61
)
47
61
      _ccv_nnc_graph_unwrap_sub_graph(graph, count, reverse_count, *(ccv_nnc_graph_t**)ccv_array_get(sub_graph->sub_graphs, i));
48
204
}
49
50
static void _ccv_nnc_graph_unwrap(const ccv_nnc_graph_t* const graph, const int64_t count, const int64_t reverse_count)
51
177
{
52
177
  if (!graph->tensor_wraps_refs)
53
34
    return;
54
143
  int i, j;
55
528
  for (i = 0; i < graph->tensor_wraps_refs->rnum; 
i++385
)
56
385
  {
57
385
    const ccv_nnc_graph_tensor_wraps_ref_t* const tensor_wraps_ref = (const ccv_nnc_graph_tensor_wraps_ref_t*)ccv_array_get(graph->tensor_wraps_refs, i);
58
385
    const ccv_nnc_graph_t* const sub_graph = tensor_wraps_ref->graph;
59
385
    ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(sub_graph->tensor_wraps, tensor_wraps_ref->d);
60
385
    if (tensor_wrap_array)
61
1.40k
      
for (j = 0; 385
j < tensor_wrap_array->size;
j++1.02k
)
62
1.02k
      {
63
1.02k
        ccv_nnc_graph_tensor_wrap_t* const tensor_wrap = tensor_wrap_array->tensor_wraps[j];
64
1.02k
        if (!tensor_wrap)
65
364
          continue;
66
660
        _ccv_nnc_unwrap_tensor_wrap(graph, count, reverse_count, tensor_wrap);
67
660
      }
68
385
  }
69
143
  _ccv_nnc_graph_unwrap_sub_graph(graph, count, reverse_count, graph);
70
143
}
71
72
static void _ccv_nnc_graph_transit_move_to(const ccv_nnc_graph_t* const graph)
73
146
{
74
146
  int i;
75
146
  if (graph->carry_overs)
76
265
    
for (i = 0; 123
i < graph->carry_overs->rnum;
i++142
)
77
142
    {
78
142
      ccv_nnc_graph_tensor_carry_over_t* const carry_over = (ccv_nnc_graph_tensor_carry_over_t*)ccv_array_get(graph->carry_overs, i);
79
142
      ccv_nnc_tensor_t* it = (ccv_nnc_tensor_t*)(carry_over->to->tensors[carry_over->to->index]);
80
142
      assert(!CCV_IS_TENSOR_MULTIVIEW(it));
81
142
      it->data = carry_over->transit;
82
142
    }
83
146
}
84
85
static void _ccv_nnc_graph_from_move_transit(const ccv_nnc_graph_t* const graph)
86
148
{
87
148
  int i;
88
148
  if (graph->carry_overs)
89
268
    
for (i = 0; 124
i < graph->carry_overs->rnum;
i++144
)
90
144
    {
91
144
      ccv_nnc_graph_tensor_carry_over_t* const carry_over = (ccv_nnc_graph_tensor_carry_over_t*)ccv_array_get(graph->carry_overs, i);
92
144
      ccv_nnc_tensor_t* it = (ccv_nnc_tensor_t*)(carry_over->from->tensors[carry_over->from->index]);
93
144
      assert(!CCV_IS_TENSOR_MULTIVIEW(it));
94
144
      carry_over->transit = it->data;
95
144
    }
96
148
}
97
98
static void _ccv_nnc_rewrap_tensor_wrap(const ccv_nnc_graph_t* const graph, ccv_nnc_graph_tensor_wrap_t* const tensor_wrap)
99
960
{
100
2.04k
  while (tensor_wrap->index > 0 && 
CCV_IS_TENSOR_MULTIVIEW1.23k
(tensor_wrap->tensors[tensor_wrap->index - 1]) &&
101
2.04k
      
(1.23k
((ccv_nnc_tensor_multiview_t*)tensor_wrap->tensors[tensor_wrap->index - 1])->anchor == (intptr_t)graph1.23k
||
102
1.23k
       
((ccv_nnc_tensor_multiview_t*)tensor_wrap->tensors[tensor_wrap->index - 1])->anchor == (intptr_t)graph->peer165
))
103
1.08k
    --tensor_wrap->index;
104
960
}
105
106
static void _ccv_nnc_graph_rewrap_sub_graph(const ccv_nnc_graph_t* const graph, const ccv_nnc_graph_t* const sub_graph)
107
204
{
108
204
  int i;
109
204
  if (sub_graph->carry_overs)
110
277
    
for (i = 0; 127
i < sub_graph->carry_overs->rnum;
i++150
)
111
150
    {
112
150
      ccv_nnc_graph_tensor_carry_over_t* const carry_over = (ccv_nnc_graph_tensor_carry_over_t*)ccv_array_get(sub_graph->carry_overs, i);
113
150
      _ccv_nnc_rewrap_tensor_wrap(graph, carry_over->from);
114
150
      _ccv_nnc_rewrap_tensor_wrap(graph, carry_over->to);
115
150
    }
116
204
  if (sub_graph->sub_graphs)
117
82
    
for (i = 0; 21
i < sub_graph->sub_graphs->rnum;
i++61
)
118
61
      _ccv_nnc_graph_rewrap_sub_graph(graph, *(ccv_nnc_graph_t**)ccv_array_get(sub_graph->sub_graphs, i));
119
204
}
120
121
static void _ccv_nnc_graph_rewrap(const ccv_nnc_graph_t* const graph) // Call this method at the end to roll the wrap_ptr back
122
177
{
123
177
  if (!graph->tensor_wraps_refs)
124
34
    return;
125
143
  int i, j;
126
528
  for (i = 0; i < graph->tensor_wraps_refs->rnum; 
i++385
)
127
385
  {
128
385
    const ccv_nnc_graph_tensor_wraps_ref_t* const tensor_wraps_ref = (const ccv_nnc_graph_tensor_wraps_ref_t*)ccv_array_get(graph->tensor_wraps_refs, i);
129
385
    const ccv_nnc_graph_t* const sub_graph = tensor_wraps_ref->graph;
130
385
    ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(sub_graph->tensor_wraps, tensor_wraps_ref->d);
131
385
    if (tensor_wrap_array)
132
1.40k
      
for (j = 0; 385
j < tensor_wrap_array->size;
j++1.02k
)
133
1.02k
      {
134
1.02k
        ccv_nnc_graph_tensor_wrap_t* const tensor_wrap = tensor_wrap_array->tensor_wraps[j];
135
1.02k
        if (!tensor_wrap)
136
364
          continue;
137
660
        _ccv_nnc_rewrap_tensor_wrap(graph, tensor_wrap);
138
660
      }
139
385
  }
140
143
  _ccv_nnc_graph_rewrap_sub_graph(graph, graph);
141
143
}
142
143
static void _ccv_nnc_graph_exec_unwrap_io(const ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const node)
144
1.46M
{
145
1.46M
  if (!node->tensor_wraps_ref)
146
1.46M
    return;
147
288
  int i;
148
288
  ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, node->tensor_wraps_ref - 1);
149
288
  ccv_nnc_graph_tensor_wrap_t** const tensor_wraps = tensor_wrap_array->tensor_wraps;
150
1.08k
  for (i = 0; i < tensor_wrap_array->size; 
i++793
)
151
793
    if (tensor_wraps[i])
152
508
    {
153
508
      assert(tensor_wraps[i]->index > 0);
154
508
      ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)(tensor_wraps[i]->tensors[tensor_wraps[i]->index - 1]);
155
508
      assert(CCV_IS_TENSOR_MULTIVIEW(mv));
156
508
      // Only now set the mv->it, because now this node is about to get executed.
157
508
      mv->it = tensor_wraps[i]->tensors[tensor_wraps[i]->index];
158
508
      assert(!CCV_IS_TENSOR_MULTIVIEW(mv->it));
159
508
    }
160
725
  
for (i = 0; 288
i < node->input_size;
i++437
)
161
437
    if (tensor_wraps[i])
162
196
      node->inputs[i] = tensor_wraps[i]->tensors[tensor_wraps[i]->index];
163
288
  const int d = node->input_size;
164
488
  for (i = 0; i < node->output_size; 
i++200
)
165
200
    if (tensor_wraps[d + i])
166
156
      node->outputs[i] = tensor_wraps[d + i]->tensors[tensor_wraps[d + i]->index];
167
288
}
168
169
static void _ccv_nnc_graph_exec_unwrap_while_expr(const ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const node)
170
167
{
171
167
  assert(node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE);
172
167
  if (!node->p_while.tensor_wraps_ref)
173
161
    return;
174
6
  int i;
175
6
  ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, node->p_while.tensor_wraps_ref - 1);
176
6
  ccv_nnc_graph_tensor_wrap_t** const tensor_wraps = tensor_wrap_array->tensor_wraps;
177
18
  for (i = 0; i < tensor_wrap_array->size; 
i++12
)
178
12
    if (tensor_wraps[i])
179
6
    {
180
6
      assert(tensor_wraps[i]->index > 0);
181
6
      ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)(tensor_wraps[i]->tensors[tensor_wraps[i]->index - 1]);
182
6
      assert(CCV_IS_TENSOR_MULTIVIEW(mv));
183
6
      // Only now set the mv->it, because now this node is about to get executed.
184
6
      mv->it = tensor_wraps[i]->tensors[tensor_wraps[i]->index];
185
6
      assert(!CCV_IS_TENSOR_MULTIVIEW(mv->it));
186
6
    }
187
18
  
for (i = 0; 6
i < node->p_while.input_size;
i++12
)
188
12
    if (tensor_wraps[i])
189
6
      node->p_while.inputs[i] = tensor_wraps[i]->tensors[tensor_wraps[i]->index];
190
6
}
191
192
static void _ccv_nnc_graph_exec_unwrap_phi(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_info_t* const node, const int ref)
193
42
{
194
42
  int i;
195
42
  // If the output tensor is a phi multi-view tensor, we update our selection to all the subscribers.
196
83
  for (i = 0; i < node->output_size; 
i++41
)
197
41
    if (CCV_IS_TENSOR_MULTIVIEW(node->outputs[i]) &&
198
41
      
((ccv_nnc_tensor_multiview_t*)node->outputs[i])->anchor == 30
CCV_NNC_MULTIVIEW_PHI30
)
199
41
    {
200
30
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)node->outputs[i];
201
30
      mv->it = CCV_NNC_MULTIVIEW_DATA(mv)[ref >= 0];
202
30
      ccv_nnc_tensor_multiview_synchronize(mv);
203
30
    }
204
42
}
205
206
static void _ccv_nnc_graph_exec_begin_synchronize_multiviews(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const node)
207
1.46M
{
208
1.46M
  if (!node->tensor_wraps_ref)
209
1.46M
    return;
210
288
  int i;
211
288
  ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, node->tensor_wraps_ref - 1);
212
288
  ccv_nnc_graph_tensor_wrap_t** const tensor_wraps = tensor_wrap_array->tensor_wraps;
213
1.08k
  for (i = 0; i < tensor_wrap_array->size; 
i++793
)
214
793
    if (tensor_wraps[i] && 
tensor_wraps[i]->update_required508
)
215
508
    {
216
508
      assert(tensor_wraps[i]->index > 0);
217
508
      ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)(tensor_wraps[i]->tensors[tensor_wraps[i]->index - 1]);
218
508
      // Now update the final pointer.
219
508
      ccv_nnc_tensor_multiview_synchronize(mv);
220
508
      tensor_wraps[i]->update_required = 0; // Reset, no need to update.
221
508
    }
222
288
}
223
224
static void _ccv_nnc_print_tensor_verbose(const ccv_nnc_tensor_t* const tensor)
225
0
{
226
0
  if (tensor->info.dim[0] <= 0)
227
0
    return;
228
0
  int i;
229
0
  const int len = ccv_min(tensor->info.dim[0], 3);
230
0
  if (CCV_TENSOR_GET_MEMORY(tensor->info.type) == CCV_TENSOR_GPU_MEMORY)
231
0
  {
232
0
#ifdef HAVE_CUDA
233
0
    switch (tensor->info.datatype)
234
0
    {
235
0
      case CCV_16F: {
236
0
        uint16_t data[len];
237
0
        cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.f16, tensor->info.type, len * sizeof(uint16_t));
238
0
        float fp32[len];
239
0
        ccv_half_precision_to_float(data, fp32, len);
240
0
        for (i = 0; i < len; i++)
241
0
          PRINT(CCV_CLI_VERBOSE, " %f", fp32[i]);
242
0
        break;
243
0
      }
244
0
      case CCV_32F: {
245
0
        float data[len];
246
0
        cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.f32, tensor->info.type, len * sizeof(float));
247
0
        for (i = 0; i < len; i++)
248
0
          PRINT(CCV_CLI_VERBOSE, " %f", data[i]);
249
0
        break;
250
0
      }
251
0
      case CCV_64F: {
252
0
        double data[len];
253
0
        cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.f64, tensor->info.type, len * sizeof(double));
254
0
        for (i = 0; i < len; i++)
255
0
          PRINT(CCV_CLI_VERBOSE, " %f", data[i]);
256
0
        break;
257
0
      }
258
0
      case CCV_32S: {
259
0
        int data[len];
260
0
        cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.i32, tensor->info.type, len * sizeof(int));
261
0
        for (i = 0; i < len; i++)
262
0
          PRINT(CCV_CLI_VERBOSE, " %d", data[i]);
263
0
        break;
264
0
      }
265
0
      case CCV_64S: {
266
0
        int64_t data[len];
267
0
        cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.i64, tensor->info.type, len * sizeof(int64_t));
268
0
        for (i = 0; i < len; i++)
269
0
          PRINT(CCV_CLI_VERBOSE, " %lld", (long long)data[i]);
270
0
        break;
271
0
      }
272
0
      case CCV_8U: {
273
0
        uint8_t data[len];
274
0
        cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.u8, tensor->info.type, len * sizeof(uint8_t));
275
0
        for (i = 0; i < len; i++)
276
0
          PRINT(CCV_CLI_VERBOSE, " %d", (int)data[i]);
277
0
        break;
278
0
      }
279
0
    }
280
0
    if (ccv_nnc_tensor_count(tensor->info) > 3)
281
0
      PRINT(CCV_CLI_VERBOSE, " ..");
282
0
#endif
283
0
  } else if (CCV_TENSOR_GET_MEMORY(tensor->info.type) == CCV_TENSOR_CPU_MEMORY) {
284
0
    switch (tensor->info.datatype)
285
0
    {
286
0
      case CCV_16F: {
287
0
        float fp32[len];
288
0
        ccv_half_precision_to_float((uint16_t*)tensor->data.f16, fp32, len);
289
0
        for (i = 0; i < len; i++)
290
0
          PRINT(CCV_CLI_VERBOSE, " %f", fp32[i]);
291
0
        break;
292
0
      }
293
0
      case CCV_32F:
294
0
        for (i = 0; i < len; i++)
295
0
          PRINT(CCV_CLI_VERBOSE, " %f", tensor->data.f32[i]);
296
0
        break;
297
0
      case CCV_64F:
298
0
        for (i = 0; i < len; i++)
299
0
          PRINT(CCV_CLI_VERBOSE, " %f", tensor->data.f64[i]);
300
0
        break;
301
0
      case CCV_32S:
302
0
        for (i = 0; i < len; i++)
303
0
          PRINT(CCV_CLI_VERBOSE, " %d", tensor->data.i32[i]);
304
0
        break;
305
0
      case CCV_64S:
306
0
        for (i = 0; i < len; i++)
307
0
          PRINT(CCV_CLI_VERBOSE, " %lld", (long long)tensor->data.i64[i]);
308
0
        break;
309
0
      case CCV_8U:
310
0
        for (i = 0; i < len; i++)
311
0
          PRINT(CCV_CLI_VERBOSE, " %d", (int)tensor->data.u8[i]);
312
0
        break;
313
0
    }
314
0
    if (ccv_nnc_tensor_count(tensor->info) > 3)
315
0
      PRINT(CCV_CLI_VERBOSE, " ..");
316
0
  }
317
0
}
318
319
typedef struct {
320
  ccv_nnc_graph_t* graph;
321
  int exec_idx;
322
  ccv_nnc_graph_exec_info_t* exec;
323
  ccv_nnc_tensor_tape_t* tensor_tape;
324
  ccv_nnc_stream_context_t* stream_context;
325
  int flags;
326
} ccv_nnc_graph_topsorted_run_coro_t;
327
328
static void _ccv_nnc_graph_topsorted_run_coro(ccv_nnc_stream_task_t* const self, void* const userdata);
329
330
typedef struct {
331
  ccv_nnc_graph_t* graph;
332
  int exec_idx;
333
  ccv_nnc_graph_exec_info_t* exec;
334
  ccv_nnc_tensor_t* const* inputs;
335
  ccv_nnc_tensor_tape_t* tensor_tape;
336
  ccv_nnc_stream_context_t* stream_context;
337
  int flags;
338
} ccv_nnc_graph_exec_cases_of_coro_t;
339
340
static void _ccv_nnc_graph_exec_cases_of_coro(ccv_nnc_stream_task_t* const self, void* const userdata)
341
3
{
342
3
  const ccv_nnc_graph_exec_cases_of_coro_t* const params = (ccv_nnc_graph_exec_cases_of_coro_t*)userdata;
343
3
  ccv_nnc_graph_t* const graph = params->graph;
344
3
  const int exec_idx = params->exec_idx;
345
3
  ccv_nnc_graph_exec_info_t* const exec = params->exec;
346
3
  ccv_nnc_tensor_t* const* const inputs = params->inputs;
347
3
  ccv_nnc_tensor_tape_t* const tensor_tape = params->tensor_tape;
348
3
  ccv_nnc_stream_context_t* const stream_context = params->stream_context;
349
3
  const int flags = params->flags;
350
3
  // Wait until this stream context is done.
351
3
  ccv_nnc_stream_task_synchronize(self, stream_context);
352
3
  int ref;
353
3
  if (exec->cmd.cmd == CCV_NNC_GRAPH_FORWARD)
354
3
  {
355
3
    ref = exec->case_of.offset + exec->case_of.expr(inputs, exec->input_size, exec->case_of.data);
356
3
    if (tensor_tape)
357
0
      ccv_nnc_tensor_tape_set_numbering(tensor_tape, graph, (ccv_nnc_graph_exec_t){
358
0
        .d = exec_idx,
359
0
        .graph = graph,
360
0
      }, ref);
361
3
  } else {
362
0
    assert(exec->cmd.cmd == CCV_NNC_GRAPH_BACKWARD);
363
0
    assert(tensor_tape);
364
0
    ref = ccv_nnc_tensor_tape_numbering(tensor_tape, graph, (ccv_nnc_graph_exec_t){
365
0
        .d = exec_idx,
366
0
        .graph = graph,
367
0
      });
368
0
  }
369
3
  if (ref >= 0)
370
3
  {
371
3
    assert(ref < exec->graph_ref_size);
372
3
    ccv_nnc_graph_t* sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, CCV_NNC_GRAPH_REF(exec)[ref] - 1);
373
3
    assert(exec->schedule.stream_size == 1);
374
3
    assert(graph->streams[SCHEDULE_STREAMS(exec->schedule)[0]] == sub_graph->streams[0]);
375
3
    ccv_nnc_graph_topsorted_run_coro_t params = {
376
3
      .graph = sub_graph,
377
3
      .exec_idx = exec_idx,
378
3
      .exec = exec,
379
3
      .tensor_tape = tensor_tape,
380
3
      .stream_context = graph->streams[SCHEDULE_STREAMS(exec->schedule)[0]],
381
3
      .flags = flags
382
3
    };
383
3
    // Directly call it.
384
3
    _ccv_nnc_graph_topsorted_run_coro(self, &params);
385
3
  }
386
3
  _ccv_nnc_graph_exec_unwrap_phi(graph, exec, ref);
387
3
}
388
389
typedef struct {
390
  ccv_nnc_graph_t* graph;
391
  ccv_nnc_graph_exec_info_t* node;
392
  ccv_nnc_stream_context_t* stream;
393
} ccv_nnc_graph_neighbor_context_discovery_t;
394
395
static ccv_nnc_stream_context_t* _ccv_nnc_graph_neighbor_context_discovery(const int device_id, void* const context)
396
356k
{
397
356k
  const ccv_nnc_graph_neighbor_context_discovery_t* const discovery = (ccv_nnc_graph_neighbor_context_discovery_t*)context;
398
356k
  if (CCV_STREAM_GET_DEVICE_ID(ccv_nnc_stream_context_type(discovery->stream)) == device_id)
399
89.1k
    return discovery->stream;
400
267k
  ccv_nnc_graph_t* const graph = discovery->graph;
401
267k
  ccv_nnc_graph_exec_info_t* const node = discovery->node;
402
267k
  int i;
403
267k
  // First try to find in other streams of the same node.
404
802k
  for (i = 0; i < node->schedule.stream_size; 
i++535k
)
405
802k
  {
406
802k
    ccv_nnc_stream_context_t* const stream = graph->streams[SCHEDULE_STREAMS(node->schedule)[i]];
407
802k
    if (CCV_STREAM_GET_DEVICE_ID(ccv_nnc_stream_context_type(stream)) == device_id)
408
267k
      return stream;
409
802k
  }
410
267k
  // If cannot find, try to find in all the wait streams.
411
267k
  
for (i = 0; 4
i < node->schedule.wait_size7
;
i++3
)
412
7
  {
413
7
    ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_signal_get_emitter(graph->signals[node->schedule.waits[i]]);
414
7
    if (stream_context && CCV_STREAM_GET_DEVICE_ID(ccv_nnc_stream_context_type(stream_context)) == device_id)
415
4
      return stream_context;
416
7
  }
417
4
  
return 00
;
418
4
}
419
420
static inline ccv_nnc_stream_task_t* _ccv_nnc_graph_exec_run_task(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const node, const int idx, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_scheduler_t* const scheduler, const int flags)
421
1.38M
{
422
1.38M
  _ccv_nnc_graph_exec_unwrap_io(graph, node);
423
1.38M
  ccv_nnc_tensor_t** inputs = node->inputs;
424
1.38M
  ccv_nnc_tensor_t** outputs = inputs + node->input_size;
425
1.38M
  if (tensor_tape)
426
0
    ccv_nnc_tensor_tape_io(tensor_tape, graph, node->input_flags, inputs, node->input_size, node->output_flags, outputs, node->output_size);
427
1.38M
  /* Broadcast the updates to all subscribed references for input / output, even though at th
428
1.38M
   * time output is not written yet, propagate pointer change is still valid. */
429
1.38M
  _ccv_nnc_graph_exec_begin_synchronize_multiviews(graph, node);
430
1.38M
  if (node->cmd.cmd == CCV_NNC_GRAPH_FORWARD || 
node->cmd.cmd == CCV_NNC_GRAPH_BACKWARD1.38M
)
431
6
  {
432
6
    if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
433
3
    {
434
3
      ccv_nnc_stream_context_t* const node_stream = graph->streams[SCHEDULE_STREAMS(node->schedule)[0]];
435
3
      ccv_nnc_graph_exec_cases_of_coro_t params = {
436
3
        .graph = graph,
437
3
        .exec_idx = idx,
438
3
        .exec = node,
439
3
        .inputs = inputs,
440
3
        .tensor_tape = tensor_tape,
441
3
        .stream_context = node_stream,
442
3
        .flags = flags,
443
3
      };
444
3
      ccv_nnc_stream_task_t* const task = ccv_nnc_stream_task_new(scheduler, _ccv_nnc_graph_exec_cases_of_coro, &params, 0);
445
3
      ccv_nnc_stream_task_resume(task);
446
3
      return task;
447
3
    } else if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE) {
448
3
      ccv_nnc_graph_t* sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[0] - 1);
449
3
      assert(graph->streams[SCHEDULE_STREAMS(node->schedule)[0]] == sub_graph->streams[0]);
450
3
      ccv_nnc_graph_topsorted_run_coro_t params = {
451
3
        .graph = sub_graph,
452
3
        .exec_idx = idx,
453
3
        .exec = node,
454
3
        .tensor_tape = tensor_tape,
455
3
        .stream_context = graph->streams[SCHEDULE_STREAMS(node->schedule)[0]],
456
3
        .flags = flags
457
3
      };
458
3
      ccv_nnc_stream_task_t* const task = ccv_nnc_stream_task_new(scheduler, _ccv_nnc_graph_topsorted_run_coro, &params, 0);
459
3
      ccv_nnc_stream_task_resume(task);
460
3
      return task;
461
1.38M
    }
462
1.38M
  } else {
463
1.38M
    int i, j;
464
3.03M
    for (i = 0; i < node->schedule.stream_size; 
i++1.65M
)
465
1.65M
    {
466
1.65M
      ccv_nnc_stream_context_t* const stream = graph->streams[SCHEDULE_STREAMS(node->schedule)[i]];
467
3.41M
      for (j = 0; j < node->schedule.wait_size; 
j++1.76M
)
468
1.76M
        ccv_nnc_stream_context_wait_signal(stream, graph->signals[node->schedule.waits[j]]);
469
1.65M
    }
470
1.38M
    PRINT(CCV_CLI_VERBOSE, "%s [%d]: [%d] -> [%d]\n", ccv_nnc_cmd_name(node->cmd.cmd), idx, node->input_size, node->output_size);
471
6.76M
    for (i = 0; i < node->input_size; 
i++5.37M
)
472
5.37M
    {
473
5.37M
      PRINT(CCV_CLI_VERBOSE, "|-> %d. %p (%p:%d)", 
i + 1, inputs[i], (inputs[i] ? inputs[i]->data.u8 : 0), (inputs[i] ? 0
CCV_TENSOR_GET_DEVICE_ID0
(inputs[i]->info.type) : -1));
474
5.37M
      if (inputs[i] && 
CCV_CLI_OUTPUT_LEVEL_IS3.89M
(CCV_CLI_VERBOSE))
475
5.37M
        
_ccv_nnc_print_tensor_verbose(inputs[i])0
;
476
5.37M
      PRINT(CCV_CLI_VERBOSE, "\n");
477
5.37M
    }
478
1.38M
    ccv_nnc_stream_context_t* const node_stream = graph->streams[SCHEDULE_STREAMS(node->schedule)[0]];
479
1.38M
    ccv_nnc_graph_neighbor_context_discovery_t discovery_context = {
480
1.38M
      .graph = graph,
481
1.38M
      .node = node,
482
1.38M
      .stream = node_stream
483
1.38M
    };
484
1.38M
    ccv_nnc_stream_context_set_neighbor_discovery(node_stream, _ccv_nnc_graph_neighbor_context_discovery, &discovery_context);
485
1.38M
    ccv_nnc_cmd_exec(node->cmd, node->hint, flags, inputs, node->input_size, outputs, node->output_size, node_stream);
486
4.45M
    for (i = 0; i < node->output_size; 
i++3.07M
)
487
3.07M
    {
488
3.07M
      PRINT(CCV_CLI_VERBOSE, "|<- %d. %p (%p:%d)", 
i + 1, outputs[i], (outputs[i] ? outputs[i]->data.u8 : 0), (outputs[i] ? 0
CCV_TENSOR_GET_DEVICE_ID0
(outputs[i]->info.type) : -1));
489
3.07M
      if (outputs[i] && 
CCV_CLI_OUTPUT_LEVEL_IS2.82M
(CCV_CLI_VERBOSE))
490
3.07M
        
_ccv_nnc_print_tensor_verbose(outputs[i])0
;
491
3.07M
      PRINT(CCV_CLI_VERBOSE, "\n");
492
3.07M
    }
493
3.03M
    for (i = 0; i < node->schedule.stream_size; 
i++1.65M
)
494
1.65M
      if (SCHEDULE_SIGNALS(node->schedule)[i] >= 0)
495
590k
      {
496
590k
        ccv_nnc_stream_context_t* const stream = graph->streams[SCHEDULE_STREAMS(node->schedule)[i]];
497
590k
        ccv_nnc_stream_context_emit_signal(stream, graph->signals[SCHEDULE_SIGNALS(node->schedule)[i]]);
498
590k
      }
499
1.38M
  }
500
1.38M
  
return 01.38M
;
501
1.38M
}
502
503
static void _ccv_nnc_graph_mark_outgoing_streams_blocked_by_task(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const exec_info, ccv_nnc_graph_exec_info_t* const node, ccv_nnc_stream_task_t* const task)
504
9
{
505
9
  int i, j;
506
9
  if (node->outgoings)
507
12
    
for (i = 0; 6
i < node->outgoings->rnum;
i++6
)
508
6
    {
509
6
      const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i);
510
6
      ccv_nnc_graph_exec_info_t* const outgoing_node = exec_info + outgoing_idx;
511
6
      // An outgoing stream can be blocked by multiple other tasks from other streams. But it is OK,
512
6
      // because on next round of execution, that one will be marked as blocked again.
513
12
      for (j = 0; j < outgoing_node->schedule.stream_size; 
j++6
)
514
6
        graph->block_stream_tasks[SCHEDULE_STREAMS(outgoing_node->schedule)[j]] = task;
515
6
    }
516
9
}
517
518
static void _ccv_nnc_graph_wait_any_sub_tasks(ccv_nnc_stream_task_t* const self, ccv_nnc_graph_t* const graph, ccv_nnc_stream_task_t* const* const sub_tasks, const int sub_task_size, ccv_nnc_graph_exec_info_t* const exec_info, const int* const pending_nodes, const int pending_node_size)
519
3.47k
{
520
3.47k
  int i, j, k;
521
3.47k
  if (sub_task_size)
522
3
    ccv_nnc_stream_task_wait_any(self, sub_tasks, sub_task_size);
523
3.48k
  for (i = 0; i < sub_task_size; 
i++3
)
524
3
    if (sub_tasks[i]->done)
525
9
      
for (j = 0; 3
j < pending_node_size;
j++6
)
526
6
      {
527
6
        ccv_nnc_graph_exec_info_t* const node = exec_info + pending_nodes[j];
528
12
        for (k = 0; k < node->schedule.stream_size; 
k++6
)
529
6
          if (graph->block_stream_tasks[SCHEDULE_STREAMS(node->schedule)[k]] == sub_tasks[i])
530
3
            graph->block_stream_tasks[SCHEDULE_STREAMS(node->schedule)[k]] = 0;
531
6
      }
532
3.47k
}
533
534
static void _ccv_nnc_graph_exec_run_loop(ccv_nnc_stream_task_t* const self, ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const exec_info, const int start_index, const int exec_info_size, ccv_nnc_tensor_tape_t* const tensor_tape, const int flags)
535
3.47k
{
536
3.47k
  int i, j;
537
3.47k
  int sub_task_size = 0;
538
3.47k
  ccv_nnc_stream_task_t** const sub_tasks = (ccv_nnc_stream_task_t**)ccv_nnc_graph_buffer(graph, sizeof(ccv_nnc_stream_task_t*) * (graph->sub_graphs ? 
graph->sub_graphs->rnum4
:
03.47k
) + sizeof(int) * exec_info_size * 2);
539
3.47k
  int* pending_nodes[2];
540
3.47k
  pending_nodes[0] = (int*)(sub_tasks + (graph->sub_graphs ? 
graph->sub_graphs->rnum4
:
03.47k
));
541
3.47k
  pending_nodes[1] = pending_nodes[0] + exec_info_size;
542
3.47k
  int pending_node_size[2] = {
543
3.47k
    0, 0
544
3.47k
  };
545
1.38M
  for (i = start_index; i < exec_info_size; 
i++1.38M
)
546
1.38M
  {
547
1.38M
    ccv_nnc_graph_exec_info_t* const node = exec_info + i;
548
1.38M
    // If stream is blocked by but not blocked by current executing task.
549
1.38M
    int blocked = 0;
550
3.03M
    for (j = 0; j < node->schedule.stream_size; 
j++1.65M
)
551
1.65M
      if (graph->block_stream_tasks[SCHEDULE_STREAMS(node->schedule)[j]])
552
6
      {
553
6
        pending_nodes[0][pending_node_size[0]++] = i;
554
6
        _ccv_nnc_graph_mark_outgoing_streams_blocked_by_task(graph, exec_info, node, graph->block_stream_tasks[SCHEDULE_STREAMS(node->schedule)[j]]);
555
6
        blocked = 1;
556
6
      }
557
1.38M
    if (blocked)
558
6
      continue;
559
1.38M
    ccv_nnc_stream_task_t* const task = _ccv_nnc_graph_exec_run_task(graph, node, i, tensor_tape, self->super, flags);
560
1.38M
    if (task && 
!task->done6
)
561
3
    {
562
3
      sub_tasks[sub_task_size++] = task;
563
6
      for (j = 0; j < node->schedule.stream_size; 
j++3
)
564
3
        graph->block_stream_tasks[SCHEDULE_STREAMS(node->schedule)[j]] = task;
565
3
      _ccv_nnc_graph_mark_outgoing_streams_blocked_by_task(graph, exec_info, node, task);
566
3
    }
567
1.38M
  }
568
3.47k
  _ccv_nnc_graph_wait_any_sub_tasks(self, graph, sub_tasks, sub_task_size, exec_info, pending_nodes[0], pending_node_size[0]);
569
3.47k
  int p = 0, q = 1;
570
3.47k
  while (pending_node_size[p] > 0)
571
3
  {
572
3
    pending_node_size[q] = 0;
573
3
    sub_task_size = 0;
574
9
    for (i = 0; i < pending_node_size[p]; 
i++6
)
575
6
    {
576
6
      const int idx = pending_nodes[p][i];
577
6
      ccv_nnc_graph_exec_info_t* const node = exec_info + idx;
578
6
      int blocked = 0;
579
12
      for (j = 0; j < node->schedule.stream_size; 
j++6
)
580
6
        if (graph->block_stream_tasks[SCHEDULE_STREAMS(node->schedule)[j]])
581
0
        {
582
0
          _ccv_nnc_graph_mark_outgoing_streams_blocked_by_task(graph, exec_info, node, graph->block_stream_tasks[SCHEDULE_STREAMS(node->schedule)[j]]);
583
0
          pending_nodes[q][pending_node_size[q]++] = idx;
584
0
          blocked = 1;
585
0
        }
586
6
      if (blocked)
587
0
        continue;
588
6
      ccv_nnc_stream_task_t* const task = _ccv_nnc_graph_exec_run_task(graph, node, idx, tensor_tape, self->super, flags);
589
6
      if (task && 
!task->done0
)
590
0
      {
591
0
        sub_tasks[sub_task_size++] = task;
592
0
        for (j = 0; j < node->schedule.stream_size; j++)
593
0
          graph->block_stream_tasks[SCHEDULE_STREAMS(node->schedule)[j]] = task;
594
0
        _ccv_nnc_graph_mark_outgoing_streams_blocked_by_task(graph, exec_info, node, task);
595
0
      }
596
6
    }
597
3
    int t;
598
3
    CCV_SWAP(p, q, t);
599
3
    _ccv_nnc_graph_wait_any_sub_tasks(self, graph, sub_tasks, sub_task_size, exec_info, pending_nodes[p], pending_node_size[p]);
600
3
  }
601
3.47k
}
602
603
static void _ccv_nnc_graph_topsorted_run_coro(ccv_nnc_stream_task_t* const self, void* const userdata)
604
3.44k
{
605
3.44k
  const ccv_nnc_graph_topsorted_run_coro_t* const params = (ccv_nnc_graph_topsorted_run_coro_t*)userdata;
606
3.44k
  ccv_nnc_graph_t* const graph = params->graph;
607
3.44k
  const int exec_idx = params->exec_idx;
608
3.44k
  ccv_nnc_graph_exec_info_t* const exec = params->exec;
609
3.44k
  ccv_nnc_tensor_tape_t* const tensor_tape = params->tensor_tape;
610
3.44k
  ccv_nnc_stream_context_t* const stream_context = params->stream_context;
611
3.44k
  const int flags = params->flags;
612
3.44k
  int i, j;
613
3.44k
  // Assign the resource container pointer.
614
360k
  for (i = 0; i < graph->stream_size; 
i++356k
)
615
356k
    graph->streams[i]->resource_container = stream_context->_inline_container;
616
3.44k
  ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0);
617
3.44k
  if (exec_idx == -1)
618
3.43k
  {
619
3.43k
    if (stream_context->main)
620
1
    {
621
1
      ccv_nnc_stream_task_t* const previous_main = stream_context->main;
622
1
      stream_context->main = self;
623
1
      // Wait the previous task to be done. This makes sure that our graph run is serial on the same stream.
624
1
      assert(!previous_main->done);
625
1
      ccv_nnc_stream_task_wait_any(self, &previous_main, 1);
626
1
    } else
627
3.43k
      stream_context->main = self;
628
3.43k
    if (stream_context != graph->streams[0])
629
3.43k
    {
630
3.43k
      // Make sure when we start work on streams[0], the current stream context is done.
631
3.43k
      ccv_nnc_stream_signal_t* const signal = ccv_nnc_stream_context_get_signal(stream_context, (int64_t)(intptr_t)graph);
632
3.43k
      ccv_nnc_stream_context_emit_signal(stream_context, signal);
633
3.43k
      ccv_nnc_stream_context_wait_signal(graph->streams[0], signal);
634
3.43k
    }
635
3.43k
  } else {
636
6
    assert(stream_context == graph->streams[0]);
637
6
  }
638
3.44k
  if (exec && 
(exec->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)6
)
639
3
  {
640
3
    assert(exec->p_while.expr);
641
3
    int64_t count = 0;
642
3
    // This is a forward while loop. Backward while loop will just consult its peering part.
643
3
    if (exec->cmd.cmd == CCV_NNC_GRAPH_FORWARD)
644
3
    {
645
3
      const int graph_breakpoint_size = graph->breakpoint_offset + graph->breakpoint_size;
646
15
      for (;; ++count)
647
18
      {
648
18
        graph->while_count = count;
649
18
        if (tensor_tape)
650
0
          ccv_nnc_tensor_tape_set_numbering(tensor_tape, graph->p, (ccv_nnc_graph_exec_t){
651
0
            .d = exec_idx,
652
0
            .graph = graph->p,
653
0
          }, count);
654
18
        _ccv_nnc_graph_unwrap(graph, count, 0);
655
18
        if (count > 0)
656
15
          _ccv_nnc_graph_transit_move_to(graph);
657
18
        _ccv_nnc_graph_exec_run_loop(self, graph, exec_info, 0, graph_breakpoint_size, tensor_tape, flags);
658
18
        // Reached breakpoints, now check the breakpoint, if not met, break out.
659
18
        // Wait until everything on the stream is executed.
660
36
        for (i = graph->breakpoint_offset; i < graph_breakpoint_size; 
i++18
)
661
36
          
for (j = 0; 18
j < exec_info[i].schedule.stream_size;
j++18
)
662
18
            ccv_nnc_stream_task_synchronize(self, graph->streams[SCHEDULE_STREAMS(exec_info[i].schedule)[j]]);
663
18
        _ccv_nnc_graph_exec_unwrap_while_expr(graph, exec);
664
18
        if (!exec->p_while.expr(exec->p_while.inputs, exec->p_while.input_size, exec->p_while.data))
665
3
        {
666
3
          _ccv_nnc_graph_rewrap(graph);
667
3
          // If we break from here, it is ok because all the streams are waited.
668
3
          break;
669
3
        }
670
15
        _ccv_nnc_graph_exec_run_loop(self, graph, exec_info, graph_breakpoint_size, graph->exec_info->rnum, tensor_tape, flags);
671
15
        _ccv_nnc_graph_from_move_transit(graph);
672
15
        _ccv_nnc_graph_rewrap(graph);
673
15
      }
674
3
    } else {
675
0
      // For backward graph, no need to evaluate the while expr.
676
0
      assert(exec->cmd.cmd == CCV_NNC_GRAPH_BACKWARD);
677
0
      assert(graph->peer);
678
0
      assert(tensor_tape);
679
0
      count = 0;
680
0
      int64_t reverse_count = graph->while_count = ccv_nnc_tensor_tape_numbering(tensor_tape, graph->p, (ccv_nnc_graph_exec_t){
681
0
          .d = exec_idx,
682
0
          .graph = graph->p,
683
0
        });
684
0
      _ccv_nnc_graph_unwrap(graph, count, reverse_count);
685
0
      _ccv_nnc_graph_exec_run_loop(self, graph, exec_info, graph->breakpoint_offset, graph->exec_info->rnum, tensor_tape, flags);
686
0
      _ccv_nnc_graph_from_move_transit(graph);
687
0
      _ccv_nnc_graph_rewrap(graph);
688
0
      for (count = 1; reverse_count > 0; ++count)
689
0
      {
690
0
        graph->while_count = --reverse_count;
691
0
        _ccv_nnc_graph_unwrap(graph, count, reverse_count);
692
0
        _ccv_nnc_graph_transit_move_to(graph);
693
0
        _ccv_nnc_graph_exec_run_loop(self, graph, exec_info, 0, graph->exec_info->rnum, tensor_tape, flags);
694
0
        _ccv_nnc_graph_from_move_transit(graph);
695
0
        _ccv_nnc_graph_rewrap(graph);
696
0
      }
697
0
    }
698
3
    for (i = 0; i < graph->wait_size; 
i++0
)
699
0
      ccv_nnc_stream_context_wait_signal(graph->streams[0], graph->signals[graph->waits[i]]);
700
3.44k
  } else {
701
3.44k
    graph->while_count = 0;
702
3.44k
    _ccv_nnc_graph_exec_run_loop(self, graph, exec_info, 0, graph->exec_info->rnum, tensor_tape, flags);
703
3.44k
    for (i = 0; i < graph->wait_size; 
i++0
)
704
0
      ccv_nnc_stream_context_wait_signal(graph->streams[0], graph->signals[graph->waits[i]]);
705
3.44k
  }
706
3.44k
  if (stream_context != graph->streams[0])
707
3.43k
  {
708
3.43k
    assert(exec_idx == -1);
709
3.43k
    ccv_nnc_stream_context_emit_signal(graph->streams[0], graph->extern_signal);
710
3.43k
    ccv_nnc_stream_context_wait_signal(stream_context, graph->extern_signal);
711
3.43k
  }
712
3.44k
  // Reset main to 0 if it is current me.
713
3.44k
  if (exec_idx == -1 && 
stream_context->main == self3.43k
)
714
3.43k
    stream_context->main = 0;
715
3.44k
}
716
717
static int _ccv_nnc_graph_run(ccv_nnc_graph_t* const graph, const int exec_idx, ccv_nnc_graph_exec_info_t* const exec, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context, const int flags, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size);
718
719
static inline void _ccv_nnc_graph_exec_run(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const node, const int idx, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context, const int flags)
720
81.9k
{
721
81.9k
  int i;
722
81.9k
  _ccv_nnc_graph_exec_unwrap_io(graph, node);
723
81.9k
  ccv_nnc_tensor_t** inputs = node->inputs;
724
81.9k
  ccv_nnc_tensor_t** outputs = inputs + node->input_size;
725
81.9k
  if (tensor_tape)
726
78
    ccv_nnc_tensor_tape_io(tensor_tape, graph, node->input_flags, inputs, node->input_size, node->output_flags, outputs, node->output_size);
727
81.9k
  /* Broadcast the updates to all subscribed references for input / output, even though at th
728
81.9k
   * time output is not written yet, propagate pointer change is still valid. */
729
81.9k
  _ccv_nnc_graph_exec_begin_synchronize_multiviews(graph, node);
730
81.9k
  if (node->cmd.cmd == CCV_NNC_GRAPH_FORWARD || 
node->cmd.cmd == CCV_NNC_GRAPH_BACKWARD81.8k
)
731
67
  {
732
67
    assert(!stream_context); // This doesn't work properly with stream context.
733
67
    if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF)
734
39
    {
735
39
      int ref;
736
39
      if (node->cmd.cmd == CCV_NNC_GRAPH_FORWARD)
737
35
      {
738
35
        ref = node->case_of.offset + node->case_of.expr(inputs, node->input_size, node->case_of.data);
739
35
        if (tensor_tape)
740
4
          ccv_nnc_tensor_tape_set_numbering(tensor_tape, graph, (ccv_nnc_graph_exec_t){
741
4
            .d = idx,
742
4
            .graph = graph,
743
4
          }, ref);
744
35
      } else {
745
4
        assert(node->cmd.cmd == CCV_NNC_GRAPH_BACKWARD);
746
4
        assert(tensor_tape);
747
4
        ref = ccv_nnc_tensor_tape_numbering(tensor_tape, graph, (ccv_nnc_graph_exec_t){
748
4
            .d = idx,
749
4
            .graph = graph,
750
4
          });
751
4
      }
752
39
      if (ref >= 0)
753
31
      {
754
31
        assert(ref < node->graph_ref_size);
755
31
        ccv_nnc_graph_t* sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[ref] - 1);
756
31
        _ccv_nnc_graph_run(sub_graph, idx, node, inputs, node->input_size, outputs, node->output_size, tensor_tape, stream_context, flags, 0, 0, 0, 0);
757
31
      }
758
39
      _ccv_nnc_graph_exec_unwrap_phi(graph, node, ref);
759
39
    } else 
if (28
node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE28
) {
760
28
      ccv_nnc_graph_t* sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[0] - 1);
761
28
      _ccv_nnc_graph_run(sub_graph, idx, node, inputs, node->input_size, outputs, node->output_size, tensor_tape, stream_context, flags, 0, 0, 0, 0);
762
28
    }
763
81.8k
  } else {
764
81.8k
    PRINT(CCV_CLI_VERBOSE, "%s [%d]: [%d] -> [%d]\n", ccv_nnc_cmd_name(node->cmd.cmd), idx, node->input_size, node->output_size);
765
297k
    for (i = 0; i < node->input_size; 
i++215k
)
766
215k
    {
767
215k
      PRINT(CCV_CLI_VERBOSE, "|-> %d. %p (%p:%d)", 
i + 1, inputs[i], (inputs[i] ? inputs[i]->data.u8 : 0), (inputs[i] ? 0
CCV_TENSOR_GET_DEVICE_ID0
(inputs[i]->info.type) : -1));
768
215k
      if (inputs[i] && 
CCV_CLI_OUTPUT_LEVEL_IS171k
(CCV_CLI_VERBOSE))
769
215k
        
_ccv_nnc_print_tensor_verbose(inputs[i])0
;
770
215k
      PRINT(CCV_CLI_VERBOSE, "\n");
771
215k
    }
772
81.8k
    ccv_nnc_cmd_exec(node->cmd, node->hint, flags, inputs, node->input_size, outputs, node->output_size, stream_context);
773
200k
    for (i = 0; i < node->output_size; 
i++118k
)
774
118k
    {
775
118k
      PRINT(CCV_CLI_VERBOSE, "|<- %d. %p (%p:%d)", 
i + 1, outputs[i], (outputs[i] ? outputs[i]->data.u8 : 0), (outputs[i] ? 0
CCV_TENSOR_GET_DEVICE_ID0
(outputs[i]->info.type) : -1));
776
118k
      if (outputs[i] && 
CCV_CLI_OUTPUT_LEVEL_IS111k
(CCV_CLI_VERBOSE))
777
118k
        
_ccv_nnc_print_tensor_verbose(outputs[i])0
;
778
118k
      PRINT(CCV_CLI_VERBOSE, "\n");
779
118k
    }
780
81.8k
  }
781
81.9k
}
782
783
static inline void _ccv_nnc_graph_topsorted_run(ccv_nnc_graph_t* const graph, const int exec_idx, ccv_nnc_graph_exec_info_t* const exec, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context, const int flags)
784
4.75k
{
785
4.75k
  int i;
786
4.75k
  if (exec && 
(exec->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)51
)
787
23
  {
788
23
    assert(!stream_context); // This doesn't work properly with stream context.
789
23
    assert(exec->p_while.expr);
790
23
    int64_t count = 0;
791
23
    // This is a forward while loop. Backward while loop will just consult its peering part.
792
23
    if (exec->cmd.cmd == CCV_NNC_GRAPH_FORWARD)
793
22
    {
794
22
      const int graph_breakpoint_size = graph->breakpoint_offset + graph->breakpoint_size;
795
104
      for (;; ++count)
796
126
      {
797
126
        graph->while_count = count;
798
126
        if (tensor_tape)
799
5
          ccv_nnc_tensor_tape_set_numbering(tensor_tape, graph->p, (ccv_nnc_graph_exec_t){
800
5
            .d = exec_idx,
801
5
            .graph = graph->p,
802
5
          }, count);
803
126
        _ccv_nnc_graph_unwrap(graph, count, 0);
804
126
        if (count > 0)
805
104
          _ccv_nnc_graph_transit_move_to(graph);
806
312
        for (i = 0; i < graph_breakpoint_size; 
i++186
)
807
186
          _ccv_nnc_graph_exec_run(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i), i, tensor_tape, stream_context, flags);
808
126
        _ccv_nnc_graph_exec_unwrap_while_expr(graph, exec);
809
126
        // Reached breakpoints, now check the breakpoint, if not met, break out.
810
126
        if (!exec->p_while.expr(exec->p_while.inputs, exec->p_while.input_size, exec->p_while.data))
811
22
        {
812
22
          _ccv_nnc_graph_rewrap(graph);
813
22
          break;
814
22
        }
815
210
        
for (i = graph_breakpoint_size; 104
i < graph->exec_info->rnum;
i++106
)
816
106
          _ccv_nnc_graph_exec_run(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i), i, tensor_tape, stream_context, flags);
817
104
        _ccv_nnc_graph_from_move_transit(graph);
818
104
        _ccv_nnc_graph_rewrap(graph);
819
104
      }
820
22
    } else {
821
1
      // For backward graph, no need to evaluate the while expr.
822
1
      assert(exec->cmd.cmd == CCV_NNC_GRAPH_BACKWARD);
823
1
      assert(graph->peer);
824
1
      assert(tensor_tape);
825
1
      count = 0;
826
1
      int64_t reverse_count = graph->while_count = ccv_nnc_tensor_tape_numbering(tensor_tape, graph->p, (ccv_nnc_graph_exec_t){
827
1
          .d = exec_idx,
828
1
          .graph = graph->p,
829
1
        });
830
1
      _ccv_nnc_graph_unwrap(graph, count, reverse_count);
831
5
      for (i = graph->breakpoint_offset; i < graph->exec_info->rnum; 
i++4
)
832
4
        _ccv_nnc_graph_exec_run(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i), i, tensor_tape, stream_context, flags);
833
1
      _ccv_nnc_graph_from_move_transit(graph);
834
1
      _ccv_nnc_graph_rewrap(graph);
835
5
      for (count = 1; reverse_count > 0; 
++count4
)
836
4
      {
837
4
        graph->while_count = --reverse_count;
838
4
        _ccv_nnc_graph_unwrap(graph, count, reverse_count);
839
4
        _ccv_nnc_graph_transit_move_to(graph);
840
20
        for (i = 0; i < graph->exec_info->rnum; 
i++16
)
841
16
          _ccv_nnc_graph_exec_run(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i), i, tensor_tape, stream_context, flags);
842
4
        _ccv_nnc_graph_from_move_transit(graph);
843
4
        _ccv_nnc_graph_rewrap(graph);
844
4
      }
845
1
    }
846
4.72k
  } else {
847
4.72k
    graph->while_count = 0;
848
77.3k
    for (i = 0; i < graph->exec_info->rnum; 
i++72.6k
)
849
72.6k
      _ccv_nnc_graph_exec_run(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i), i, tensor_tape, stream_context, flags);
850
4.72k
  }
851
4.75k
}
852
853
static inline void _ccv_nnc_graph_run_slow_path(ccv_nnc_graph_t* const graph, const int exec_idx, ccv_nnc_graph_exec_info_t* const exec, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context, const int flags, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size)
854
531
{
855
531
  int i, j;
856
531
  const ccv_nnc_graph_exec_t* const graph_sources = sources ? 
sources295
:
(ccv_nnc_graph_exec_t*)236
ccv_array_get236
(graph->sources, 0);
857
531
  const int graph_source_size = source_size ? 
source_size295
:
graph->sources->rnum236
;
858
531
  const ccv_nnc_graph_exec_t* const graph_destinations = destinations ? 
destinations255
:
(ccv_nnc_graph_exec_t*)276
ccv_array_get276
(graph->destinations, 0);
859
531
  const int graph_destination_size = destination_size ? 
destination_size255
:
graph->destinations->rnum276
;
860
531
#define visitor(node, idx, ...) \
861
8.97k
  _ccv_nnc_graph_exec_run(graph, node, idx, tensor_tape, stream_context, flags)
862
531
  if (exec && 
(exec->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)8
)
863
5
  {
864
5
    assert(!stream_context); // This doesn't work properly with stream context.
865
5
    assert(exec->p_while.expr);
866
5
    int64_t count = 0;
867
5
    // This is a forward while loop. Backward while loop will just consult its peering part.
868
5
    if (exec->cmd.cmd == CCV_NNC_GRAPH_FORWARD)
869
4
    {
870
4
      ccv_array_t* follows = ccv_array_new(sizeof(ccv_nnc_graph_exec_t), graph->breakpoint_size, 0);
871
8
      for (i = 0; i < graph->breakpoint_size; 
i++4
)
872
4
      {
873
4
        const ccv_nnc_graph_exec_info_t* const exec_info = (const ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, graph->breakpoints->d);
874
4
        if (exec_info->outgoings)
875
6
          
for (j = 0; 3
j < exec_info->outgoings->rnum;
j++3
)
876
3
          {
877
3
            const ccv_nnc_graph_exec_t exec = {
878
3
              .d = *(int*)ccv_array_get(exec_info->outgoings, j),
879
3
              .graph = graph,
880
3
            };
881
3
            ccv_array_push(follows, &exec);
882
3
          }
883
4
      }
884
19
      for (;; ++count)
885
23
      {
886
23
        graph->while_count = count;
887
23
        if (tensor_tape)
888
5
          ccv_nnc_tensor_tape_set_numbering(tensor_tape, graph->p, (ccv_nnc_graph_exec_t){
889
5
            .d = exec_idx,
890
5
            .graph = graph->p,
891
5
          }, count);
892
23
        _ccv_nnc_graph_unwrap(graph, count, 0);
893
23
        if (count > 0)
894
19
          _ccv_nnc_graph_transit_move_to(graph);
895
28
        
CCV_NNC_GRAPH_VISIT23
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph_sources, graph_source_size, graph->breakpoints, graph->breakpoint_size, 0, visitor);
896
23
        _ccv_nnc_graph_exec_unwrap_while_expr(graph, exec);
897
23
        // Reached breakpoints, now check the breakpoint, if not met, break out.
898
23
        if (!exec->p_while.expr(exec->p_while.inputs, exec->p_while.input_size, exec->p_while.data))
899
4
        {
900
4
          _ccv_nnc_graph_rewrap(graph);
901
4
          break;
902
4
        }
903
19
        if (follows->rnum > 0)
904
19
          
CCV_NNC_GRAPH_VISIT15
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, (ccv_nnc_graph_exec_t*)ccv_array_get(follows, 0), follows->rnum, graph_destinations, graph_destination_size, 0,
visitor15
);
905
19
        _ccv_nnc_graph_from_move_transit(graph);
906
19
        _ccv_nnc_graph_rewrap(graph);
907
19
      }
908
4
      ccv_array_free(follows);
909
4
    } else {
910
1
      // For backward graph, no need to evaluate the while expr.
911
1
      assert(exec->cmd.cmd == CCV_NNC_GRAPH_BACKWARD);
912
1
      assert(graph->peer);
913
1
      assert(tensor_tape);
914
1
      count = 0;
915
1
      int64_t reverse_count = graph->while_count = ccv_nnc_tensor_tape_numbering(tensor_tape, graph->p, (ccv_nnc_graph_exec_t){
916
1
          .d = exec_idx,
917
1
          .graph = graph->p,
918
1
        });
919
1
      _ccv_nnc_graph_unwrap(graph, count, reverse_count);
920
2
      
CCV_NNC_GRAPH_VISIT1
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph->breakpoints, graph->breakpoint_size, graph_destinations, graph_destination_size, 1, visitor);
921
1
      _ccv_nnc_graph_from_move_transit(graph);
922
1
      _ccv_nnc_graph_rewrap(graph);
923
5
      for (count = 1; reverse_count > 0; 
++count4
)
924
4
      {
925
4
        graph->while_count = --reverse_count;
926
4
        _ccv_nnc_graph_unwrap(graph, count, reverse_count);
927
4
        _ccv_nnc_graph_transit_move_to(graph);
928
8
        
CCV_NNC_GRAPH_VISIT4
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph_sources, graph_source_size, graph_destinations, graph_destination_size, 0, visitor);
929
4
        _ccv_nnc_graph_from_move_transit(graph);
930
4
        _ccv_nnc_graph_rewrap(graph);
931
4
      }
932
1
    }
933
526
  } else {
934
526
    graph->while_count = 0;
935
8.92k
    
CCV_NNC_GRAPH_VISIT526
(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph_sources, graph_source_size, graph_destinations, graph_destination_size, 0, visitor);
936
526
  }
937
531
#undef visitor
938
531
}
939
940
static int _ccv_nnc_graph_run(ccv_nnc_graph_t* const graph, const int exec_idx, ccv_nnc_graph_exec_info_t* const exec, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context, const int flags, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size)
941
5.28k
{
942
5.28k
  assert((sources == 0 && source_size == 0) || (sources && source_size));
943
5.28k
  assert((destinations == 0 && destination_size == 0) || (destinations && destination_size));
944
5.28k
  const ccv_nnc_graph_exec_t* const graph_sources = sources ? 
sources295
:
(ccv_nnc_graph_exec_t*)4.98k
ccv_array_get4.98k
(graph->sources, 0);
945
5.28k
  const int graph_source_size = source_size ? 
source_size295
:
graph->sources->rnum4.98k
;
946
5.28k
  const ccv_nnc_graph_exec_t* const graph_destinations = destinations ? 
destinations255
:
(ccv_nnc_graph_exec_t*)5.02k
ccv_array_get5.02k
(graph->destinations, 0);
947
5.28k
  const int graph_destination_size = destination_size ? 
destination_size255
:
graph->destinations->rnum5.02k
;
948
5.28k
  int i;
949
10.5k
  for (i = 0; i < graph_source_size; 
i++5.28k
)
950
5.28k
    if (graph_sources[i].graph != graph)
951
0
      return CCV_NNC_EXEC_INVALID;
952
10.6k
  
for (i = 0; 5.28k
i < graph_destination_size;
i++5.34k
)
953
5.34k
    if (graph_destinations[i].graph != graph)
954
0
      return CCV_NNC_EXEC_INVALID;
955
5.28k
  // When topsorted is true, there is no memory allocation when run the graph.
956
5.28k
  const int topsorted = (!sources && 
!destinations4.98k
&&
graph->topsorted4.76k
);
957
5.28k
  if (topsorted)
958
4.75k
    _ccv_nnc_graph_topsorted_run(graph, exec_idx, exec, tensor_tape, stream_context, flags);
959
531
  else
960
531
    _ccv_nnc_graph_run_slow_path(graph, exec_idx, exec, inputs, input_size, outputs, output_size, tensor_tape, stream_context, flags, sources, source_size, destinations, destination_size);
961
5.28k
  return CCV_NNC_EXEC_SUCCESS;
962
5.28k
}
963
964
int ccv_nnc_graph_run(ccv_nnc_graph_t* const graph, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context, const int flags, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size)
965
8.66k
{
966
8.66k
  if (stream_context && 
graph->topsorted3.43k
&&
source_size == 03.43k
&&
destination_size == 03.43k
)
967
3.43k
  {
968
3.43k
    ccv_nnc_stream_scheduler_t* const scheduler = ccv_nnc_stream_context_get_scheduler(stream_context);
969
3.43k
    ccv_nnc_graph_topsorted_run_coro_t params = {
970
3.43k
      .graph = graph,
971
3.43k
      .exec_idx = -1,
972
3.43k
      .exec = 0,
973
3.43k
      .tensor_tape = tensor_tape,
974
3.43k
      .stream_context = stream_context,
975
3.43k
      .flags = flags
976
3.43k
    };
977
3.43k
    ccv_nnc_stream_task_t* const task = ccv_nnc_stream_task_new(scheduler, _ccv_nnc_graph_topsorted_run_coro, &params, sizeof(params));
978
3.43k
    ccv_nnc_stream_schedule_task(scheduler, task);
979
3.43k
    return CCV_NNC_EXEC_SUCCESS;
980
3.43k
  } else
981
5.22k
    return _ccv_nnc_graph_run(graph, -1, 0, 0, 0, 0, 0, tensor_tape, 0 /* In this case, we don't support stream context yet. */, flags, sources, source_size, destinations, destination_size);
982
8.66k
}