/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_graph_run.c
Line | Count | Source |
1 | | #include "ccv_nnc.h" |
2 | | #include "ccv_nnc_easy.h" |
3 | | #include "ccv_nnc_internal.h" |
4 | | #include "ccv_internal.h" |
5 | | #include "_ccv_nnc_graph.h" |
6 | | #include "_ccv_nnc_stream.h" |
7 | | #ifdef HAVE_CUDA |
8 | | #include "gpu/ccv_nnc_compat.h" |
9 | | #elif defined(HAVE_MPS) |
10 | | #include "mps/ccv_nnc_mps.h" |
11 | | #endif |
12 | | |
13 | | // MARK - Level-2 API |
14 | | |
15 | | static void _ccv_nnc_unwrap_tensor_wrap(const ccv_nnc_graph_t* const graph, const int64_t count, const int64_t reverse_count, ccv_nnc_graph_tensor_wrap_t* const tensor_wrap) |
16 | 930 | { |
17 | 930 | ccv_nnc_tensor_t* tensor = tensor_wrap->tensors[tensor_wrap->index]; |
18 | 1.96k | while (CCV_IS_TENSOR_MULTIVIEW(tensor) && |
19 | 1.96k | (1.06k ((ccv_nnc_tensor_multiview_t*)tensor)->anchor == (intptr_t)graph1.06k || |
20 | 1.06k | ((ccv_nnc_tensor_multiview_t*)tensor)->anchor == (intptr_t)graph->pair45 )) |
21 | 1.03k | { |
22 | | // If the anchor is from the pair, we use the reverse_count instead (we are looking it up). |
23 | 1.03k | const int i = (int)((((ccv_nnc_tensor_multiview_t*)tensor)->anchor == (intptr_t)graph) ? count1.02k : reverse_count15 ); |
24 | 1.03k | ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor; |
25 | 1.03k | const int off = mv->kind; |
26 | 1.03k | const int mod = mv->repeat; |
27 | 1.03k | tensor = CCV_NNC_MULTIVIEW_DATA(mv)[i >= off ? ((i - off) % mod) + off1.00k : i32 ]; // Unwrap. |
28 | | // If reached the root. |
29 | 1.03k | if (!CCV_IS_TENSOR_MULTIVIEW(tensor)) |
30 | 889 | tensor_wrap->update_required = 1; // Need to update tensor updates. |
31 | 1.03k | ++tensor_wrap->index; |
32 | 1.03k | tensor_wrap->tensors[tensor_wrap->index] = tensor; |
33 | 1.03k | assert(tensor_wrap->index < tensor_wrap->count); |
34 | 1.03k | } |
35 | 930 | } |
36 | | |
37 | | static void _ccv_nnc_graph_unwrap_sub_graph(const ccv_nnc_graph_t* const graph, const int64_t count, const int64_t reverse_count, const ccv_nnc_graph_t* const sub_graph) |
38 | 198 | { |
39 | 198 | int i; |
40 | 198 | if (sub_graph->carry_overs) |
41 | 265 | for (i = 0; 121 i < sub_graph->carry_overs->rnum; i++144 ) |
42 | 144 | { |
43 | 144 | ccv_nnc_graph_tensor_carry_over_t* const carry_over = (ccv_nnc_graph_tensor_carry_over_t*)ccv_array_get(sub_graph->carry_overs, i); |
44 | 144 | _ccv_nnc_unwrap_tensor_wrap(graph, count, reverse_count, carry_over->from); |
45 | 144 | _ccv_nnc_unwrap_tensor_wrap(graph, count, reverse_count, carry_over->to); |
46 | 144 | } |
47 | 198 | if (sub_graph->sub_graphs) |
48 | 82 | for (i = 0; 21 i < sub_graph->sub_graphs->rnum; i++61 ) |
49 | 61 | _ccv_nnc_graph_unwrap_sub_graph(graph, count, reverse_count, *(ccv_nnc_graph_t**)ccv_array_get(sub_graph->sub_graphs, i)); |
50 | 198 | } |
51 | | |
52 | | static void _ccv_nnc_graph_unwrap(const ccv_nnc_graph_t* const graph, const int64_t count, const int64_t reverse_count) |
53 | 171 | { |
54 | 171 | if (!graph->tensor_wraps_refs) |
55 | 34 | return; |
56 | 137 | int i, j; |
57 | 510 | for (i = 0; i < graph->tensor_wraps_refs->rnum; i++373 ) |
58 | 373 | { |
59 | 373 | const ccv_nnc_graph_tensor_wraps_ref_t* const tensor_wraps_ref = (const ccv_nnc_graph_tensor_wraps_ref_t*)ccv_array_get(graph->tensor_wraps_refs, i); |
60 | 373 | const ccv_nnc_graph_t* const sub_graph = tensor_wraps_ref->graph; |
61 | 373 | ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(sub_graph->tensor_wraps, tensor_wraps_ref->d); |
62 | 373 | if (tensor_wrap_array) |
63 | 1.36k | for (j = 0; 373 j < tensor_wrap_array->size; j++994 ) |
64 | 994 | { |
65 | 994 | ccv_nnc_graph_tensor_wrap_t* const tensor_wrap = tensor_wrap_array->tensor_wraps[j]; |
66 | 994 | if (!tensor_wrap) |
67 | 352 | continue; |
68 | 642 | _ccv_nnc_unwrap_tensor_wrap(graph, count, reverse_count, tensor_wrap); |
69 | 642 | } |
70 | 373 | } |
71 | 137 | _ccv_nnc_graph_unwrap_sub_graph(graph, count, reverse_count, graph); |
72 | 137 | } |
73 | | |
74 | | static void _ccv_nnc_graph_transit_move_to(const ccv_nnc_graph_t* const graph) |
75 | 141 | { |
76 | 141 | int i; |
77 | 141 | if (graph->carry_overs) |
78 | 255 | for (i = 0; 118 i < graph->carry_overs->rnum; i++137 ) |
79 | 137 | { |
80 | 137 | ccv_nnc_graph_tensor_carry_over_t* const carry_over = (ccv_nnc_graph_tensor_carry_over_t*)ccv_array_get(graph->carry_overs, i); |
81 | 137 | ccv_nnc_tensor_t* it = (ccv_nnc_tensor_t*)(carry_over->to->tensors[carry_over->to->index]); |
82 | 137 | assert(!CCV_IS_TENSOR_MULTIVIEW(it)); |
83 | 137 | it->data = carry_over->transit; |
84 | 137 | } |
85 | 141 | } |
86 | | |
87 | | static void _ccv_nnc_graph_from_move_transit(const ccv_nnc_graph_t* const graph) |
88 | 143 | { |
89 | 143 | int i; |
90 | 143 | if (graph->carry_overs) |
91 | 258 | for (i = 0; 119 i < graph->carry_overs->rnum; i++139 ) |
92 | 139 | { |
93 | 139 | ccv_nnc_graph_tensor_carry_over_t* const carry_over = (ccv_nnc_graph_tensor_carry_over_t*)ccv_array_get(graph->carry_overs, i); |
94 | 139 | ccv_nnc_tensor_t* it = (ccv_nnc_tensor_t*)(carry_over->from->tensors[carry_over->from->index]); |
95 | 139 | assert(!CCV_IS_TENSOR_MULTIVIEW(it)); |
96 | 139 | carry_over->transit = it->data; |
97 | 139 | } |
98 | 143 | } |
99 | | |
100 | | static void _ccv_nnc_rewrap_tensor_wrap(const ccv_nnc_graph_t* const graph, ccv_nnc_graph_tensor_wrap_t* const tensor_wrap) |
101 | 930 | { |
102 | 1.96k | while (tensor_wrap->index > 0 && CCV_IS_TENSOR_MULTIVIEW1.18k (tensor_wrap->tensors[tensor_wrap->index - 1]) && |
103 | 1.96k | (1.18k ((ccv_nnc_tensor_multiview_t*)tensor_wrap->tensors[tensor_wrap->index - 1])->anchor == (intptr_t)graph1.18k || |
104 | 1.18k | ((ccv_nnc_tensor_multiview_t*)tensor_wrap->tensors[tensor_wrap->index - 1])->anchor == (intptr_t)graph->pair165 )) |
105 | 1.03k | --tensor_wrap->index; |
106 | 930 | } |
107 | | |
108 | | static void _ccv_nnc_graph_rewrap_sub_graph(const ccv_nnc_graph_t* const graph, const ccv_nnc_graph_t* const sub_graph) |
109 | 198 | { |
110 | 198 | int i; |
111 | 198 | if (sub_graph->carry_overs) |
112 | 265 | for (i = 0; 121 i < sub_graph->carry_overs->rnum; i++144 ) |
113 | 144 | { |
114 | 144 | ccv_nnc_graph_tensor_carry_over_t* const carry_over = (ccv_nnc_graph_tensor_carry_over_t*)ccv_array_get(sub_graph->carry_overs, i); |
115 | 144 | _ccv_nnc_rewrap_tensor_wrap(graph, carry_over->from); |
116 | 144 | _ccv_nnc_rewrap_tensor_wrap(graph, carry_over->to); |
117 | 144 | } |
118 | 198 | if (sub_graph->sub_graphs) |
119 | 82 | for (i = 0; 21 i < sub_graph->sub_graphs->rnum; i++61 ) |
120 | 61 | _ccv_nnc_graph_rewrap_sub_graph(graph, *(ccv_nnc_graph_t**)ccv_array_get(sub_graph->sub_graphs, i)); |
121 | 198 | } |
122 | | |
123 | | static void _ccv_nnc_graph_rewrap(const ccv_nnc_graph_t* const graph) // Call this method at the end to roll the wrap_ptr back |
124 | 171 | { |
125 | 171 | if (!graph->tensor_wraps_refs) |
126 | 34 | return; |
127 | 137 | int i, j; |
128 | 510 | for (i = 0; i < graph->tensor_wraps_refs->rnum; i++373 ) |
129 | 373 | { |
130 | 373 | const ccv_nnc_graph_tensor_wraps_ref_t* const tensor_wraps_ref = (const ccv_nnc_graph_tensor_wraps_ref_t*)ccv_array_get(graph->tensor_wraps_refs, i); |
131 | 373 | const ccv_nnc_graph_t* const sub_graph = tensor_wraps_ref->graph; |
132 | 373 | ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(sub_graph->tensor_wraps, tensor_wraps_ref->d); |
133 | 373 | if (tensor_wrap_array) |
134 | 1.36k | for (j = 0; 373 j < tensor_wrap_array->size; j++994 ) |
135 | 994 | { |
136 | 994 | ccv_nnc_graph_tensor_wrap_t* const tensor_wrap = tensor_wrap_array->tensor_wraps[j]; |
137 | 994 | if (!tensor_wrap) |
138 | 352 | continue; |
139 | 642 | _ccv_nnc_rewrap_tensor_wrap(graph, tensor_wrap); |
140 | 642 | } |
141 | 373 | } |
142 | 137 | _ccv_nnc_graph_rewrap_sub_graph(graph, graph); |
143 | 137 | } |
144 | | |
145 | | static void _ccv_nnc_graph_exec_unwrap_io(const ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const node) |
146 | 291k | { |
147 | 291k | if (!node->tensor_wraps_ref) |
148 | 290k | return; |
149 | 277 | int i; |
150 | 277 | ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, node->tensor_wraps_ref - 1); |
151 | 277 | ccv_nnc_graph_tensor_wrap_t** const tensor_wraps = tensor_wrap_array->tensor_wraps; |
152 | 1.04k | for (i = 0; i < tensor_wrap_array->size; i++767 ) |
153 | 767 | if (tensor_wraps[i]) |
154 | 492 | { |
155 | 492 | assert(tensor_wraps[i]->index > 0); |
156 | 492 | ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)(tensor_wraps[i]->tensors[tensor_wraps[i]->index - 1]); |
157 | 492 | assert(CCV_IS_TENSOR_MULTIVIEW(mv)); |
158 | | // Only now set the mv->it, because now this node is about to get executed. |
159 | 492 | mv->it = tensor_wraps[i]->tensors[tensor_wraps[i]->index]; |
160 | 492 | assert(!CCV_IS_TENSOR_MULTIVIEW(mv->it)); |
161 | 492 | } |
162 | 699 | for (i = 0; 277 i < node->input_size; i++422 ) |
163 | 422 | if (tensor_wraps[i]) |
164 | 191 | node->inputs[i] = tensor_wraps[i]->tensors[tensor_wraps[i]->index]; |
165 | 277 | const int d = node->input_size; |
166 | 472 | for (i = 0; i < node->output_size; i++195 ) |
167 | 195 | if (tensor_wraps[d + i]) |
168 | 151 | node->outputs[i] = tensor_wraps[d + i]->tensors[tensor_wraps[d + i]->index]; |
169 | 277 | } |
170 | | |
171 | | static void _ccv_nnc_graph_exec_unwrap_while_expr(const ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const node) |
172 | 161 | { |
173 | 161 | assert(node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE); |
174 | 161 | if (!node->p_while.tensor_wraps_ref) |
175 | 155 | return; |
176 | 6 | int i; |
177 | 6 | ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, node->p_while.tensor_wraps_ref - 1); |
178 | 6 | ccv_nnc_graph_tensor_wrap_t** const tensor_wraps = tensor_wrap_array->tensor_wraps; |
179 | 18 | for (i = 0; i < tensor_wrap_array->size; i++12 ) |
180 | 12 | if (tensor_wraps[i]) |
181 | 6 | { |
182 | 6 | assert(tensor_wraps[i]->index > 0); |
183 | 6 | ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)(tensor_wraps[i]->tensors[tensor_wraps[i]->index - 1]); |
184 | 6 | assert(CCV_IS_TENSOR_MULTIVIEW(mv)); |
185 | | // Only now set the mv->it, because now this node is about to get executed. |
186 | 6 | mv->it = tensor_wraps[i]->tensors[tensor_wraps[i]->index]; |
187 | 6 | assert(!CCV_IS_TENSOR_MULTIVIEW(mv->it)); |
188 | 6 | } |
189 | 18 | for (i = 0; 6 i < node->p_while.input_size; i++12 ) |
190 | 12 | if (tensor_wraps[i]) |
191 | 6 | node->p_while.inputs[i] = tensor_wraps[i]->tensors[tensor_wraps[i]->index]; |
192 | 6 | } |
193 | | |
194 | | static void _ccv_nnc_graph_exec_unwrap_phi(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_info_t* const node, const int ref) |
195 | 41 | { |
196 | 41 | int i; |
197 | | // If the output tensor is a phi multi-view tensor, we update our selection to all the subscribers. |
198 | 80 | for (i = 0; i < node->output_size; i++39 ) |
199 | 39 | if (CCV_IS_TENSOR_MULTIVIEW(node->outputs[i]) && |
200 | 39 | ((ccv_nnc_tensor_multiview_t*)node->outputs[i])->anchor == 29 CCV_NNC_MULTIVIEW_PHI29 ) |
201 | 29 | { |
202 | 29 | ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)node->outputs[i]; |
203 | 29 | mv->it = CCV_NNC_MULTIVIEW_DATA(mv)[ref >= 0]; |
204 | 29 | ccv_nnc_tensor_multiview_synchronize(mv); |
205 | 29 | } |
206 | 41 | } |
207 | | |
208 | | static void _ccv_nnc_graph_exec_begin_synchronize_multiviews(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const node) |
209 | 291k | { |
210 | 291k | if (!node->tensor_wraps_ref) |
211 | 290k | return; |
212 | 277 | int i; |
213 | 277 | ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, node->tensor_wraps_ref - 1); |
214 | 277 | ccv_nnc_graph_tensor_wrap_t** const tensor_wraps = tensor_wrap_array->tensor_wraps; |
215 | 1.04k | for (i = 0; i < tensor_wrap_array->size; i++767 ) |
216 | 767 | if (tensor_wraps[i] && tensor_wraps[i]->update_required492 ) |
217 | 492 | { |
218 | 492 | assert(tensor_wraps[i]->index > 0); |
219 | 492 | ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)(tensor_wraps[i]->tensors[tensor_wraps[i]->index - 1]); |
220 | | // Now update the final pointer. |
221 | 492 | ccv_nnc_tensor_multiview_synchronize(mv); |
222 | 492 | tensor_wraps[i]->update_required = 0; // Reset, no need to update. |
223 | 492 | } |
224 | 277 | } |
225 | | |
226 | | void ccv_nnc_print_tensor_shape(const ccv_nnc_tensor_t* const tensor) |
227 | 0 | { |
228 | 0 | int i; |
229 | 0 | PRINT(CCV_CLI_INFO, " [%d", tensor->info.dim[0]); |
230 | 0 | for (i = 1; i < CCV_NNC_MAX_DIM_ALLOC && tensor->info.dim[i]; i++) |
231 | 0 | PRINT(CCV_CLI_INFO, "x%d", tensor->info.dim[i]); |
232 | 0 | PRINT(CCV_CLI_INFO, "]"); |
233 | 0 | } |
234 | | |
235 | | void ccv_nnc_print_tensor_info(const ccv_nnc_tensor_t* const tensor) |
236 | 0 | { |
237 | 0 | int i; |
238 | 0 | PRINT(CCV_CLI_INFO, " [%d", tensor->info.dim[0]); |
239 | 0 | for (i = 1; i < CCV_NNC_MAX_DIM_ALLOC && tensor->info.dim[i]; i++) |
240 | 0 | PRINT(CCV_CLI_INFO, "x%d", tensor->info.dim[i]); |
241 | 0 | PRINT(CCV_CLI_INFO, "]"); |
242 | 0 | if (!CCV_CLI_OUTPUT_LEVEL_IS(CCV_CLI_VERBOSE) || tensor->info.dim[0] <= 0) |
243 | 0 | return; |
244 | 0 | const int nd = ccv_nnc_tensor_nd(tensor->info.dim); |
245 | 0 | const int len = ccv_min(tensor->info.dim[nd - 1], 3); |
246 | 0 | if (CCV_TENSOR_GET_MEMORY(tensor->info.type) == CCV_TENSOR_GPU_MEMORY) |
247 | 0 | { |
248 | 0 | #ifdef HAVE_CUDA |
249 | 0 | switch (tensor->info.datatype) |
250 | 0 | { |
251 | 0 | case CCV_16F: { |
252 | 0 | uint16_t data[len]; |
253 | 0 | cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.f16, tensor->info.type, len * sizeof(uint16_t)); |
254 | 0 | float fp32[len]; |
255 | 0 | ccv_half_precision_to_float(data, fp32, len); |
256 | 0 | for (i = 0; i < len; i++) |
257 | 0 | PRINT(CCV_CLI_VERBOSE, " %f", fp32[i]); |
258 | 0 | break; |
259 | 0 | } |
260 | 0 | case CCV_16BF: { |
261 | 0 | uint16_t data[len]; |
262 | 0 | cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.f16, tensor->info.type, len * sizeof(uint16_t)); |
263 | 0 | float fp32[len]; |
264 | 0 | ccv_bfloat_to_float(data, fp32, len); |
265 | 0 | for (i = 0; i < len; i++) |
266 | 0 | PRINT(CCV_CLI_VERBOSE, " %f", fp32[i]); |
267 | 0 | break; |
268 | 0 | } |
269 | 0 | case CCV_32F: { |
270 | 0 | float data[len]; |
271 | 0 | cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.f32, tensor->info.type, len * sizeof(float)); |
272 | 0 | for (i = 0; i < len; i++) |
273 | 0 | PRINT(CCV_CLI_VERBOSE, " %f", data[i]); |
274 | 0 | break; |
275 | 0 | } |
276 | 0 | case CCV_64F: { |
277 | 0 | double data[len]; |
278 | 0 | cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.f64, tensor->info.type, len * sizeof(double)); |
279 | 0 | for (i = 0; i < len; i++) |
280 | 0 | PRINT(CCV_CLI_VERBOSE, " %f", data[i]); |
281 | 0 | break; |
282 | 0 | } |
283 | 0 | case CCV_32S: { |
284 | 0 | int data[len]; |
285 | 0 | cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.i32, tensor->info.type, len * sizeof(int)); |
286 | 0 | for (i = 0; i < len; i++) |
287 | 0 | PRINT(CCV_CLI_VERBOSE, " %d", data[i]); |
288 | 0 | break; |
289 | 0 | } |
290 | 0 | case CCV_64S: { |
291 | 0 | int64_t data[len]; |
292 | 0 | cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.i64, tensor->info.type, len * sizeof(int64_t)); |
293 | 0 | for (i = 0; i < len; i++) |
294 | 0 | PRINT(CCV_CLI_VERBOSE, " %lld", (long long)data[i]); |
295 | 0 | break; |
296 | 0 | } |
297 | 0 | case CCV_8U: { |
298 | 0 | uint8_t data[len]; |
299 | 0 | cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.u8, tensor->info.type, len * sizeof(uint8_t)); |
300 | 0 | for (i = 0; i < len; i++) |
301 | 0 | PRINT(CCV_CLI_VERBOSE, " %d", (int)data[i]); |
302 | 0 | break; |
303 | 0 | } |
304 | 0 | } |
305 | 0 | if (ccv_nnc_tensor_count(tensor->info) > 3) |
306 | 0 | PRINT(CCV_CLI_VERBOSE, " .."); |
307 | | #elif defined(HAVE_MPS) |
308 | | switch (tensor->info.datatype) |
309 | | { |
310 | | case CCV_16F: { |
311 | | uint16_t data[len]; |
312 | | mpmemcpy(data, 0, CCV_TENSOR_CPU_MEMORY, tensor->data.f16, tensor->dataof, tensor->info.type, len * sizeof(uint16_t)); |
313 | | float fp32[len]; |
314 | | ccv_half_precision_to_float(data, fp32, len); |
315 | | for (i = 0; i < len; i++) |
316 | | PRINT(CCV_CLI_VERBOSE, " %f", fp32[i]); |
317 | | break; |
318 | | } |
319 | | case CCV_16BF: { |
320 | | uint16_t data[len]; |
321 | | mpmemcpy(data, 0, CCV_TENSOR_CPU_MEMORY, tensor->data.f16, tensor->dataof, tensor->info.type, len * sizeof(uint16_t)); |
322 | | float fp32[len]; |
323 | | ccv_bfloat_to_float(data, fp32, len); |
324 | | for (i = 0; i < len; i++) |
325 | | PRINT(CCV_CLI_VERBOSE, " %f", fp32[i]); |
326 | | break; |
327 | | } |
328 | | case CCV_32F: { |
329 | | float data[len]; |
330 | | mpmemcpy(data, 0, CCV_TENSOR_CPU_MEMORY, tensor->data.f32, tensor->dataof, tensor->info.type, len * sizeof(float)); |
331 | | for (i = 0; i < len; i++) |
332 | | PRINT(CCV_CLI_VERBOSE, " %f", data[i]); |
333 | | break; |
334 | | } |
335 | | case CCV_64F: { |
336 | | double data[len]; |
337 | | mpmemcpy(data, 0, CCV_TENSOR_CPU_MEMORY, tensor->data.f64, tensor->dataof, tensor->info.type, len * sizeof(double)); |
338 | | for (i = 0; i < len; i++) |
339 | | PRINT(CCV_CLI_VERBOSE, " %f", data[i]); |
340 | | break; |
341 | | } |
342 | | case CCV_32S: { |
343 | | int data[len]; |
344 | | mpmemcpy(data, 0, CCV_TENSOR_CPU_MEMORY, tensor->data.i32, tensor->dataof, tensor->info.type, len * sizeof(int)); |
345 | | for (i = 0; i < len; i++) |
346 | | PRINT(CCV_CLI_VERBOSE, " %d", data[i]); |
347 | | break; |
348 | | } |
349 | | case CCV_64S: { |
350 | | int64_t data[len]; |
351 | | mpmemcpy(data, 0, CCV_TENSOR_CPU_MEMORY, tensor->data.i64, tensor->dataof, tensor->info.type, len * sizeof(int64_t)); |
352 | | for (i = 0; i < len; i++) |
353 | | PRINT(CCV_CLI_VERBOSE, " %lld", (long long)data[i]); |
354 | | break; |
355 | | } |
356 | | case CCV_8U: { |
357 | | uint8_t data[len]; |
358 | | mpmemcpy(data, 0, CCV_TENSOR_CPU_MEMORY, tensor->data.u8, tensor->dataof, tensor->info.type, len * sizeof(uint8_t)); |
359 | | for (i = 0; i < len; i++) |
360 | | PRINT(CCV_CLI_VERBOSE, " %d", (int)data[i]); |
361 | | break; |
362 | | } |
363 | | } |
364 | | if (ccv_nnc_tensor_count(tensor->info) > 3) |
365 | | PRINT(CCV_CLI_VERBOSE, " .."); |
366 | | #endif |
367 | 0 | } else if (CCV_TENSOR_GET_MEMORY(tensor->info.type) == CCV_TENSOR_CPU_MEMORY) { |
368 | 0 | switch (tensor->info.datatype) |
369 | 0 | { |
370 | 0 | case CCV_16F: { |
371 | 0 | float fp32[len]; |
372 | 0 | ccv_half_precision_to_float((uint16_t*)tensor->data.f16, fp32, len); |
373 | 0 | for (i = 0; i < len; i++) |
374 | 0 | PRINT(CCV_CLI_VERBOSE, " %f", fp32[i]); |
375 | 0 | break; |
376 | 0 | } |
377 | 0 | case CCV_16BF: { |
378 | 0 | float fp32[len]; |
379 | 0 | ccv_bfloat_to_float((uint16_t*)tensor->data.f16, fp32, len); |
380 | 0 | for (i = 0; i < len; i++) |
381 | 0 | PRINT(CCV_CLI_VERBOSE, " %f", fp32[i]); |
382 | 0 | break; |
383 | 0 | } |
384 | 0 | case CCV_32F: |
385 | 0 | for (i = 0; i < len; i++) |
386 | 0 | PRINT(CCV_CLI_VERBOSE, " %f", tensor->data.f32[i]); |
387 | 0 | break; |
388 | 0 | case CCV_64F: |
389 | 0 | for (i = 0; i < len; i++) |
390 | 0 | PRINT(CCV_CLI_VERBOSE, " %f", tensor->data.f64[i]); |
391 | 0 | break; |
392 | 0 | case CCV_32S: |
393 | 0 | for (i = 0; i < len; i++) |
394 | 0 | PRINT(CCV_CLI_VERBOSE, " %d", tensor->data.i32[i]); |
395 | 0 | break; |
396 | 0 | case CCV_64S: |
397 | 0 | for (i = 0; i < len; i++) |
398 | 0 | PRINT(CCV_CLI_VERBOSE, " %lld", (long long)tensor->data.i64[i]); |
399 | 0 | break; |
400 | 0 | case CCV_8U: |
401 | 0 | for (i = 0; i < len; i++) |
402 | 0 | PRINT(CCV_CLI_VERBOSE, " %d", (int)tensor->data.u8[i]); |
403 | 0 | break; |
404 | 0 | } |
405 | 0 | if (ccv_nnc_tensor_count(tensor->info) > 3) |
406 | 0 | PRINT(CCV_CLI_VERBOSE, " .."); |
407 | 0 | } |
408 | 0 | } |
409 | | |
410 | | static co_decl(_ccv_nnc_graph_topsorted_run_coro, (ccv_nnc_graph_t* const graph, const int exec_idx, const ccv_nnc_graph_static_schedule_t* const schedule, ccv_nnc_graph_exec_info_t* const exec, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context, const int flags)); |
411 | | |
412 | 6 | static co_decl_task2 (_ccv_nnc_graph_exec_cases_of_coro, (ccv_nnc_graph_t* const graph, const int exec_idx, ccv_nnc_graph_exec_info_t* const exec, const ccv_nnc_graph_exec_schedule_t* const schd, ccv_nnc_tensor_t* const* const inputs, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context, int flags), private( |
413 | 6 | int ref; |
414 | 6 | ccv_nnc_graph_t* sub_graph; |
415 | 6 | )) { |
416 | | // Wait until this stream context is done. |
417 | 6 | co_stream_await2 (CO_P(stream_context)); |
418 | 2 | if (CO_P(exec)->cmd.cmd == CCV_NNC_GRAPH_FORWARD) |
419 | 2 | { |
420 | 2 | CO_V(ref) = CO_P(exec)->case_of.offset + CO_P(exec)->case_of.expr(CO_P(inputs), CO_P(exec)->input_size, CO_P(exec)->case_of.data); |
421 | 2 | if (CO_P(tensor_tape)) |
422 | 0 | ccv_nnc_tensor_tape_set_numbering(CO_P(tensor_tape), CO_P(graph), (ccv_nnc_graph_exec_t){ |
423 | 0 | .d = CO_P(exec_idx), |
424 | 0 | .graph = CO_P(graph), |
425 | 0 | }, CO_V(ref)); |
426 | 2 | } else { |
427 | 0 | assert(CO_P(exec)->cmd.cmd == CCV_NNC_GRAPH_BACKWARD); |
428 | 0 | assert(CO_P(tensor_tape)); |
429 | 0 | CO_V(ref) = ccv_nnc_tensor_tape_numbering(CO_P(tensor_tape), CO_P(graph), (ccv_nnc_graph_exec_t){ |
430 | 0 | .d = CO_P(exec_idx), |
431 | 0 | .graph = CO_P(graph), |
432 | 0 | }); |
433 | 0 | } |
434 | 2 | if (CO_V(ref) >= 0) |
435 | 2 | { |
436 | 2 | assert(CO_V(ref) < CO_P(exec)->graph_ref_size); |
437 | 2 | CO_V(sub_graph) = *(ccv_nnc_graph_t**)ccv_array_get(CO_P(graph)->sub_graphs, CCV_NNC_GRAPH_REF(CO_P(exec))[CO_V(ref)] - 1); |
438 | 2 | assert(CO_P(schd)->stream_size == 1); |
439 | 2 | assert(CO_P(graph)->streams[SCHEDULE_STREAMS(*CO_P(schd))[0]] == CO_V(sub_graph)->streams[0]); |
440 | 2 | co_apply(_ccv_nnc_graph_topsorted_run_coro, (CO_V(sub_graph), CO_P(exec_idx), CO_V(sub_graph)->default_schedule, CO_P(exec), CO_P(tensor_tape), CO_P(graph)->streams[SCHEDULE_STREAMS(*CO_P(schd))[0]], CO_P(flags))); |
441 | 2 | } |
442 | 2 | _ccv_nnc_graph_exec_unwrap_phi(CO_P(graph), CO_P(exec), CO_V(ref)); |
443 | 2 | } co_end() |
444 | | |
445 | | typedef struct { |
446 | | ccv_nnc_graph_t* graph; |
447 | | const ccv_nnc_graph_exec_schedule_t* node; |
448 | | ccv_nnc_stream_context_t* stream; |
449 | | } ccv_nnc_graph_neighbor_context_discovery_t; |
450 | | |
451 | | static ccv_nnc_stream_context_t* _ccv_nnc_graph_neighbor_context_discovery(const int device_id, void* const context) |
452 | 13.9k | { |
453 | 13.9k | const ccv_nnc_graph_neighbor_context_discovery_t* const discovery = (ccv_nnc_graph_neighbor_context_discovery_t*)context; |
454 | 13.9k | if (CCV_STREAM_GET_DEVICE_ID(ccv_nnc_stream_context_type(discovery->stream)) == device_id) |
455 | 3.65k | return discovery->stream; |
456 | 10.3k | ccv_nnc_graph_t* const graph = discovery->graph; |
457 | 10.3k | const ccv_nnc_graph_exec_schedule_t* const node = discovery->node; |
458 | 10.3k | int i; |
459 | | // First try to find in other streams of the same node. |
460 | 30.9k | for (i = 0; i < node->stream_size; i++20.6k ) |
461 | 30.9k | { |
462 | 30.9k | ccv_nnc_stream_context_t* const stream = graph->streams[SCHEDULE_STREAMS(*node)[i]]; |
463 | 30.9k | if (CCV_STREAM_GET_DEVICE_ID(ccv_nnc_stream_context_type(stream)) == device_id) |
464 | 10.3k | return stream; |
465 | 30.9k | } |
466 | | // If cannot find, try to find in all the wait streams. |
467 | 7 | for (i = 0; 4 i < node->wait_size; i++3 ) |
468 | 7 | { |
469 | 7 | ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_signal_get_emitter(graph->signals[node->waits[i]]); |
470 | 7 | if (stream_context && CCV_STREAM_GET_DEVICE_ID(ccv_nnc_stream_context_type(stream_context)) == device_id) |
471 | 4 | return stream_context; |
472 | 7 | } |
473 | 0 | return 0; |
474 | 4 | } |
475 | | |
476 | | static co_routine_t* _ccv_nnc_graph_exec_run_task(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const node, const ccv_nnc_graph_exec_schedule_t* const schd, const int idx, ccv_nnc_tensor_tape_t* const tensor_tape, const int flags) |
477 | 229k | { |
478 | 229k | _ccv_nnc_graph_exec_unwrap_io(graph, node); |
479 | 229k | ccv_nnc_tensor_t** inputs = node->inputs; |
480 | 229k | ccv_nnc_tensor_t** outputs = inputs ? inputs + node->input_size201k : 028.1k ; |
481 | 229k | if (tensor_tape) |
482 | 0 | ccv_nnc_tensor_tape_io(tensor_tape, graph, node->input_flags, inputs, node->input_size, node->output_flags, outputs, node->output_size); |
483 | | /* Broadcast the updates to all subscribed references for input / output, even though at th |
484 | | * time output is not written yet, propagate pointer change is still valid. */ |
485 | 229k | _ccv_nnc_graph_exec_begin_synchronize_multiviews(graph, node); |
486 | 229k | if (node->cmd.cmd == CCV_NNC_GRAPH_FORWARD || node->cmd.cmd == CCV_NNC_GRAPH_BACKWARD229k ) |
487 | 4 | { |
488 | 4 | if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) |
489 | 2 | { |
490 | 2 | ccv_nnc_stream_context_t* const node_stream = graph->streams[SCHEDULE_STREAMS(*schd)[0]]; |
491 | 2 | return co_new(_ccv_nnc_graph_exec_cases_of_coro, (graph, idx, node, schd, inputs, tensor_tape, node_stream, flags)); |
492 | 2 | } else if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE) { |
493 | 2 | ccv_nnc_graph_t* sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[0] - 1); |
494 | 2 | assert(graph->streams[SCHEDULE_STREAMS(*schd)[0]] == sub_graph->streams[0]); |
495 | 2 | return co_new(_ccv_nnc_graph_topsorted_run_coro, (sub_graph, idx, sub_graph->default_schedule, node, tensor_tape, graph->streams[SCHEDULE_STREAMS(*schd)[0]], flags)); |
496 | 2 | } |
497 | 229k | } else { |
498 | 229k | PRINT(CCV_CLI_INFO, "%s [%d]: [%d] -> [%d] (%d)\n", ccv_nnc_cmd_name(node->cmd.cmd), idx, node->input_size, node->output_size, 0 SCHEDULE_STREAMS0 (*schd)[0]); |
499 | 229k | int i, j; |
500 | 229k | int flag = 0; |
501 | 470k | for (i = 0; i < schd->stream_size; i++240k ) |
502 | 240k | { |
503 | 240k | ccv_nnc_stream_context_t* const stream = graph->streams[SCHEDULE_STREAMS(*schd)[i]]; |
504 | 355k | for (j = 0; j < schd->wait_size; j++115k ) |
505 | 115k | { |
506 | 115k | ccv_nnc_stream_context_wait_signal(stream, graph->signals[schd->waits[j]]); |
507 | 115k | if (!flag) |
508 | 43.1k | { |
509 | 43.1k | PRINT(CCV_CLI_INFO, "Wait: (%d, %d)", SCHEDULE_STREAMS0 (*schd)[i], schd->waits[j]); |
510 | 43.1k | flag = 1; |
511 | 43.1k | } else |
512 | 71.9k | PRINT(CCV_CLI_INFO, ", (%d, %d)", SCHEDULE_STREAMS0 (*schd)[i], schd->waits[j]); |
513 | 115k | } |
514 | 240k | } |
515 | 229k | if (flag) |
516 | 43.1k | PRINT(CCV_CLI_INFO, "\n"); |
517 | 880k | for (i = 0; i < node->input_size; i++650k ) |
518 | 650k | { |
519 | 650k | PRINT(CCV_CLI_INFO, "|-> %d. %p (%p:%d)", i + 1, inputs[i], (inputs[i] ? inputs[i]->data.u8 : 0), (inputs[i] ? 0 CCV_TENSOR_GET_DEVICE_ID0 (inputs[i]->info.type) : -1)); |
520 | 650k | if (inputs[i] && CCV_CLI_OUTPUT_LEVEL_IS498k (CCV_CLI_INFO)) |
521 | 0 | ccv_nnc_print_tensor_info(inputs[i]); |
522 | 650k | PRINT(CCV_CLI_INFO, "\n"); |
523 | 650k | } |
524 | 586k | for (i = 0; i < node->output_size; i++356k ) |
525 | 356k | { |
526 | 356k | PRINT(CCV_CLI_INFO, "|<- %d. %p (%p:%d)", i + 1, outputs[i], (outputs[i] ? outputs[i]->data.u8 : 0), (outputs[i] ? 0 CCV_TENSOR_GET_DEVICE_ID0 (outputs[i]->info.type) : -1)); |
527 | 356k | if (outputs[i] && CCV_CLI_OUTPUT_LEVEL_IS336k (CCV_CLI_INFO)) |
528 | 0 | ccv_nnc_print_tensor_shape(outputs[i]); |
529 | 356k | PRINT(CCV_CLI_INFO, "\n"); |
530 | 356k | } |
531 | 229k | ccv_nnc_stream_context_t* const node_stream = graph->streams[SCHEDULE_STREAMS(*schd)[0]]; |
532 | 229k | ccv_nnc_graph_neighbor_context_discovery_t discovery_context = { |
533 | 229k | .graph = graph, |
534 | 229k | .node = schd, |
535 | 229k | .stream = node_stream |
536 | 229k | }; |
537 | 229k | ccv_nnc_stream_context_set_neighbor_discovery(node_stream, _ccv_nnc_graph_neighbor_context_discovery, &discovery_context); |
538 | 229k | const int status = ccv_nnc_cmd_exec(node->cmd, node->hint, flags, inputs, node->input_size, outputs, node->output_size, node_stream); |
539 | 229k | if (status != 0) |
540 | 0 | PRINT(CCV_CLI_INFO, "Invalid Status: %d\n", status); |
541 | 229k | if (CCV_CLI_OUTPUT_LEVEL_IS(CCV_CLI_VERBOSE)) |
542 | 0 | { |
543 | 0 | for (i = 0; i < node->output_size; i++) |
544 | 0 | { |
545 | 0 | PRINT(CCV_CLI_VERBOSE, "POST: |<- %d. %p (%p:%d)", i + 1, outputs[i], (outputs[i] ? outputs[i]->data.u8 : 0), (outputs[i] ? CCV_TENSOR_GET_DEVICE_ID(outputs[i]->info.type) : -1)); |
546 | 0 | if (outputs[i]) |
547 | 0 | ccv_nnc_print_tensor_info(outputs[i]); |
548 | 0 | PRINT(CCV_CLI_VERBOSE, "\n"); |
549 | 0 | } |
550 | 0 | } |
551 | 229k | flag = 0; |
552 | 470k | for (i = 0; i < schd->stream_size; i++240k ) |
553 | 240k | if (SCHEDULE_SIGNALS(*schd)[i] >= 0) |
554 | 57.9k | { |
555 | 57.9k | ccv_nnc_stream_context_t* const stream = graph->streams[SCHEDULE_STREAMS(*schd)[i]]; |
556 | 57.9k | ccv_nnc_stream_context_emit_signal(stream, graph->signals[SCHEDULE_SIGNALS(*schd)[i]]); |
557 | 57.9k | if (!flag) |
558 | 57.9k | { |
559 | 57.9k | PRINT(CCV_CLI_INFO, "Emit: (%d, %d)", SCHEDULE_STREAMS0 (*schd)[i], SCHEDULE_SIGNALS0 (*schd)[i]); |
560 | 57.9k | flag = 1; |
561 | 57.9k | } else |
562 | 9 | PRINT(CCV_CLI_INFO, ", (%d, %d)", SCHEDULE_STREAMS0 (*schd)[i], SCHEDULE_SIGNALS0 (*schd)[i]); |
563 | 57.9k | } |
564 | 229k | if (flag) |
565 | 57.9k | PRINT(CCV_CLI_INFO, "\n"); |
566 | 229k | } |
567 | 229k | return 0; |
568 | 229k | } |
569 | | |
570 | | static void _ccv_nnc_graph_mark_outgoing_streams_blocked_by_task(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_schedule_t* const schd_info, ccv_nnc_graph_exec_info_t* const node, co_routine_t* const task) |
571 | 6 | { |
572 | 6 | int i, j; |
573 | 6 | if (node->outgoings) |
574 | 8 | for (i = 0; 4 i < node->outgoings->rnum; i++4 ) |
575 | 4 | { |
576 | 4 | const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i); |
577 | 4 | const ccv_nnc_graph_exec_schedule_t* const outgoing_schd = schd_info + outgoing_idx; |
578 | | // An outgoing stream can be blocked by multiple other tasks from other streams. But it is OK, |
579 | | // because on next round of execution, that one will be marked as blocked again. |
580 | 8 | for (j = 0; j < outgoing_schd->stream_size; j++4 ) |
581 | 4 | graph->block_stream_tasks[SCHEDULE_STREAMS(*outgoing_schd)[j]] = task; |
582 | 4 | } |
583 | 6 | } |
584 | | |
585 | 6 | static co_decl_task2 (_ccv_nnc_graph_wait_any_sub_tasks, (ccv_nnc_graph_t* const graph, co_routine_t* const* const sub_tasks, const int sub_task_size, const ccv_nnc_graph_exec_schedule_t* const schd_info, const int* const pending_nodes, const int pending_node_size), private( |
586 | 6 | )) { |
587 | 6 | assert(CO_P(sub_task_size) > 0); |
588 | 2 | co_await_any(CO_P(sub_tasks), CO_P(sub_task_size)); |
589 | | // This is not good, these local variables need to be in the private section. |
590 | | // I got away with it because there is no yield or resume or apply or any after await above. |
591 | 2 | int i, j, k; |
592 | 4 | for (i = 0; i < CO_P(sub_task_size); i++2 ) |
593 | 2 | if (co_is_done(CO_P(sub_tasks)[i])) |
594 | 2 | { |
595 | 6 | for (j = 0; j < CO_P(pending_node_size); j++4 ) |
596 | 4 | { |
597 | 4 | const ccv_nnc_graph_exec_schedule_t* const node = CO_P(schd_info) + CO_P(pending_nodes)[j]; |
598 | 8 | for (k = 0; k < node->stream_size; k++4 ) |
599 | 4 | if (CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*node)[k]] == CO_P(sub_tasks)[i]) |
600 | 2 | CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*node)[k]] = 0; |
601 | 4 | } |
602 | 2 | co_free(CO_P(sub_tasks)[i]); |
603 | 2 | } |
604 | 2 | } co_end() |
605 | | |
606 | 53.2k | static co_decl_task26.6k (_ccv_nnc_graph_exec_run_loop, (ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const exec_info, const ccv_nnc_graph_exec_schedule_t* const schd_info, const int* const psort, const int start_index, const int exec_info_size, ccv_nnc_tensor_tape_t* const tensor_tape, const int flags), private( |
607 | 53.2k | int i, p, q; |
608 | 53.2k | int sub_task_size; |
609 | 53.2k | co_routine_t** sub_tasks; |
610 | 53.2k | int* pending_nodes[2]; |
611 | 53.2k | int pending_node_size[2]; |
612 | 53.2k | int idx; |
613 | 53.2k | ccv_nnc_graph_exec_info_t* node; |
614 | 53.2k | const ccv_nnc_graph_exec_schedule_t* schd; |
615 | 53.2k | co_routine_t* task; |
616 | 53.2k | )) { |
617 | 53.2k | CO_V26.6k (sub_task_size) = 0; |
618 | 53.2k | CO_V26.6k (sub_tasks) = (co_routine_t**)ccv_nnc_graph_buffer(CO_P26.6k (graph), sizeof(co_routine_t*) * (CO_P26.6k (graph)->sub_graphs26.6k ? CO_P3 (graph)->sub_graphs->rnum3 : 026.6k ) + sizeof(int) * CO_P26.6k (exec_info_size) * 2); |
619 | 53.2k | CO_V26.6k (pending_nodes)[0] = (int*)(CO_V26.6k (sub_tasks) + (CO_P26.6k (graph)->sub_graphs26.6k ? CO_P3 (graph)->sub_graphs->rnum3 : 026.6k )); |
620 | 53.2k | CO_V26.6k (pending_nodes)[1] = CO_V26.6k (pending_nodes)[0] + CO_P26.6k (exec_info_size); |
621 | 53.2k | CO_V26.6k (pending_node_size)[0] = 0; |
622 | 53.2k | CO_V26.6k (pending_node_size)[1] = 0; |
623 | 256k | for (CO_V26.6k (i) = CO_P26.6k (start_index); CO_V(i) < CO_P(exec_info_size); CO_V229k (i)++229k ) |
624 | 229k | { |
625 | 229k | if (__atomic_load_n(&CO_P(graph)->run_state, __ATOMIC_ACQUIRE) == CCV_NNC_GRAPH_STATE_CANCEL) |
626 | 0 | break; |
627 | 229k | CO_V(idx) = CO_P(psort) ? CO_P94.7k (psort)[94.7k CO_V94.7k (i)] : CO_V135k (i); |
628 | 229k | CO_V(node) = CO_P(exec_info) + CO_V(idx); |
629 | 229k | CO_V(schd) = CO_P(schd_info) + CO_V(idx); |
630 | | // If stream is blocked by but not blocked by current executing task. |
631 | 229k | int blocked = 0, j; |
632 | 470k | for (j = 0; j < CO_V(schd)->stream_size; j++240k ) |
633 | 240k | if (CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*CO_V(schd))[j]]) |
634 | 4 | { |
635 | 4 | CO_V(pending_nodes)[0][CO_V(pending_node_size)[0]++] = CO_V(idx); |
636 | 4 | _ccv_nnc_graph_mark_outgoing_streams_blocked_by_task(CO_P(graph), CO_P(schd_info), CO_V(node), CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*CO_V(schd))[j]]); |
637 | 4 | blocked = 1; |
638 | 4 | } |
639 | 229k | if (blocked) |
640 | 4 | continue; |
641 | 229k | CO_V(task) = _ccv_nnc_graph_exec_run_task(CO_P(graph), CO_V(node), CO_V(schd), CO_V(idx), CO_P(tensor_tape), CO_P(flags)); |
642 | 229k | if (CO_V(task)) |
643 | 4 | { |
644 | 4 | co_resume(CO_V(task)); |
645 | 4 | if (!co_is_done(CO_V(task))) |
646 | 2 | { |
647 | 2 | CO_V(sub_tasks)[CO_V(sub_task_size)++] = CO_V(task); |
648 | 2 | int j; |
649 | 4 | for (j = 0; j < CO_V(schd)->stream_size; j++2 ) |
650 | 2 | CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*CO_V(schd))[j]] = CO_V(task); |
651 | 2 | _ccv_nnc_graph_mark_outgoing_streams_blocked_by_task(CO_P(graph), CO_P(schd_info), CO_V(node), CO_V(task)); |
652 | 2 | } else |
653 | 2 | co_free(CO_V(task)); |
654 | 4 | } |
655 | 229k | } |
656 | 26.6k | if (CO_V(sub_task_size)) |
657 | 26.6k | co_apply(_ccv_nnc_graph_wait_any_sub_tasks, (CO_P(graph), CO_V(sub_tasks), CO_V(sub_task_size), CO_P(schd_info), CO_V(pending_nodes)[0], CO_V(pending_node_size)[0])); |
658 | 26.6k | if (__atomic_load_n(&CO_P(graph)->run_state, __ATOMIC_ACQUIRE) == CCV_NNC_GRAPH_STATE_CANCEL) |
659 | 26.6k | co_return(); |
660 | 26.6k | CO_V(p) = 0; |
661 | 26.6k | CO_V(q) = 1; |
662 | 26.6k | while (CO_V(pending_node_size)[CO_V(p)] > 0) |
663 | 2 | { |
664 | 2 | CO_V(pending_node_size)[CO_V(q)] = 0; |
665 | 2 | CO_V(sub_task_size) = 0; |
666 | 6 | for (CO_V2 (i) = 0; CO_V(i) < CO_V(pending_node_size)[CO_V(p)]; CO_V4 (i)++4 ) |
667 | 4 | { |
668 | 4 | CO_V(idx) = CO_V(pending_nodes)[CO_V(p)][CO_V(i)]; |
669 | 4 | CO_V(node) = CO_P(exec_info) + CO_V(idx); |
670 | 4 | CO_V(schd) = CO_P(schd_info) + CO_V(idx); |
671 | 4 | int blocked = 0, j; |
672 | 8 | for (j = 0; j < CO_V(schd)->stream_size; j++4 ) |
673 | 4 | if (CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*CO_V(schd))[j]]) |
674 | 0 | { |
675 | 0 | _ccv_nnc_graph_mark_outgoing_streams_blocked_by_task(CO_P(graph), CO_P(schd_info), CO_V(node), CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*CO_V(schd))[j]]); |
676 | 0 | CO_V(pending_nodes)[CO_V(q)][CO_V(pending_node_size)[CO_V(q)]++] = CO_V(idx); |
677 | 0 | blocked = 1; |
678 | 0 | } |
679 | 4 | if (blocked) |
680 | 0 | continue; |
681 | 4 | CO_V(task) = _ccv_nnc_graph_exec_run_task(CO_P(graph), CO_V(node), CO_V(schd), CO_V(idx), CO_P(tensor_tape), CO_P(flags)); |
682 | 4 | if (CO_V(task)) |
683 | 0 | { |
684 | 0 | co_resume(CO_V(task)); |
685 | 0 | if (!co_is_done(CO_V(task))) |
686 | 0 | { |
687 | 0 | CO_V(sub_tasks)[CO_V(sub_task_size)++] = CO_V(task); |
688 | 0 | for (j = 0; j < CO_V(schd)->stream_size; j++) |
689 | 0 | CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*CO_V(schd))[j]] = CO_V(task); |
690 | 0 | _ccv_nnc_graph_mark_outgoing_streams_blocked_by_task(CO_P(graph), CO_P(schd_info), CO_V(node), CO_V(task)); |
691 | 0 | } else |
692 | 0 | co_free(CO_V(task)); |
693 | 0 | } |
694 | 4 | } |
695 | 2 | int t; |
696 | 2 | CCV_SWAP(CO_V(p), CO_V(q), t); |
697 | 2 | if (CO_V(sub_task_size)) |
698 | 2 | co_apply(_ccv_nnc_graph_wait_any_sub_tasks, (CO_P(graph), CO_V(sub_tasks), CO_V(sub_task_size), CO_P(schd_info), CO_V(pending_nodes)[CO_V(p)], CO_V(pending_node_size)[CO_V(p)])); |
699 | 2 | } |
700 | 26.6k | } co_end() |
701 | | |
702 | 79.8k | co_task26.6k (_ccv_nnc_graph_topsorted_run_coro, (ccv_nnc_graph_t* const graph, const int exec_idx, const ccv_nnc_graph_static_schedule_t* const schedule, ccv_nnc_graph_exec_info_t* const exec, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context, const int flags), private(
|
703 | 79.8k | ccv_nnc_graph_exec_info_t* exec_info; |
704 | 79.8k | const ccv_nnc_graph_exec_schedule_t* schd_info; |
705 | 79.8k | co_routine_t* previous_main; |
706 | 79.8k | int stream_0; |
707 | | // while loop |
708 | 79.8k | int64_t count, reverse_count; |
709 | 79.8k | int graph_breakpoint_size; |
710 | 79.8k | int i, j; |
711 | 79.8k | )) { |
712 | 79.8k | assert(CO_P(graph)->stream_size > 0); |
713 | 26.6k | int i; |
714 | | // Assign the resource container pointer. |
715 | 104k | for (i = 0; i < CO_P(graph)->stream_size; i++78.3k ) |
716 | 78.3k | CO_P(graph)->streams[i]->resource_container = CO_P(stream_context)->_inline_container; |
717 | 26.6k | CO_V(exec_info) = (ccv_nnc_graph_exec_info_t*)ccv_array_get(CO_P(graph)->exec_info, 0); |
718 | 26.6k | CO_V(schd_info) = CO_P(schedule)->exec_info; |
719 | 26.6k | CO_V(stream_0) = CO_P(schedule)->stream_0; |
720 | 26.6k | if (CO_P(exec_idx) == -1) |
721 | 26.5k | { |
722 | 26.5k | if (CO_P(stream_context)->main) |
723 | 0 | { |
724 | 0 | CO_V(previous_main) = CO_P(stream_context)->main; |
725 | 0 | CO_P(stream_context)->main = co_self(); |
726 | | // Wait the previous task to be done. This makes sure that our graph run is serial on the same stream. |
727 | 0 | assert(!co_is_done(CO_V(previous_main))); |
728 | 0 | co_await(CO_V(previous_main)); |
729 | 0 | } else |
730 | 26.5k | CO_P(stream_context)->main = co_self(); |
731 | 26.5k | PRINT(CCV_CLI_INFO, "Graph Stream %d Begin", CO_V0 (stream_0)); |
732 | 26.5k | ccv_nnc_stream_signal_t* stream_0_signal; |
733 | 26.5k | if (CO_P(stream_context) != CO_P(graph)->streams[CO_V(stream_0)]) |
734 | 907 | { |
735 | | // Make sure when we start work on streams[0], the current stream context is done. |
736 | 907 | stream_0_signal = ccv_nnc_stream_context_emit_signal_new(CO_P(stream_context)); |
737 | 907 | ccv_nnc_stream_context_wait_signal(CO_P(graph)->streams[CO_V(stream_0)], stream_0_signal); |
738 | 25.6k | } else if (CO_P(schedule)->stream_1_size) { |
739 | 81 | ccv_nnc_stream_context_emit_signal(CO_P(graph)->streams[CO_V(stream_0)], CO_P(schedule)->begin); |
740 | 81 | stream_0_signal = CO_P(schedule)->begin; |
741 | 81 | } |
742 | 26.5k | int i, flag = 0; |
743 | 26.8k | for (i = 0; i < CO_P(schedule)->stream_1_size; i++250 ) |
744 | 250 | { |
745 | 250 | ccv_nnc_stream_context_wait_signal(CO_P(graph)->streams[CO_P(schedule)->stream_1s[i]], stream_0_signal); |
746 | 250 | if (!flag) |
747 | 86 | { |
748 | 86 | PRINT(CCV_CLI_INFO, ", Wait: %d", CO_P0 (schedule)->stream_1s[i]); |
749 | 86 | flag = 1; |
750 | 86 | } else |
751 | 164 | PRINT(CCV_CLI_INFO, ", %d", CO_P0 (schedule)->stream_1s[i]); |
752 | 250 | } |
753 | 26.5k | PRINT(CCV_CLI_INFO, "\n"); |
754 | 26.5k | } else { |
755 | 4 | assert(CO_P(stream_context) == CO_P(graph)->streams[0]); |
756 | 4 | } |
757 | 26.6k | if (CO_P(exec) && (4 CO_P4 (exec)->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)) |
758 | 2 | { |
759 | 2 | assert(CO_P(schedule) == CO_P(graph)->default_schedule); |
760 | 2 | assert(CO_P(exec)->p_while.expr); |
761 | 2 | CO_V(count) = 0; |
762 | | // This is a forward while loop. Backward while loop will just consult its pairing part. |
763 | 2 | if (CO_P(exec)->cmd.cmd == CCV_NNC_GRAPH_FORWARD) |
764 | 2 | { |
765 | 2 | CO_V(graph_breakpoint_size) = CO_P(graph)->breakpoint_offset + CO_P(graph)->breakpoint_size; |
766 | 10 | for (;; ++CO_V(count)) |
767 | 12 | { |
768 | 12 | CO_P(graph)->while_count = CO_V(count); |
769 | 12 | if (CO_P(tensor_tape)) |
770 | 0 | ccv_nnc_tensor_tape_set_numbering(CO_P(tensor_tape), CO_P(graph)->p, (ccv_nnc_graph_exec_t){ |
771 | 0 | .d = CO_P(exec_idx), |
772 | 0 | .graph = CO_P(graph)->p, |
773 | 0 | }, CO_V(count)); |
774 | 12 | _ccv_nnc_graph_unwrap(CO_P(graph), CO_V(count), 0); |
775 | 12 | if (CO_V(count) > 0) |
776 | 10 | _ccv_nnc_graph_transit_move_to(CO_P(graph)); |
777 | 12 | co_apply(_ccv_nnc_graph_exec_run_loop, (CO_P(graph), CO_V(exec_info), CO_V(schd_info), 0, 0, CO_V(graph_breakpoint_size), CO_P(tensor_tape), CO_P(flags))); |
778 | 12 | if (__atomic_load_n(&CO_P(graph)->run_state, __ATOMIC_ACQUIRE) == CCV_NNC_GRAPH_STATE_CANCEL) |
779 | 0 | break; |
780 | | // Reached breakpoints, now check the breakpoint, if not met, break out. |
781 | | // Wait until everything on the stream is executed. |
782 | 24 | for (12 CO_V12 (i) = CO_P12 (graph)->breakpoint_offset; CO_V(i) < CO_V(graph_breakpoint_size); CO_V12 (i)++12 ) |
783 | 24 | for (12 CO_V12 (j) = 0; CO_V(j) < CO_V(schd_info)[CO_V(i)].stream_size; CO_V12 (j)++12 ) |
784 | 12 | co_stream_await(CO_P(graph)->streams[SCHEDULE_STREAMS(CO_V(schd_info)[CO_V(i)])[CO_V(j)]]); |
785 | 12 | _ccv_nnc_graph_exec_unwrap_while_expr(CO_P(graph), CO_P(exec)); |
786 | 12 | if (!CO_P(exec)->p_while.expr(CO_P(exec)->p_while.inputs, CO_P(exec)->p_while.input_size, CO_P(exec)->p_while.data)) |
787 | 2 | { |
788 | 2 | _ccv_nnc_graph_rewrap(CO_P(graph)); |
789 | | // If we break from here, it is ok because all the streams are waited. |
790 | 2 | break; |
791 | 2 | } |
792 | 10 | co_apply(_ccv_nnc_graph_exec_run_loop, (CO_P(graph), CO_V(exec_info), CO_V(schd_info), 0, CO_V(graph_breakpoint_size), CO_P(graph)->exec_info->rnum, CO_P(tensor_tape), CO_P(flags))); |
793 | | // If it is cancelled here, we don't need to breakout yet, we can breakout on earlier place. The most important thing is to avoid stream wait if there is a cancel. |
794 | 10 | _ccv_nnc_graph_from_move_transit(CO_P(graph)); |
795 | 10 | _ccv_nnc_graph_rewrap(CO_P(graph)); |
796 | 10 | } |
797 | 2 | } else { |
798 | | // For backward graph, no need to evaluate the while expr. |
799 | 0 | assert(CO_P(exec)->cmd.cmd == CCV_NNC_GRAPH_BACKWARD); |
800 | 0 | assert(CO_P(graph)->pair); |
801 | 0 | assert(CO_P(tensor_tape)); |
802 | 0 | CO_V(count) = 0; |
803 | 0 | CO_V(reverse_count) = CO_P(graph)->while_count = ccv_nnc_tensor_tape_numbering(CO_P(tensor_tape), CO_P(graph)->p, (ccv_nnc_graph_exec_t){ |
804 | 0 | .d = CO_P(exec_idx), |
805 | 0 | .graph = CO_P(graph)->p, |
806 | 0 | }); |
807 | 0 | _ccv_nnc_graph_unwrap(CO_P(graph), CO_V(count), CO_V(reverse_count)); |
808 | 0 | co_apply(_ccv_nnc_graph_exec_run_loop, (CO_P(graph), CO_V(exec_info), CO_V(schd_info), 0, CO_P(graph)->breakpoint_offset, CO_P(graph)->exec_info->rnum, CO_P(tensor_tape), CO_P(flags))); |
809 | | // If it is cancelled here, we don't need to breakout yet, we can breakout later. |
810 | 0 | _ccv_nnc_graph_from_move_transit(CO_P(graph)); |
811 | 0 | _ccv_nnc_graph_rewrap(CO_P(graph)); |
812 | 0 | for (CO_V(count) = 1; CO_V(reverse_count) > 0; ++CO_V(count)) |
813 | 0 | { |
814 | 0 | CO_P(graph)->while_count = --CO_V(reverse_count); |
815 | 0 | _ccv_nnc_graph_unwrap(CO_P(graph), CO_V(count), CO_V(reverse_count)); |
816 | 0 | _ccv_nnc_graph_transit_move_to(CO_P(graph)); |
817 | 0 | co_apply(_ccv_nnc_graph_exec_run_loop, (CO_P(graph), CO_V(exec_info), CO_V(schd_info), 0, 0, CO_P(graph)->exec_info->rnum, CO_P(tensor_tape), CO_P(flags))); |
818 | 0 | if (__atomic_load_n(&CO_P(graph)->run_state, __ATOMIC_ACQUIRE) == CCV_NNC_GRAPH_STATE_CANCEL) |
819 | 0 | break; |
820 | 0 | _ccv_nnc_graph_from_move_transit(CO_P(graph)); |
821 | 0 | _ccv_nnc_graph_rewrap(CO_P(graph)); |
822 | 0 | } |
823 | 0 | } |
824 | 2 | if (__atomic_load_n(&CO_P(graph)->run_state, __ATOMIC_ACQUIRE) == CCV_NNC_GRAPH_STATE_CANCEL) |
825 | 0 | { |
826 | | // The most important thing is to reset main and then return, we don't need to wait for any streaming event. |
827 | 0 | if (CO_P(exec_idx) == -1 && CO_P(stream_context)->main == co_self()) |
828 | 0 | CO_P(stream_context)->main = 0; |
829 | 0 | co_return(); |
830 | 0 | } |
831 | 2 | assert(CO_V(stream_0) == 0); |
832 | 2 | int i; |
833 | 2 | for (i = 0; i < CO_P(schedule)->wait_size; i++0 ) |
834 | 0 | ccv_nnc_stream_context_wait_signal(CO_P(graph)->streams[0], CO_P(graph)->signals[CO_P(schedule)->waits[i]]); |
835 | 26.5k | } else { |
836 | 26.5k | CO_P(graph)->while_count = 0; |
837 | 26.5k | co_apply(_ccv_nnc_graph_exec_run_loop, (CO_P(graph), CO_V(exec_info), CO_V(schd_info), CO_P(schedule)->psort, 0, CO_P(schedule)->psort ? CO_P(schedule)->psort_size : CO_P(schedule)->exec_info_size, CO_P(tensor_tape), CO_P(flags))); |
838 | 26.5k | if (__atomic_load_n(&CO_P(graph)->run_state, __ATOMIC_ACQUIRE) == CCV_NNC_GRAPH_STATE_CANCEL) |
839 | 0 | { |
840 | | // The most important thing is to reset main and then return, we don't need to wait for any streaming event. |
841 | 0 | if (CO_P(exec_idx) == -1 && CO_P(stream_context)->main == co_self()) |
842 | 0 | CO_P(stream_context)->main = 0; |
843 | 0 | co_return(); |
844 | 0 | } |
845 | 26.5k | PRINT(CCV_CLI_INFO, "Graph Stream %d End", CO_V0 (stream_0)); |
846 | 26.5k | int i, flag = 0; |
847 | 26.7k | for (i = 0; i < CO_P(schedule)->wait_size; i++194 ) |
848 | 194 | { |
849 | 194 | ccv_nnc_stream_context_wait_signal(CO_P(graph)->streams[CO_V(stream_0)], CO_P(graph)->signals[CO_P(schedule)->waits[i]]); |
850 | 194 | if (!flag) |
851 | 66 | { |
852 | 66 | PRINT(CCV_CLI_INFO, ", Wait: %d", CO_P0 (schedule)->waits[i]); |
853 | 66 | flag = 1; |
854 | 66 | } else |
855 | 128 | PRINT(CCV_CLI_INFO, ", %d", CO_P0 (schedule)->waits[i]); |
856 | 194 | } |
857 | 26.5k | PRINT(CCV_CLI_INFO, "\n"); |
858 | 26.5k | } |
859 | 26.6k | if (CO_P(stream_context) != CO_P(graph)->streams[CO_V(stream_0)]) |
860 | 907 | { |
861 | 907 | assert(CO_P(exec_idx) == -1); |
862 | 907 | ccv_nnc_stream_context_emit_signal(CO_P(graph)->streams[CO_V(stream_0)], CO_P(schedule)->end); |
863 | 907 | ccv_nnc_stream_context_wait_signal(CO_P(stream_context), CO_P(schedule)->end); |
864 | 907 | } |
865 | | // Reset main to 0 if it is current me. |
866 | 26.6k | if (CO_P(exec_idx) == -1 && CO_P26.5k (stream_context)->main == 26.5k co_self26.5k ()) |
867 | 26.5k | CO_P(stream_context)->main = 0; |
868 | 26.6k | } co_end() |
869 | | |
870 | | static int _ccv_nnc_graph_run(ccv_nnc_graph_t* const graph, const int exec_idx, ccv_nnc_graph_exec_info_t* const exec, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const int flags, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context); |
871 | | |
872 | | static inline void _ccv_nnc_graph_exec_run(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const node, const int idx, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context, const int flags) |
873 | 61.2k | { |
874 | 61.2k | int i; |
875 | 61.2k | _ccv_nnc_graph_exec_unwrap_io(graph, node); |
876 | 61.2k | ccv_nnc_tensor_t** inputs = node->inputs; |
877 | 61.2k | ccv_nnc_tensor_t** outputs = inputs ? inputs + node->input_size61.0k : 0223 ; |
878 | 61.2k | if (tensor_tape) |
879 | 78 | ccv_nnc_tensor_tape_io(tensor_tape, graph, node->input_flags, inputs, node->input_size, node->output_flags, outputs, node->output_size); |
880 | | /* Broadcast the updates to all subscribed references for input / output, even though at th |
881 | | * time output is not written yet, propagate pointer change is still valid. */ |
882 | 61.2k | _ccv_nnc_graph_exec_begin_synchronize_multiviews(graph, node); |
883 | 61.2k | if (node->cmd.cmd == CCV_NNC_GRAPH_FORWARD || node->cmd.cmd == CCV_NNC_GRAPH_BACKWARD61.1k ) |
884 | 67 | { |
885 | 67 | assert(!stream_context); // This doesn't work properly with stream context. |
886 | 67 | if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) |
887 | 39 | { |
888 | 39 | int ref; |
889 | 39 | if (node->cmd.cmd == CCV_NNC_GRAPH_FORWARD) |
890 | 35 | { |
891 | 35 | ref = node->case_of.offset + node->case_of.expr(inputs, node->input_size, node->case_of.data); |
892 | 35 | if (tensor_tape) |
893 | 4 | ccv_nnc_tensor_tape_set_numbering(tensor_tape, graph, (ccv_nnc_graph_exec_t){ |
894 | 4 | .d = idx, |
895 | 4 | .graph = graph, |
896 | 4 | }, ref); |
897 | 35 | } else { |
898 | 4 | assert(node->cmd.cmd == CCV_NNC_GRAPH_BACKWARD); |
899 | 4 | assert(tensor_tape); |
900 | 4 | ref = ccv_nnc_tensor_tape_numbering(tensor_tape, graph, (ccv_nnc_graph_exec_t){ |
901 | 4 | .d = idx, |
902 | 4 | .graph = graph, |
903 | 4 | }); |
904 | 4 | } |
905 | 39 | if (ref >= 0) |
906 | 31 | { |
907 | 31 | assert(ref < node->graph_ref_size); |
908 | 31 | ccv_nnc_graph_t* sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[ref] - 1); |
909 | 31 | _ccv_nnc_graph_run(sub_graph, idx, node, inputs, node->input_size, outputs, node->output_size, flags, 0, 0, 0, 0, tensor_tape, stream_context); |
910 | 31 | } |
911 | 39 | _ccv_nnc_graph_exec_unwrap_phi(graph, node, ref); |
912 | 39 | } else if (28 node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE28 ) { |
913 | 28 | ccv_nnc_graph_t* sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[0] - 1); |
914 | 28 | _ccv_nnc_graph_run(sub_graph, idx, node, inputs, node->input_size, outputs, node->output_size, flags, 0, 0, 0, 0, tensor_tape, stream_context); |
915 | 28 | } |
916 | 61.1k | } else { |
917 | 61.1k | PRINT(CCV_CLI_INFO, "%s [%d]: [%d] -> [%d]\n", ccv_nnc_cmd_name(node->cmd.cmd), idx, node->input_size, node->output_size); |
918 | 235k | for (i = 0; i < node->input_size; i++174k ) |
919 | 174k | { |
920 | 174k | PRINT(CCV_CLI_INFO, "|-> %d. %p (%p:%d)", i + 1, inputs[i], (inputs[i] ? inputs[i]->data.u8 : 0), (inputs[i] ? 0 CCV_TENSOR_GET_DEVICE_ID0 (inputs[i]->info.type) : -1)); |
921 | 174k | if (inputs[i] && CCV_CLI_OUTPUT_LEVEL_IS131k (CCV_CLI_INFO)) |
922 | 0 | ccv_nnc_print_tensor_info(inputs[i]); |
923 | 174k | PRINT(CCV_CLI_INFO, "\n"); |
924 | 174k | } |
925 | 61.1k | ccv_nnc_cmd_exec(node->cmd, node->hint, flags, inputs, node->input_size, outputs, node->output_size, stream_context); |
926 | 156k | for (i = 0; i < node->output_size; i++95.2k ) |
927 | 95.2k | { |
928 | 95.2k | PRINT(CCV_CLI_INFO, "|<- %d. %p (%p:%d)", i + 1, outputs[i], (outputs[i] ? outputs[i]->data.u8 : 0), (outputs[i] ? 0 CCV_TENSOR_GET_DEVICE_ID0 (outputs[i]->info.type) : -1)); |
929 | 95.2k | if (outputs[i] && CCV_CLI_OUTPUT_LEVEL_IS78.9k (CCV_CLI_INFO)) |
930 | 0 | ccv_nnc_print_tensor_info(outputs[i]); |
931 | 95.2k | PRINT(CCV_CLI_INFO, "\n"); |
932 | 95.2k | } |
933 | 61.1k | } |
934 | 61.2k | } |
935 | | |
936 | | static inline void _ccv_nnc_graph_topsorted_run(ccv_nnc_graph_t* const graph, const int exec_idx, ccv_nnc_graph_exec_info_t* const exec, const int flags, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context) |
937 | 9.93k | { |
938 | 9.93k | int i; |
939 | 9.93k | if (exec && (exec->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)51 ) |
940 | 23 | { |
941 | 23 | assert(!stream_context); // This doesn't work properly with stream context. |
942 | 23 | assert(exec->p_while.expr); |
943 | 23 | int64_t count = 0; |
944 | | // This is a forward while loop. Backward while loop will just consult its pairing part. |
945 | 23 | if (exec->cmd.cmd == CCV_NNC_GRAPH_FORWARD) |
946 | 22 | { |
947 | 22 | const int graph_breakpoint_size = graph->breakpoint_offset + graph->breakpoint_size; |
948 | 104 | for (;; ++count) |
949 | 126 | { |
950 | 126 | graph->while_count = count; |
951 | 126 | if (tensor_tape) |
952 | 5 | ccv_nnc_tensor_tape_set_numbering(tensor_tape, graph->p, (ccv_nnc_graph_exec_t){ |
953 | 5 | .d = exec_idx, |
954 | 5 | .graph = graph->p, |
955 | 5 | }, count); |
956 | 126 | _ccv_nnc_graph_unwrap(graph, count, 0); |
957 | 126 | if (count > 0) |
958 | 104 | _ccv_nnc_graph_transit_move_to(graph); |
959 | 312 | for (i = 0; i < graph_breakpoint_size; i++186 ) |
960 | 186 | _ccv_nnc_graph_exec_run(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i), i, tensor_tape, stream_context, flags); |
961 | 126 | _ccv_nnc_graph_exec_unwrap_while_expr(graph, exec); |
962 | | // Reached breakpoints, now check the breakpoint, if not met, break out. |
963 | 126 | if (!exec->p_while.expr(exec->p_while.inputs, exec->p_while.input_size, exec->p_while.data)) |
964 | 22 | { |
965 | 22 | _ccv_nnc_graph_rewrap(graph); |
966 | 22 | break; |
967 | 22 | } |
968 | 210 | for (i = graph_breakpoint_size; 104 i < graph->exec_info->rnum; i++106 ) |
969 | 106 | _ccv_nnc_graph_exec_run(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i), i, tensor_tape, stream_context, flags); |
970 | 104 | _ccv_nnc_graph_from_move_transit(graph); |
971 | 104 | _ccv_nnc_graph_rewrap(graph); |
972 | 104 | } |
973 | 22 | } else { |
974 | | // For backward graph, no need to evaluate the while expr. |
975 | 1 | assert(exec->cmd.cmd == CCV_NNC_GRAPH_BACKWARD); |
976 | 1 | assert(graph->pair); |
977 | 1 | assert(tensor_tape); |
978 | 1 | count = 0; |
979 | 1 | int64_t reverse_count = graph->while_count = ccv_nnc_tensor_tape_numbering(tensor_tape, graph->p, (ccv_nnc_graph_exec_t){ |
980 | 1 | .d = exec_idx, |
981 | 1 | .graph = graph->p, |
982 | 1 | }); |
983 | 1 | _ccv_nnc_graph_unwrap(graph, count, reverse_count); |
984 | 5 | for (i = graph->breakpoint_offset; i < graph->exec_info->rnum; i++4 ) |
985 | 4 | _ccv_nnc_graph_exec_run(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i), i, tensor_tape, stream_context, flags); |
986 | 1 | _ccv_nnc_graph_from_move_transit(graph); |
987 | 1 | _ccv_nnc_graph_rewrap(graph); |
988 | 5 | for (count = 1; reverse_count > 0; ++count4 ) |
989 | 4 | { |
990 | 4 | graph->while_count = --reverse_count; |
991 | 4 | _ccv_nnc_graph_unwrap(graph, count, reverse_count); |
992 | 4 | _ccv_nnc_graph_transit_move_to(graph); |
993 | 20 | for (i = 0; i < graph->exec_info->rnum; i++16 ) |
994 | 16 | _ccv_nnc_graph_exec_run(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i), i, tensor_tape, stream_context, flags); |
995 | 4 | _ccv_nnc_graph_from_move_transit(graph); |
996 | 4 | _ccv_nnc_graph_rewrap(graph); |
997 | 4 | } |
998 | 1 | } |
999 | 9.91k | } else { |
1000 | 9.91k | graph->while_count = 0; |
1001 | 70.6k | for (i = 0; i < graph->exec_info->rnum; i++60.6k ) |
1002 | 60.6k | _ccv_nnc_graph_exec_run(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i), i, tensor_tape, stream_context, flags); |
1003 | 9.91k | } |
1004 | 9.93k | } |
1005 | | |
1006 | | static inline void _ccv_nnc_graph_run_slow_path(ccv_nnc_graph_t* const graph, const int exec_idx, ccv_nnc_graph_exec_info_t* const exec, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const int flags, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context) |
1007 | 35 | { |
1008 | 35 | int i, j; |
1009 | 35 | const ccv_nnc_graph_exec_t* const graph_sources = sources ? sources27 : (ccv_nnc_graph_exec_t*)8 ccv_array_get8 (graph->sources, 0); |
1010 | 35 | const int graph_source_size = source_size ? source_size27 : graph->sources->rnum8 ; |
1011 | 35 | const ccv_nnc_graph_exec_t* const graph_destinations = destinations ? destinations27 : (ccv_nnc_graph_exec_t*)8 ccv_array_get8 (graph->destinations, 0); |
1012 | 35 | const int graph_destination_size = destination_size ? destination_size27 : graph->destinations->rnum8 ; |
1013 | 35 | #define visitor(node, idx, ...) \ |
1014 | 235 | _ccv_nnc_graph_exec_run(graph, node, idx, tensor_tape, stream_context, flags) |
1015 | 35 | if (exec && (exec->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)8 ) |
1016 | 5 | { |
1017 | 5 | assert(!stream_context); // This doesn't work properly with stream context. |
1018 | 5 | assert(exec->p_while.expr); |
1019 | 5 | int64_t count = 0; |
1020 | | // This is a forward while loop. Backward while loop will just consult its pairing part. |
1021 | 5 | if (exec->cmd.cmd == CCV_NNC_GRAPH_FORWARD) |
1022 | 4 | { |
1023 | 4 | ccv_array_t* follows = ccv_array_new(sizeof(ccv_nnc_graph_exec_t), graph->breakpoint_size, 0); |
1024 | 8 | for (i = 0; i < graph->breakpoint_size; i++4 ) |
1025 | 4 | { |
1026 | 4 | const ccv_nnc_graph_exec_info_t* const exec_info = (const ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, graph->breakpoints->d); |
1027 | 4 | if (exec_info->outgoings) |
1028 | 6 | for (j = 0; 3 j < exec_info->outgoings->rnum; j++3 ) |
1029 | 3 | { |
1030 | 3 | const ccv_nnc_graph_exec_t exec = { |
1031 | 3 | .d = *(int*)ccv_array_get(exec_info->outgoings, j), |
1032 | 3 | .graph = graph, |
1033 | 3 | }; |
1034 | 3 | ccv_array_push(follows, &exec); |
1035 | 3 | } |
1036 | 4 | } |
1037 | 19 | for (;; ++count) |
1038 | 23 | { |
1039 | 23 | graph->while_count = count; |
1040 | 23 | if (tensor_tape) |
1041 | 5 | ccv_nnc_tensor_tape_set_numbering(tensor_tape, graph->p, (ccv_nnc_graph_exec_t){ |
1042 | 5 | .d = exec_idx, |
1043 | 5 | .graph = graph->p, |
1044 | 5 | }, count); |
1045 | 23 | _ccv_nnc_graph_unwrap(graph, count, 0); |
1046 | 23 | if (count > 0) |
1047 | 19 | _ccv_nnc_graph_transit_move_to(graph); |
1048 | 28 | CCV_NNC_GRAPH_VISIT23 (graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph_sources, graph_source_size, graph->breakpoints, graph->breakpoint_size, 0, visitor); |
1049 | 23 | _ccv_nnc_graph_exec_unwrap_while_expr(graph, exec); |
1050 | | // Reached breakpoints, now check the breakpoint, if not met, break out. |
1051 | 23 | if (!exec->p_while.expr(exec->p_while.inputs, exec->p_while.input_size, exec->p_while.data)) |
1052 | 4 | { |
1053 | 4 | _ccv_nnc_graph_rewrap(graph); |
1054 | 4 | break; |
1055 | 4 | } |
1056 | 19 | if (follows->rnum > 0) |
1057 | 19 | CCV_NNC_GRAPH_VISIT15 (graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, (ccv_nnc_graph_exec_t*)ccv_array_get(follows, 0), follows->rnum, graph_destinations, graph_destination_size, 0, visitor15 ); |
1058 | 19 | _ccv_nnc_graph_from_move_transit(graph); |
1059 | 19 | _ccv_nnc_graph_rewrap(graph); |
1060 | 19 | } |
1061 | 4 | ccv_array_free(follows); |
1062 | 4 | } else { |
1063 | | // For backward graph, no need to evaluate the while expr. |
1064 | 1 | assert(exec->cmd.cmd == CCV_NNC_GRAPH_BACKWARD); |
1065 | 1 | assert(graph->pair); |
1066 | 1 | assert(tensor_tape); |
1067 | 1 | count = 0; |
1068 | 1 | int64_t reverse_count = graph->while_count = ccv_nnc_tensor_tape_numbering(tensor_tape, graph->p, (ccv_nnc_graph_exec_t){ |
1069 | 1 | .d = exec_idx, |
1070 | 1 | .graph = graph->p, |
1071 | 1 | }); |
1072 | 1 | _ccv_nnc_graph_unwrap(graph, count, reverse_count); |
1073 | 2 | CCV_NNC_GRAPH_VISIT1 (graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph->breakpoints, graph->breakpoint_size, graph_destinations, graph_destination_size, 1, visitor); |
1074 | 1 | _ccv_nnc_graph_from_move_transit(graph); |
1075 | 1 | _ccv_nnc_graph_rewrap(graph); |
1076 | 5 | for (count = 1; reverse_count > 0; ++count4 ) |
1077 | 4 | { |
1078 | 4 | graph->while_count = --reverse_count; |
1079 | 4 | _ccv_nnc_graph_unwrap(graph, count, reverse_count); |
1080 | 4 | _ccv_nnc_graph_transit_move_to(graph); |
1081 | 8 | CCV_NNC_GRAPH_VISIT4 (graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph_sources, graph_source_size, graph_destinations, graph_destination_size, 0, visitor); |
1082 | 4 | _ccv_nnc_graph_from_move_transit(graph); |
1083 | 4 | _ccv_nnc_graph_rewrap(graph); |
1084 | 4 | } |
1085 | 1 | } |
1086 | 30 | } else { |
1087 | 30 | graph->while_count = 0; |
1088 | 182 | CCV_NNC_GRAPH_VISIT30 (graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph_sources, graph_source_size, graph_destinations, graph_destination_size, 0, visitor); |
1089 | 30 | } |
1090 | 35 | #undef visitor |
1091 | 35 | } |
1092 | | |
1093 | | static int _ccv_nnc_graph_run(ccv_nnc_graph_t* const graph, const int exec_idx, ccv_nnc_graph_exec_info_t* const exec, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const int flags, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context) |
1094 | 9.96k | { |
1095 | 9.96k | assert((sources == 0 && source_size == 0) || (sources && source_size)); |
1096 | 9.96k | assert((destinations == 0 && destination_size == 0) || (destinations && destination_size)); |
1097 | 9.96k | const ccv_nnc_graph_exec_t* const graph_sources = sources ? sources27 : (ccv_nnc_graph_exec_t*)9.94k ccv_array_get9.94k (graph->sources, 0); |
1098 | 9.96k | const int graph_source_size = source_size ? source_size27 : graph->sources->rnum9.94k ; |
1099 | 9.96k | const ccv_nnc_graph_exec_t* const graph_destinations = destinations ? destinations27 : (ccv_nnc_graph_exec_t*)9.94k ccv_array_get9.94k (graph->destinations, 0); |
1100 | 9.96k | const int graph_destination_size = destination_size ? destination_size27 : graph->destinations->rnum9.94k ; |
1101 | 9.96k | int i; |
1102 | 19.9k | for (i = 0; i < graph_source_size; i++9.97k ) |
1103 | 9.97k | if (graph_sources[i].graph != graph) |
1104 | 0 | return CCV_NNC_EXEC_INVALID; |
1105 | 19.9k | for (i = 0; 9.96k i < graph_destination_size; i++9.97k ) |
1106 | 9.97k | if (graph_destinations[i].graph != graph) |
1107 | 0 | return CCV_NNC_EXEC_INVALID; |
1108 | | // When topsorted is true, there is no memory allocation when run the graph. |
1109 | 9.96k | const int topsorted = (!sources && !destinations9.94k && graph->topsorted9.94k ); |
1110 | 9.96k | if (topsorted) |
1111 | 9.93k | _ccv_nnc_graph_topsorted_run(graph, exec_idx, exec, flags, tensor_tape, stream_context); |
1112 | 35 | else |
1113 | 35 | _ccv_nnc_graph_run_slow_path(graph, exec_idx, exec, inputs, input_size, outputs, output_size, flags, sources, source_size, destinations, destination_size, tensor_tape, stream_context); |
1114 | 9.96k | return CCV_NNC_EXEC_SUCCESS; |
1115 | 9.96k | } |
1116 | | |
1117 | | int ccv_nnc_graph_run(ccv_nnc_graph_t* const graph, const int flags, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context) |
1118 | 10.1k | { |
1119 | 10.1k | __atomic_store_n(&graph->run_state, CCV_NNC_GRAPH_STATE_RUNNING, __ATOMIC_RELEASE); |
1120 | 10.1k | if (stream_context && graph->topsorted215 && graph->stream_size > 0215 && graph->default_schedule215 && source_size == 0215 && destination_size == 0215 ) |
1121 | 215 | { |
1122 | 215 | co_scheduler_t* const scheduler = ccv_nnc_stream_context_get_scheduler(stream_context); |
1123 | 215 | co_routine_t* const task = co_new(_ccv_nnc_graph_topsorted_run_coro, (graph, -1, graph->default_schedule, 0, tensor_tape, stream_context, flags)); |
1124 | 215 | co_schedule(scheduler, task); |
1125 | | // I don't need to worry about freeing this task, it will free itself at the end. |
1126 | 215 | return CCV_NNC_EXEC_SUCCESS; |
1127 | 215 | } else |
1128 | 9.90k | return _ccv_nnc_graph_run(graph, -1, 0, 0, 0, 0, 0, flags, sources, source_size, destinations, destination_size, tensor_tape, 0 /* In this case, we don't support stream context yet. */); |
1129 | 10.1k | } |
1130 | | |
1131 | | int ccv_nnc_graph_run_with_schedule(ccv_nnc_graph_t* const graph, const int flags, const ccv_nnc_graph_static_schedule_t* const _schedule, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const _stream_context) |
1132 | 26.3k | { |
1133 | 26.3k | assert(graph->topsorted); |
1134 | 26.3k | if (graph->exec_info->rnum == 0) |
1135 | 0 | return CCV_NNC_EXEC_SUCCESS; |
1136 | 26.3k | __atomic_store_n(&graph->run_state, CCV_NNC_GRAPH_STATE_RUNNING, __ATOMIC_RELEASE); |
1137 | 26.3k | assert(graph->stream_size > 0); |
1138 | 26.3k | const ccv_nnc_graph_static_schedule_t* const schedule = _schedule ? _schedule15.8k : graph->default_schedule10.5k ; |
1139 | 26.3k | assert(schedule); |
1140 | 26.3k | assert(schedule->stream_0 < graph->stream_size); |
1141 | 26.3k | ccv_nnc_stream_context_t* const stream_context = _stream_context ? _stream_context699 : graph->streams[schedule->stream_0]25.6k ; |
1142 | 26.3k | co_scheduler_t* const scheduler = ccv_nnc_stream_context_get_scheduler(stream_context); |
1143 | 26.3k | co_routine_t* const task = co_new(_ccv_nnc_graph_topsorted_run_coro, (graph, -1, schedule, 0, tensor_tape, stream_context, flags)); |
1144 | 26.3k | co_schedule(scheduler, task); |
1145 | | // I don't need to worry about freeing this task, it will free itself at the end. |
1146 | 26.3k | if (!_stream_context) // If no stream context provided, this is a sync operation. |
1147 | 25.6k | ccv_nnc_stream_context_wait(stream_context); |
1148 | 26.3k | return CCV_NNC_EXEC_SUCCESS; |
1149 | 26.3k | } |
1150 | | |
1151 | | void ccv_nnc_graph_cancel(ccv_nnc_graph_t* const graph) |
1152 | 0 | { |
1153 | 0 | __atomic_store_n(&graph->run_state, CCV_NNC_GRAPH_STATE_CANCEL, __ATOMIC_RELEASE); |
1154 | 0 | } |