/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_graph_run.c
Line | Count | Source |
1 | | #include "ccv_nnc.h" |
2 | | #include "ccv_nnc_easy.h" |
3 | | #include "ccv_nnc_internal.h" |
4 | | #include "ccv_internal.h" |
5 | | #include "_ccv_nnc_graph.h" |
6 | | #include "_ccv_nnc_stream.h" |
7 | | #ifdef HAVE_CUDA |
8 | | #include "gpu/ccv_nnc_compat.h" |
9 | | #elif defined(HAVE_MPS) |
10 | | #include "mps/ccv_nnc_mps.h" |
11 | | #endif |
12 | | |
13 | | // MARK - Level-2 API |
14 | | |
15 | | static void _ccv_nnc_unwrap_tensor_wrap(const ccv_nnc_graph_t* const graph, const int64_t count, const int64_t reverse_count, ccv_nnc_graph_tensor_wrap_t* const tensor_wrap) |
16 | 930 | { |
17 | 930 | ccv_nnc_tensor_t* tensor = tensor_wrap->tensors[tensor_wrap->index]; |
18 | 1.96k | while (CCV_IS_TENSOR_MULTIVIEW(tensor) && |
19 | 1.96k | (1.06k ((ccv_nnc_tensor_multiview_t*)tensor)->anchor == (intptr_t)graph1.06k || |
20 | 1.06k | ((ccv_nnc_tensor_multiview_t*)tensor)->anchor == (intptr_t)graph->pair45 )) |
21 | 1.03k | { |
22 | | // If the anchor is from the pair, we use the reverse_count instead (we are looking it up). |
23 | 1.03k | const int i = (int)((((ccv_nnc_tensor_multiview_t*)tensor)->anchor == (intptr_t)graph) ? count1.02k : reverse_count15 ); |
24 | 1.03k | ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor; |
25 | 1.03k | const int off = mv->kind; |
26 | 1.03k | const int mod = mv->repeat; |
27 | 1.03k | tensor = CCV_NNC_MULTIVIEW_DATA(mv)[i >= off ? ((i - off) % mod) + off1.00k : i32 ]; // Unwrap. |
28 | | // If reached the root. |
29 | 1.03k | if (!CCV_IS_TENSOR_MULTIVIEW(tensor)) |
30 | 889 | tensor_wrap->update_required = 1; // Need to update tensor updates. |
31 | 1.03k | ++tensor_wrap->index; |
32 | 1.03k | tensor_wrap->tensors[tensor_wrap->index] = tensor; |
33 | 1.03k | assert(tensor_wrap->index < tensor_wrap->count); |
34 | 1.03k | } |
35 | 930 | } |
36 | | |
37 | | static void _ccv_nnc_graph_unwrap_sub_graph(const ccv_nnc_graph_t* const graph, const int64_t count, const int64_t reverse_count, const ccv_nnc_graph_t* const sub_graph) |
38 | 198 | { |
39 | 198 | int i; |
40 | 198 | if (sub_graph->carry_overs) |
41 | 265 | for (i = 0; 121 i < sub_graph->carry_overs->rnum; i++144 ) |
42 | 144 | { |
43 | 144 | ccv_nnc_graph_tensor_carry_over_t* const carry_over = (ccv_nnc_graph_tensor_carry_over_t*)ccv_array_get(sub_graph->carry_overs, i); |
44 | 144 | _ccv_nnc_unwrap_tensor_wrap(graph, count, reverse_count, carry_over->from); |
45 | 144 | _ccv_nnc_unwrap_tensor_wrap(graph, count, reverse_count, carry_over->to); |
46 | 144 | } |
47 | 198 | if (sub_graph->sub_graphs) |
48 | 82 | for (i = 0; 21 i < sub_graph->sub_graphs->rnum; i++61 ) |
49 | 61 | _ccv_nnc_graph_unwrap_sub_graph(graph, count, reverse_count, *(ccv_nnc_graph_t**)ccv_array_get(sub_graph->sub_graphs, i)); |
50 | 198 | } |
51 | | |
52 | | static void _ccv_nnc_graph_unwrap(const ccv_nnc_graph_t* const graph, const int64_t count, const int64_t reverse_count) |
53 | 171 | { |
54 | 171 | if (!graph->tensor_wraps_refs) |
55 | 34 | return; |
56 | 137 | int i, j; |
57 | 510 | for (i = 0; i < graph->tensor_wraps_refs->rnum; i++373 ) |
58 | 373 | { |
59 | 373 | const ccv_nnc_graph_tensor_wraps_ref_t* const tensor_wraps_ref = (const ccv_nnc_graph_tensor_wraps_ref_t*)ccv_array_get(graph->tensor_wraps_refs, i); |
60 | 373 | const ccv_nnc_graph_t* const sub_graph = tensor_wraps_ref->graph; |
61 | 373 | ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(sub_graph->tensor_wraps, tensor_wraps_ref->d); |
62 | 373 | if (tensor_wrap_array) |
63 | 1.36k | for (j = 0; 373 j < tensor_wrap_array->size; j++994 ) |
64 | 994 | { |
65 | 994 | ccv_nnc_graph_tensor_wrap_t* const tensor_wrap = tensor_wrap_array->tensor_wraps[j]; |
66 | 994 | if (!tensor_wrap) |
67 | 352 | continue; |
68 | 642 | _ccv_nnc_unwrap_tensor_wrap(graph, count, reverse_count, tensor_wrap); |
69 | 642 | } |
70 | 373 | } |
71 | 137 | _ccv_nnc_graph_unwrap_sub_graph(graph, count, reverse_count, graph); |
72 | 137 | } |
73 | | |
74 | | static void _ccv_nnc_graph_transit_move_to(const ccv_nnc_graph_t* const graph) |
75 | 141 | { |
76 | 141 | int i; |
77 | 141 | if (graph->carry_overs) |
78 | 255 | for (i = 0; 118 i < graph->carry_overs->rnum; i++137 ) |
79 | 137 | { |
80 | 137 | ccv_nnc_graph_tensor_carry_over_t* const carry_over = (ccv_nnc_graph_tensor_carry_over_t*)ccv_array_get(graph->carry_overs, i); |
81 | 137 | ccv_nnc_tensor_t* it = (ccv_nnc_tensor_t*)(carry_over->to->tensors[carry_over->to->index]); |
82 | 137 | assert(!CCV_IS_TENSOR_MULTIVIEW(it)); |
83 | 137 | it->data = carry_over->transit; |
84 | 137 | } |
85 | 141 | } |
86 | | |
87 | | static void _ccv_nnc_graph_from_move_transit(const ccv_nnc_graph_t* const graph) |
88 | 143 | { |
89 | 143 | int i; |
90 | 143 | if (graph->carry_overs) |
91 | 258 | for (i = 0; 119 i < graph->carry_overs->rnum; i++139 ) |
92 | 139 | { |
93 | 139 | ccv_nnc_graph_tensor_carry_over_t* const carry_over = (ccv_nnc_graph_tensor_carry_over_t*)ccv_array_get(graph->carry_overs, i); |
94 | 139 | ccv_nnc_tensor_t* it = (ccv_nnc_tensor_t*)(carry_over->from->tensors[carry_over->from->index]); |
95 | 139 | assert(!CCV_IS_TENSOR_MULTIVIEW(it)); |
96 | 139 | carry_over->transit = it->data; |
97 | 139 | } |
98 | 143 | } |
99 | | |
100 | | static void _ccv_nnc_rewrap_tensor_wrap(const ccv_nnc_graph_t* const graph, ccv_nnc_graph_tensor_wrap_t* const tensor_wrap) |
101 | 930 | { |
102 | 1.96k | while (tensor_wrap->index > 0 && CCV_IS_TENSOR_MULTIVIEW1.18k (tensor_wrap->tensors[tensor_wrap->index - 1]) && |
103 | 1.96k | (1.18k ((ccv_nnc_tensor_multiview_t*)tensor_wrap->tensors[tensor_wrap->index - 1])->anchor == (intptr_t)graph1.18k || |
104 | 1.18k | ((ccv_nnc_tensor_multiview_t*)tensor_wrap->tensors[tensor_wrap->index - 1])->anchor == (intptr_t)graph->pair165 )) |
105 | 1.03k | --tensor_wrap->index; |
106 | 930 | } |
107 | | |
108 | | static void _ccv_nnc_graph_rewrap_sub_graph(const ccv_nnc_graph_t* const graph, const ccv_nnc_graph_t* const sub_graph) |
109 | 198 | { |
110 | 198 | int i; |
111 | 198 | if (sub_graph->carry_overs) |
112 | 265 | for (i = 0; 121 i < sub_graph->carry_overs->rnum; i++144 ) |
113 | 144 | { |
114 | 144 | ccv_nnc_graph_tensor_carry_over_t* const carry_over = (ccv_nnc_graph_tensor_carry_over_t*)ccv_array_get(sub_graph->carry_overs, i); |
115 | 144 | _ccv_nnc_rewrap_tensor_wrap(graph, carry_over->from); |
116 | 144 | _ccv_nnc_rewrap_tensor_wrap(graph, carry_over->to); |
117 | 144 | } |
118 | 198 | if (sub_graph->sub_graphs) |
119 | 82 | for (i = 0; 21 i < sub_graph->sub_graphs->rnum; i++61 ) |
120 | 61 | _ccv_nnc_graph_rewrap_sub_graph(graph, *(ccv_nnc_graph_t**)ccv_array_get(sub_graph->sub_graphs, i)); |
121 | 198 | } |
122 | | |
123 | | static void _ccv_nnc_graph_rewrap(const ccv_nnc_graph_t* const graph) // Call this method at the end to roll the wrap_ptr back |
124 | 171 | { |
125 | 171 | if (!graph->tensor_wraps_refs) |
126 | 34 | return; |
127 | 137 | int i, j; |
128 | 510 | for (i = 0; i < graph->tensor_wraps_refs->rnum; i++373 ) |
129 | 373 | { |
130 | 373 | const ccv_nnc_graph_tensor_wraps_ref_t* const tensor_wraps_ref = (const ccv_nnc_graph_tensor_wraps_ref_t*)ccv_array_get(graph->tensor_wraps_refs, i); |
131 | 373 | const ccv_nnc_graph_t* const sub_graph = tensor_wraps_ref->graph; |
132 | 373 | ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(sub_graph->tensor_wraps, tensor_wraps_ref->d); |
133 | 373 | if (tensor_wrap_array) |
134 | 1.36k | for (j = 0; 373 j < tensor_wrap_array->size; j++994 ) |
135 | 994 | { |
136 | 994 | ccv_nnc_graph_tensor_wrap_t* const tensor_wrap = tensor_wrap_array->tensor_wraps[j]; |
137 | 994 | if (!tensor_wrap) |
138 | 352 | continue; |
139 | 642 | _ccv_nnc_rewrap_tensor_wrap(graph, tensor_wrap); |
140 | 642 | } |
141 | 373 | } |
142 | 137 | _ccv_nnc_graph_rewrap_sub_graph(graph, graph); |
143 | 137 | } |
144 | | |
145 | | static void _ccv_nnc_graph_exec_unwrap_io(const ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const node) |
146 | 290k | { |
147 | 290k | if (!node->tensor_wraps_ref) |
148 | 290k | return; |
149 | 277 | int i; |
150 | 277 | ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, node->tensor_wraps_ref - 1); |
151 | 277 | ccv_nnc_graph_tensor_wrap_t** const tensor_wraps = tensor_wrap_array->tensor_wraps; |
152 | 1.04k | for (i = 0; i < tensor_wrap_array->size; i++767 ) |
153 | 767 | if (tensor_wraps[i]) |
154 | 492 | { |
155 | 492 | assert(tensor_wraps[i]->index > 0); |
156 | 492 | ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)(tensor_wraps[i]->tensors[tensor_wraps[i]->index - 1]); |
157 | 492 | assert(CCV_IS_TENSOR_MULTIVIEW(mv)); |
158 | | // Only now set the mv->it, because now this node is about to get executed. |
159 | 492 | mv->it = tensor_wraps[i]->tensors[tensor_wraps[i]->index]; |
160 | 492 | assert(!CCV_IS_TENSOR_MULTIVIEW(mv->it)); |
161 | 492 | } |
162 | 699 | for (i = 0; 277 i < node->input_size; i++422 ) |
163 | 422 | if (tensor_wraps[i]) |
164 | 191 | node->inputs[i] = tensor_wraps[i]->tensors[tensor_wraps[i]->index]; |
165 | 277 | const int d = node->input_size; |
166 | 472 | for (i = 0; i < node->output_size; i++195 ) |
167 | 195 | if (tensor_wraps[d + i]) |
168 | 151 | node->outputs[i] = tensor_wraps[d + i]->tensors[tensor_wraps[d + i]->index]; |
169 | 277 | } |
170 | | |
171 | | static void _ccv_nnc_graph_exec_unwrap_while_expr(const ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const node) |
172 | 161 | { |
173 | 161 | assert(node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE); |
174 | 161 | if (!node->p_while.tensor_wraps_ref) |
175 | 155 | return; |
176 | 6 | int i; |
177 | 6 | ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, node->p_while.tensor_wraps_ref - 1); |
178 | 6 | ccv_nnc_graph_tensor_wrap_t** const tensor_wraps = tensor_wrap_array->tensor_wraps; |
179 | 18 | for (i = 0; i < tensor_wrap_array->size; i++12 ) |
180 | 12 | if (tensor_wraps[i]) |
181 | 6 | { |
182 | 6 | assert(tensor_wraps[i]->index > 0); |
183 | 6 | ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)(tensor_wraps[i]->tensors[tensor_wraps[i]->index - 1]); |
184 | 6 | assert(CCV_IS_TENSOR_MULTIVIEW(mv)); |
185 | | // Only now set the mv->it, because now this node is about to get executed. |
186 | 6 | mv->it = tensor_wraps[i]->tensors[tensor_wraps[i]->index]; |
187 | 6 | assert(!CCV_IS_TENSOR_MULTIVIEW(mv->it)); |
188 | 6 | } |
189 | 18 | for (i = 0; 6 i < node->p_while.input_size; i++12 ) |
190 | 12 | if (tensor_wraps[i]) |
191 | 6 | node->p_while.inputs[i] = tensor_wraps[i]->tensors[tensor_wraps[i]->index]; |
192 | 6 | } |
193 | | |
194 | | static void _ccv_nnc_graph_exec_unwrap_phi(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_info_t* const node, const int ref) |
195 | 41 | { |
196 | 41 | int i; |
197 | | // If the output tensor is a phi multi-view tensor, we update our selection to all the subscribers. |
198 | 80 | for (i = 0; i < node->output_size; i++39 ) |
199 | 39 | if (CCV_IS_TENSOR_MULTIVIEW(node->outputs[i]) && |
200 | 39 | ((ccv_nnc_tensor_multiview_t*)node->outputs[i])->anchor == 29 CCV_NNC_MULTIVIEW_PHI29 ) |
201 | 29 | { |
202 | 29 | ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)node->outputs[i]; |
203 | 29 | mv->it = CCV_NNC_MULTIVIEW_DATA(mv)[ref >= 0]; |
204 | 29 | ccv_nnc_tensor_multiview_synchronize(mv); |
205 | 29 | } |
206 | 41 | } |
207 | | |
208 | | static void _ccv_nnc_graph_exec_begin_synchronize_multiviews(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const node) |
209 | 290k | { |
210 | 290k | if (!node->tensor_wraps_ref) |
211 | 290k | return; |
212 | 277 | int i; |
213 | 277 | ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, node->tensor_wraps_ref - 1); |
214 | 277 | ccv_nnc_graph_tensor_wrap_t** const tensor_wraps = tensor_wrap_array->tensor_wraps; |
215 | 1.04k | for (i = 0; i < tensor_wrap_array->size; i++767 ) |
216 | 767 | if (tensor_wraps[i] && tensor_wraps[i]->update_required492 ) |
217 | 492 | { |
218 | 492 | assert(tensor_wraps[i]->index > 0); |
219 | 492 | ccv_nnc_tensor_multiview_t* const mv = (ccv_nnc_tensor_multiview_t*)(tensor_wraps[i]->tensors[tensor_wraps[i]->index - 1]); |
220 | | // Now update the final pointer. |
221 | 492 | ccv_nnc_tensor_multiview_synchronize(mv); |
222 | 492 | tensor_wraps[i]->update_required = 0; // Reset, no need to update. |
223 | 492 | } |
224 | 277 | } |
225 | | |
226 | | void ccv_nnc_print_tensor_shape(const ccv_nnc_tensor_t* const tensor) |
227 | 0 | { |
228 | 0 | int i; |
229 | 0 | PRINT(CCV_CLI_INFO, " [%d", tensor->info.dim[0]); |
230 | 0 | for (i = 1; i < CCV_NNC_MAX_DIM_ALLOC && tensor->info.dim[i]; i++) |
231 | 0 | PRINT(CCV_CLI_INFO, "x%d", tensor->info.dim[i]); |
232 | 0 | PRINT(CCV_CLI_INFO, "]"); |
233 | 0 | } |
234 | | |
235 | | void ccv_nnc_print_tensor_info(const ccv_nnc_tensor_t* const tensor) |
236 | 0 | { |
237 | 0 | int i; |
238 | 0 | PRINT(CCV_CLI_INFO, " [%d", tensor->info.dim[0]); |
239 | 0 | for (i = 1; i < CCV_NNC_MAX_DIM_ALLOC && tensor->info.dim[i]; i++) |
240 | 0 | PRINT(CCV_CLI_INFO, "x%d", tensor->info.dim[i]); |
241 | 0 | PRINT(CCV_CLI_INFO, "]"); |
242 | 0 | if (!CCV_CLI_OUTPUT_LEVEL_IS(CCV_CLI_VERBOSE) || tensor->info.dim[0] <= 0) |
243 | 0 | return; |
244 | 0 | const int nd = ccv_nnc_tensor_nd(tensor->info.dim); |
245 | 0 | const int len = ccv_min(tensor->info.dim[nd - 1], 3); |
246 | 0 | if (CCV_TENSOR_GET_MEMORY(tensor->info.type) == CCV_TENSOR_GPU_MEMORY) |
247 | 0 | { |
248 | 0 | #ifdef HAVE_CUDA |
249 | 0 | switch (tensor->info.datatype) |
250 | 0 | { |
251 | 0 | case CCV_16F: { |
252 | 0 | uint16_t data[len]; |
253 | 0 | cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.f16, tensor->info.type, len * sizeof(uint16_t)); |
254 | 0 | float fp32[len]; |
255 | 0 | ccv_half_precision_to_float(data, fp32, len); |
256 | 0 | for (i = 0; i < len; i++) |
257 | 0 | PRINT(CCV_CLI_VERBOSE, " %f", fp32[i]); |
258 | 0 | break; |
259 | 0 | } |
260 | 0 | case CCV_32F: { |
261 | 0 | float data[len]; |
262 | 0 | cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.f32, tensor->info.type, len * sizeof(float)); |
263 | 0 | for (i = 0; i < len; i++) |
264 | 0 | PRINT(CCV_CLI_VERBOSE, " %f", data[i]); |
265 | 0 | break; |
266 | 0 | } |
267 | 0 | case CCV_64F: { |
268 | 0 | double data[len]; |
269 | 0 | cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.f64, tensor->info.type, len * sizeof(double)); |
270 | 0 | for (i = 0; i < len; i++) |
271 | 0 | PRINT(CCV_CLI_VERBOSE, " %f", data[i]); |
272 | 0 | break; |
273 | 0 | } |
274 | 0 | case CCV_32S: { |
275 | 0 | int data[len]; |
276 | 0 | cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.i32, tensor->info.type, len * sizeof(int)); |
277 | 0 | for (i = 0; i < len; i++) |
278 | 0 | PRINT(CCV_CLI_VERBOSE, " %d", data[i]); |
279 | 0 | break; |
280 | 0 | } |
281 | 0 | case CCV_64S: { |
282 | 0 | int64_t data[len]; |
283 | 0 | cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.i64, tensor->info.type, len * sizeof(int64_t)); |
284 | 0 | for (i = 0; i < len; i++) |
285 | 0 | PRINT(CCV_CLI_VERBOSE, " %lld", (long long)data[i]); |
286 | 0 | break; |
287 | 0 | } |
288 | 0 | case CCV_8U: { |
289 | 0 | uint8_t data[len]; |
290 | 0 | cumemcpy(data, CCV_TENSOR_CPU_MEMORY, tensor->data.u8, tensor->info.type, len * sizeof(uint8_t)); |
291 | 0 | for (i = 0; i < len; i++) |
292 | 0 | PRINT(CCV_CLI_VERBOSE, " %d", (int)data[i]); |
293 | 0 | break; |
294 | 0 | } |
295 | 0 | } |
296 | 0 | if (ccv_nnc_tensor_count(tensor->info) > 3) |
297 | 0 | PRINT(CCV_CLI_VERBOSE, " .."); |
298 | | #elif defined(HAVE_MPS) |
299 | | switch (tensor->info.datatype) |
300 | | { |
301 | | case CCV_16F: { |
302 | | uint16_t data[len]; |
303 | | mpmemcpy(data, 0, CCV_TENSOR_CPU_MEMORY, tensor->data.f16, tensor->dataof, tensor->info.type, len * sizeof(uint16_t)); |
304 | | float fp32[len]; |
305 | | ccv_half_precision_to_float(data, fp32, len); |
306 | | for (i = 0; i < len; i++) |
307 | | PRINT(CCV_CLI_VERBOSE, " %f", fp32[i]); |
308 | | break; |
309 | | } |
310 | | case CCV_32F: { |
311 | | float data[len]; |
312 | | mpmemcpy(data, 0, CCV_TENSOR_CPU_MEMORY, tensor->data.f32, tensor->dataof, tensor->info.type, len * sizeof(float)); |
313 | | for (i = 0; i < len; i++) |
314 | | PRINT(CCV_CLI_VERBOSE, " %f", data[i]); |
315 | | break; |
316 | | } |
317 | | case CCV_64F: { |
318 | | double data[len]; |
319 | | mpmemcpy(data, 0, CCV_TENSOR_CPU_MEMORY, tensor->data.f64, tensor->dataof, tensor->info.type, len * sizeof(double)); |
320 | | for (i = 0; i < len; i++) |
321 | | PRINT(CCV_CLI_VERBOSE, " %f", data[i]); |
322 | | break; |
323 | | } |
324 | | case CCV_32S: { |
325 | | int data[len]; |
326 | | mpmemcpy(data, 0, CCV_TENSOR_CPU_MEMORY, tensor->data.i32, tensor->dataof, tensor->info.type, len * sizeof(int)); |
327 | | for (i = 0; i < len; i++) |
328 | | PRINT(CCV_CLI_VERBOSE, " %d", data[i]); |
329 | | break; |
330 | | } |
331 | | case CCV_64S: { |
332 | | int64_t data[len]; |
333 | | mpmemcpy(data, 0, CCV_TENSOR_CPU_MEMORY, tensor->data.i64, tensor->dataof, tensor->info.type, len * sizeof(int64_t)); |
334 | | for (i = 0; i < len; i++) |
335 | | PRINT(CCV_CLI_VERBOSE, " %lld", (long long)data[i]); |
336 | | break; |
337 | | } |
338 | | case CCV_8U: { |
339 | | uint8_t data[len]; |
340 | | mpmemcpy(data, 0, CCV_TENSOR_CPU_MEMORY, tensor->data.u8, tensor->dataof, tensor->info.type, len * sizeof(uint8_t)); |
341 | | for (i = 0; i < len; i++) |
342 | | PRINT(CCV_CLI_VERBOSE, " %d", (int)data[i]); |
343 | | break; |
344 | | } |
345 | | } |
346 | | if (ccv_nnc_tensor_count(tensor->info) > 3) |
347 | | PRINT(CCV_CLI_VERBOSE, " .."); |
348 | | #endif |
349 | 0 | } else if (CCV_TENSOR_GET_MEMORY(tensor->info.type) == CCV_TENSOR_CPU_MEMORY) { |
350 | 0 | switch (tensor->info.datatype) |
351 | 0 | { |
352 | 0 | case CCV_16F: { |
353 | 0 | float fp32[len]; |
354 | 0 | ccv_half_precision_to_float((uint16_t*)tensor->data.f16, fp32, len); |
355 | 0 | for (i = 0; i < len; i++) |
356 | 0 | PRINT(CCV_CLI_VERBOSE, " %f", fp32[i]); |
357 | 0 | break; |
358 | 0 | } |
359 | 0 | case CCV_32F: |
360 | 0 | for (i = 0; i < len; i++) |
361 | 0 | PRINT(CCV_CLI_VERBOSE, " %f", tensor->data.f32[i]); |
362 | 0 | break; |
363 | 0 | case CCV_64F: |
364 | 0 | for (i = 0; i < len; i++) |
365 | 0 | PRINT(CCV_CLI_VERBOSE, " %f", tensor->data.f64[i]); |
366 | 0 | break; |
367 | 0 | case CCV_32S: |
368 | 0 | for (i = 0; i < len; i++) |
369 | 0 | PRINT(CCV_CLI_VERBOSE, " %d", tensor->data.i32[i]); |
370 | 0 | break; |
371 | 0 | case CCV_64S: |
372 | 0 | for (i = 0; i < len; i++) |
373 | 0 | PRINT(CCV_CLI_VERBOSE, " %lld", (long long)tensor->data.i64[i]); |
374 | 0 | break; |
375 | 0 | case CCV_8U: |
376 | 0 | for (i = 0; i < len; i++) |
377 | 0 | PRINT(CCV_CLI_VERBOSE, " %d", (int)tensor->data.u8[i]); |
378 | 0 | break; |
379 | 0 | } |
380 | 0 | if (ccv_nnc_tensor_count(tensor->info) > 3) |
381 | 0 | PRINT(CCV_CLI_VERBOSE, " .."); |
382 | 0 | } |
383 | 0 | } |
384 | | |
385 | | static co_decl(_ccv_nnc_graph_topsorted_run_coro, (ccv_nnc_graph_t* const graph, const int exec_idx, const ccv_nnc_graph_static_schedule_t* const schedule, ccv_nnc_graph_exec_info_t* const exec, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context, const int flags)); |
386 | | |
387 | 6 | static co_decl_task2 (_ccv_nnc_graph_exec_cases_of_coro, (ccv_nnc_graph_t* const graph, const int exec_idx, ccv_nnc_graph_exec_info_t* const exec, const ccv_nnc_graph_exec_schedule_t* const schd, ccv_nnc_tensor_t* const* const inputs, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context, int flags), private( |
388 | 6 | int ref; |
389 | 6 | ccv_nnc_graph_t* sub_graph; |
390 | 6 | )) { |
391 | | // Wait until this stream context is done. |
392 | 6 | co_stream_await2 (CO_P(stream_context)); |
393 | 2 | if (CO_P(exec)->cmd.cmd == CCV_NNC_GRAPH_FORWARD) |
394 | 2 | { |
395 | 2 | CO_V(ref) = CO_P(exec)->case_of.offset + CO_P(exec)->case_of.expr(CO_P(inputs), CO_P(exec)->input_size, CO_P(exec)->case_of.data); |
396 | 2 | if (CO_P(tensor_tape)) |
397 | 0 | ccv_nnc_tensor_tape_set_numbering(CO_P(tensor_tape), CO_P(graph), (ccv_nnc_graph_exec_t){ |
398 | 0 | .d = CO_P(exec_idx), |
399 | 0 | .graph = CO_P(graph), |
400 | 0 | }, CO_V(ref)); |
401 | 2 | } else { |
402 | 0 | assert(CO_P(exec)->cmd.cmd == CCV_NNC_GRAPH_BACKWARD); |
403 | 0 | assert(CO_P(tensor_tape)); |
404 | 0 | CO_V(ref) = ccv_nnc_tensor_tape_numbering(CO_P(tensor_tape), CO_P(graph), (ccv_nnc_graph_exec_t){ |
405 | 0 | .d = CO_P(exec_idx), |
406 | 0 | .graph = CO_P(graph), |
407 | 0 | }); |
408 | 0 | } |
409 | 2 | if (CO_V(ref) >= 0) |
410 | 2 | { |
411 | 2 | assert(CO_V(ref) < CO_P(exec)->graph_ref_size); |
412 | 2 | CO_V(sub_graph) = *(ccv_nnc_graph_t**)ccv_array_get(CO_P(graph)->sub_graphs, CCV_NNC_GRAPH_REF(CO_P(exec))[CO_V(ref)] - 1); |
413 | 2 | assert(CO_P(schd)->stream_size == 1); |
414 | 2 | assert(CO_P(graph)->streams[SCHEDULE_STREAMS(*CO_P(schd))[0]] == CO_V(sub_graph)->streams[0]); |
415 | 2 | co_apply(_ccv_nnc_graph_topsorted_run_coro, (CO_V(sub_graph), CO_P(exec_idx), CO_V(sub_graph)->default_schedule, CO_P(exec), CO_P(tensor_tape), CO_P(graph)->streams[SCHEDULE_STREAMS(*CO_P(schd))[0]], CO_P(flags))); |
416 | 2 | } |
417 | 2 | _ccv_nnc_graph_exec_unwrap_phi(CO_P(graph), CO_P(exec), CO_V(ref)); |
418 | 2 | } co_end() |
419 | | |
420 | | typedef struct { |
421 | | ccv_nnc_graph_t* graph; |
422 | | const ccv_nnc_graph_exec_schedule_t* node; |
423 | | ccv_nnc_stream_context_t* stream; |
424 | | } ccv_nnc_graph_neighbor_context_discovery_t; |
425 | | |
426 | | static ccv_nnc_stream_context_t* _ccv_nnc_graph_neighbor_context_discovery(const int device_id, void* const context) |
427 | 13.9k | { |
428 | 13.9k | const ccv_nnc_graph_neighbor_context_discovery_t* const discovery = (ccv_nnc_graph_neighbor_context_discovery_t*)context; |
429 | 13.9k | if (CCV_STREAM_GET_DEVICE_ID(ccv_nnc_stream_context_type(discovery->stream)) == device_id) |
430 | 3.65k | return discovery->stream; |
431 | 10.3k | ccv_nnc_graph_t* const graph = discovery->graph; |
432 | 10.3k | const ccv_nnc_graph_exec_schedule_t* const node = discovery->node; |
433 | 10.3k | int i; |
434 | | // First try to find in other streams of the same node. |
435 | 30.9k | for (i = 0; i < node->stream_size; i++20.6k ) |
436 | 30.9k | { |
437 | 30.9k | ccv_nnc_stream_context_t* const stream = graph->streams[SCHEDULE_STREAMS(*node)[i]]; |
438 | 30.9k | if (CCV_STREAM_GET_DEVICE_ID(ccv_nnc_stream_context_type(stream)) == device_id) |
439 | 10.3k | return stream; |
440 | 30.9k | } |
441 | | // If cannot find, try to find in all the wait streams. |
442 | 7 | for (i = 0; 4 i < node->wait_size; i++3 ) |
443 | 7 | { |
444 | 7 | ccv_nnc_stream_context_t* stream_context = ccv_nnc_stream_signal_get_emitter(graph->signals[node->waits[i]]); |
445 | 7 | if (stream_context && CCV_STREAM_GET_DEVICE_ID(ccv_nnc_stream_context_type(stream_context)) == device_id) |
446 | 4 | return stream_context; |
447 | 7 | } |
448 | 0 | return 0; |
449 | 4 | } |
450 | | |
451 | | static co_routine_t* _ccv_nnc_graph_exec_run_task(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const node, const ccv_nnc_graph_exec_schedule_t* const schd, const int idx, ccv_nnc_tensor_tape_t* const tensor_tape, const int flags) |
452 | 229k | { |
453 | 229k | _ccv_nnc_graph_exec_unwrap_io(graph, node); |
454 | 229k | ccv_nnc_tensor_t** inputs = node->inputs; |
455 | 229k | ccv_nnc_tensor_t** outputs = inputs ? inputs + node->input_size201k : 028.0k ; |
456 | 229k | if (tensor_tape) |
457 | 0 | ccv_nnc_tensor_tape_io(tensor_tape, graph, node->input_flags, inputs, node->input_size, node->output_flags, outputs, node->output_size); |
458 | | /* Broadcast the updates to all subscribed references for input / output, even though at th |
459 | | * time output is not written yet, propagate pointer change is still valid. */ |
460 | 229k | _ccv_nnc_graph_exec_begin_synchronize_multiviews(graph, node); |
461 | 229k | if (node->cmd.cmd == CCV_NNC_GRAPH_FORWARD || node->cmd.cmd == CCV_NNC_GRAPH_BACKWARD229k ) |
462 | 4 | { |
463 | 4 | if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) |
464 | 2 | { |
465 | 2 | ccv_nnc_stream_context_t* const node_stream = graph->streams[SCHEDULE_STREAMS(*schd)[0]]; |
466 | 2 | return co_new(_ccv_nnc_graph_exec_cases_of_coro, (graph, idx, node, schd, inputs, tensor_tape, node_stream, flags)); |
467 | 2 | } else if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE) { |
468 | 2 | ccv_nnc_graph_t* sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[0] - 1); |
469 | 2 | assert(graph->streams[SCHEDULE_STREAMS(*schd)[0]] == sub_graph->streams[0]); |
470 | 2 | return co_new(_ccv_nnc_graph_topsorted_run_coro, (sub_graph, idx, sub_graph->default_schedule, node, tensor_tape, graph->streams[SCHEDULE_STREAMS(*schd)[0]], flags)); |
471 | 2 | } |
472 | 229k | } else { |
473 | 229k | PRINT(CCV_CLI_INFO, "%s [%d]: [%d] -> [%d] (%d)\n", ccv_nnc_cmd_name(node->cmd.cmd), idx, node->input_size, node->output_size, 0 SCHEDULE_STREAMS0 (*schd)[0]); |
474 | 229k | int i, j; |
475 | 229k | int flag = 0; |
476 | 469k | for (i = 0; i < schd->stream_size; i++239k ) |
477 | 239k | { |
478 | 239k | ccv_nnc_stream_context_t* const stream = graph->streams[SCHEDULE_STREAMS(*schd)[i]]; |
479 | 354k | for (j = 0; j < schd->wait_size; j++115k ) |
480 | 115k | { |
481 | 115k | ccv_nnc_stream_context_wait_signal(stream, graph->signals[schd->waits[j]]); |
482 | 115k | if (!flag) |
483 | 43.1k | { |
484 | 43.1k | PRINT(CCV_CLI_INFO, "Wait: (%d, %d)", SCHEDULE_STREAMS0 (*schd)[i], schd->waits[j]); |
485 | 43.1k | flag = 1; |
486 | 43.1k | } else |
487 | 71.9k | PRINT(CCV_CLI_INFO, ", (%d, %d)", SCHEDULE_STREAMS0 (*schd)[i], schd->waits[j]); |
488 | 115k | } |
489 | 239k | } |
490 | 229k | if (flag) |
491 | 43.1k | PRINT(CCV_CLI_INFO, "\n"); |
492 | 878k | for (i = 0; i < node->input_size; i++649k ) |
493 | 649k | { |
494 | 649k | PRINT(CCV_CLI_INFO, "|-> %d. %p (%p:%d)", i + 1, inputs[i], (inputs[i] ? inputs[i]->data.u8 : 0), (inputs[i] ? 0 CCV_TENSOR_GET_DEVICE_ID0 (inputs[i]->info.type) : -1)); |
495 | 649k | if (inputs[i] && CCV_CLI_OUTPUT_LEVEL_IS497k (CCV_CLI_INFO)) |
496 | 0 | ccv_nnc_print_tensor_info(inputs[i]); |
497 | 649k | PRINT(CCV_CLI_INFO, "\n"); |
498 | 649k | } |
499 | 585k | for (i = 0; i < node->output_size; i++356k ) |
500 | 356k | { |
501 | 356k | PRINT(CCV_CLI_INFO, "|<- %d. %p (%p:%d)", i + 1, outputs[i], (outputs[i] ? outputs[i]->data.u8 : 0), (outputs[i] ? 0 CCV_TENSOR_GET_DEVICE_ID0 (outputs[i]->info.type) : -1)); |
502 | 356k | if (outputs[i] && CCV_CLI_OUTPUT_LEVEL_IS335k (CCV_CLI_INFO)) |
503 | 0 | ccv_nnc_print_tensor_shape(outputs[i]); |
504 | 356k | PRINT(CCV_CLI_INFO, "\n"); |
505 | 356k | } |
506 | 229k | ccv_nnc_stream_context_t* const node_stream = graph->streams[SCHEDULE_STREAMS(*schd)[0]]; |
507 | 229k | ccv_nnc_graph_neighbor_context_discovery_t discovery_context = { |
508 | 229k | .graph = graph, |
509 | 229k | .node = schd, |
510 | 229k | .stream = node_stream |
511 | 229k | }; |
512 | 229k | ccv_nnc_stream_context_set_neighbor_discovery(node_stream, _ccv_nnc_graph_neighbor_context_discovery, &discovery_context); |
513 | 229k | const int status = ccv_nnc_cmd_exec(node->cmd, node->hint, flags, inputs, node->input_size, outputs, node->output_size, node_stream); |
514 | 229k | if (status != 0) |
515 | 0 | PRINT(CCV_CLI_INFO, "Invalid Status: %d\n", status); |
516 | 229k | if (CCV_CLI_OUTPUT_LEVEL_IS(CCV_CLI_VERBOSE)) |
517 | 0 | { |
518 | 0 | for (i = 0; i < node->output_size; i++) |
519 | 0 | { |
520 | 0 | PRINT(CCV_CLI_VERBOSE, "POST: |<- %d. %p (%p:%d)", i + 1, outputs[i], (outputs[i] ? outputs[i]->data.u8 : 0), (outputs[i] ? CCV_TENSOR_GET_DEVICE_ID(outputs[i]->info.type) : -1)); |
521 | 0 | if (outputs[i]) |
522 | 0 | ccv_nnc_print_tensor_info(outputs[i]); |
523 | 0 | PRINT(CCV_CLI_VERBOSE, "\n"); |
524 | 0 | } |
525 | 0 | } |
526 | 229k | flag = 0; |
527 | 469k | for (i = 0; i < schd->stream_size; i++239k ) |
528 | 239k | if (SCHEDULE_SIGNALS(*schd)[i] >= 0) |
529 | 57.9k | { |
530 | 57.9k | ccv_nnc_stream_context_t* const stream = graph->streams[SCHEDULE_STREAMS(*schd)[i]]; |
531 | 57.9k | ccv_nnc_stream_context_emit_signal(stream, graph->signals[SCHEDULE_SIGNALS(*schd)[i]]); |
532 | 57.9k | if (!flag) |
533 | 57.9k | { |
534 | 57.9k | PRINT(CCV_CLI_INFO, "Emit: (%d, %d)", SCHEDULE_STREAMS0 (*schd)[i], SCHEDULE_SIGNALS0 (*schd)[i]); |
535 | 57.9k | flag = 1; |
536 | 57.9k | } else |
537 | 9 | PRINT(CCV_CLI_INFO, ", (%d, %d)", SCHEDULE_STREAMS0 (*schd)[i], SCHEDULE_SIGNALS0 (*schd)[i]); |
538 | 57.9k | } |
539 | 229k | if (flag) |
540 | 57.9k | PRINT(CCV_CLI_INFO, "\n"); |
541 | 229k | } |
542 | 229k | return 0; |
543 | 229k | } |
544 | | |
545 | | static void _ccv_nnc_graph_mark_outgoing_streams_blocked_by_task(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_schedule_t* const schd_info, ccv_nnc_graph_exec_info_t* const node, co_routine_t* const task) |
546 | 6 | { |
547 | 6 | int i, j; |
548 | 6 | if (node->outgoings) |
549 | 8 | for (i = 0; 4 i < node->outgoings->rnum; i++4 ) |
550 | 4 | { |
551 | 4 | const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i); |
552 | 4 | const ccv_nnc_graph_exec_schedule_t* const outgoing_schd = schd_info + outgoing_idx; |
553 | | // An outgoing stream can be blocked by multiple other tasks from other streams. But it is OK, |
554 | | // because on next round of execution, that one will be marked as blocked again. |
555 | 8 | for (j = 0; j < outgoing_schd->stream_size; j++4 ) |
556 | 4 | graph->block_stream_tasks[SCHEDULE_STREAMS(*outgoing_schd)[j]] = task; |
557 | 4 | } |
558 | 6 | } |
559 | | |
560 | 6 | static co_decl_task2 (_ccv_nnc_graph_wait_any_sub_tasks, (ccv_nnc_graph_t* const graph, co_routine_t* const* const sub_tasks, const int sub_task_size, const ccv_nnc_graph_exec_schedule_t* const schd_info, const int* const pending_nodes, const int pending_node_size), private( |
561 | 6 | )) { |
562 | 6 | assert(CO_P(sub_task_size) > 0); |
563 | 2 | co_await_any(CO_P(sub_tasks), CO_P(sub_task_size)); |
564 | | // This is not good, these local variables need to be in the private section. |
565 | | // I got away with it because there is no yield or resume or apply or any after await above. |
566 | 2 | int i, j, k; |
567 | 4 | for (i = 0; i < CO_P(sub_task_size); i++2 ) |
568 | 2 | if (co_is_done(CO_P(sub_tasks)[i])) |
569 | 2 | { |
570 | 6 | for (j = 0; j < CO_P(pending_node_size); j++4 ) |
571 | 4 | { |
572 | 4 | const ccv_nnc_graph_exec_schedule_t* const node = CO_P(schd_info) + CO_P(pending_nodes)[j]; |
573 | 8 | for (k = 0; k < node->stream_size; k++4 ) |
574 | 4 | if (CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*node)[k]] == CO_P(sub_tasks)[i]) |
575 | 2 | CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*node)[k]] = 0; |
576 | 4 | } |
577 | 2 | co_free(CO_P(sub_tasks)[i]); |
578 | 2 | } |
579 | 2 | } co_end() |
580 | | |
581 | 53.0k | static co_decl_task26.5k (_ccv_nnc_graph_exec_run_loop, (ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const exec_info, const ccv_nnc_graph_exec_schedule_t* const schd_info, const int* const psort, const int start_index, const int exec_info_size, ccv_nnc_tensor_tape_t* const tensor_tape, const int flags), private( |
582 | 53.0k | int i, p, q; |
583 | 53.0k | int sub_task_size; |
584 | 53.0k | co_routine_t** sub_tasks; |
585 | 53.0k | int* pending_nodes[2]; |
586 | 53.0k | int pending_node_size[2]; |
587 | 53.0k | int idx; |
588 | 53.0k | ccv_nnc_graph_exec_info_t* node; |
589 | 53.0k | const ccv_nnc_graph_exec_schedule_t* schd; |
590 | 53.0k | co_routine_t* task; |
591 | 53.0k | )) { |
592 | 53.0k | CO_V26.5k (sub_task_size) = 0; |
593 | 53.0k | CO_V26.5k (sub_tasks) = (co_routine_t**)ccv_nnc_graph_buffer(CO_P26.5k (graph), sizeof(co_routine_t*) * (CO_P26.5k (graph)->sub_graphs26.5k ? CO_P3 (graph)->sub_graphs->rnum3 : 026.5k ) + sizeof(int) * CO_P26.5k (exec_info_size) * 2); |
594 | 53.0k | CO_V26.5k (pending_nodes)[0] = (int*)(CO_V26.5k (sub_tasks) + (CO_P26.5k (graph)->sub_graphs26.5k ? CO_P3 (graph)->sub_graphs->rnum3 : 026.5k )); |
595 | 53.0k | CO_V26.5k (pending_nodes)[1] = CO_V26.5k (pending_nodes)[0] + CO_P26.5k (exec_info_size); |
596 | 53.0k | CO_V26.5k (pending_node_size)[0] = 0; |
597 | 53.0k | CO_V26.5k (pending_node_size)[1] = 0; |
598 | 255k | for (CO_V26.5k (i) = CO_P26.5k (start_index); CO_V(i) < CO_P(exec_info_size); CO_V229k (i)++229k ) |
599 | 229k | { |
600 | 229k | if (__atomic_load_n(&CO_P(graph)->run_state, __ATOMIC_ACQUIRE) == CCV_NNC_GRAPH_STATE_CANCEL) |
601 | 0 | break; |
602 | 229k | CO_V(idx) = CO_P(psort) ? CO_P94.3k (psort)[94.3k CO_V94.3k (i)] : CO_V135k (i); |
603 | 229k | CO_V(node) = CO_P(exec_info) + CO_V(idx); |
604 | 229k | CO_V(schd) = CO_P(schd_info) + CO_V(idx); |
605 | | // If stream is blocked by but not blocked by current executing task. |
606 | 229k | int blocked = 0, j; |
607 | 469k | for (j = 0; j < CO_V(schd)->stream_size; j++239k ) |
608 | 239k | if (CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*CO_V(schd))[j]]) |
609 | 4 | { |
610 | 4 | CO_V(pending_nodes)[0][CO_V(pending_node_size)[0]++] = CO_V(idx); |
611 | 4 | _ccv_nnc_graph_mark_outgoing_streams_blocked_by_task(CO_P(graph), CO_P(schd_info), CO_V(node), CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*CO_V(schd))[j]]); |
612 | 4 | blocked = 1; |
613 | 4 | } |
614 | 229k | if (blocked) |
615 | 4 | continue; |
616 | 229k | CO_V(task) = _ccv_nnc_graph_exec_run_task(CO_P(graph), CO_V(node), CO_V(schd), CO_V(idx), CO_P(tensor_tape), CO_P(flags)); |
617 | 229k | if (CO_V(task)) |
618 | 4 | { |
619 | 4 | co_resume(CO_V(task)); |
620 | 4 | if (!co_is_done(CO_V(task))) |
621 | 2 | { |
622 | 2 | CO_V(sub_tasks)[CO_V(sub_task_size)++] = CO_V(task); |
623 | 2 | int j; |
624 | 4 | for (j = 0; j < CO_V(schd)->stream_size; j++2 ) |
625 | 2 | CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*CO_V(schd))[j]] = CO_V(task); |
626 | 2 | _ccv_nnc_graph_mark_outgoing_streams_blocked_by_task(CO_P(graph), CO_P(schd_info), CO_V(node), CO_V(task)); |
627 | 2 | } else |
628 | 2 | co_free(CO_V(task)); |
629 | 4 | } |
630 | 229k | } |
631 | 26.5k | if (CO_V(sub_task_size)) |
632 | 26.5k | co_apply(_ccv_nnc_graph_wait_any_sub_tasks, (CO_P(graph), CO_V(sub_tasks), CO_V(sub_task_size), CO_P(schd_info), CO_V(pending_nodes)[0], CO_V(pending_node_size)[0])); |
633 | 26.5k | if (__atomic_load_n(&CO_P(graph)->run_state, __ATOMIC_ACQUIRE) == CCV_NNC_GRAPH_STATE_CANCEL) |
634 | 26.5k | co_return(); |
635 | 26.5k | CO_V(p) = 0; |
636 | 26.5k | CO_V(q) = 1; |
637 | 26.5k | while (CO_V(pending_node_size)[CO_V(p)] > 0) |
638 | 2 | { |
639 | 2 | CO_V(pending_node_size)[CO_V(q)] = 0; |
640 | 2 | CO_V(sub_task_size) = 0; |
641 | 6 | for (CO_V2 (i) = 0; CO_V(i) < CO_V(pending_node_size)[CO_V(p)]; CO_V4 (i)++4 ) |
642 | 4 | { |
643 | 4 | CO_V(idx) = CO_V(pending_nodes)[CO_V(p)][CO_V(i)]; |
644 | 4 | CO_V(node) = CO_P(exec_info) + CO_V(idx); |
645 | 4 | CO_V(schd) = CO_P(schd_info) + CO_V(idx); |
646 | 4 | int blocked = 0, j; |
647 | 8 | for (j = 0; j < CO_V(schd)->stream_size; j++4 ) |
648 | 4 | if (CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*CO_V(schd))[j]]) |
649 | 0 | { |
650 | 0 | _ccv_nnc_graph_mark_outgoing_streams_blocked_by_task(CO_P(graph), CO_P(schd_info), CO_V(node), CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*CO_V(schd))[j]]); |
651 | 0 | CO_V(pending_nodes)[CO_V(q)][CO_V(pending_node_size)[CO_V(q)]++] = CO_V(idx); |
652 | 0 | blocked = 1; |
653 | 0 | } |
654 | 4 | if (blocked) |
655 | 0 | continue; |
656 | 4 | CO_V(task) = _ccv_nnc_graph_exec_run_task(CO_P(graph), CO_V(node), CO_V(schd), CO_V(idx), CO_P(tensor_tape), CO_P(flags)); |
657 | 4 | if (CO_V(task)) |
658 | 0 | { |
659 | 0 | co_resume(CO_V(task)); |
660 | 0 | if (!co_is_done(CO_V(task))) |
661 | 0 | { |
662 | 0 | CO_V(sub_tasks)[CO_V(sub_task_size)++] = CO_V(task); |
663 | 0 | for (j = 0; j < CO_V(schd)->stream_size; j++) |
664 | 0 | CO_P(graph)->block_stream_tasks[SCHEDULE_STREAMS(*CO_V(schd))[j]] = CO_V(task); |
665 | 0 | _ccv_nnc_graph_mark_outgoing_streams_blocked_by_task(CO_P(graph), CO_P(schd_info), CO_V(node), CO_V(task)); |
666 | 0 | } else |
667 | 0 | co_free(CO_V(task)); |
668 | 0 | } |
669 | 4 | } |
670 | 2 | int t; |
671 | 2 | CCV_SWAP(CO_V(p), CO_V(q), t); |
672 | 2 | if (CO_V(sub_task_size)) |
673 | 2 | co_apply(_ccv_nnc_graph_wait_any_sub_tasks, (CO_P(graph), CO_V(sub_tasks), CO_V(sub_task_size), CO_P(schd_info), CO_V(pending_nodes)[CO_V(p)], CO_V(pending_node_size)[CO_V(p)])); |
674 | 2 | } |
675 | 26.5k | } co_end() |
676 | | |
677 | 79.5k | co_task26.4k (_ccv_nnc_graph_topsorted_run_coro, (ccv_nnc_graph_t* const graph, const int exec_idx, const ccv_nnc_graph_static_schedule_t* const schedule, ccv_nnc_graph_exec_info_t* const exec, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context, const int flags), private(
|
678 | 79.5k | ccv_nnc_graph_exec_info_t* exec_info; |
679 | 79.5k | const ccv_nnc_graph_exec_schedule_t* schd_info; |
680 | 79.5k | co_routine_t* previous_main; |
681 | 79.5k | int stream_0; |
682 | | // while loop |
683 | 79.5k | int64_t count, reverse_count; |
684 | 79.5k | int graph_breakpoint_size; |
685 | 79.5k | int i, j; |
686 | 79.5k | )) { |
687 | 79.5k | assert(CO_P(graph)->stream_size > 0); |
688 | 26.4k | int i; |
689 | | // Assign the resource container pointer. |
690 | 104k | for (i = 0; i < CO_P(graph)->stream_size; i++78.1k ) |
691 | 78.1k | CO_P(graph)->streams[i]->resource_container = CO_P(stream_context)->_inline_container; |
692 | 26.4k | CO_V(exec_info) = (ccv_nnc_graph_exec_info_t*)ccv_array_get(CO_P(graph)->exec_info, 0); |
693 | 26.4k | CO_V(schd_info) = CO_P(schedule)->exec_info; |
694 | 26.4k | CO_V(stream_0) = CO_P(schedule)->stream_0; |
695 | 26.4k | if (CO_P(exec_idx) == -1) |
696 | 26.4k | { |
697 | 26.4k | if (CO_P(stream_context)->main) |
698 | 0 | { |
699 | 0 | CO_V(previous_main) = CO_P(stream_context)->main; |
700 | 0 | CO_P(stream_context)->main = co_self(); |
701 | | // Wait the previous task to be done. This makes sure that our graph run is serial on the same stream. |
702 | 0 | assert(!co_is_done(CO_V(previous_main))); |
703 | 0 | co_await(CO_V(previous_main)); |
704 | 0 | } else |
705 | 26.4k | CO_P(stream_context)->main = co_self(); |
706 | 26.4k | PRINT(CCV_CLI_INFO, "Graph Stream %d Begin", CO_V0 (stream_0)); |
707 | 26.4k | ccv_nnc_stream_signal_t* stream_0_signal; |
708 | 26.4k | if (CO_P(stream_context) != CO_P(graph)->streams[CO_V(stream_0)]) |
709 | 907 | { |
710 | | // Make sure when we start work on streams[0], the current stream context is done. |
711 | 907 | stream_0_signal = ccv_nnc_stream_context_emit_signal_new(CO_P(stream_context)); |
712 | 907 | ccv_nnc_stream_context_wait_signal(CO_P(graph)->streams[CO_V(stream_0)], stream_0_signal); |
713 | 25.5k | } else if (CO_P(schedule)->stream_1_size) { |
714 | 81 | ccv_nnc_stream_context_emit_signal(CO_P(graph)->streams[CO_V(stream_0)], CO_P(schedule)->begin); |
715 | 81 | stream_0_signal = CO_P(schedule)->begin; |
716 | 81 | } |
717 | 26.4k | int i, flag = 0; |
718 | 26.7k | for (i = 0; i < CO_P(schedule)->stream_1_size; i++250 ) |
719 | 250 | { |
720 | 250 | ccv_nnc_stream_context_wait_signal(CO_P(graph)->streams[CO_P(schedule)->stream_1s[i]], stream_0_signal); |
721 | 250 | if (!flag) |
722 | 86 | { |
723 | 86 | PRINT(CCV_CLI_INFO, ", Wait: %d", CO_P0 (schedule)->stream_1s[i]); |
724 | 86 | flag = 1; |
725 | 86 | } else |
726 | 164 | PRINT(CCV_CLI_INFO, ", %d", CO_P0 (schedule)->stream_1s[i]); |
727 | 250 | } |
728 | 26.4k | PRINT(CCV_CLI_INFO, "\n"); |
729 | 26.4k | } else { |
730 | 4 | assert(CO_P(stream_context) == CO_P(graph)->streams[0]); |
731 | 4 | } |
732 | 26.4k | if (CO_P(exec) && (4 CO_P4 (exec)->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)) |
733 | 2 | { |
734 | 2 | assert(CO_P(schedule) == CO_P(graph)->default_schedule); |
735 | 2 | assert(CO_P(exec)->p_while.expr); |
736 | 2 | CO_V(count) = 0; |
737 | | // This is a forward while loop. Backward while loop will just consult its pairing part. |
738 | 2 | if (CO_P(exec)->cmd.cmd == CCV_NNC_GRAPH_FORWARD) |
739 | 2 | { |
740 | 2 | CO_V(graph_breakpoint_size) = CO_P(graph)->breakpoint_offset + CO_P(graph)->breakpoint_size; |
741 | 10 | for (;; ++CO_V(count)) |
742 | 12 | { |
743 | 12 | CO_P(graph)->while_count = CO_V(count); |
744 | 12 | if (CO_P(tensor_tape)) |
745 | 0 | ccv_nnc_tensor_tape_set_numbering(CO_P(tensor_tape), CO_P(graph)->p, (ccv_nnc_graph_exec_t){ |
746 | 0 | .d = CO_P(exec_idx), |
747 | 0 | .graph = CO_P(graph)->p, |
748 | 0 | }, CO_V(count)); |
749 | 12 | _ccv_nnc_graph_unwrap(CO_P(graph), CO_V(count), 0); |
750 | 12 | if (CO_V(count) > 0) |
751 | 10 | _ccv_nnc_graph_transit_move_to(CO_P(graph)); |
752 | 12 | co_apply(_ccv_nnc_graph_exec_run_loop, (CO_P(graph), CO_V(exec_info), CO_V(schd_info), 0, 0, CO_V(graph_breakpoint_size), CO_P(tensor_tape), CO_P(flags))); |
753 | 12 | if (__atomic_load_n(&CO_P(graph)->run_state, __ATOMIC_ACQUIRE) == CCV_NNC_GRAPH_STATE_CANCEL) |
754 | 0 | break; |
755 | | // Reached breakpoints, now check the breakpoint, if not met, break out. |
756 | | // Wait until everything on the stream is executed. |
757 | 24 | for (12 CO_V12 (i) = CO_P12 (graph)->breakpoint_offset; CO_V(i) < CO_V(graph_breakpoint_size); CO_V12 (i)++12 ) |
758 | 24 | for (12 CO_V12 (j) = 0; CO_V(j) < CO_V(schd_info)[CO_V(i)].stream_size; CO_V12 (j)++12 ) |
759 | 12 | co_stream_await(CO_P(graph)->streams[SCHEDULE_STREAMS(CO_V(schd_info)[CO_V(i)])[CO_V(j)]]); |
760 | 12 | _ccv_nnc_graph_exec_unwrap_while_expr(CO_P(graph), CO_P(exec)); |
761 | 12 | if (!CO_P(exec)->p_while.expr(CO_P(exec)->p_while.inputs, CO_P(exec)->p_while.input_size, CO_P(exec)->p_while.data)) |
762 | 2 | { |
763 | 2 | _ccv_nnc_graph_rewrap(CO_P(graph)); |
764 | | // If we break from here, it is ok because all the streams are waited. |
765 | 2 | break; |
766 | 2 | } |
767 | 10 | co_apply(_ccv_nnc_graph_exec_run_loop, (CO_P(graph), CO_V(exec_info), CO_V(schd_info), 0, CO_V(graph_breakpoint_size), CO_P(graph)->exec_info->rnum, CO_P(tensor_tape), CO_P(flags))); |
768 | | // If it is cancelled here, we don't need to breakout yet, we can breakout on earlier place. The most important thing is to avoid stream wait if there is a cancel. |
769 | 10 | _ccv_nnc_graph_from_move_transit(CO_P(graph)); |
770 | 10 | _ccv_nnc_graph_rewrap(CO_P(graph)); |
771 | 10 | } |
772 | 2 | } else { |
773 | | // For backward graph, no need to evaluate the while expr. |
774 | 0 | assert(CO_P(exec)->cmd.cmd == CCV_NNC_GRAPH_BACKWARD); |
775 | 0 | assert(CO_P(graph)->pair); |
776 | 0 | assert(CO_P(tensor_tape)); |
777 | 0 | CO_V(count) = 0; |
778 | 0 | CO_V(reverse_count) = CO_P(graph)->while_count = ccv_nnc_tensor_tape_numbering(CO_P(tensor_tape), CO_P(graph)->p, (ccv_nnc_graph_exec_t){ |
779 | 0 | .d = CO_P(exec_idx), |
780 | 0 | .graph = CO_P(graph)->p, |
781 | 0 | }); |
782 | 0 | _ccv_nnc_graph_unwrap(CO_P(graph), CO_V(count), CO_V(reverse_count)); |
783 | 0 | co_apply(_ccv_nnc_graph_exec_run_loop, (CO_P(graph), CO_V(exec_info), CO_V(schd_info), 0, CO_P(graph)->breakpoint_offset, CO_P(graph)->exec_info->rnum, CO_P(tensor_tape), CO_P(flags))); |
784 | | // If it is cancelled here, we don't need to breakout yet, we can breakout later. |
785 | 0 | _ccv_nnc_graph_from_move_transit(CO_P(graph)); |
786 | 0 | _ccv_nnc_graph_rewrap(CO_P(graph)); |
787 | 0 | for (CO_V(count) = 1; CO_V(reverse_count) > 0; ++CO_V(count)) |
788 | 0 | { |
789 | 0 | CO_P(graph)->while_count = --CO_V(reverse_count); |
790 | 0 | _ccv_nnc_graph_unwrap(CO_P(graph), CO_V(count), CO_V(reverse_count)); |
791 | 0 | _ccv_nnc_graph_transit_move_to(CO_P(graph)); |
792 | 0 | co_apply(_ccv_nnc_graph_exec_run_loop, (CO_P(graph), CO_V(exec_info), CO_V(schd_info), 0, 0, CO_P(graph)->exec_info->rnum, CO_P(tensor_tape), CO_P(flags))); |
793 | 0 | if (__atomic_load_n(&CO_P(graph)->run_state, __ATOMIC_ACQUIRE) == CCV_NNC_GRAPH_STATE_CANCEL) |
794 | 0 | break; |
795 | 0 | _ccv_nnc_graph_from_move_transit(CO_P(graph)); |
796 | 0 | _ccv_nnc_graph_rewrap(CO_P(graph)); |
797 | 0 | } |
798 | 0 | } |
799 | 2 | if (__atomic_load_n(&CO_P(graph)->run_state, __ATOMIC_ACQUIRE) == CCV_NNC_GRAPH_STATE_CANCEL) |
800 | 0 | { |
801 | | // The most important thing is to reset main and then return, we don't need to wait for any streaming event. |
802 | 0 | if (CO_P(exec_idx) == -1 && CO_P(stream_context)->main == co_self()) |
803 | 0 | CO_P(stream_context)->main = 0; |
804 | 0 | co_return(); |
805 | 0 | } |
806 | 2 | assert(CO_V(stream_0) == 0); |
807 | 2 | int i; |
808 | 2 | for (i = 0; i < CO_P(schedule)->wait_size; i++0 ) |
809 | 0 | ccv_nnc_stream_context_wait_signal(CO_P(graph)->streams[0], CO_P(graph)->signals[CO_P(schedule)->waits[i]]); |
810 | 26.4k | } else { |
811 | 26.4k | CO_P(graph)->while_count = 0; |
812 | 26.4k | co_apply(_ccv_nnc_graph_exec_run_loop, (CO_P(graph), CO_V(exec_info), CO_V(schd_info), CO_P(schedule)->psort, 0, CO_P(schedule)->psort ? CO_P(schedule)->psort_size : CO_P(schedule)->exec_info_size, CO_P(tensor_tape), CO_P(flags))); |
813 | 26.4k | if (__atomic_load_n(&CO_P(graph)->run_state, __ATOMIC_ACQUIRE) == CCV_NNC_GRAPH_STATE_CANCEL) |
814 | 0 | { |
815 | | // The most important thing is to reset main and then return, we don't need to wait for any streaming event. |
816 | 0 | if (CO_P(exec_idx) == -1 && CO_P(stream_context)->main == co_self()) |
817 | 0 | CO_P(stream_context)->main = 0; |
818 | 0 | co_return(); |
819 | 0 | } |
820 | 26.4k | PRINT(CCV_CLI_INFO, "Graph Stream %d End", CO_V0 (stream_0)); |
821 | 26.4k | int i, flag = 0; |
822 | 26.6k | for (i = 0; i < CO_P(schedule)->wait_size; i++194 ) |
823 | 194 | { |
824 | 194 | ccv_nnc_stream_context_wait_signal(CO_P(graph)->streams[CO_V(stream_0)], CO_P(graph)->signals[CO_P(schedule)->waits[i]]); |
825 | 194 | if (!flag) |
826 | 66 | { |
827 | 66 | PRINT(CCV_CLI_INFO, ", Wait: %d", CO_P0 (schedule)->waits[i]); |
828 | 66 | flag = 1; |
829 | 66 | } else |
830 | 128 | PRINT(CCV_CLI_INFO, ", %d", CO_P0 (schedule)->waits[i]); |
831 | 194 | } |
832 | 26.4k | PRINT(CCV_CLI_INFO, "\n"); |
833 | 26.4k | } |
834 | 26.4k | if (CO_P(stream_context) != CO_P(graph)->streams[CO_V(stream_0)]) |
835 | 907 | { |
836 | 907 | assert(CO_P(exec_idx) == -1); |
837 | 907 | ccv_nnc_stream_context_emit_signal(CO_P(graph)->streams[CO_V(stream_0)], CO_P(schedule)->end); |
838 | 907 | ccv_nnc_stream_context_wait_signal(CO_P(stream_context), CO_P(schedule)->end); |
839 | 907 | } |
840 | | // Reset main to 0 if it is current me. |
841 | 26.4k | if (CO_P(exec_idx) == -1 && CO_P26.4k (stream_context)->main == 26.4k co_self26.4k ()) |
842 | 26.4k | CO_P(stream_context)->main = 0; |
843 | 26.4k | } co_end() |
844 | | |
845 | | static int _ccv_nnc_graph_run(ccv_nnc_graph_t* const graph, const int exec_idx, ccv_nnc_graph_exec_info_t* const exec, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const int flags, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context); |
846 | | |
847 | | static inline void _ccv_nnc_graph_exec_run(ccv_nnc_graph_t* const graph, ccv_nnc_graph_exec_info_t* const node, const int idx, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context, const int flags) |
848 | 61.1k | { |
849 | 61.1k | int i; |
850 | 61.1k | _ccv_nnc_graph_exec_unwrap_io(graph, node); |
851 | 61.1k | ccv_nnc_tensor_t** inputs = node->inputs; |
852 | 61.1k | ccv_nnc_tensor_t** outputs = inputs ? inputs + node->input_size60.9k : 0223 ; |
853 | 61.1k | if (tensor_tape) |
854 | 78 | ccv_nnc_tensor_tape_io(tensor_tape, graph, node->input_flags, inputs, node->input_size, node->output_flags, outputs, node->output_size); |
855 | | /* Broadcast the updates to all subscribed references for input / output, even though at th |
856 | | * time output is not written yet, propagate pointer change is still valid. */ |
857 | 61.1k | _ccv_nnc_graph_exec_begin_synchronize_multiviews(graph, node); |
858 | 61.1k | if (node->cmd.cmd == CCV_NNC_GRAPH_FORWARD || node->cmd.cmd == CCV_NNC_GRAPH_BACKWARD61.1k ) |
859 | 67 | { |
860 | 67 | assert(!stream_context); // This doesn't work properly with stream context. |
861 | 67 | if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) |
862 | 39 | { |
863 | 39 | int ref; |
864 | 39 | if (node->cmd.cmd == CCV_NNC_GRAPH_FORWARD) |
865 | 35 | { |
866 | 35 | ref = node->case_of.offset + node->case_of.expr(inputs, node->input_size, node->case_of.data); |
867 | 35 | if (tensor_tape) |
868 | 4 | ccv_nnc_tensor_tape_set_numbering(tensor_tape, graph, (ccv_nnc_graph_exec_t){ |
869 | 4 | .d = idx, |
870 | 4 | .graph = graph, |
871 | 4 | }, ref); |
872 | 35 | } else { |
873 | 4 | assert(node->cmd.cmd == CCV_NNC_GRAPH_BACKWARD); |
874 | 4 | assert(tensor_tape); |
875 | 4 | ref = ccv_nnc_tensor_tape_numbering(tensor_tape, graph, (ccv_nnc_graph_exec_t){ |
876 | 4 | .d = idx, |
877 | 4 | .graph = graph, |
878 | 4 | }); |
879 | 4 | } |
880 | 39 | if (ref >= 0) |
881 | 31 | { |
882 | 31 | assert(ref < node->graph_ref_size); |
883 | 31 | ccv_nnc_graph_t* sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[ref] - 1); |
884 | 31 | _ccv_nnc_graph_run(sub_graph, idx, node, inputs, node->input_size, outputs, node->output_size, flags, 0, 0, 0, 0, tensor_tape, stream_context); |
885 | 31 | } |
886 | 39 | _ccv_nnc_graph_exec_unwrap_phi(graph, node, ref); |
887 | 39 | } else if (28 node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE28 ) { |
888 | 28 | ccv_nnc_graph_t* sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[0] - 1); |
889 | 28 | _ccv_nnc_graph_run(sub_graph, idx, node, inputs, node->input_size, outputs, node->output_size, flags, 0, 0, 0, 0, tensor_tape, stream_context); |
890 | 28 | } |
891 | 61.0k | } else { |
892 | 61.0k | PRINT(CCV_CLI_INFO, "%s [%d]: [%d] -> [%d]\n", ccv_nnc_cmd_name(node->cmd.cmd), idx, node->input_size, node->output_size); |
893 | 235k | for (i = 0; i < node->input_size; i++174k ) |
894 | 174k | { |
895 | 174k | PRINT(CCV_CLI_INFO, "|-> %d. %p (%p:%d)", i + 1, inputs[i], (inputs[i] ? inputs[i]->data.u8 : 0), (inputs[i] ? 0 CCV_TENSOR_GET_DEVICE_ID0 (inputs[i]->info.type) : -1)); |
896 | 174k | if (inputs[i] && CCV_CLI_OUTPUT_LEVEL_IS130k (CCV_CLI_INFO)) |
897 | 0 | ccv_nnc_print_tensor_info(inputs[i]); |
898 | 174k | PRINT(CCV_CLI_INFO, "\n"); |
899 | 174k | } |
900 | 61.0k | ccv_nnc_cmd_exec(node->cmd, node->hint, flags, inputs, node->input_size, outputs, node->output_size, stream_context); |
901 | 156k | for (i = 0; i < node->output_size; i++95.1k ) |
902 | 95.1k | { |
903 | 95.1k | PRINT(CCV_CLI_INFO, "|<- %d. %p (%p:%d)", i + 1, outputs[i], (outputs[i] ? outputs[i]->data.u8 : 0), (outputs[i] ? 0 CCV_TENSOR_GET_DEVICE_ID0 (outputs[i]->info.type) : -1)); |
904 | 95.1k | if (outputs[i] && CCV_CLI_OUTPUT_LEVEL_IS78.8k (CCV_CLI_INFO)) |
905 | 0 | ccv_nnc_print_tensor_info(outputs[i]); |
906 | 95.1k | PRINT(CCV_CLI_INFO, "\n"); |
907 | 95.1k | } |
908 | 61.0k | } |
909 | 61.1k | } |
910 | | |
911 | | static inline void _ccv_nnc_graph_topsorted_run(ccv_nnc_graph_t* const graph, const int exec_idx, ccv_nnc_graph_exec_info_t* const exec, const int flags, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context) |
912 | 9.90k | { |
913 | 9.90k | int i; |
914 | 9.90k | if (exec && (exec->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)51 ) |
915 | 23 | { |
916 | 23 | assert(!stream_context); // This doesn't work properly with stream context. |
917 | 23 | assert(exec->p_while.expr); |
918 | 23 | int64_t count = 0; |
919 | | // This is a forward while loop. Backward while loop will just consult its pairing part. |
920 | 23 | if (exec->cmd.cmd == CCV_NNC_GRAPH_FORWARD) |
921 | 22 | { |
922 | 22 | const int graph_breakpoint_size = graph->breakpoint_offset + graph->breakpoint_size; |
923 | 104 | for (;; ++count) |
924 | 126 | { |
925 | 126 | graph->while_count = count; |
926 | 126 | if (tensor_tape) |
927 | 5 | ccv_nnc_tensor_tape_set_numbering(tensor_tape, graph->p, (ccv_nnc_graph_exec_t){ |
928 | 5 | .d = exec_idx, |
929 | 5 | .graph = graph->p, |
930 | 5 | }, count); |
931 | 126 | _ccv_nnc_graph_unwrap(graph, count, 0); |
932 | 126 | if (count > 0) |
933 | 104 | _ccv_nnc_graph_transit_move_to(graph); |
934 | 312 | for (i = 0; i < graph_breakpoint_size; i++186 ) |
935 | 186 | _ccv_nnc_graph_exec_run(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i), i, tensor_tape, stream_context, flags); |
936 | 126 | _ccv_nnc_graph_exec_unwrap_while_expr(graph, exec); |
937 | | // Reached breakpoints, now check the breakpoint, if not met, break out. |
938 | 126 | if (!exec->p_while.expr(exec->p_while.inputs, exec->p_while.input_size, exec->p_while.data)) |
939 | 22 | { |
940 | 22 | _ccv_nnc_graph_rewrap(graph); |
941 | 22 | break; |
942 | 22 | } |
943 | 210 | for (i = graph_breakpoint_size; 104 i < graph->exec_info->rnum; i++106 ) |
944 | 106 | _ccv_nnc_graph_exec_run(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i), i, tensor_tape, stream_context, flags); |
945 | 104 | _ccv_nnc_graph_from_move_transit(graph); |
946 | 104 | _ccv_nnc_graph_rewrap(graph); |
947 | 104 | } |
948 | 22 | } else { |
949 | | // For backward graph, no need to evaluate the while expr. |
950 | 1 | assert(exec->cmd.cmd == CCV_NNC_GRAPH_BACKWARD); |
951 | 1 | assert(graph->pair); |
952 | 1 | assert(tensor_tape); |
953 | 1 | count = 0; |
954 | 1 | int64_t reverse_count = graph->while_count = ccv_nnc_tensor_tape_numbering(tensor_tape, graph->p, (ccv_nnc_graph_exec_t){ |
955 | 1 | .d = exec_idx, |
956 | 1 | .graph = graph->p, |
957 | 1 | }); |
958 | 1 | _ccv_nnc_graph_unwrap(graph, count, reverse_count); |
959 | 5 | for (i = graph->breakpoint_offset; i < graph->exec_info->rnum; i++4 ) |
960 | 4 | _ccv_nnc_graph_exec_run(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i), i, tensor_tape, stream_context, flags); |
961 | 1 | _ccv_nnc_graph_from_move_transit(graph); |
962 | 1 | _ccv_nnc_graph_rewrap(graph); |
963 | 5 | for (count = 1; reverse_count > 0; ++count4 ) |
964 | 4 | { |
965 | 4 | graph->while_count = --reverse_count; |
966 | 4 | _ccv_nnc_graph_unwrap(graph, count, reverse_count); |
967 | 4 | _ccv_nnc_graph_transit_move_to(graph); |
968 | 20 | for (i = 0; i < graph->exec_info->rnum; i++16 ) |
969 | 16 | _ccv_nnc_graph_exec_run(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i), i, tensor_tape, stream_context, flags); |
970 | 4 | _ccv_nnc_graph_from_move_transit(graph); |
971 | 4 | _ccv_nnc_graph_rewrap(graph); |
972 | 4 | } |
973 | 1 | } |
974 | 9.88k | } else { |
975 | 9.88k | graph->while_count = 0; |
976 | 70.5k | for (i = 0; i < graph->exec_info->rnum; i++60.6k ) |
977 | 60.6k | _ccv_nnc_graph_exec_run(graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i), i, tensor_tape, stream_context, flags); |
978 | 9.88k | } |
979 | 9.90k | } |
980 | | |
981 | | static inline void _ccv_nnc_graph_run_slow_path(ccv_nnc_graph_t* const graph, const int exec_idx, ccv_nnc_graph_exec_info_t* const exec, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const int flags, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context) |
982 | 35 | { |
983 | 35 | int i, j; |
984 | 35 | const ccv_nnc_graph_exec_t* const graph_sources = sources ? sources27 : (ccv_nnc_graph_exec_t*)8 ccv_array_get8 (graph->sources, 0); |
985 | 35 | const int graph_source_size = source_size ? source_size27 : graph->sources->rnum8 ; |
986 | 35 | const ccv_nnc_graph_exec_t* const graph_destinations = destinations ? destinations27 : (ccv_nnc_graph_exec_t*)8 ccv_array_get8 (graph->destinations, 0); |
987 | 35 | const int graph_destination_size = destination_size ? destination_size27 : graph->destinations->rnum8 ; |
988 | 35 | #define visitor(node, idx, ...) \ |
989 | 235 | _ccv_nnc_graph_exec_run(graph, node, idx, tensor_tape, stream_context, flags) |
990 | 35 | if (exec && (exec->flags & CCV_NNC_GRAPH_EXEC_P_WHILE)8 ) |
991 | 5 | { |
992 | 5 | assert(!stream_context); // This doesn't work properly with stream context. |
993 | 5 | assert(exec->p_while.expr); |
994 | 5 | int64_t count = 0; |
995 | | // This is a forward while loop. Backward while loop will just consult its pairing part. |
996 | 5 | if (exec->cmd.cmd == CCV_NNC_GRAPH_FORWARD) |
997 | 4 | { |
998 | 4 | ccv_array_t* follows = ccv_array_new(sizeof(ccv_nnc_graph_exec_t), graph->breakpoint_size, 0); |
999 | 8 | for (i = 0; i < graph->breakpoint_size; i++4 ) |
1000 | 4 | { |
1001 | 4 | const ccv_nnc_graph_exec_info_t* const exec_info = (const ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, graph->breakpoints->d); |
1002 | 4 | if (exec_info->outgoings) |
1003 | 6 | for (j = 0; 3 j < exec_info->outgoings->rnum; j++3 ) |
1004 | 3 | { |
1005 | 3 | const ccv_nnc_graph_exec_t exec = { |
1006 | 3 | .d = *(int*)ccv_array_get(exec_info->outgoings, j), |
1007 | 3 | .graph = graph, |
1008 | 3 | }; |
1009 | 3 | ccv_array_push(follows, &exec); |
1010 | 3 | } |
1011 | 4 | } |
1012 | 19 | for (;; ++count) |
1013 | 23 | { |
1014 | 23 | graph->while_count = count; |
1015 | 23 | if (tensor_tape) |
1016 | 5 | ccv_nnc_tensor_tape_set_numbering(tensor_tape, graph->p, (ccv_nnc_graph_exec_t){ |
1017 | 5 | .d = exec_idx, |
1018 | 5 | .graph = graph->p, |
1019 | 5 | }, count); |
1020 | 23 | _ccv_nnc_graph_unwrap(graph, count, 0); |
1021 | 23 | if (count > 0) |
1022 | 19 | _ccv_nnc_graph_transit_move_to(graph); |
1023 | 28 | CCV_NNC_GRAPH_VISIT23 (graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph_sources, graph_source_size, graph->breakpoints, graph->breakpoint_size, 0, visitor); |
1024 | 23 | _ccv_nnc_graph_exec_unwrap_while_expr(graph, exec); |
1025 | | // Reached breakpoints, now check the breakpoint, if not met, break out. |
1026 | 23 | if (!exec->p_while.expr(exec->p_while.inputs, exec->p_while.input_size, exec->p_while.data)) |
1027 | 4 | { |
1028 | 4 | _ccv_nnc_graph_rewrap(graph); |
1029 | 4 | break; |
1030 | 4 | } |
1031 | 19 | if (follows->rnum > 0) |
1032 | 19 | CCV_NNC_GRAPH_VISIT15 (graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, (ccv_nnc_graph_exec_t*)ccv_array_get(follows, 0), follows->rnum, graph_destinations, graph_destination_size, 0, visitor15 ); |
1033 | 19 | _ccv_nnc_graph_from_move_transit(graph); |
1034 | 19 | _ccv_nnc_graph_rewrap(graph); |
1035 | 19 | } |
1036 | 4 | ccv_array_free(follows); |
1037 | 4 | } else { |
1038 | | // For backward graph, no need to evaluate the while expr. |
1039 | 1 | assert(exec->cmd.cmd == CCV_NNC_GRAPH_BACKWARD); |
1040 | 1 | assert(graph->pair); |
1041 | 1 | assert(tensor_tape); |
1042 | 1 | count = 0; |
1043 | 1 | int64_t reverse_count = graph->while_count = ccv_nnc_tensor_tape_numbering(tensor_tape, graph->p, (ccv_nnc_graph_exec_t){ |
1044 | 1 | .d = exec_idx, |
1045 | 1 | .graph = graph->p, |
1046 | 1 | }); |
1047 | 1 | _ccv_nnc_graph_unwrap(graph, count, reverse_count); |
1048 | 2 | CCV_NNC_GRAPH_VISIT1 (graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph->breakpoints, graph->breakpoint_size, graph_destinations, graph_destination_size, 1, visitor); |
1049 | 1 | _ccv_nnc_graph_from_move_transit(graph); |
1050 | 1 | _ccv_nnc_graph_rewrap(graph); |
1051 | 5 | for (count = 1; reverse_count > 0; ++count4 ) |
1052 | 4 | { |
1053 | 4 | graph->while_count = --reverse_count; |
1054 | 4 | _ccv_nnc_graph_unwrap(graph, count, reverse_count); |
1055 | 4 | _ccv_nnc_graph_transit_move_to(graph); |
1056 | 8 | CCV_NNC_GRAPH_VISIT4 (graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph_sources, graph_source_size, graph_destinations, graph_destination_size, 0, visitor); |
1057 | 4 | _ccv_nnc_graph_from_move_transit(graph); |
1058 | 4 | _ccv_nnc_graph_rewrap(graph); |
1059 | 4 | } |
1060 | 1 | } |
1061 | 30 | } else { |
1062 | 30 | graph->while_count = 0; |
1063 | 182 | CCV_NNC_GRAPH_VISIT30 (graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph_sources, graph_source_size, graph_destinations, graph_destination_size, 0, visitor); |
1064 | 30 | } |
1065 | 35 | #undef visitor |
1066 | 35 | } |
1067 | | |
1068 | | static int _ccv_nnc_graph_run(ccv_nnc_graph_t* const graph, const int exec_idx, ccv_nnc_graph_exec_info_t* const exec, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, const int flags, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context) |
1069 | 9.94k | { |
1070 | 9.94k | assert((sources == 0 && source_size == 0) || (sources && source_size)); |
1071 | 9.94k | assert((destinations == 0 && destination_size == 0) || (destinations && destination_size)); |
1072 | 9.94k | const ccv_nnc_graph_exec_t* const graph_sources = sources ? sources27 : (ccv_nnc_graph_exec_t*)9.91k ccv_array_get9.91k (graph->sources, 0); |
1073 | 9.94k | const int graph_source_size = source_size ? source_size27 : graph->sources->rnum9.91k ; |
1074 | 9.94k | const ccv_nnc_graph_exec_t* const graph_destinations = destinations ? destinations27 : (ccv_nnc_graph_exec_t*)9.91k ccv_array_get9.91k (graph->destinations, 0); |
1075 | 9.94k | const int graph_destination_size = destination_size ? destination_size27 : graph->destinations->rnum9.91k ; |
1076 | 9.94k | int i; |
1077 | 19.8k | for (i = 0; i < graph_source_size; i++9.94k ) |
1078 | 9.94k | if (graph_sources[i].graph != graph) |
1079 | 0 | return CCV_NNC_EXEC_INVALID; |
1080 | 19.8k | for (i = 0; 9.94k i < graph_destination_size; i++9.94k ) |
1081 | 9.94k | if (graph_destinations[i].graph != graph) |
1082 | 0 | return CCV_NNC_EXEC_INVALID; |
1083 | | // When topsorted is true, there is no memory allocation when run the graph. |
1084 | 9.94k | const int topsorted = (!sources && !destinations9.91k && graph->topsorted9.91k ); |
1085 | 9.94k | if (topsorted) |
1086 | 9.90k | _ccv_nnc_graph_topsorted_run(graph, exec_idx, exec, flags, tensor_tape, stream_context); |
1087 | 35 | else |
1088 | 35 | _ccv_nnc_graph_run_slow_path(graph, exec_idx, exec, inputs, input_size, outputs, output_size, flags, sources, source_size, destinations, destination_size, tensor_tape, stream_context); |
1089 | 9.94k | return CCV_NNC_EXEC_SUCCESS; |
1090 | 9.94k | } |
1091 | | |
1092 | | int ccv_nnc_graph_run(ccv_nnc_graph_t* const graph, const int flags, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context) |
1093 | 10.1k | { |
1094 | 10.1k | __atomic_store_n(&graph->run_state, CCV_NNC_GRAPH_STATE_RUNNING, __ATOMIC_RELEASE); |
1095 | 10.1k | if (stream_context && graph->topsorted215 && graph->stream_size > 0215 && graph->default_schedule215 && source_size == 0215 && destination_size == 0215 ) |
1096 | 215 | { |
1097 | 215 | co_scheduler_t* const scheduler = ccv_nnc_stream_context_get_scheduler(stream_context); |
1098 | 215 | co_routine_t* const task = co_new(_ccv_nnc_graph_topsorted_run_coro, (graph, -1, graph->default_schedule, 0, tensor_tape, stream_context, flags)); |
1099 | 215 | co_schedule(scheduler, task); |
1100 | | // I don't need to worry about freeing this task, it will free itself at the end. |
1101 | 215 | return CCV_NNC_EXEC_SUCCESS; |
1102 | 215 | } else |
1103 | 9.88k | return _ccv_nnc_graph_run(graph, -1, 0, 0, 0, 0, 0, flags, sources, source_size, destinations, destination_size, tensor_tape, 0 /* In this case, we don't support stream context yet. */); |
1104 | 10.1k | } |
1105 | | |
1106 | | int ccv_nnc_graph_run_with_schedule(ccv_nnc_graph_t* const graph, const int flags, const ccv_nnc_graph_static_schedule_t* const _schedule, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const _stream_context) |
1107 | 26.2k | { |
1108 | 26.2k | assert(graph->topsorted); |
1109 | 26.2k | if (graph->exec_info->rnum == 0) |
1110 | 0 | return CCV_NNC_EXEC_SUCCESS; |
1111 | 26.2k | __atomic_store_n(&graph->run_state, CCV_NNC_GRAPH_STATE_RUNNING, __ATOMIC_RELEASE); |
1112 | 26.2k | assert(graph->stream_size > 0); |
1113 | 26.2k | const ccv_nnc_graph_static_schedule_t* const schedule = _schedule ? _schedule15.7k : graph->default_schedule10.4k ; |
1114 | 26.2k | assert(schedule); |
1115 | 26.2k | assert(schedule->stream_0 < graph->stream_size); |
1116 | 26.2k | ccv_nnc_stream_context_t* const stream_context = _stream_context ? _stream_context699 : graph->streams[schedule->stream_0]25.5k ; |
1117 | 26.2k | co_scheduler_t* const scheduler = ccv_nnc_stream_context_get_scheduler(stream_context); |
1118 | 26.2k | co_routine_t* const task = co_new(_ccv_nnc_graph_topsorted_run_coro, (graph, -1, schedule, 0, tensor_tape, stream_context, flags)); |
1119 | 26.2k | co_schedule(scheduler, task); |
1120 | | // I don't need to worry about freeing this task, it will free itself at the end. |
1121 | 26.2k | if (!_stream_context) // If no stream context provided, this is a sync operation. |
1122 | 25.5k | ccv_nnc_stream_context_wait(stream_context); |
1123 | 26.2k | return CCV_NNC_EXEC_SUCCESS; |
1124 | 26.2k | } |
1125 | | |
1126 | | void ccv_nnc_graph_cancel(ccv_nnc_graph_t* const graph) |
1127 | 0 | { |
1128 | 0 | __atomic_store_n(&graph->run_state, CCV_NNC_GRAPH_STATE_CANCEL, __ATOMIC_RELEASE); |
1129 | 0 | } |