/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_graph.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "ccv_nnc.h" |
2 | | #include "ccv_nnc_easy.h" |
3 | | #include "ccv_nnc_internal.h" |
4 | | #include "ccv_internal.h" |
5 | | #include "_ccv_nnc_graph.h" |
6 | | |
7 | | // MARK - Level-2 API |
8 | | |
9 | | ccv_nnc_graph_t* ccv_nnc_graph_new(void) |
10 | 6.24k | { |
11 | 6.24k | ccv_nnc_graph_t* graph = (ccv_nnc_graph_t*)cccalloc(1, sizeof(ccv_nnc_graph_t)); |
12 | 6.24k | graph->exec_info = ccv_array_new(sizeof(ccv_nnc_graph_exec_info_t), 5, 0); |
13 | 6.24k | return graph; |
14 | 6.24k | } |
15 | | |
16 | | void ccv_nnc_graph_set_sources(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t* const sources, const int source_size) |
17 | 6.23k | { |
18 | 6.23k | if (!graph->sources) |
19 | 6.23k | graph->sources = ccv_array_new(sizeof(ccv_nnc_graph_exec_t), source_size, 0); |
20 | 0 | else |
21 | 0 | ccv_array_clear(graph->sources); |
22 | 6.23k | int i; |
23 | 12.4k | for (i = 0; i < source_size; i++6.23k ) |
24 | 6.23k | ccv_array_push(graph->sources, sources + i); |
25 | 6.23k | graph->topsorted = 0; |
26 | 6.23k | } |
27 | | |
28 | | ccv_nnc_graph_exec_t* ccv_nnc_graph_sources(const ccv_nnc_graph_t* const graph) |
29 | 0 | { |
30 | 0 | return graph->sources ? (ccv_nnc_graph_exec_t*)ccv_array_get(graph->sources, 0) : 0; |
31 | 0 | } |
32 | | |
33 | | int ccv_nnc_graph_source_size(const ccv_nnc_graph_t* const graph) |
34 | 0 | { |
35 | 0 | return graph->sources ? graph->sources->rnum : 0; |
36 | 0 | } |
37 | | |
38 | | void ccv_nnc_graph_set_destinations(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t* const destinations, const int destination_size) |
39 | 6.23k | { |
40 | 6.23k | if (!graph->destinations) |
41 | 6.23k | graph->destinations = ccv_array_new(sizeof(ccv_nnc_graph_exec_t), destination_size, 0); |
42 | 0 | else |
43 | 0 | ccv_array_clear(graph->sources); |
44 | 6.23k | int i; |
45 | 12.4k | for (i = 0; i < destination_size; i++6.23k ) |
46 | 6.23k | ccv_array_push(graph->destinations, destinations + i); |
47 | 6.23k | graph->topsorted = 0; |
48 | 6.23k | } |
49 | | |
50 | | ccv_nnc_graph_exec_t* ccv_nnc_graph_destinations(const ccv_nnc_graph_t* const graph) |
51 | 0 | { |
52 | 0 | return graph->destinations ? (ccv_nnc_graph_exec_t*)ccv_array_get(graph->destinations, 0) : 0; |
53 | 0 | } |
54 | | |
55 | | int ccv_nnc_graph_destination_size(const ccv_nnc_graph_t* const graph) |
56 | 0 | { |
57 | 0 | return graph->destinations ? graph->destinations->rnum : 0; |
58 | 0 | } |
59 | | |
60 | | void ccv_nnc_graph_exec_set(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, const ccv_nnc_cmd_t cmd) |
61 | 44.2k | { |
62 | 44.2k | assert(exec.d < graph->exec_info->rnum); |
63 | 44.2k | assert(exec.graph == graph); |
64 | 44.2k | ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d); |
65 | 44.2k | exec_info->cmd = cmd; |
66 | 44.2k | } |
67 | | |
68 | | ccv_nnc_cmd_t ccv_nnc_graph_exec_cmd(const ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec) |
69 | 6.41k | { |
70 | 6.41k | assert(exec.d < graph->exec_info->rnum); |
71 | 6.41k | assert(exec.graph == graph); |
72 | 6.41k | ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d); |
73 | 6.41k | return exec_info->cmd; |
74 | 6.41k | } |
75 | | |
76 | | void ccv_nnc_graph_exec_set_hint(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, const ccv_nnc_hint_t hint) |
77 | 178 | { |
78 | 178 | assert(exec.d < graph->exec_info->rnum); |
79 | 178 | assert(exec.graph == graph); |
80 | 178 | ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d); |
81 | 178 | exec_info->hint = hint; |
82 | 178 | } |
83 | | |
84 | | static int _ccv_nnc_tensor_multiview_level_count(const ccv_nnc_tensor_multiview_t* const mv) |
85 | 482 | { |
86 | 482 | if (!CCV_IS_TENSOR_MULTIVIEW(mv)) |
87 | 327 | return 1; |
88 | 155 | const int count = mv->kind + mv->repeat; |
89 | 155 | int i, c = 0; |
90 | 502 | for (i = 0; i < count; i++347 ) |
91 | 347 | { |
92 | 347 | ccv_nnc_tensor_t* tv = CCV_NNC_MULTIVIEW_DATA(mv)[i]; |
93 | 347 | if (tv == CCV_NNC_TENSOR_PLACEHOLDER) |
94 | 8 | c = ccv_max(c, 1); |
95 | 339 | else |
96 | 339 | c = ccv_max(c, _ccv_nnc_tensor_multiview_level_count((ccv_nnc_tensor_multiview_t*)tv)); |
97 | 347 | } |
98 | 155 | return c + 1; |
99 | 482 | } |
100 | | |
101 | | static ccv_nnc_graph_tensor_wrap_t* _ccv_nnc_graph_tensor_wrap_new(const ccv_nnc_tensor_multiview_t* const mv) |
102 | 143 | { |
103 | 143 | const int level_count = _ccv_nnc_tensor_multiview_level_count(mv); |
104 | 143 | ccv_nnc_graph_tensor_wrap_t* tensor_wrap = (ccv_nnc_graph_tensor_wrap_t*)ccmalloc(sizeof(ccv_nnc_graph_tensor_wrap_t) + sizeof(ccv_nnc_tensor_t*) * (level_count - 1)); |
105 | 143 | tensor_wrap->update_required = 0; |
106 | 143 | tensor_wrap->count = level_count; |
107 | 143 | tensor_wrap->index = 0; |
108 | 143 | tensor_wrap->tensors[0] = (ccv_nnc_tensor_t*)mv; |
109 | 143 | return tensor_wrap; |
110 | 143 | } |
111 | | |
112 | | static void _ccv_nnc_graph_exec_rewind(ccv_nnc_graph_exec_info_t* const info, ccv_nnc_graph_t* const graph) |
113 | 23 | { |
114 | 23 | if (!info->tensor_wraps_ref) |
115 | 22 | return; |
116 | 1 | int i; |
117 | 1 | assert(info->tensor_wraps_ref <= graph->tensor_wraps->rnum); |
118 | 1 | ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, info->tensor_wraps_ref - 1);; |
119 | | // Rewind from tensor wraps. |
120 | 3 | for (i = 0; i < info->input_size; i++2 ) |
121 | 2 | if (tensor_wrap_array->tensor_wraps[i]) |
122 | 1 | info->inputs[i] = tensor_wrap_array->tensor_wraps[i]->tensors[0]; |
123 | 1 | const int d = info->input_size; |
124 | 2 | for (i = 0; i < info->output_size; i++1 ) |
125 | 1 | if (tensor_wrap_array->tensor_wraps[d + i]) |
126 | 1 | info->outputs[i] = tensor_wrap_array->tensor_wraps[d + i]->tensors[0]; |
127 | 1 | const int dd = info->input_size + info->output_size; |
128 | 1 | for (i = 0; i < info->update_size; i++0 ) |
129 | 0 | if (tensor_wrap_array->tensor_wraps[dd + i]) |
130 | 0 | info->updates[i] = tensor_wrap_array->tensor_wraps[dd + i]->tensors[0]; |
131 | 1 | } |
132 | | |
133 | | static void _ccv_nnc_graph_tensor_wrap_free(ccv_nnc_graph_tensor_wrap_t* const tensor_wrap) |
134 | 195 | { |
135 | 195 | ccfree(tensor_wrap); |
136 | 195 | } |
137 | | |
138 | | ccv_nnc_graph_tensor_wrap_array_t* ccv_nnc_get_tensor_wrap_array(ccv_nnc_graph_t* const graph, const int tensor_wrap_size, int* const tensor_wraps_ref) |
139 | 62 | { |
140 | 62 | ccv_nnc_graph_tensor_wrap_array_t** tensor_wrap_array_ref = *tensor_wraps_ref ? (ccv_nnc_graph_tensor_wrap_array_t**)9 ccv_array_get9 (graph->tensor_wraps, *tensor_wraps_ref - 1) : 053 ; |
141 | | // Otherwise, find an open slot. |
142 | 62 | if (!tensor_wrap_array_ref) |
143 | 53 | { |
144 | 53 | if (!graph->tensor_wraps) |
145 | 27 | graph->tensor_wraps = ccv_array_new(sizeof(ccv_nnc_graph_tensor_wrap_array_t*), 0, 0); |
146 | 53 | ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = 0; |
147 | 53 | ccv_array_push(graph->tensor_wraps, &tensor_wrap_array); |
148 | 53 | tensor_wrap_array_ref = (ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, graph->tensor_wraps->rnum - 1); |
149 | 53 | *tensor_wraps_ref = graph->tensor_wraps->rnum; |
150 | 53 | } |
151 | 62 | int i; |
152 | 62 | if (*tensor_wrap_array_ref) |
153 | 9 | { |
154 | 9 | if ((*tensor_wrap_array_ref)->size != tensor_wrap_size) |
155 | 9 | *tensor_wrap_array_ref = (ccv_nnc_graph_tensor_wrap_array_t*)ccrealloc(*tensor_wrap_array_ref, sizeof(ccv_nnc_graph_tensor_wrap_array_t) + sizeof(ccv_nnc_graph_tensor_wrap_t*) * (tensor_wrap_size - 1)); |
156 | 18 | for (i = (*tensor_wrap_array_ref)->size; i < tensor_wrap_size; i++9 ) |
157 | 9 | (*tensor_wrap_array_ref)->tensor_wraps[i] = 0; |
158 | 9 | } else |
159 | 53 | *tensor_wrap_array_ref = (ccv_nnc_graph_tensor_wrap_array_t*)cccalloc(sizeof(ccv_nnc_graph_tensor_wrap_array_t) + sizeof(ccv_nnc_graph_tensor_wrap_t*) * (tensor_wrap_size - 1), 1); |
160 | 62 | ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *tensor_wrap_array_ref; |
161 | 62 | tensor_wrap_array->size = tensor_wrap_size; |
162 | 62 | return tensor_wrap_array; |
163 | 62 | } |
164 | | |
165 | | void ccv_nnc_set_tensor_wraps(ccv_nnc_graph_tensor_wrap_t** const tensor_wraps, ccv_nnc_tensor_t* const* const tensors, const int tensor_size) |
166 | 184 | { |
167 | 184 | int i; |
168 | 349 | for (i = 0; i < tensor_size; i++165 ) |
169 | 165 | if (tensors[i]) |
170 | 164 | { |
171 | 164 | if (CCV_IS_TENSOR_MULTIVIEW(tensors[i]) && |
172 | 164 | ((ccv_nnc_tensor_multiview_t*)tensors[i])->anchor != 111 CCV_NNC_MULTIVIEW_PHI111 ) |
173 | 107 | { |
174 | 107 | if (!tensor_wraps[i] || tensors[i] != tensor_wraps[i]->tensors[0]14 ) |
175 | 93 | { |
176 | 93 | if (tensor_wraps[i]) |
177 | 0 | _ccv_nnc_graph_tensor_wrap_free(tensor_wraps[i]); |
178 | 93 | tensor_wraps[i] = _ccv_nnc_graph_tensor_wrap_new((ccv_nnc_tensor_multiview_t*)tensors[i]); |
179 | 93 | } |
180 | 107 | } else { |
181 | 57 | if (tensor_wraps[i]) |
182 | 0 | _ccv_nnc_graph_tensor_wrap_free(tensor_wraps[i]); |
183 | 57 | tensor_wraps[i] = 0; |
184 | 57 | } |
185 | 164 | } |
186 | 184 | } |
187 | | |
188 | | void ccv_nnc_graph_register_tensor_wraps(ccv_nnc_graph_t* graph, const int tensor_wraps_ref_d) |
189 | 53 | { |
190 | 53 | ccv_nnc_graph_t* p = graph; |
191 | 53 | const ccv_nnc_graph_tensor_wraps_ref_t tensor_wraps_ref = { |
192 | 53 | .d = tensor_wraps_ref_d, |
193 | 53 | .graph = graph, |
194 | 53 | }; |
195 | 99 | do { |
196 | 99 | if (!p->tensor_wraps_refs) |
197 | 44 | { |
198 | 44 | p->tensor_wraps_refs = ccv_array_new(sizeof(ccv_nnc_graph_tensor_wraps_ref_t), 0, 0); |
199 | 44 | ccv_array_push(p->tensor_wraps_refs, &tensor_wraps_ref); |
200 | 55 | } else { |
201 | 55 | int i; |
202 | 55 | int has_tensor_wraps_ref = 0; |
203 | 152 | for (i = 0; !has_tensor_wraps_ref && i < p->tensor_wraps_refs->rnum; i++97 ) |
204 | 97 | { |
205 | 97 | ccv_nnc_graph_tensor_wraps_ref_t* tensor_wraps_ref = (ccv_nnc_graph_tensor_wraps_ref_t*)ccv_array_get(p->tensor_wraps_refs, i); |
206 | 97 | has_tensor_wraps_ref = (tensor_wraps_ref->d == tensor_wraps_ref_d && tensor_wraps_ref->graph == graph8 ); |
207 | 97 | } |
208 | 55 | if (!has_tensor_wraps_ref) |
209 | 55 | ccv_array_push(p->tensor_wraps_refs, &tensor_wraps_ref); |
210 | 55 | } |
211 | 99 | p = p->p; |
212 | 99 | } while (p); |
213 | 53 | } |
214 | | |
215 | | static void _ccv_nnc_graph_redo_tensor_wraps(ccv_nnc_graph_exec_info_t* const info, ccv_nnc_graph_t* const graph) |
216 | 32.5k | { |
217 | 32.5k | int i; |
218 | 32.5k | const int has_wrap = ccv_nnc_tensors_have_wraps(info->inputs, info->input_size) || |
219 | 32.5k | ccv_nnc_tensors_have_wraps(info->outputs, info->output_size)32.4k || |
220 | 32.5k | ccv_nnc_tensors_have_wraps(info->updates, info->update_size)32.4k ; |
221 | 32.5k | if (has_wrap) |
222 | 61 | { |
223 | 61 | const int tensor_wrap_size = info->input_size + info->output_size + info->update_size; |
224 | 61 | ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = ccv_nnc_get_tensor_wrap_array(graph, tensor_wrap_size, &info->tensor_wraps_ref); |
225 | 61 | ccv_nnc_set_tensor_wraps(tensor_wrap_array->tensor_wraps, info->inputs, info->input_size); |
226 | 61 | const int d = info->input_size; |
227 | 61 | ccv_nnc_set_tensor_wraps(tensor_wrap_array->tensor_wraps + d, info->outputs, info->output_size); |
228 | 61 | const int dd = info->input_size + info->output_size; |
229 | 61 | ccv_nnc_set_tensor_wraps(tensor_wrap_array->tensor_wraps + dd, info->updates, info->update_size); |
230 | 32.4k | } else if (info->tensor_wraps_ref) { |
231 | 1 | ccv_nnc_graph_tensor_wrap_array_t** tensor_wrap_array_ref = (ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, info->tensor_wraps_ref - 1); |
232 | 1 | ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *tensor_wrap_array_ref; |
233 | 1 | if (tensor_wrap_array) |
234 | 1 | { |
235 | 4 | for (i = 0; i < tensor_wrap_array->size; i++3 ) |
236 | 3 | if (tensor_wrap_array->tensor_wraps[i]) |
237 | 2 | _ccv_nnc_graph_tensor_wrap_free(tensor_wrap_array->tensor_wraps[i]); |
238 | 1 | ccfree(tensor_wrap_array); |
239 | 1 | *tensor_wrap_array_ref = 0; |
240 | 1 | info->tensor_wraps_ref = 0; |
241 | 1 | } |
242 | 1 | } |
243 | 32.5k | } |
244 | | |
245 | | static void _ccv_nnc_graph_deregister_tensor_wraps(ccv_nnc_graph_t* graph, const int tensor_wraps_ref_d) |
246 | 1 | { |
247 | 1 | ccv_nnc_graph_t* p = graph; |
248 | 2 | do { |
249 | 2 | int i; |
250 | | // Remove from the array. |
251 | 2 | if (p->tensor_wraps_refs) |
252 | 2 | for (i = 0; i < p->tensor_wraps_refs->rnum; i++0 ) |
253 | 2 | { |
254 | 2 | ccv_nnc_graph_tensor_wraps_ref_t* const tensor_wraps_ref = (ccv_nnc_graph_tensor_wraps_ref_t*)ccv_array_get(p->tensor_wraps_refs, i); |
255 | 2 | if (tensor_wraps_ref->d == tensor_wraps_ref_d && tensor_wraps_ref->graph == graph) |
256 | 2 | { |
257 | 2 | --p->tensor_wraps_refs->rnum; |
258 | 2 | if (i < p->tensor_wraps_refs->rnum) |
259 | 0 | memcpy(tensor_wraps_ref, tensor_wraps_ref + 1, sizeof(ccv_nnc_graph_exec_t) * (p->tensor_wraps_refs->rnum - i)); |
260 | 2 | break; |
261 | 2 | } |
262 | 2 | } |
263 | 2 | p = p->p; |
264 | 2 | } while (p); |
265 | 1 | } |
266 | | |
267 | | void ccv_nnc_graph_exec_set_io_flags(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, const int* const input_flags, const int input_flag_size, const int* const output_flags, const int output_flag_size) |
268 | 32.0k | { |
269 | 32.0k | assert(exec.d < graph->exec_info->rnum); |
270 | 32.0k | assert(exec.graph == graph); |
271 | 32.0k | ccv_nnc_graph_exec_info_t* const info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d); |
272 | 32.0k | assert(input_flag_size <= info->input_size); |
273 | 32.0k | assert(output_flag_size <= info->output_size); |
274 | 32.0k | if (info->input_size + info->output_size == 0) |
275 | 19 | return; |
276 | 32.0k | if (!info->input_flags) |
277 | 32.0k | { |
278 | 32.0k | info->input_flags = (int*)cccalloc(info->input_size + info->output_size, sizeof(int)); |
279 | 32.0k | info->output_flags = info->input_flags + info->input_size; |
280 | 32.0k | } |
281 | 32.0k | if (input_flag_size > 0) |
282 | 0 | memcpy(info->input_flags, input_flags, sizeof(int) * input_flag_size); |
283 | 32.0k | if (output_flag_size > 0) |
284 | 0 | memcpy(info->output_flags, output_flags, sizeof(int) * output_flag_size); |
285 | 32.0k | } |
286 | | |
287 | | void ccv_nnc_graph_exec_pair_with(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, const ccv_nnc_graph_exec_t pair_exec) |
288 | 587 | { |
289 | 587 | assert(exec.graph == graph); |
290 | 587 | assert(exec.d >= 0); |
291 | 587 | assert(exec.d < graph->exec_info->rnum); |
292 | 587 | assert(pair_exec.graph == graph || pair_exec.graph == graph->pair); |
293 | 587 | assert(pair_exec.d >= 0); |
294 | 587 | if (pair_exec.graph == graph) |
295 | 583 | { assert(pair_exec.d < graph->exec_info->rnum); } |
296 | 4 | else |
297 | 4 | { assert(pair_exec.d < graph->pair->exec_info->rnum); } |
298 | 587 | ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d); |
299 | 587 | exec_info->pair_ref = pair_exec.d + 1; |
300 | 587 | } |
301 | | |
302 | | static ccv_nnc_tensor_t* _ccv_nnc_any_tensor_from_tensor_multiview(ccv_nnc_tensor_multiview_t* const mv) |
303 | 92 | { |
304 | 92 | ccv_nnc_tensor_t* tensor = (ccv_nnc_tensor_t*)mv; |
305 | 188 | while (CCV_IS_TENSOR_MULTIVIEW(tensor)) |
306 | 96 | { |
307 | 96 | ccv_nnc_tensor_multiview_t* mv = (ccv_nnc_tensor_multiview_t*)tensor; |
308 | 96 | const int count = 0; |
309 | 96 | const int off = mv->kind; |
310 | 96 | const int mod = mv->repeat; |
311 | | // If reached the root. |
312 | 96 | tensor = CCV_NNC_MULTIVIEW_DATA(mv)[count >= off ? ((count - off) % mod) + off83 : count13 ]; // Unwrap. |
313 | 96 | } |
314 | 92 | return tensor; |
315 | 92 | } |
316 | | |
317 | | void ccv_nnc_graph_exec_set_io(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size) |
318 | 23 | { |
319 | 23 | assert(exec.d < graph->exec_info->rnum); |
320 | 23 | assert(exec.graph == graph); |
321 | 23 | ccv_nnc_graph_exec_info_t* const info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d); |
322 | | // De-register from the graph if it contains multiview tensors. |
323 | 23 | if (info->tensor_wraps_ref) |
324 | 1 | _ccv_nnc_graph_deregister_tensor_wraps(graph, info->tensor_wraps_ref - 1); |
325 | | // In case it is already executed, rewind. |
326 | 23 | _ccv_nnc_graph_exec_rewind(info, graph); |
327 | 23 | if (input_size == 0 && output_size == 04 ) |
328 | 1 | { |
329 | 1 | if (info->input_size > 0 || info->output_size > 0) |
330 | 0 | ccfree(info->inputs); |
331 | 1 | info->inputs = 0; |
332 | 1 | info->outputs = 0; |
333 | 1 | info->input_size = 0; |
334 | 1 | info->output_size = 0; |
335 | 1 | _ccv_nnc_graph_redo_tensor_wraps(info, graph); |
336 | 1 | if (info->tensor_wraps_ref) |
337 | 0 | ccv_nnc_graph_register_tensor_wraps(graph, info->tensor_wraps_ref - 1); |
338 | 1 | return; |
339 | 1 | } |
340 | 22 | if (info->inputs) |
341 | 2 | info->inputs = (ccv_nnc_tensor_t**)ccrealloc(info->inputs, sizeof(ccv_nnc_tensor_t*) * (input_size + output_size)); |
342 | 20 | else |
343 | 20 | info->inputs = (ccv_nnc_tensor_t**)ccmalloc(sizeof(ccv_nnc_tensor_t*) * (input_size + output_size)); |
344 | 22 | info->outputs = info->inputs + input_size; |
345 | 22 | if (inputs) |
346 | 22 | memcpy(info->inputs, inputs, sizeof(ccv_nnc_tensor_t*) * input_size); |
347 | 22 | if (outputs) |
348 | 22 | memcpy(info->outputs, outputs, sizeof(ccv_nnc_tensor_t*) * output_size); |
349 | 22 | int i; |
350 | 22 | int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0; |
351 | 77 | for (i = 0; i < input_size + output_size; i++55 ) |
352 | 55 | if (info->inputs[i]) |
353 | 55 | { |
354 | 55 | ccv_nnc_tensor_t* const tensor = CCV_IS_TENSOR_MULTIVIEW(info->inputs[i]) ? _ccv_nnc_any_tensor_from_tensor_multiview((ccv_nnc_tensor_multiview_t*)info->inputs[i])3 : info->inputs[i]52 ; |
355 | 55 | tensor_memory |= CCV_TENSOR_GET_MEMORY(tensor->info.type), tensor_formats |= tensor->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(tensor->info.datatype); |
356 | 55 | } |
357 | 22 | info->cmd.backend = ccv_nnc_cmd_find_backend(info->cmd, tensor_memory, tensor_formats, tensor_datatypes); |
358 | 22 | info->input_size = input_size; |
359 | 22 | info->output_size = output_size; |
360 | 22 | _ccv_nnc_graph_redo_tensor_wraps(info, graph); |
361 | | // Register again if the tensor wraps exist. |
362 | 22 | if (info->tensor_wraps_ref) |
363 | 2 | ccv_nnc_graph_register_tensor_wraps(graph, info->tensor_wraps_ref - 1); |
364 | | // Free flags. |
365 | 22 | if (info->input_flags) |
366 | 0 | { |
367 | 0 | ccfree(info->input_flags); |
368 | 0 | info->input_flags = info->output_flags = 0; |
369 | 0 | } |
370 | 22 | } |
371 | | |
372 | | void ccv_nnc_graph_exec_add_as_affected(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t exec, ccv_nnc_tensor_t* const update) |
373 | 23 | { |
374 | 23 | assert(CCV_IS_TENSOR_MULTIVIEW(update)); |
375 | 23 | assert(exec.d < graph->exec_info->rnum); |
376 | 23 | assert(exec.graph == graph); |
377 | 23 | ccv_nnc_graph_exec_info_t* const info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, exec.d); |
378 | 23 | const int register_tensor_wraps = !info->tensor_wraps_ref; |
379 | 23 | const int update_index = info->update_size; |
380 | 23 | ++info->update_size; |
381 | 23 | if (info->updates) |
382 | 6 | info->updates = (ccv_nnc_tensor_t**)ccrealloc(info->updates, sizeof(ccv_nnc_tensor_t*) * info->update_size); |
383 | 17 | else |
384 | 17 | info->updates = (ccv_nnc_tensor_t**)ccmalloc(sizeof(ccv_nnc_tensor_t*) * info->update_size); |
385 | 23 | info->updates[update_index] = update; |
386 | 23 | _ccv_nnc_graph_redo_tensor_wraps(info, graph); |
387 | 23 | if (register_tensor_wraps) |
388 | 14 | ccv_nnc_graph_register_tensor_wraps(graph, info->tensor_wraps_ref - 1); |
389 | 23 | } |
390 | | |
391 | | ccv_nnc_graph_exec_t ccv_nnc_graph_exec_new(ccv_nnc_graph_t* const graph, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size) |
392 | 32.4k | { |
393 | 32.4k | int d = graph->exec_info->rnum; |
394 | 32.4k | ccv_nnc_graph_exec_info_t info = { |
395 | 32.4k | .cmd = cmd, |
396 | 32.4k | .hint = hint, |
397 | 32.4k | .input_size = input_size, |
398 | 32.4k | .output_size = output_size, |
399 | 32.4k | }; |
400 | 32.4k | assert(inputs || input_size == 0); |
401 | 32.4k | assert(outputs || output_size == 0); |
402 | 32.4k | if (input_size > 0 || output_size > 04.71k ) |
403 | 32.1k | { |
404 | 32.1k | info.inputs = (ccv_nnc_tensor_t**)ccmalloc(sizeof(ccv_nnc_tensor_t*) * (input_size + output_size)); |
405 | 32.1k | info.outputs = info.inputs + input_size; |
406 | 32.1k | if (inputs) |
407 | 32.1k | memcpy(info.inputs, inputs, sizeof(ccv_nnc_tensor_t*) * input_size); |
408 | 32.1k | if (outputs) |
409 | 32.1k | memcpy(info.outputs, outputs, sizeof(ccv_nnc_tensor_t*) * output_size); |
410 | 32.1k | info.input_size = input_size; |
411 | 32.1k | info.output_size = output_size; |
412 | 32.1k | int i; |
413 | 32.1k | int tensor_memory = 0, tensor_formats = 0, tensor_datatypes = 0; |
414 | 173k | for (i = 0; i < input_size + output_size; i++141k ) |
415 | 141k | if (info.inputs[i]) |
416 | 105k | { |
417 | 105k | ccv_nnc_tensor_t* const tensor = CCV_IS_TENSOR_MULTIVIEW(info.inputs[i]) ? _ccv_nnc_any_tensor_from_tensor_multiview((ccv_nnc_tensor_multiview_t*)info.inputs[i])76 : info.inputs[i]105k ; |
418 | 105k | tensor_memory |= CCV_TENSOR_GET_MEMORY(tensor->info.type), tensor_formats |= tensor->info.format, tensor_datatypes |= CCV_GET_DATA_TYPE(tensor->info.datatype); |
419 | 105k | } |
420 | 32.1k | info.cmd.backend = ccv_nnc_cmd_find_backend(info.cmd, tensor_memory, tensor_formats, tensor_datatypes); |
421 | 32.1k | } |
422 | 32.4k | _ccv_nnc_graph_redo_tensor_wraps(&info, graph); |
423 | | // Add itself to the graph's wraps array, this will help the run time when we run the graph and do unwrapping. |
424 | 32.4k | if (info.tensor_wraps_ref) |
425 | 36 | ccv_nnc_graph_register_tensor_wraps(graph, info.tensor_wraps_ref - 1); |
426 | 32.4k | ccv_array_push(graph->exec_info, &info); |
427 | 32.4k | return (ccv_nnc_graph_exec_t){ |
428 | 32.4k | .d = d, |
429 | 32.4k | .graph = graph, |
430 | 32.4k | }; |
431 | 32.4k | } |
432 | | |
433 | | void ccv_nnc_graph_add_carry_over(ccv_nnc_graph_t* const graph, const ccv_nnc_tensor_t* const from, const ccv_nnc_tensor_t* const to) |
434 | 25 | { |
435 | 25 | ccv_nnc_graph_tensor_carry_over_t carry_over = { |
436 | 25 | .from = _ccv_nnc_graph_tensor_wrap_new((ccv_nnc_tensor_multiview_t*)from), |
437 | 25 | .to = _ccv_nnc_graph_tensor_wrap_new((ccv_nnc_tensor_multiview_t*)to) |
438 | 25 | }; |
439 | 25 | if (!graph->carry_overs) |
440 | 21 | graph->carry_overs = ccv_array_new(sizeof(ccv_nnc_graph_tensor_carry_over_t), 0, 0); |
441 | 25 | ccv_array_push(graph->carry_overs, &carry_over); |
442 | 25 | } |
443 | | |
444 | | int ccv_nnc_graph_exec_concat(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t source, const ccv_nnc_graph_exec_t destination) |
445 | 29.2k | { |
446 | 29.2k | assert(graph == source.graph); |
447 | 29.2k | assert(graph == destination.graph); |
448 | 29.2k | assert(source.d < graph->exec_info->rnum); |
449 | 29.2k | assert(destination.d < graph->exec_info->rnum); |
450 | 29.2k | ccv_nnc_graph_exec_info_t* src_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, source.d); |
451 | 29.2k | if (src_info->outgoings == 0) |
452 | 26.2k | src_info->outgoings = ccv_array_new(sizeof(int32_t), 1, 0); |
453 | 3.07k | else { |
454 | 3.07k | int i; |
455 | | // Check if this is already connected, if so, skip. |
456 | 9.31k | for (i = 0; i < src_info->outgoings->rnum; i++6.24k ) |
457 | 6.24k | if (*(int*)ccv_array_get(src_info->outgoings, i) == destination.d) |
458 | 0 | return -1; |
459 | 3.07k | } |
460 | 29.2k | ccv_array_push(src_info->outgoings, &destination.d); |
461 | 29.2k | graph->topsorted = 0; |
462 | 29.2k | return 0; |
463 | 29.2k | } |
464 | | |
465 | | int ccv_nnc_graph_exec_disjoin(ccv_nnc_graph_t* const graph, const ccv_nnc_graph_exec_t source, const ccv_nnc_graph_exec_t destination) |
466 | 0 | { |
467 | 0 | assert(graph == source.graph); |
468 | 0 | assert(graph == destination.graph); |
469 | 0 | assert(source.d < graph->exec_info->rnum); |
470 | 0 | assert(destination.d < graph->exec_info->rnum); |
471 | 0 | ccv_nnc_graph_exec_info_t* src_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, source.d); |
472 | 0 | if (!src_info->outgoings) |
473 | 0 | return -1; |
474 | 0 | int i; |
475 | | // Check if this is already connected, if so, skip. |
476 | 0 | for (i = 0; i < src_info->outgoings->rnum; i++) |
477 | 0 | if (*(int*)ccv_array_get(src_info->outgoings, i) == destination.d) |
478 | 0 | { |
479 | 0 | if (i < src_info->outgoings->rnum - 1) |
480 | 0 | *(int*)ccv_array_get(src_info->outgoings, i) = *(int*)ccv_array_get(src_info->outgoings, src_info->outgoings->rnum - 1); |
481 | 0 | --src_info->outgoings->rnum; |
482 | 0 | graph->topsorted = 0; |
483 | 0 | return 0; |
484 | 0 | } |
485 | 0 | return -1; |
486 | 0 | } |
487 | | |
488 | | int ccv_nnc_graph_exec_count(const ccv_nnc_graph_t* const graph) |
489 | 0 | { |
490 | 0 | return graph->exec_info ? graph->exec_info->rnum : 0; |
491 | 0 | } |
492 | | |
493 | | void* ccv_nnc_graph_buffer(ccv_nnc_graph_t* const graph, int size) |
494 | 26.5k | { |
495 | 26.5k | if (graph->buffer_size >= size) |
496 | 26.1k | return graph->buffer; |
497 | 352 | graph->buffer_size = size; |
498 | 352 | graph->buffer = (graph->buffer) ? ccrealloc17 (graph->buffer, size)17 : ccmalloc335 (size)335 ; |
499 | 352 | return graph->buffer; |
500 | 26.5k | } |
501 | | |
502 | | void ccv_nnc_graph_topsort(ccv_nnc_graph_t* const graph, int* const exec_cvt, const int exec_cvt_size) |
503 | 6.22k | { |
504 | 6.22k | if (exec_cvt_size == 0 && graph->exec_info->rnum == 00 ) |
505 | 0 | { |
506 | 0 | graph->topsorted = 1; |
507 | 0 | return; |
508 | 0 | } |
509 | 6.22k | assert(exec_cvt_size == graph->exec_info->rnum); |
510 | 6.22k | assert(graph->sources && graph->sources->rnum); |
511 | 6.22k | assert(graph->destinations && graph->destinations->rnum); |
512 | 6.22k | int i, j; |
513 | 38.6k | for (i = 0; i < exec_cvt_size; i++32.4k ) |
514 | 32.4k | exec_cvt[i] = -1; |
515 | 6.22k | ccv_array_t* exec_info = ccv_array_new(sizeof(ccv_nnc_graph_exec_info_t), graph->exec_info->rnum, 0); |
516 | | // If there are breakpoints, it is more complicated, we first start to the breakpoints, and then continue from the breakpoints to the destinations. |
517 | 6.22k | if (graph->breakpoint_size) |
518 | 21 | { |
519 | 42 | ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new21 (graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, (ccv_nnc_graph_exec_t*)ccv_array_get(graph->sources, 0), graph->sources->rnum, graph->breakpoints, graph->breakpoint_size, 0); |
520 | 42 | for (i = 0; i < graph->breakpoint_size; i++21 ) |
521 | 21 | exec_cvt[graph->breakpoints[i].d] = -2; // Mark this as breakpoints, so we will skip the first round. |
522 | 42 | ccv_nnc_graph_visit_for32 (visit, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), node, idx) { |
523 | 32 | assert(!node->pair_ref); // If node has a pair ref, we cannot fix it up. |
524 | 32 | if (exec_cvt[idx] == -2) // Skip breakpoint. |
525 | 21 | continue; |
526 | | // Loop over node and push to the array. |
527 | 11 | ccv_array_push(exec_info, node); |
528 | | // Go to its sub-graph to fix exec_idx |
529 | 11 | for (i = 0; i < node->graph_ref_size; i++0 ) |
530 | 0 | { |
531 | 0 | const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1; |
532 | 0 | if (graph_ref >= 0) |
533 | 0 | { |
534 | 0 | ccv_nnc_graph_t* const sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, graph_ref); |
535 | 0 | sub_graph->exec_idx = exec_info->rnum; |
536 | 0 | } |
537 | 0 | } |
538 | 11 | exec_cvt[idx] = exec_info->rnum - 1; |
539 | 11 | } ccv_nnc_graph_visit_endfor |
540 | 21 | ccv_nnc_graph_visit_free(visit); |
541 | 21 | graph->breakpoint_offset = exec_info->rnum; |
542 | 42 | visit = ccv_nnc_graph_visit_new21 (graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph->breakpoints, graph->breakpoint_size, (ccv_nnc_graph_exec_t*)ccv_array_get(graph->destinations, 0), graph->destinations->rnum, 0); |
543 | 44 | ccv_nnc_graph_visit_for(visit, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), node, idx) { |
544 | 44 | assert(!node->pair_ref); // If node has a pair ref, we cannot fix it up. |
545 | | // Loop over node and push to the array. |
546 | 44 | ccv_array_push(exec_info, node); |
547 | | // Go to its sub-graph to fix exec_idx |
548 | 52 | for (i = 0; i < node->graph_ref_size; i++8 ) |
549 | 8 | { |
550 | 8 | const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1; |
551 | 8 | if (graph_ref >= 0) |
552 | 8 | { |
553 | 8 | ccv_nnc_graph_t* const sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, graph_ref); |
554 | 8 | sub_graph->exec_idx = exec_info->rnum; |
555 | 8 | } |
556 | 8 | } |
557 | 44 | exec_cvt[idx] = exec_info->rnum - 1; |
558 | 44 | } ccv_nnc_graph_visit_endfor |
559 | 21 | ccv_nnc_graph_visit_free(visit); |
560 | 42 | for (i = 0; i < graph->breakpoint_size; i++21 ) |
561 | 21 | { assert(exec_cvt[graph->breakpoints[i].d] >= 0); } // All breakpoints should be assigned. |
562 | 6.20k | } else { |
563 | 12.4k | ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new6.20k (graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, (ccv_nnc_graph_exec_t*)ccv_array_get(graph->sources, 0), graph->sources->rnum, (ccv_nnc_graph_exec_t*)ccv_array_get(graph->destinations, 0), graph->destinations->rnum, 0); |
564 | 32.3k | ccv_nnc_graph_visit_for(visit, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), node, idx) { |
565 | 32.3k | assert(!node->pair_ref); // If node has a pair ref, we cannot fix it up. |
566 | | // Loop over node and push to the array. |
567 | 32.3k | ccv_array_push(exec_info, node); |
568 | | // Go to its sub-graph to fix exec_idx |
569 | 32.3k | for (i = 0; i < node->graph_ref_size; i++42 ) |
570 | 42 | { |
571 | 42 | const int graph_ref = CCV_NNC_GRAPH_REF(node)[i] - 1; |
572 | 42 | if (graph_ref >= 0) |
573 | 42 | { |
574 | 42 | ccv_nnc_graph_t* const sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, graph_ref); |
575 | 42 | sub_graph->exec_idx = exec_info->rnum; |
576 | 42 | } |
577 | 42 | } |
578 | 32.3k | exec_cvt[idx] = exec_info->rnum - 1; |
579 | 32.3k | } ccv_nnc_graph_visit_endfor |
580 | 6.20k | ccv_nnc_graph_visit_free(visit); |
581 | 6.20k | } |
582 | 6.22k | assert(graph->exec_info->rnum == exec_info->rnum); |
583 | 6.22k | ccv_array_free(graph->exec_info); |
584 | 6.22k | graph->exec_info = exec_info; |
585 | 12.4k | for (i = 0; i < graph->sources->rnum; i++6.22k ) |
586 | 6.22k | { |
587 | 6.22k | ccv_nnc_graph_exec_t* const source = (ccv_nnc_graph_exec_t*)ccv_array_get(graph->sources, i); |
588 | 6.22k | source->d = exec_cvt[source->d]; |
589 | 6.22k | } |
590 | 12.4k | for (i = 0; i < graph->destinations->rnum; i++6.22k ) |
591 | 6.22k | { |
592 | 6.22k | ccv_nnc_graph_exec_t* const destination = (ccv_nnc_graph_exec_t*)ccv_array_get(graph->destinations, i); |
593 | 6.22k | destination->d = exec_cvt[destination->d]; |
594 | 6.22k | } |
595 | | // Update all outgoings to reflect the latest. |
596 | 38.6k | for (i = 0; i < exec_info->rnum; i++32.4k ) |
597 | 32.4k | { |
598 | 32.4k | ccv_nnc_graph_exec_info_t* const info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(exec_info, i); |
599 | 32.4k | if (info->outgoings) |
600 | 55.4k | for (j = 0; 26.1k j < info->outgoings->rnum; j++29.2k ) |
601 | 29.2k | *(int*)ccv_array_get(info->outgoings, j) = exec_cvt[*(int*)ccv_array_get(info->outgoings, j)]; |
602 | 32.4k | } |
603 | 6.22k | graph->topsorted = 1; |
604 | 6.22k | } |
605 | | |
606 | | typedef struct { |
607 | | int device_id; |
608 | | int exec_idx; |
609 | | ccv_array_t* signal_set; |
610 | | ccv_array_t* command_set; // The set of command executed in this stream. In case there is a tie (on rank). We will check this. |
611 | | } ccv_nnc_stream_data_t; |
612 | | |
613 | | static void _ccv_nnc_graph_schedule_assign_signals(ccv_array_t* const incoming, ccv_nnc_graph_exec_schedule_t* const node, ccv_array_t* const stream_data, int* const signal_size, ccv_nnc_graph_exec_schedule_t* const exec_info, const int exec_info_size) |
614 | 4.80k | { |
615 | 4.80k | assert(incoming->rnum > 0); |
616 | 4.80k | int i, j, k; |
617 | 4.80k | int wait_size = 0, max_wait_size = 0; |
618 | 10.7k | for (i = 0; i < incoming->rnum; i++5.98k ) |
619 | 5.98k | { |
620 | 5.98k | const int incoming_idx = *(int*)ccv_array_get(incoming, i); |
621 | 5.98k | ccv_nnc_graph_exec_schedule_t* const incoming_exec_info = exec_info + incoming_idx; |
622 | 5.98k | assert(incoming_exec_info->stream_size > 0); |
623 | 5.98k | max_wait_size += incoming_exec_info->stream_size; |
624 | 5.98k | } |
625 | 4.80k | int waits[ccv_max(1, max_wait_size)]; |
626 | 4.80k | assert(node->stream_size > 0); |
627 | 10.7k | for (i = 0; 4.80k i < incoming->rnum; i++5.98k ) |
628 | 5.98k | { |
629 | 5.98k | const int incoming_idx = *(int*)ccv_array_get(incoming, i); |
630 | 5.98k | assert(incoming_idx < exec_info_size); |
631 | 5.98k | assert(incoming_idx >= 0); |
632 | 5.98k | ccv_nnc_graph_exec_schedule_t* const incoming_exec_info = exec_info + incoming_idx; |
633 | 5.98k | assert(incoming_exec_info->stream_size > 0); |
634 | 5.98k | int stream_synced = 1; |
635 | | // If the current node's stream is a subset of the incoming node's stream, there |
636 | | // is no need to sync with signal, because we are already synced with the incoming. |
637 | 11.9k | for (j = 0; stream_synced && j < node->stream_size10.2k ; j++5.99k ) |
638 | 5.99k | { |
639 | 5.99k | const int s = SCHEDULE_STREAMS(*node)[j]; |
640 | 5.99k | assert(s >= 0); |
641 | 5.99k | int flag = 0; |
642 | 12.8k | for (k = 0; !flag && k < incoming_exec_info->stream_size8.55k ; k++6.82k ) |
643 | 6.82k | flag = (SCHEDULE_STREAMS(*incoming_exec_info)[k] == s); |
644 | 5.99k | stream_synced = flag; |
645 | 5.99k | } |
646 | 5.98k | if (stream_synced) |
647 | 4.24k | continue; |
648 | | // Otherwise, find the streams we need to sync with, and create signals for these. |
649 | 3.49k | for (j = 0; 1.73k j < incoming_exec_info->stream_size; j++1.75k ) |
650 | 1.75k | { |
651 | 1.75k | const int s = SCHEDULE_STREAMS(*incoming_exec_info)[j]; |
652 | 1.75k | assert(s >= 0); |
653 | 1.75k | int flag = 0; |
654 | 4.50k | for (k = 0; !flag && k < node->stream_size4.48k ; k++2.75k ) |
655 | 2.75k | flag = (SCHEDULE_STREAMS(*node)[k] == s); |
656 | 1.75k | if (!flag) // Need to have a signal. |
657 | 1.72k | { |
658 | 1.72k | if (SCHEDULE_SIGNALS(*incoming_exec_info)[j] < 0) |
659 | 1.33k | SCHEDULE_SIGNALS(*incoming_exec_info)[j] = (*signal_size)++; |
660 | 393 | else { |
661 | 393 | int flag = 0; |
662 | | // If any of the stream the current node has already seen this signal, we are good already. |
663 | 1.33k | for (k = 0; !flag && k < node->stream_size; k++943 ) |
664 | 943 | { |
665 | 943 | assert(SCHEDULE_STREAMS(*node)[k] >= 0); |
666 | 943 | ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, SCHEDULE_STREAMS(*node)[k]); |
667 | 943 | flag = (data->signal_set && ccv_array_find_int(data->signal_set, 427 SCHEDULE_SIGNALS427 (*incoming_exec_info)[j])); |
668 | 943 | } |
669 | 393 | if (flag) |
670 | 0 | continue; |
671 | 393 | } |
672 | | // Otherwise, we need to wait for this. Currently, our granularity is about wait on all streams. |
673 | 1.72k | waits[wait_size++] = SCHEDULE_SIGNALS(*incoming_exec_info)[j]; |
674 | | // All streams on this node have seen this signal. |
675 | 4.43k | for (k = 0; k < node->stream_size; k++2.70k ) |
676 | 2.70k | { |
677 | 2.70k | ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, SCHEDULE_STREAMS(*node)[k]); |
678 | 2.70k | if (!data->signal_set) |
679 | 919 | data->signal_set = ccv_array_new(sizeof(int), 0, 0); |
680 | 2.70k | ccv_array_push(data->signal_set, &SCHEDULE_SIGNALS(*incoming_exec_info)[j]); |
681 | 2.70k | } |
682 | 1.72k | } |
683 | 1.75k | } |
684 | 1.73k | } |
685 | 4.80k | node->wait_size = wait_size; |
686 | 4.80k | if (wait_size > 0) |
687 | 833 | { |
688 | 833 | node->waits = node->waits ? ccrealloc0 (node->waits, sizeof(int) * wait_size)0 : ccmalloc(sizeof(int) * wait_size); |
689 | 833 | memcpy(node->waits, waits, sizeof(int) * wait_size); |
690 | 833 | } |
691 | 4.80k | } |
692 | | |
693 | | typedef struct { |
694 | | int rank; |
695 | | ccv_array_t* outgoings; |
696 | | } ccv_nnc_incoming_t; |
697 | | |
698 | | static int _ccv_nnc_device_ids_for_stream_data(ccv_nnc_graph_exec_info_t* const node, const int device_id, ccv_array_t* const stream_data, int* const device_ids, const int max_device_id_size) |
699 | 12.8k | { |
700 | | // TODO: I need to re-think whether this is GPU only or not. |
701 | 12.8k | int device_id_size = ccv_nnc_device_ids_for_io(node->inputs, node->input_size, node->outputs, node->output_size, CCV_TENSOR_GPU_MEMORY, device_ids, max_device_id_size); |
702 | 12.8k | if (device_id_size == 0) |
703 | 2.47k | { |
704 | | // If there is a default data, use that device id. Otherwise, use the device id passed in (this will be the default data device id). |
705 | 2.47k | if (stream_data->rnum > 0) |
706 | 2.29k | { |
707 | 2.29k | ccv_nnc_stream_data_t* const default_data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, 0); |
708 | 2.29k | device_ids[0] = default_data->device_id; |
709 | 2.29k | } else |
710 | 180 | device_ids[0] = device_id >= 0 ? device_id2 : 0178 ; |
711 | 2.47k | device_id_size = 1; |
712 | 2.47k | } |
713 | 12.8k | return device_id_size; |
714 | 12.8k | } |
715 | | |
716 | | void ccv_nnc_graph_static_schedule_free(ccv_nnc_graph_static_schedule_t* const schedule) |
717 | 400 | { |
718 | 400 | int i; |
719 | 400 | ccv_nnc_graph_exec_schedule_t* const schd_info = schedule->exec_info; |
720 | 7.64k | for (i = 0; i < schedule->exec_info_size; i++7.24k ) |
721 | 7.24k | { |
722 | 7.24k | if (schd_info[i].stream_size > 1) |
723 | 150 | ccfree(schd_info[i]._heap_streams); |
724 | 7.24k | if (schd_info[i].waits) |
725 | 833 | ccfree(schd_info[i].waits); |
726 | 7.24k | } |
727 | 400 | if (schedule->stream_1s) |
728 | 14 | ccfree(schedule->stream_1s); |
729 | 400 | if (schedule->waits) |
730 | 10 | ccfree(schedule->waits); |
731 | 400 | if (schedule->psort) |
732 | 63 | ccfree(schedule->psort); |
733 | 400 | if (schedule->begin) |
734 | 14 | ccv_nnc_stream_signal_free(schedule->begin); |
735 | 400 | if (schedule->end) |
736 | 400 | ccv_nnc_stream_signal_free(schedule->end); |
737 | 400 | ccfree(schedule); |
738 | 400 | } |
739 | | |
740 | | static ccv_nnc_graph_static_schedule_t* _ccv_nnc_graph_static_schedule_new(ccv_nnc_graph_t* const graph, const int stream_type, const int device_id, const int max_stream_count, ccv_nnc_stream_context_t* const stream_context, const ccv_nnc_graph_exec_t* const _sources, const int _source_size, const ccv_nnc_graph_exec_t* const _destinations, const int _destination_size) |
741 | 400 | { |
742 | 400 | assert(graph->sources && graph->sources->rnum); |
743 | 400 | assert(graph->destinations && graph->destinations->rnum); |
744 | 400 | assert(graph->topsorted); // Only support this on a topsorted graph. |
745 | 400 | const int exec_info_size = graph->exec_info->rnum; |
746 | 400 | assert(exec_info_size > 0); |
747 | 400 | const ccv_nnc_graph_exec_t* const sources = _sources == 0 ? (ccv_nnc_graph_exec_t*)371 ccv_array_get371 (graph->sources, 0) : _sources29 ; |
748 | 400 | const int source_size = _sources == 0 ? graph->sources->rnum371 : _source_size29 ; |
749 | 400 | if (!_sources) |
750 | 371 | { assert(_source_size == 0); } |
751 | 400 | const ccv_nnc_graph_exec_t* const destinations = _destinations == 0 ? (ccv_nnc_graph_exec_t*)362 ccv_array_get362 (graph->destinations, 0) : _destinations38 ; |
752 | 400 | const int destination_size = _destinations == 0 ? graph->destinations->rnum362 : _destination_size38 ; |
753 | 400 | if (!_destinations) |
754 | 362 | { assert(_destination_size == 0); } |
755 | 400 | const int root_schedule = (_sources == 0 && _destinations == 0371 ); |
756 | 400 | ccv_nnc_graph_static_schedule_t* const schedule = cccalloc(1, sizeof(ccv_nnc_graph_static_schedule_t) + sizeof(ccv_nnc_graph_exec_schedule_t) * (exec_info_size - 1)); |
757 | 400 | schedule->exec_info_size = exec_info_size; |
758 | 400 | ccv_nnc_graph_exec_schedule_t* const schd_info = schedule->exec_info; |
759 | 400 | ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0); |
760 | 800 | ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new400 (graph, exec_info, exec_info_size, sources, source_size, destinations, destination_size, 0); |
761 | 400 | if (!root_schedule) |
762 | 63 | { |
763 | | // If this is not a root schedule, we need to do partial topsort. |
764 | 63 | int psort_size = 0; |
765 | 1.54k | ccv_nnc_graph_visit_for(visit, exec_info, node, idx) { |
766 | 1.54k | ++psort_size; |
767 | 1.54k | } ccv_nnc_graph_visit_endfor |
768 | 63 | schedule->psort = (int*)ccmalloc(sizeof(int) * psort_size); |
769 | 63 | schedule->psort_size = psort_size; |
770 | 63 | psort_size = 0; |
771 | 1.54k | ccv_nnc_graph_visit_for(visit, exec_info, node, idx) { |
772 | 1.54k | schedule->psort[psort_size++] = idx; |
773 | 1.54k | } ccv_nnc_graph_visit_endfor |
774 | 63 | } |
775 | 800 | int i, j, k; |
776 | | // Generate exec dependencies (or, in other words, partial ordering of executions). |
777 | 800 | ccv_sparse_matrix_t* exec_dep = ccv_sparse_matrix_new(exec_info_size, exec_info_size, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0); |
778 | 800 | int* buf = (int*)ccmalloc400 (sizeof(int) * exec_info_size * 2); |
779 | 800 | int buf_size; |
780 | 800 | #define for_block(x, val) \ |
781 | 175k | do { \ |
782 | 175k | if (((int32_t*)val)[0] > 0) \ |
783 | 175k | { \ |
784 | 175k | buf[buf_size * 2] = x; \ |
785 | 175k | buf[buf_size * 2 + 1] = ((int32_t*)val)[0] + 1; \ |
786 | 175k | ++buf_size; \ |
787 | 175k | } \ |
788 | 175k | } while (0) |
789 | 7.64k | for (i = 0; i < exec_info_size; i++7.24k ) |
790 | 7.24k | schd_info[i].stream_size = -1; |
791 | 5.22k | ccv_nnc_graph_visit_for(visit, exec_info, node, idx, term) { |
792 | 5.22k | buf_size = 0; /* save all its parent deps to this buffer */ |
793 | 5.22k | ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx); |
794 | 5.22k | schd_info[idx].stream_size = 0; |
795 | 5.22k | if (vector) |
796 | 175k | CCV_SPARSE_VECTOR_FOREACH4.80k (exec_dep, vector, for_block); |
797 | 5.22k | if (!node->outgoings) |
798 | 362 | continue; |
799 | 14.0k | for (i = 0; 4.85k i < node->outgoings->rnum; i++9.21k ) |
800 | 9.21k | { |
801 | 9.21k | int outgoing = *(int*)ccv_array_get(node->outgoings, i); |
802 | 9.21k | const int32_t one = 1; |
803 | 9.21k | ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, idx); |
804 | | /* If not found, set, if the current node is the destination node, no need |
805 | | * set itself as parent of subsequent nodes because its terminal nature. */ |
806 | 9.21k | if (!term && (9.13k !cell.i329.13k || cell.i32[0] == 00 )) |
807 | 9.13k | ccv_set_sparse_matrix_cell(exec_dep, outgoing, idx, &one); |
808 | 309k | for (j = 0; j < buf_size; j++300k ) /* set with all idx's dependencies as well */ |
809 | 300k | { |
810 | 300k | ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2]); |
811 | | /* If not found, set */ |
812 | 300k | if (!cell.i32 || cell.i32[0] == 0117k ) |
813 | 183k | ccv_set_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2], &buf[j * 2 + 1]); |
814 | 117k | else { |
815 | | /* Otherwise, set to the longest one */ |
816 | 117k | int32_t dep = ccv_max(cell.i32[0], buf[j * 2 + 1]); |
817 | 117k | ccv_set_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2], &dep); |
818 | 117k | } |
819 | 300k | } |
820 | 9.21k | } |
821 | 4.85k | } ccv_nnc_graph_visit_endfor |
822 | 400 | #undef for_block |
823 | 400 | ccfree(buf); |
824 | | // Algorithm to allocate signals and streams for this graph. |
825 | 400 | ccv_array_t* const stream_data = ccv_array_new(sizeof(ccv_nnc_stream_data_t), 0, 0); |
826 | 400 | ccv_array_t** const outgoings = cccalloc(exec_info_size, sizeof(ccv_array_t*)); |
827 | 400 | ccv_nnc_incoming_t* const incomings = cccalloc(exec_info_size, sizeof(ccv_nnc_incoming_t)); |
828 | 400 | int max_device_id_size = 1; |
829 | | // Filter out outgoing nodes that we will be able to access it afterwards anyway. |
830 | 5.22k | ccv_nnc_graph_visit_for(visit, exec_info, node, idx) { |
831 | 5.22k | max_device_id_size = ccv_max(node->input_size + node->output_size, max_device_id_size); |
832 | 5.22k | if (node->outgoings) |
833 | 4.85k | { |
834 | 4.85k | outgoings[idx] = ccv_array_new(sizeof(int), 0, 0); |
835 | 14.0k | for (i = 0; i < node->outgoings->rnum; i++9.21k ) |
836 | 9.21k | { |
837 | 9.21k | const int di = *(int*)ccv_array_get(node->outgoings, i); |
838 | | // Skip if we haven't accessed this exec. |
839 | 9.21k | if (schd_info[di].stream_size < 0) |
840 | 1.34k | continue; |
841 | 7.86k | int flag = 0; |
842 | 26.2k | for (j = 0; !flag && j < node->outgoings->rnum24.3k ; j++18.3k ) |
843 | 18.3k | { |
844 | 18.3k | if (j != i) |
845 | 12.2k | { |
846 | 12.2k | const int dj = *(int*)ccv_array_get(node->outgoings, j); |
847 | 12.2k | ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, di, dj); |
848 | 12.2k | flag = (cell.i32 && cell.i32[0]1.88k ); |
849 | 12.2k | } |
850 | 18.3k | } |
851 | 7.86k | if (!flag) |
852 | 5.98k | { |
853 | 5.98k | ccv_array_push(outgoings[idx], &di); |
854 | 5.98k | if (!incomings[di].outgoings) |
855 | 4.80k | incomings[di].outgoings = ccv_array_new(sizeof(int), 1, 0); |
856 | 5.98k | ccv_array_push(incomings[di].outgoings, &idx); |
857 | 5.98k | } |
858 | 7.86k | } |
859 | 4.85k | } |
860 | 5.22k | } ccv_nnc_graph_visit_endfor |
861 | 400 | #define visitor(node, idx, _) \ |
862 | 5.22k | if (node->outgoings) \ |
863 | 10.7k | for (i = 0; 4.80k i < node->outgoings->rnum; i++5.98k ) \ |
864 | 5.98k | { \ |
865 | 5.98k | const int d = *(int*)ccv_array_get(node->outgoings, i); \ |
866 | 5.98k | node->rank = ccv_max(incomings[d].rank + 1, node->rank); \ |
867 | 5.98k | } |
868 | 5.22k | CCV_NNC_GRAPH_VISIT400 (graph, incomings, exec_info_size, destinations, destination_size, sources, source_size, 0, visitor); |
869 | 400 | #undef visitor |
870 | 400 | int device_ids[max_device_id_size]; |
871 | 400 | int outgoing_device_ids[max_device_id_size]; |
872 | 400 | int signal_size = 0; |
873 | 5.22k | ccv_nnc_graph_visit_for(visit, exec_info, node, idx) { |
874 | | // Go through the incomings. |
875 | 5.22k | const int device_id_size = _ccv_nnc_device_ids_for_stream_data(node, device_id, stream_data, device_ids, max_device_id_size); |
876 | 5.22k | if (schd_info[idx].stream_size == 0) |
877 | 420 | { |
878 | 420 | schd_info[idx].stream_size = device_id_size; // At least at the same size as the device_id_size. |
879 | 420 | if (device_id_size > 1) |
880 | 6 | { |
881 | 6 | schd_info[idx]._heap_streams = (int*)ccmalloc(sizeof(int) * device_id_size * 2); |
882 | 6 | schd_info[idx]._heap_signals = (schd_info[idx]._heap_streams + device_id_size); |
883 | 6 | } |
884 | 854 | for (i = 0; i < device_id_size; i++434 ) |
885 | 434 | SCHEDULE_STREAMS(schd_info[idx])[i] = -1, SCHEDULE_SIGNALS(schd_info[idx])[i] = -1; |
886 | 420 | } |
887 | 10.8k | for (i = 0; i < device_id_size; i++5.63k ) |
888 | | // Go through until the end to assign streams. |
889 | 5.63k | if (SCHEDULE_STREAMS(schd_info[idx])[i] < 0) |
890 | 1.36k | { |
891 | 1.36k | int stream_idx = -1; |
892 | 1.36k | int stream_has_command = 0; |
893 | | // First, find a good stream in stream data (the stream is good if it can be recycled, and it has the same command). |
894 | | // Otherwise, we prefer a usable stream (it doesn't have the command, but it can be recycled). |
895 | 35.0k | for (j = 0; (stream_idx < 0 || !stream_has_command249 ) && j < stream_data->rnum35.0k ; j++33.7k ) |
896 | 33.7k | { |
897 | 33.7k | ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, j); |
898 | 33.7k | if (data->device_id == device_ids[i]) |
899 | 8.94k | { |
900 | 8.94k | const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, idx, data->exec_idx); |
901 | | // If there is a path to conclude that exec_idx is before idx, then we can reuse |
902 | | // this stream. Otherwise the work in this "empty stream" could still be ongoing, |
903 | | // and we may delay the following work unnecessarily. |
904 | 8.94k | if (cell.i32 && cell.i32[0] > 0153 ) |
905 | 153 | { |
906 | 153 | if (ccv_array_find_uint(data->command_set, node->cmd.cmd)) |
907 | 68 | stream_idx = j, stream_has_command = 1; |
908 | 85 | else if (stream_idx < 0) // Otherwise, only assign the stream idx if it is not assigned yet. |
909 | 45 | stream_idx = j; |
910 | 153 | } |
911 | 8.94k | } |
912 | 33.7k | } |
913 | 1.36k | if (stream_idx < 0) |
914 | 1.25k | { |
915 | | // Note that the max stream count is a "soft" limit. Even we have different devices, our compute allocation has to be on different streams. |
916 | 1.25k | if (stream_data->rnum >= max_stream_count && max_stream_count > 01.01k ) |
917 | 0 | { |
918 | | // If we are already at out limit, go through again to see if a stream is available, if the stream has command, and also its exec_idx is not preceding this execution. |
919 | 0 | for (j = 0; (stream_idx < 0 || !stream_has_command) && j < stream_data->rnum; j++) |
920 | 0 | { |
921 | 0 | ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, j); |
922 | 0 | if (data->device_id == device_ids[i]) |
923 | 0 | { |
924 | 0 | const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, data->exec_idx, idx); |
925 | | // There must be no path from idx to exec_idx otherwise we already have stream_idx. Now we just to verify |
926 | | // there is no path from exec_idx to idx as well. |
927 | 0 | if (!cell.i32 || cell.i32[0] == 0) |
928 | 0 | { |
929 | 0 | if (ccv_array_find_uint(data->command_set, node->cmd.cmd)) |
930 | 0 | stream_idx = j, stream_has_command = 1; |
931 | 0 | else if (stream_idx < 0) // Otherwise, only assign the stream idx if it is not assigned yet. |
932 | 0 | stream_idx = j; |
933 | 0 | } |
934 | 0 | } |
935 | 0 | } |
936 | 0 | if (stream_idx >= 0) |
937 | 0 | { |
938 | | // Now need to mark exec_idx is after idx, so we can avoid A -> B -> A deadlock. |
939 | 0 | ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, stream_idx); |
940 | 0 | const int32_t one = 1; |
941 | 0 | ccv_set_sparse_matrix_cell(exec_dep, idx, data->exec_idx, &one); |
942 | 0 | } |
943 | 0 | } |
944 | 1.25k | if (stream_idx < 0) |
945 | 1.25k | { |
946 | 1.25k | stream_idx = stream_data->rnum; |
947 | 1.25k | const ccv_nnc_stream_data_t data = { |
948 | 1.25k | .device_id = device_ids[i], |
949 | 1.25k | }; |
950 | 1.25k | ccv_array_push(stream_data, &data); |
951 | 1.25k | } |
952 | 1.25k | } |
953 | 1.36k | assert(stream_idx >= 0); |
954 | 1.36k | ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, stream_idx); |
955 | 1.36k | if (!data->command_set) |
956 | 1.25k | data->command_set = ccv_array_new(sizeof(uint32_t), 1, 0); |
957 | 1.36k | SCHEDULE_STREAMS(schd_info[idx])[i] = stream_idx; |
958 | 1.36k | ccv_array_add_unique_uint(data->command_set, node->cmd.cmd); |
959 | | // Assign all subsequent node to use this stream. |
960 | 1.36k | int outgoing_idx = idx; |
961 | | // if we want to enforce the stream count is only 1, we certainly don't want to the greedy approach. |
962 | | // With the greedy approach, the current stream will go all the way down and certainly conflict with |
963 | | // other streams. We'd prefer to interleaving the execution instead in this case. |
964 | 1.36k | if (max_stream_count != 1) |
965 | 5.63k | while (1.36k outgoings[outgoing_idx] && outgoings[outgoing_idx]->rnum5.27k ) |
966 | 5.21k | { |
967 | 5.21k | int highest_rank = -1; |
968 | 5.21k | int highest_idx = -1; |
969 | 5.21k | int stream_n = -1; |
970 | 5.21k | int stream_has_command = 0; |
971 | 12.8k | for (j = 0; j < outgoings[outgoing_idx]->rnum; j++7.62k ) |
972 | 7.62k | { |
973 | 7.62k | const int d = *(int*)ccv_array_get(outgoings[outgoing_idx], j); |
974 | | // This is not outside of our scope at this point. |
975 | 7.62k | assert(schd_info[d].stream_size >= 0); |
976 | 7.62k | ccv_nnc_graph_exec_info_t* const outgoing_node = exec_info + d; |
977 | 7.62k | const int outgoing_device_id_size = _ccv_nnc_device_ids_for_stream_data(outgoing_node, device_id, stream_data, outgoing_device_ids, max_device_id_size); |
978 | 7.62k | if (schd_info[d].stream_size == 0) |
979 | 4.80k | { |
980 | 4.80k | schd_info[d].stream_size = outgoing_device_id_size; // At least at the same size as the device_id_size. |
981 | 4.80k | if (outgoing_device_id_size > 1) |
982 | 144 | { |
983 | 144 | schd_info[d]._heap_streams = (int*)ccmalloc(sizeof(int) * outgoing_device_id_size * 2); |
984 | 144 | schd_info[d]._heap_signals = (schd_info[d]._heap_streams + outgoing_device_id_size); |
985 | 144 | } |
986 | 10.0k | for (k = 0; k < outgoing_device_id_size; k++5.20k ) |
987 | 5.20k | SCHEDULE_STREAMS(schd_info[d])[k] = -1, SCHEDULE_SIGNALS(schd_info[d])[k] = -1; |
988 | 4.80k | } |
989 | 7.62k | assert(schd_info[d].stream_size == outgoing_device_id_size); |
990 | 16.2k | for (k = 0; 7.62k k < outgoing_device_id_size; k++8.66k ) |
991 | | // If it should be on the same device and the stream is not assign, potentially. |
992 | 8.66k | if (outgoing_device_ids[k] == device_ids[i] && |
993 | 8.66k | SCHEDULE_STREAMS5.38k (schd_info[d])[k] < 05.38k && |
994 | 8.66k | (4.91k incomings[d].rank > highest_rank4.91k || |
995 | 4.91k | (647 incomings[d].rank == highest_rank647 && |
996 | 647 | !stream_has_command && ccv_array_find_uint(data->command_set, outgoing_node->cmd.cmd)0 ))) |
997 | 4.27k | { |
998 | 4.27k | highest_rank = incomings[d].rank; |
999 | 4.27k | highest_idx = d; |
1000 | 4.27k | stream_n = k; |
1001 | | // This is 1 if rank is the same (thus, I must break the tie already), if the rank is not the same, we need to compute this. |
1002 | 4.27k | stream_has_command = (incomings[d].rank == highest_rank || ccv_array_find_uint(data->command_set, outgoing_node->cmd.cmd)0 ); |
1003 | 4.27k | } |
1004 | 7.62k | } |
1005 | 5.21k | if (highest_idx >= 0) |
1006 | 4.27k | { |
1007 | 4.27k | outgoing_idx = highest_idx; |
1008 | 4.27k | ccv_nnc_graph_exec_info_t* const outgoing_node = exec_info + outgoing_idx; |
1009 | 4.27k | assert(stream_n >= 0); |
1010 | 4.27k | SCHEDULE_STREAMS(schd_info[outgoing_idx])[stream_n] = stream_idx; |
1011 | 4.27k | ccv_array_add_unique_uint(data->command_set, outgoing_node->cmd.cmd); |
1012 | 4.27k | } else |
1013 | 941 | break; |
1014 | 5.21k | } |
1015 | 1.36k | data->exec_idx = outgoing_idx; |
1016 | 1.36k | } |
1017 | 5.22k | } ccv_nnc_graph_visit_endfor |
1018 | | // Go through to assign signals when necessary. |
1019 | 5.22k | ccv_nnc_graph_visit_for(visit, exec_info, node, idx) { |
1020 | 5.22k | if (incomings[idx].outgoings && incomings[idx].outgoings->rnum4.80k ) |
1021 | 4.80k | _ccv_nnc_graph_schedule_assign_signals(incomings[idx].outgoings, schd_info + idx, stream_data, &signal_size, schd_info, exec_info_size); |
1022 | 5.22k | } ccv_nnc_graph_visit_endfor |
1023 | 7.64k | for (i = 0; i < exec_info_size; i++7.24k ) |
1024 | 7.24k | if (outgoings[i]) |
1025 | 4.85k | ccv_array_free(outgoings[i]); |
1026 | 400 | ccfree(outgoings); |
1027 | 400 | ccv_matrix_free(exec_dep); |
1028 | 400 | ccv_nnc_stream_data_t* const default_data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, 0); |
1029 | 400 | if (device_id >= 0) |
1030 | 4 | { |
1031 | | // If the default stream (stream 0) is not the same as desired stream, swap with the one that is. |
1032 | 4 | if (default_data->device_id != device_id) |
1033 | 0 | { |
1034 | 0 | int exchange_stream_idx = -1; |
1035 | | // Find the stream idx to exchange. |
1036 | 0 | ccv_nnc_graph_visit_for(visit, exec_info, node, idx) { |
1037 | 0 | int flag = 0; |
1038 | 0 | for(i = 0; !flag && i < schd_info[idx].stream_size; i++) |
1039 | 0 | { |
1040 | 0 | const int stream_idx = SCHEDULE_STREAMS(schd_info[idx])[i]; |
1041 | 0 | ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, stream_idx); |
1042 | 0 | if (data->device_id == device_id) |
1043 | 0 | { |
1044 | 0 | exchange_stream_idx = stream_idx; |
1045 | 0 | flag = 1; |
1046 | 0 | } |
1047 | 0 | } |
1048 | 0 | if (flag) |
1049 | 0 | break; |
1050 | 0 | } ccv_nnc_graph_visit_endfor |
1051 | 0 | assert(exchange_stream_idx >= 0); |
1052 | 0 | ccv_nnc_graph_visit_for(visit, exec_info, node, idx) { |
1053 | 0 | for (i = 0; i < schd_info[idx].stream_size; i++) |
1054 | 0 | if (SCHEDULE_STREAMS(schd_info[idx])[i] == 0) |
1055 | 0 | SCHEDULE_STREAMS(schd_info[idx])[i] = -1; |
1056 | 0 | } ccv_nnc_graph_visit_endfor |
1057 | 0 | ccv_nnc_graph_visit_for(visit, exec_info, node, idx) { |
1058 | 0 | for (i = 0; i < schd_info[idx].stream_size; i++) |
1059 | 0 | if (SCHEDULE_STREAMS(schd_info[idx])[i] == exchange_stream_idx) |
1060 | 0 | SCHEDULE_STREAMS(schd_info[idx])[i] = 0; |
1061 | 0 | } ccv_nnc_graph_visit_endfor |
1062 | 0 | ccv_nnc_graph_visit_for(visit, exec_info, node, idx) { |
1063 | 0 | for (i = 0; i < schd_info[idx].stream_size; i++) |
1064 | 0 | if (SCHEDULE_STREAMS(schd_info[idx])[i] == -1) |
1065 | 0 | SCHEDULE_STREAMS(schd_info[idx])[i] = exchange_stream_idx; |
1066 | 0 | } ccv_nnc_graph_visit_endfor |
1067 | 0 | ((ccv_nnc_stream_data_t*)ccv_array_get(stream_data, exchange_stream_idx))->device_id = default_data->device_id; |
1068 | 0 | default_data->device_id = device_id; |
1069 | 0 | } |
1070 | 4 | } |
1071 | 400 | int graph_stream_1_size = 0; |
1072 | 820 | for (i = 0; i < source_size; i++420 ) |
1073 | 420 | { |
1074 | 420 | const int idx = sources[i].d; |
1075 | | // If it has incoming nodes, check whether these are on stream 0. |
1076 | 420 | if (incomings[idx].outgoings && incomings[idx].outgoings->rnum0 ) |
1077 | 0 | { |
1078 | 0 | int flag = 0; |
1079 | 0 | const ccv_array_t* const incoming = incomings[idx].outgoings; |
1080 | 0 | for (j = 0; !flag && j < incoming->rnum; j++) |
1081 | 0 | { |
1082 | 0 | const int incoming_idx = *(int*)ccv_array_get(incoming, j); |
1083 | 0 | for (k = 0; !flag && k < schd_info[incoming_idx].stream_size; k++) |
1084 | 0 | flag = (SCHEDULE_STREAMS(schd_info[incoming_idx])[k] == 0); // If this is the default stream, we already have a good start. |
1085 | 0 | } |
1086 | 0 | if (flag) |
1087 | 0 | continue; |
1088 | 0 | } |
1089 | 854 | for (j = 0; 420 j < schd_info[idx].stream_size; j++434 ) |
1090 | 434 | if (SCHEDULE_STREAMS(schd_info[idx])[j] != 0) // If this is not the default stream, we need explicit begin signal to start. |
1091 | 34 | ++graph_stream_1_size; |
1092 | 420 | } |
1093 | 400 | if (graph_stream_1_size > 0) |
1094 | 14 | { |
1095 | 14 | schedule->stream_1s = ccmalloc(sizeof(int) * graph_stream_1_size); |
1096 | 14 | graph_stream_1_size = 0; |
1097 | 48 | for (i = 0; i < source_size; i++34 ) |
1098 | 34 | { |
1099 | 34 | const int idx = sources[i].d; |
1100 | | // If it has incoming nodes, check whether these are on stream 0. |
1101 | 34 | if (incomings[idx].outgoings && incomings[idx].outgoings->rnum0 ) |
1102 | 0 | { |
1103 | 0 | int flag = 0; |
1104 | 0 | const ccv_array_t* const incoming = incomings[idx].outgoings; |
1105 | 0 | for (j = 0; !flag && j < incoming->rnum; j++) |
1106 | 0 | { |
1107 | 0 | const int incoming_idx = *(int*)ccv_array_get(incoming, j); |
1108 | 0 | for (k = 0; !flag && k < schd_info[incoming_idx].stream_size; k++) |
1109 | 0 | flag = (SCHEDULE_STREAMS(schd_info[incoming_idx])[k] == 0); // If this is the default stream, we already have a good start. |
1110 | 0 | } |
1111 | 0 | if (flag) |
1112 | 0 | continue; |
1113 | 0 | } |
1114 | 82 | for (j = 0; 34 j < schd_info[idx].stream_size; j++48 ) |
1115 | 48 | if (SCHEDULE_STREAMS(schd_info[idx])[j] != 0) // If this is not the default stream, we need explicit begin signal to start. |
1116 | 34 | { |
1117 | 34 | const int stream_idx = SCHEDULE_STREAMS(schd_info[idx])[j]; |
1118 | 34 | int flag = 0; |
1119 | 64 | for (k = 0; !flag && k < graph_stream_1_size; k++30 ) |
1120 | 30 | flag = (stream_idx == schedule->stream_1s[k]); |
1121 | 34 | if (!flag) |
1122 | 34 | schedule->stream_1s[graph_stream_1_size++] = stream_idx; |
1123 | 34 | } |
1124 | 34 | } |
1125 | 14 | schedule->stream_1_size = graph_stream_1_size; |
1126 | 14 | } |
1127 | 7.64k | for (i = 0; i < exec_info_size; i++7.24k ) |
1128 | 7.24k | if (incomings[i].outgoings) |
1129 | 4.80k | ccv_array_free(incomings[i].outgoings); |
1130 | 400 | ccfree(incomings); |
1131 | 400 | int graph_wait_size = 0; |
1132 | 826 | for (i = 0; i < destination_size; i++426 ) |
1133 | 426 | { |
1134 | 426 | const int idx = destinations[i].d; |
1135 | 852 | for (j = 0; j < schd_info[idx].stream_size; j++426 ) |
1136 | 426 | if (SCHEDULE_STREAMS(schd_info[idx])[j] != 0) // If this exec_info doesn't end with default stream, we need to wait. |
1137 | 26 | ++graph_wait_size; |
1138 | 426 | } |
1139 | 400 | if (graph_wait_size > 0) |
1140 | 10 | { |
1141 | 10 | schedule->waits = ccmalloc(sizeof(int) * graph_wait_size); |
1142 | 10 | graph_wait_size = 0; |
1143 | 46 | for (i = 0; i < destination_size; i++36 ) |
1144 | 36 | { |
1145 | 36 | const int idx = destinations[i].d; |
1146 | 72 | for (j = 0; j < schd_info[idx].stream_size; j++36 ) |
1147 | 36 | if (SCHEDULE_STREAMS(schd_info[idx])[j] != 0) // If this exec_info doesn't end with default stream, we need to wait. |
1148 | 26 | { |
1149 | 26 | ccv_nnc_stream_data_t* const default_stream_data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, 0); |
1150 | 26 | if (SCHEDULE_SIGNALS(schd_info[idx])[j] < 0) |
1151 | 26 | SCHEDULE_SIGNALS(schd_info[idx])[j] = signal_size++; |
1152 | 0 | else if (default_stream_data->signal_set && ccv_array_find_int(default_stream_data->signal_set, SCHEDULE_SIGNALS(schd_info[idx])[j])) |
1153 | 0 | continue; |
1154 | 26 | schedule->waits[graph_wait_size++] = SCHEDULE_SIGNALS(schd_info[idx])[j]; |
1155 | 26 | } |
1156 | 36 | } |
1157 | 10 | schedule->wait_size = graph_wait_size; |
1158 | 10 | } |
1159 | 1.65k | for (i = 0; i < stream_data->rnum; i++1.25k ) |
1160 | 1.25k | { |
1161 | 1.25k | ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, i); |
1162 | 1.25k | if (data->signal_set) |
1163 | 919 | ccv_array_free(data->signal_set); |
1164 | 1.25k | assert(data->command_set); |
1165 | 1.25k | ccv_array_free(data->command_set); |
1166 | 1.25k | } |
1167 | | // Allocate streams & signals |
1168 | 400 | int default_stream_type = stream_type; |
1169 | 400 | CCV_STREAM_SET_DEVICE_ID(default_stream_type, default_data->device_id); |
1170 | 400 | if (root_schedule) |
1171 | 337 | { |
1172 | 337 | assert(!graph->streams); |
1173 | 337 | graph->stream_size = stream_data->rnum; |
1174 | 337 | graph->streams = (ccv_nnc_stream_context_t**)ccmalloc(sizeof(ccv_nnc_stream_context_t*) * graph->stream_size); |
1175 | 337 | graph->block_stream_tasks = (co_routine_t**)cccalloc(graph->stream_size, sizeof(co_routine_t*)); |
1176 | 337 | if (stream_context) |
1177 | 4 | graph->streams[0] = stream_context; |
1178 | 1.44k | for (i = (stream_context337 ? 14 : 0333 ); i < stream_data->rnum; i++1.10k ) |
1179 | 1.10k | { |
1180 | 1.10k | ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, i); |
1181 | 1.10k | int type = stream_type; |
1182 | 1.10k | CCV_STREAM_SET_DEVICE_ID(type, data->device_id); |
1183 | 1.10k | graph->streams[i] = ccv_nnc_stream_context_new(type); |
1184 | 1.10k | } |
1185 | 337 | graph->signal_size = signal_size; |
1186 | 337 | graph->signals = (ccv_nnc_stream_signal_t**)cccalloc(signal_size, sizeof(ccv_nnc_stream_signal_t*)); |
1187 | 3.67k | ccv_nnc_graph_visit_for(visit, exec_info, node, idx) { |
1188 | 7.77k | for (i = 0; i < schd_info[idx].stream_size; i++4.09k ) |
1189 | 4.09k | if (SCHEDULE_SIGNALS(schd_info[idx])[i] >= 0) |
1190 | 1.18k | { |
1191 | 1.18k | const int signal = SCHEDULE_SIGNALS(schd_info[idx])[i]; |
1192 | 1.18k | if (!graph->signals[signal]) |
1193 | 1.18k | { |
1194 | 1.18k | const ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, SCHEDULE_STREAMS(schd_info[idx])[i]); |
1195 | 1.18k | int type = stream_type; |
1196 | 1.18k | CCV_STREAM_SET_DEVICE_ID(type, data->device_id); |
1197 | 1.18k | graph->signals[signal] = ccv_nnc_stream_signal_new(type); |
1198 | 1.18k | } |
1199 | 1.18k | } |
1200 | 3.67k | } ccv_nnc_graph_visit_endfor |
1201 | 337 | } else { |
1202 | 63 | assert(graph->streams); |
1203 | 63 | assert(graph->stream_size >= stream_data->rnum); |
1204 | | // Find streams to proper allocated stream based on the type we need. |
1205 | 63 | int* const stream_idxs = (int*)ccmalloc(sizeof(int) * (stream_data->rnum + signal_size)); |
1206 | 63 | uint64_t* const stream_used = (uint64_t*)cccalloc(((graph->stream_size + 63) >> 6) + ((graph->signal_size + 63) >> 6), sizeof(uint64_t)); |
1207 | 207 | for (i = 0; i < stream_data->rnum; i++144 ) |
1208 | 144 | { |
1209 | 144 | ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, i); |
1210 | 144 | int type = stream_type; |
1211 | 144 | CCV_STREAM_SET_DEVICE_ID(type, data->device_id); |
1212 | 476 | for (j = 0; j < graph->stream_size; j++332 ) |
1213 | 476 | if (!(stream_used[j >> 6] & ((uint64_t)1 << (j & 63)))) |
1214 | 157 | { |
1215 | 157 | const int stream_type = ccv_nnc_stream_context_type(graph->streams[j]); |
1216 | 157 | if (stream_type == type) |
1217 | 144 | { |
1218 | 144 | stream_idxs[i] = j; |
1219 | 144 | stream_used[j >> 6] |= ((uint64_t)1 << (j & 63)); |
1220 | 144 | break; |
1221 | 144 | } |
1222 | 157 | } |
1223 | 144 | } |
1224 | 63 | assert(graph->signal_size >= signal_size); |
1225 | | // Find signals to proper allocated signal based on the type we need. |
1226 | 63 | int* const signal_idxs = stream_idxs + stream_data->rnum; |
1227 | 63 | uint64_t* const signal_used = stream_used + ((graph->stream_size + 63) >> 6); |
1228 | 239 | for (i = 0; i < signal_size; i++176 ) |
1229 | 176 | signal_idxs[i] = -1; |
1230 | 1.54k | ccv_nnc_graph_visit_for(visit, exec_info, node, idx) { |
1231 | 3.08k | for (i = 0; i < schd_info[idx].stream_size; i++1.54k ) |
1232 | 1.54k | if (SCHEDULE_SIGNALS(schd_info[idx])[i] >= 0) |
1233 | 176 | { |
1234 | 176 | const int signal = SCHEDULE_SIGNALS(schd_info[idx])[i]; |
1235 | 176 | if (signal_idxs[signal] < 0) |
1236 | 176 | { |
1237 | 176 | const ccv_nnc_stream_data_t* const data = (ccv_nnc_stream_data_t*)ccv_array_get(stream_data, SCHEDULE_STREAMS(schd_info[idx])[i]); |
1238 | 176 | int type = stream_type; |
1239 | 176 | CCV_STREAM_SET_DEVICE_ID(type, data->device_id); |
1240 | 2.26k | for (j = 0; j < graph->signal_size; j++2.09k ) |
1241 | 2.26k | if (!(signal_used[j >> 6] & ((uint64_t)1 << (j & 63)))) |
1242 | 334 | { |
1243 | 334 | const int signal_type = ccv_nnc_stream_signal_type(graph->signals[j]); |
1244 | 334 | if (signal_type == type) |
1245 | 176 | { |
1246 | 176 | signal_idxs[signal] = j; |
1247 | 176 | signal_used[j >> 6] |= ((uint64_t)1 << (j & 63)); |
1248 | 176 | break; |
1249 | 176 | } |
1250 | 334 | } |
1251 | 176 | } |
1252 | 176 | } |
1253 | 1.54k | } ccv_nnc_graph_visit_endfor |
1254 | | // Now rebind streams and signals from the schedule. |
1255 | 1.54k | ccv_nnc_graph_visit_for(visit, exec_info, node, idx) { |
1256 | 3.08k | for (i = 0; i < schd_info[idx].stream_size; i++1.54k ) |
1257 | 1.54k | { |
1258 | 1.54k | SCHEDULE_STREAMS(schd_info[idx])[i] = stream_idxs[SCHEDULE_STREAMS(schd_info[idx])[i]]; |
1259 | 1.54k | if (SCHEDULE_SIGNALS(schd_info[idx])[i] >= 0) |
1260 | 176 | SCHEDULE_SIGNALS(schd_info[idx])[i] = signal_idxs[SCHEDULE_SIGNALS(schd_info[idx])[i]]; |
1261 | 1.54k | } |
1262 | 1.72k | for (i = 0; i < schd_info[idx].wait_size; i++182 ) |
1263 | 182 | schd_info[idx].waits[i] = signal_idxs[schd_info[idx].waits[i]]; |
1264 | 1.54k | } ccv_nnc_graph_visit_endfor |
1265 | 83 | for (i = 0; i < schedule->stream_1_size; i++20 ) |
1266 | 20 | schedule->stream_1s[i] = stream_idxs[schedule->stream_1s[i]]; |
1267 | 89 | for (i = 0; i < schedule->wait_size; i++26 ) |
1268 | 26 | schedule->waits[i] = signal_idxs[schedule->waits[i]]; |
1269 | | // Rebind who is the stream 0 (default stream). |
1270 | 63 | schedule->stream_0 = stream_idxs[0]; |
1271 | 63 | ccfree(stream_used); |
1272 | 63 | ccfree(stream_idxs); |
1273 | 63 | } |
1274 | 400 | assert(graph->streams); |
1275 | 400 | ccv_nnc_graph_visit_free(visit); |
1276 | 1.76k | for (i = 0; i < signal_size; i++1.36k ) |
1277 | 1.36k | { assert(graph->signals[i]); } |
1278 | 400 | if (schedule->stream_1_size) |
1279 | 14 | schedule->begin = ccv_nnc_stream_signal_new(default_stream_type); |
1280 | 400 | schedule->end = ccv_nnc_stream_signal_new(default_stream_type); |
1281 | | // Do this recursively for its sub graphs. |
1282 | 400 | if (graph->sub_graphs) |
1283 | 7 | for (i = 0; 3 i < graph->sub_graphs->rnum; i++4 ) |
1284 | 4 | { |
1285 | 4 | ccv_nnc_graph_t* const sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, i); |
1286 | 4 | if (sub_graph && !sub_graph->default_schedule) |
1287 | 4 | { |
1288 | 4 | const int exec_idx = sub_graph->exec_idx - 1; |
1289 | 4 | assert(schd_info[exec_idx].stream_size == 1); |
1290 | 4 | const int stream_idx = SCHEDULE_STREAMS(schd_info[exec_idx])[0]; |
1291 | 4 | const int device_id = ((ccv_nnc_stream_data_t*)ccv_array_get(stream_data, stream_idx))->device_id; |
1292 | 4 | sub_graph->default_schedule = _ccv_nnc_graph_static_schedule_new(sub_graph, stream_type, device_id, max_stream_count, graph->streams[stream_idx], 0, 0, 0, 0); |
1293 | 4 | } |
1294 | 4 | } |
1295 | 400 | ccv_array_free(stream_data); |
1296 | 400 | return schedule; |
1297 | 400 | } |
1298 | | void ccv_nnc_graph_set_default_static_schedule(ccv_nnc_graph_t* const graph, const int stream_type, const int max_stream_count) |
1299 | 333 | { |
1300 | 333 | assert(graph->p == 0); |
1301 | 333 | if (graph->default_schedule) |
1302 | 0 | ccv_nnc_graph_static_schedule_free(graph->default_schedule); |
1303 | 333 | graph->default_schedule = _ccv_nnc_graph_static_schedule_new(graph, stream_type, -1, max_stream_count, 0, 0, 0, 0, 0); |
1304 | 333 | } |
1305 | | |
1306 | | ccv_nnc_graph_static_schedule_t* ccv_nnc_graph_static_schedule_new(ccv_nnc_graph_t* const graph, const int stream_type, const int max_stream_count, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size) |
1307 | 63 | { |
1308 | 63 | assert(graph->p == 0); |
1309 | 63 | return _ccv_nnc_graph_static_schedule_new(graph, stream_type, -1, max_stream_count, 0, sources, source_size, destinations, destination_size); |
1310 | 63 | } |
1311 | | |
1312 | | ccv_nnc_stream_context_t* ccv_nnc_graph_default_stream(const ccv_nnc_graph_t* const graph) |
1313 | 9 | { |
1314 | 9 | if (graph->streams && graph->stream_size > 0) |
1315 | 9 | return graph->streams[0]; |
1316 | 0 | return 0; |
1317 | 9 | } |
1318 | | |
1319 | | static void _ccv_nnc_graph_dot_exec(const int index, const ccv_nnc_graph_exec_info_t* const exec_info, const ccv_nnc_graph_exec_schedule_t* const schd_info, ccv_nnc_stream_context_t** const streams, const int flags, FILE* out) |
1320 | 961 | { |
1321 | 961 | if (flags == CCV_NNC_LONG_DOT_GRAPH) |
1322 | 959 | fputc('{', out); |
1323 | 961 | fprintf(out, "node%d", index); |
1324 | 961 | if (flags == CCV_NNC_LONG_DOT_GRAPH) |
1325 | 959 | { |
1326 | 959 | fputs("|Command: ", out); |
1327 | 959 | fputs(ccv_nnc_cmd_name(exec_info->cmd.cmd), out); |
1328 | 959 | if (schd_info) |
1329 | 142 | { |
1330 | 142 | if (schd_info->stream_size > 0) |
1331 | 142 | { |
1332 | 142 | int i, flag = 0; |
1333 | 142 | fputs("|Stream: ", out); |
1334 | 296 | for (i = 0; i < schd_info->stream_size; i++154 ) |
1335 | 154 | { |
1336 | 154 | const int device_id = streams ? CCV_TENSOR_GET_DEVICE_ID(streams[SCHEDULE_STREAMS(*schd_info)[i]]->type) : 00 ; |
1337 | 154 | if (i == 0) |
1338 | 142 | fprintf(out, "%d (d%d)", SCHEDULE_STREAMS(*schd_info)[i], device_id); |
1339 | 12 | else |
1340 | 12 | fprintf(out, ", %d (d%d)", SCHEDULE_STREAMS(*schd_info)[i], device_id); |
1341 | 154 | } |
1342 | 296 | for (i = 0; i < schd_info->stream_size; i++154 ) |
1343 | 154 | if (SCHEDULE_SIGNALS(*schd_info)[i] >= 0) |
1344 | 69 | { |
1345 | 69 | if (!flag) |
1346 | 60 | { |
1347 | 60 | flag = 1; |
1348 | 60 | fprintf(out, "|Signal: %d", SCHEDULE_SIGNALS(*schd_info)[i]); |
1349 | 60 | } else |
1350 | 9 | fprintf(out, ", %d", SCHEDULE_SIGNALS(*schd_info)[i]); |
1351 | 69 | } |
1352 | 142 | } |
1353 | 142 | if (schd_info->wait_size > 0) |
1354 | 76 | { |
1355 | 76 | fputs("|Wait: ", out); |
1356 | 76 | int i; |
1357 | 116 | for (i = 0; i < schd_info->wait_size - 1; i++40 ) |
1358 | 40 | fprintf(out, "%d, ", schd_info->waits[i]); |
1359 | 76 | fprintf(out, "%d", schd_info->waits[schd_info->wait_size - 1]); |
1360 | 76 | } |
1361 | 142 | } |
1362 | 959 | fputc('}', out); |
1363 | 959 | } |
1364 | 961 | } |
1365 | | |
1366 | | static void _ccv_nnc_graph_dot_tensor(const int index, const ccv_nnc_tensor_t* const tensor, const int zone, const int flags, const int depth, FILE* out) |
1367 | 2.67k | { |
1368 | | // if it has an alias pointer, or, it is a long form. |
1369 | 2.67k | if (flags == CCV_NNC_LONG_DOT_GRAPH) |
1370 | 2.66k | fputc('{', out); |
1371 | 2.67k | const int is_tensor_view = CCV_IS_TENSOR_VIEW(tensor); |
1372 | 2.67k | if (is_tensor_view) |
1373 | 51 | fprintf(out, "tensorview%d", index); |
1374 | 2.61k | else |
1375 | 2.61k | fprintf(out, "tensor%d", index); |
1376 | 2.67k | int i; |
1377 | 2.86k | for (i = 0; i < depth; i++195 ) // Print subscription to denote depth. |
1378 | 195 | fputc('\'', out); |
1379 | 2.67k | if (CCV_GET_TAPE_ALLOC(tensor->type)) |
1380 | 9 | fputs(" (t)", out); |
1381 | 2.67k | if (flags == CCV_NNC_LONG_DOT_GRAPH) |
1382 | 2.66k | { |
1383 | 2.66k | const int device_id = CCV_TENSOR_GET_DEVICE_ID(tensor->info.type); |
1384 | 2.66k | fprintf(out, "|d%d|zone%d", device_id, zone); |
1385 | 2.86k | for (i = 0; i < depth; i++195 ) // Print subscription to denote depth. |
1386 | 195 | fputc('\'', out); |
1387 | 2.66k | uintptr_t aptr = (uintptr_t)tensor->data.u8; |
1388 | 2.66k | size_t tensor_size; |
1389 | 2.66k | if (is_tensor_view) |
1390 | 51 | tensor_size = (size_t)((ccv_nnc_tensor_view_t*)(tensor))->stride[0] * tensor->info.dim[0] * CCV_GET_DATA_TYPE_SIZE(tensor->type); |
1391 | 2.61k | else |
1392 | 2.61k | tensor_size = ccv_nnc_dimension_count(tensor->info.dim) * CCV_GET_DATA_TYPE_SIZE(tensor->type); |
1393 | | // Print out the range as well. |
1394 | 2.66k | fprintf(out, "|{%#010x|%#010x}|%d", (uint32_t)aptr, (uint32_t)(aptr + tensor_size - 1), tensor->info.dim[0]); |
1395 | 6.76k | for (i = 1; i < CCV_NNC_MAX_DIM_ALLOC && tensor->info.dim[i]; i++4.10k ) |
1396 | 4.10k | fprintf(out, "x%d", tensor->info.dim[i]); |
1397 | 2.66k | fputc('}', out); |
1398 | 2.66k | } |
1399 | 2.67k | } |
1400 | | |
1401 | | typedef struct { |
1402 | | int index; |
1403 | | int name; |
1404 | | int zone; |
1405 | | uintptr_t tensor_ref; |
1406 | | uintptr_t start_ptr; |
1407 | | uintptr_t end_ptr; |
1408 | | } ccv_nnc_tensor_dot_t; |
1409 | | |
1410 | | typedef struct { |
1411 | | ccv_nnc_tensor_dot_t* dots; |
1412 | | int* remap; |
1413 | | int* rename_zone; |
1414 | | int* rename_index; |
1415 | | } ccv_nnc_tensor_dot_recovery_t; |
1416 | | |
1417 | | // First sort by start_ptr, then sort by tensor ptr (so that we will have the same tensor sorted to one cluster). |
1418 | 13.3k | #define less_than(i1, i2, aux) ((i1).start_ptr < (i2).start_ptr || (7.16k (i1).start_ptr == (i2).start_ptr7.16k && (i1).tensor_ref < (i2).tensor_ref3.51k )) |
1419 | 13.3k | static CCV_IMPLEMENT_QSORT(_ccv_nnc_tensor_dot_sort_by_ptr, ccv_nnc_tensor_dot_t, less_than) |
1420 | | #undef less_than |
1421 | | |
1422 | | static int _ccv_nnc_graph_dot_tensor_multiview_count(const ccv_nnc_tensor_multiview_t* const mv) |
1423 | 260 | { |
1424 | 260 | if (!CCV_IS_TENSOR_MULTIVIEW(mv)) |
1425 | 174 | return 1; |
1426 | 86 | const int count = mv->kind + mv->repeat; |
1427 | 86 | int i, c = 0; |
1428 | 269 | for (i = 0; i < count; i++183 ) |
1429 | 183 | c += _ccv_nnc_graph_dot_tensor_multiview_count((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)[i]); |
1430 | 86 | return c; |
1431 | 260 | } |
1432 | | |
1433 | | static void _ccv_nnc_graph_dot_tensor_multiview_tensor_dots(const ccv_nnc_tensor_multiview_t* const mv, ccv_nnc_tensor_dot_t* const tensor_dots, int* tensor_index) |
1434 | 86 | { |
1435 | 86 | const int count = mv->kind + mv->repeat; |
1436 | 86 | int i; |
1437 | 269 | for (i = 0; i < count; i++183 ) |
1438 | 183 | if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i])) |
1439 | 9 | _ccv_nnc_graph_dot_tensor_multiview_tensor_dots((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)[i], tensor_dots, tensor_index); |
1440 | 174 | else { |
1441 | 174 | tensor_dots[*tensor_index].name = *tensor_index; |
1442 | 174 | tensor_dots[*tensor_index].start_ptr = (uintptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->data.u8; |
1443 | | // Because tv's pointer will get updated, it is not correct in this case to have one tensor_ref. |
1444 | 174 | tensor_dots[*tensor_index].tensor_ref = tensor_dots[*tensor_index].start_ptr; |
1445 | 174 | const size_t dim_size = ccv_nnc_dimension_count(CCV_NNC_MULTIVIEW_DATA(mv)[i]->info.dim) * CCV_GET_DATA_TYPE_SIZE(CCV_NNC_MULTIVIEW_DATA(mv)[i]->type); |
1446 | 174 | tensor_dots[*tensor_index].end_ptr = tensor_dots[*tensor_index].start_ptr + dim_size - 1; |
1447 | 174 | ++(*tensor_index); |
1448 | 174 | } |
1449 | 86 | } |
1450 | | |
1451 | | static ccv_nnc_tensor_dot_recovery_t _ccv_nnc_graph_tensor_dot_recovery(const ccv_nnc_graph_t* const graph) |
1452 | 225 | { |
1453 | 225 | int i, j; |
1454 | | // Recover tensor relationships for all tensors referenced in the graph. |
1455 | | // Most notably, we have to give these indexes, and find if they point to |
1456 | | // the same memory region, and whether they overlap. These information |
1457 | | // are lost since we converted from symbolic form to the execution form. |
1458 | | // and here we do our best to recover because that is easier to understand |
1459 | | // if we want to present the graph visually (also, we don't want to put this |
1460 | | // information into the tensor or execution graph to avoid overhead, thus, |
1461 | | // recovering is the best we can do). |
1462 | 225 | int tensor_count = 0; |
1463 | 1.22k | for (i = 0; i < graph->exec_info->rnum; i++998 ) |
1464 | 998 | { |
1465 | 998 | ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i); |
1466 | 3.00k | for (j = 0; j < exec_info->input_size; j++2.01k ) |
1467 | 2.01k | if (exec_info->inputs[j]) |
1468 | 1.62k | tensor_count += CCV_IS_TENSOR_MULTIVIEW(exec_info->inputs[j]) ? _ccv_nnc_graph_dot_tensor_multiview_count((ccv_nnc_tensor_multiview_t*)exec_info->inputs[j])36 : 11.58k ; |
1469 | 2.17k | for (j = 0; j < exec_info->output_size; j++1.17k ) |
1470 | 1.17k | if (exec_info->outputs[j]) |
1471 | 1.12k | tensor_count += CCV_IS_TENSOR_MULTIVIEW(exec_info->outputs[j]) ? _ccv_nnc_graph_dot_tensor_multiview_count((ccv_nnc_tensor_multiview_t*)exec_info->outputs[j])41 : 11.08k ; |
1472 | 998 | } |
1473 | 225 | ccv_nnc_tensor_dot_t* tensor_dots = tensor_count > 0 ? (ccv_nnc_tensor_dot_t*)221 ccmalloc221 (sizeof(ccv_nnc_tensor_dot_t) * tensor_count) : 04 ; |
1474 | 225 | int k = 0; |
1475 | 1.22k | for (i = 0; i < graph->exec_info->rnum; i++998 ) |
1476 | 998 | { |
1477 | 998 | ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i); |
1478 | 3.00k | for (j = 0; j < exec_info->input_size; j++2.01k ) |
1479 | 2.01k | { |
1480 | 2.01k | ccv_nnc_tensor_t* tensor = exec_info->inputs[j]; |
1481 | 2.01k | if (!tensor) |
1482 | 391 | continue; |
1483 | 1.62k | if (CCV_IS_TENSOR_MULTIVIEW(tensor)) |
1484 | 36 | _ccv_nnc_graph_dot_tensor_multiview_tensor_dots((ccv_nnc_tensor_multiview_t*)tensor, tensor_dots, &k); |
1485 | 1.58k | else { |
1486 | 1.58k | tensor_dots[k].name = k; |
1487 | 1.58k | tensor_dots[k].tensor_ref = (uintptr_t)tensor; |
1488 | 1.58k | tensor_dots[k].start_ptr = (uintptr_t)tensor->data.u8; |
1489 | 1.58k | size_t tensor_size; |
1490 | 1.58k | if (CCV_IS_TENSOR_VIEW(tensor)) |
1491 | 29 | tensor_size = (size_t)((ccv_nnc_tensor_view_t*)(tensor))->stride[0] * tensor->info.dim[0] * CCV_GET_DATA_TYPE_SIZE(tensor->type); |
1492 | 1.55k | else |
1493 | 1.55k | tensor_size = ccv_nnc_dimension_count(tensor->info.dim) * CCV_GET_DATA_TYPE_SIZE(tensor->type); |
1494 | 1.58k | tensor_dots[k].end_ptr = tensor_dots[k].start_ptr + tensor_size - 1; |
1495 | 1.58k | ++k; |
1496 | 1.58k | } |
1497 | 1.62k | } |
1498 | 2.17k | for (j = 0; j < exec_info->output_size; j++1.17k ) |
1499 | 1.17k | { |
1500 | 1.17k | ccv_nnc_tensor_t* tensor = exec_info->outputs[j]; |
1501 | 1.17k | if (!tensor) |
1502 | 47 | continue; |
1503 | 1.12k | if (CCV_IS_TENSOR_MULTIVIEW(tensor)) |
1504 | 41 | _ccv_nnc_graph_dot_tensor_multiview_tensor_dots((ccv_nnc_tensor_multiview_t*)tensor, tensor_dots, &k); |
1505 | 1.08k | else { |
1506 | 1.08k | tensor_dots[k].name = k; |
1507 | 1.08k | tensor_dots[k].tensor_ref = (uintptr_t)tensor; |
1508 | 1.08k | tensor_dots[k].start_ptr = (uintptr_t)tensor->data.u8; |
1509 | 1.08k | size_t tensor_size; |
1510 | 1.08k | if (CCV_IS_TENSOR_VIEW(tensor)) |
1511 | 22 | tensor_size = (size_t)((ccv_nnc_tensor_view_t*)(tensor))->stride[0] * tensor->info.dim[0] * CCV_GET_DATA_TYPE_SIZE(tensor->type); |
1512 | 1.06k | else |
1513 | 1.06k | tensor_size = ccv_nnc_dimension_count(tensor->info.dim) * CCV_GET_DATA_TYPE_SIZE(tensor->type); |
1514 | 1.08k | tensor_dots[k].end_ptr = tensor_dots[k].start_ptr + tensor_size - 1; |
1515 | 1.08k | ++k; |
1516 | 1.08k | } |
1517 | 1.12k | } |
1518 | 998 | } |
1519 | 225 | tensor_count = k; // We may over count, now shrink. |
1520 | | // To group overlap memory into one zone, we sort it by start ptr first (secondary by the tensor pointer). |
1521 | 225 | _ccv_nnc_tensor_dot_sort_by_ptr(tensor_dots, tensor_count, 0); |
1522 | 225 | int index = 0, zone = 0; |
1523 | 225 | uintptr_t tensor_ref = tensor_count > 0 ? tensor_dots[0].tensor_ref221 : 04 ; |
1524 | 225 | uintptr_t end_ptr = tensor_count > 0 ? tensor_dots[0].end_ptr221 : 04 ; |
1525 | | // Then, it is trivial, we go by end ptr. If the next start ptr is still within the end ptr (start ptr <= end ptr), |
1526 | | // they are the same zone. |
1527 | 3.06k | for (i = 0; i < tensor_count; i++2.84k ) |
1528 | 2.84k | { |
1529 | 2.84k | if (tensor_dots[i].tensor_ref != tensor_ref) |
1530 | 1.20k | { |
1531 | 1.20k | tensor_ref = tensor_dots[i].tensor_ref; |
1532 | 1.20k | ++index; |
1533 | 1.20k | } |
1534 | 2.84k | if (tensor_dots[i].start_ptr > end_ptr) |
1535 | 864 | { |
1536 | 864 | end_ptr = ccv_max(end_ptr, tensor_dots[i].end_ptr); |
1537 | 864 | ++zone; |
1538 | 864 | } |
1539 | 2.84k | tensor_dots[i].index = index; |
1540 | 2.84k | tensor_dots[i].zone = zone; |
1541 | 2.84k | } |
1542 | | // We already have index and zone assigned, but the problem is that these are not very human interpretable (because |
1543 | | // it follows the pointer from low to high, not the tensor creation order). The following code renamed both the index |
1544 | | // and the zone so that it is much more understandable. |
1545 | 225 | const int index_count = index + 1; |
1546 | 225 | const int zone_count = zone + 1; |
1547 | 225 | int* remap = (int*)ccmalloc(sizeof(int) * (tensor_count + index_count + zone_count)); |
1548 | 225 | int* rename_index = remap + tensor_count; |
1549 | 225 | int* rename_zone = rename_index + index_count; |
1550 | 3.06k | for (i = 0; i < tensor_count; i++2.84k ) |
1551 | 2.84k | remap[tensor_dots[i].name] = i; |
1552 | 1.65k | for (i = 0; i < index_count; i++1.42k ) |
1553 | 1.42k | rename_index[i] = -1; |
1554 | 1.31k | for (i = 0; i < zone_count; i++1.08k ) |
1555 | 1.08k | rename_zone[i] = -1; |
1556 | 225 | index = 0; |
1557 | 225 | zone = 0; |
1558 | 3.06k | for (i = 0; i < tensor_count; i++2.84k ) |
1559 | 2.84k | { |
1560 | 2.84k | ccv_nnc_tensor_dot_t* tensor_dot = tensor_dots + remap[i]; |
1561 | 2.84k | if (rename_index[tensor_dot->index] == -1) |
1562 | 1.42k | rename_index[tensor_dot->index] = index++; |
1563 | 2.84k | if (rename_zone[tensor_dot->zone] == -1) |
1564 | 1.08k | rename_zone[tensor_dot->zone] = zone++; |
1565 | 2.84k | } |
1566 | 225 | ccv_nnc_tensor_dot_recovery_t recovery = { |
1567 | 225 | .dots = tensor_dots, |
1568 | 225 | .remap = remap, |
1569 | 225 | .rename_index = rename_index, |
1570 | 225 | .rename_zone = rename_zone, |
1571 | 225 | }; |
1572 | 225 | return recovery; |
1573 | 225 | } |
1574 | | |
1575 | | static void _ccv_nnc_graph_tensor_dot_recovery_free(const ccv_nnc_tensor_dot_recovery_t recovery) |
1576 | 225 | { |
1577 | 225 | ccfree(recovery.dots); |
1578 | 225 | ccfree(recovery.remap); |
1579 | 225 | } |
1580 | | |
1581 | | static void _ccv_nnc_graph_dot_tensor_multiview_one(const ccv_nnc_tensor_multiview_t* const mv, const ccv_nnc_tensor_dot_recovery_t recovery, const int depth, int* tensor_index, FILE* out) |
1582 | 86 | { |
1583 | 86 | const int count = mv->kind + mv->repeat; |
1584 | 86 | int i, j; |
1585 | 86 | fputs("|{", out); |
1586 | 269 | for (i = 0; i < count; i++183 ) |
1587 | 183 | if (CCV_IS_TENSOR_MULTIVIEW(CCV_NNC_MULTIVIEW_DATA(mv)[i])) |
1588 | 9 | { |
1589 | 9 | fprintf(out, "{%d", i); |
1590 | 9 | if (mv->kind == CCV_NNC_MULTIVIEW_K0N || (5 mv->kind == CCV_NNC_MULTIVIEW_K1N5 && i > 05 )) |
1591 | 9 | fputc('*', out); // Denotes that we loop on this. |
1592 | 9 | _ccv_nnc_graph_dot_tensor_multiview_one((ccv_nnc_tensor_multiview_t*)CCV_NNC_MULTIVIEW_DATA(mv)[i], recovery, depth, tensor_index, out); |
1593 | 9 | if (i == count - 1) |
1594 | 7 | fputc('}', out); |
1595 | 2 | else |
1596 | 2 | fputs("}|", out); |
1597 | 174 | } else { |
1598 | 174 | fprintf(out, "{%d", i); |
1599 | 174 | if (mv->kind == CCV_NNC_MULTIVIEW_K0N || (19 mv->kind == CCV_NNC_MULTIVIEW_K1N19 && i > 019 )) |
1600 | 163 | fputc('*', out); // Denotes that we loop on this. |
1601 | 174 | const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[*tensor_index]; |
1602 | 174 | fprintf(out, "|zone%d", recovery.rename_zone[tensor_dot->zone]); |
1603 | 368 | for (j = 0; j < depth; j++194 ) |
1604 | 194 | fputc('\'', out); |
1605 | 174 | uintptr_t aptr = (uintptr_t)CCV_NNC_MULTIVIEW_DATA(mv)[i]->data.u8; |
1606 | | // For the last one, we don't extend to full ainc. |
1607 | 174 | size_t dim_size = ccv_nnc_dimension_count(CCV_NNC_MULTIVIEW_DATA(mv)[i]->info.dim) * CCV_GET_DATA_TYPE_SIZE(CCV_NNC_MULTIVIEW_DATA(mv)[i]->type); |
1608 | | // Print out the range as well. |
1609 | 174 | fprintf(out, "|{%#010x|%#010x}", (uint32_t)aptr, (uint32_t)(aptr + dim_size - 1)); |
1610 | 174 | ++(*tensor_index); |
1611 | 174 | if (i == count - 1) |
1612 | 79 | fputc('}', out); |
1613 | 95 | else |
1614 | 95 | fputs("}|", out); |
1615 | 174 | } |
1616 | 86 | fputc('}', out); |
1617 | 86 | } |
1618 | | |
1619 | | static void _ccv_nnc_graph_dot_tensor_multiview(const ccv_nnc_tensor_multiview_t* const mv, const ccv_nnc_tensor_dot_recovery_t recovery, const int flags, const int depth, int* tensor_index, FILE* out) |
1620 | 77 | { |
1621 | | // if it has an alias pointer, or, it is a long form. |
1622 | 77 | if (flags == CCV_NNC_LONG_DOT_GRAPH) |
1623 | 77 | fputc('{', out); |
1624 | 77 | const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[*tensor_index]; |
1625 | 77 | fprintf(out, "multiview%d", recovery.rename_index[tensor_dot->index]); |
1626 | 77 | int i; |
1627 | 161 | for (i = 0; i < depth; i++84 ) // Print subscription to denote depth. |
1628 | 84 | fputc('\'', out); |
1629 | 77 | if (CCV_GET_TAPE_ALLOC(mv->type)) |
1630 | 7 | fputs(" (t)", out); |
1631 | 77 | if (flags == CCV_NNC_LONG_DOT_GRAPH) |
1632 | 77 | { |
1633 | 77 | _ccv_nnc_graph_dot_tensor_multiview_one(mv, recovery, depth, tensor_index, out); |
1634 | 77 | const ccv_nnc_tensor_t* root = (ccv_nnc_tensor_t*)mv; |
1635 | 156 | while (CCV_IS_TENSOR_MULTIVIEW(root)) |
1636 | 79 | root = CCV_NNC_MULTIVIEW_DATA((ccv_nnc_tensor_multiview_t*)root)[0]; |
1637 | 77 | fprintf(out, "|%d", root->info.dim[0]); |
1638 | 105 | for (i = 1; i < CCV_NNC_MAX_DIM_ALLOC && root->info.dim[i]; i++28 ) |
1639 | 28 | fprintf(out, "x%d", root->info.dim[i]); |
1640 | 77 | fputc('}', out); |
1641 | 77 | } else |
1642 | 0 | *tensor_index += _ccv_nnc_graph_dot_tensor_multiview_count(mv); |
1643 | 77 | } |
1644 | | |
1645 | | static void _ccv_nnc_graph_dot_node(const ccv_nnc_graph_exec_info_t* const exec_info, const ccv_nnc_graph_exec_schedule_t* const schd_info, const int exec_index, ccv_nnc_stream_context_t** const streams, const ccv_nnc_tensor_dot_recovery_t recovery, const int flags, const int depth, FILE* out, int* const tensor_index) |
1646 | 961 | { |
1647 | 961 | fprintf(out, "node%d [shape=record,label=\"", exec_index); |
1648 | 961 | _ccv_nnc_graph_dot_exec(exec_index, exec_info, schd_info, streams, flags, out); |
1649 | 961 | int i; |
1650 | 961 | int k = *tensor_index; |
1651 | 961 | if (exec_info->input_size > 0) |
1652 | 837 | { |
1653 | 837 | fputs("|{Input", out); |
1654 | 2.81k | for (i = 0; i < exec_info->input_size; i++1.97k ) |
1655 | 1.97k | if (exec_info->inputs[i]) |
1656 | 1.58k | { |
1657 | 1.58k | fputc('|', out); |
1658 | 1.58k | if (CCV_IS_TENSOR_MULTIVIEW(exec_info->inputs[i])) |
1659 | 33 | _ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->inputs[i], recovery, flags, depth, &k, out); |
1660 | 1.55k | else { |
1661 | 1.55k | const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k]; |
1662 | 1.55k | _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->inputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out); |
1663 | 1.55k | ++k; |
1664 | 1.55k | } |
1665 | 1.58k | } else |
1666 | 391 | fputs("|-", out); |
1667 | 837 | fputc('}', out); |
1668 | 837 | } |
1669 | 961 | if (exec_info->output_size > 0) |
1670 | 900 | { |
1671 | 900 | fputs("|{Output", out); |
1672 | 2.03k | for (i = 0; i < exec_info->output_size; i++1.13k ) |
1673 | 1.13k | if (exec_info->outputs[i]) |
1674 | 1.09k | { |
1675 | 1.09k | fputc('|', out); |
1676 | 1.09k | if (CCV_IS_TENSOR_MULTIVIEW(exec_info->outputs[i])) |
1677 | 30 | _ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->outputs[i], recovery, flags, depth, &k, out); |
1678 | 1.06k | else { |
1679 | 1.06k | const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k]; |
1680 | 1.06k | _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->outputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out); |
1681 | 1.06k | ++k; |
1682 | 1.06k | } |
1683 | 1.09k | } else |
1684 | 47 | fputs("|-", out); |
1685 | 900 | fputc('}', out); |
1686 | 900 | } |
1687 | 961 | fputs("\"];\n", out); |
1688 | 961 | *tensor_index = k; |
1689 | 961 | } |
1690 | | |
1691 | | static void _ccv_nnc_graph_dot_while_label(const ccv_nnc_graph_exec_info_t* const exec_info, const int exec_index, const ccv_nnc_tensor_dot_recovery_t recovery, const ccv_nnc_graph_t* const while_graph, const int flags, const int depth, FILE* out, int* tensor_index) |
1692 | 25 | { |
1693 | 25 | int i; |
1694 | 25 | fprintf(out, "label=<<b>while%d </b>Command: ", exec_index); |
1695 | 25 | fputs(ccv_nnc_cmd_name(exec_info->cmd.cmd), out); |
1696 | 25 | fputs(">;\n", out); |
1697 | 25 | fprintf(out, "label%d [shape=record,label=\"{", exec_index); |
1698 | 25 | int k = *tensor_index; |
1699 | 25 | if (exec_info->input_size > 0) |
1700 | 16 | { |
1701 | 16 | fputs("{Input|{", out); |
1702 | 39 | for (i = 0; i < exec_info->input_size; i++23 ) |
1703 | 23 | { |
1704 | 23 | if (i > 0) |
1705 | 7 | fputc('|', out); |
1706 | 23 | if (exec_info->inputs[i]) |
1707 | 23 | { |
1708 | 23 | if (CCV_IS_TENSOR_MULTIVIEW(exec_info->inputs[i])) |
1709 | 1 | _ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->inputs[i], recovery, flags, depth, &k, out); |
1710 | 22 | else { |
1711 | 22 | const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k]; |
1712 | 22 | _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->inputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out); |
1713 | 22 | ++k; |
1714 | 22 | } |
1715 | 23 | } else |
1716 | 0 | fputc('-', out); |
1717 | 23 | } |
1718 | 16 | fputs("}}", out); |
1719 | 16 | } |
1720 | 25 | if (exec_info->output_size > 0) |
1721 | 15 | { |
1722 | 15 | if (exec_info->input_size > 0) |
1723 | 12 | fputs("|", out); |
1724 | 15 | fputs("{Output|{", out); |
1725 | 38 | for (i = 0; i < exec_info->output_size; i++23 ) |
1726 | 23 | { |
1727 | 23 | if (i > 0) |
1728 | 8 | fputc('|', out); |
1729 | 23 | if (exec_info->outputs[i]) |
1730 | 23 | { |
1731 | 23 | if (CCV_IS_TENSOR_MULTIVIEW(exec_info->outputs[i])) |
1732 | 0 | _ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->outputs[i], recovery, flags, depth, &k, out); |
1733 | 23 | else { |
1734 | 23 | const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k]; |
1735 | 23 | _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->outputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out); |
1736 | 23 | ++k; |
1737 | 23 | } |
1738 | 23 | } else |
1739 | 0 | fputc('-', out); |
1740 | 23 | } |
1741 | 15 | fputs("}}", out); |
1742 | 15 | } |
1743 | 25 | fputs("}\"];\n", out); |
1744 | 25 | *tensor_index = k; |
1745 | 25 | } |
1746 | | |
1747 | | static void _ccv_nnc_graph_dot_case_of_label(const ccv_nnc_graph_exec_info_t* const exec_info, const int exec_index, const ccv_nnc_tensor_dot_recovery_t recovery, const int flags, const int depth, FILE* out, int* tensor_index) |
1748 | 12 | { |
1749 | 12 | int i; |
1750 | 12 | fprintf(out, "label=<<b>caseof%d </b>Command: ", exec_index); |
1751 | 12 | fputs(ccv_nnc_cmd_name(exec_info->cmd.cmd), out); |
1752 | 12 | fputs(">;\n", out); |
1753 | 12 | fprintf(out, "label%d [shape=record,label=\"{", exec_index); |
1754 | 12 | int k = *tensor_index; |
1755 | 12 | if (exec_info->input_size > 0) |
1756 | 11 | { |
1757 | 11 | fputs("{Input|{", out); |
1758 | 22 | for (i = 0; i < exec_info->input_size; i++11 ) |
1759 | 11 | { |
1760 | 11 | if (i > 0) |
1761 | 0 | fputc('|', out); |
1762 | 11 | if (exec_info->inputs[i]) |
1763 | 11 | { |
1764 | 11 | if (CCV_IS_TENSOR_MULTIVIEW(exec_info->inputs[i])) |
1765 | 2 | _ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->inputs[i], recovery, flags, depth, &k, out); |
1766 | 9 | else { |
1767 | 9 | const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k]; |
1768 | 9 | _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->inputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out); |
1769 | 9 | ++k; |
1770 | 9 | } |
1771 | 11 | } else |
1772 | 0 | fputc('-', out); |
1773 | 11 | } |
1774 | 11 | fputs("}}", out); |
1775 | 11 | } |
1776 | 12 | if (exec_info->output_size > 0) |
1777 | 11 | { |
1778 | 11 | if (exec_info->input_size > 0) |
1779 | 10 | fputs("|", out); |
1780 | 11 | fputs("{Output|{", out); |
1781 | 24 | for (i = 0; i < exec_info->output_size; i++13 ) |
1782 | 13 | { |
1783 | 13 | if (i > 0) |
1784 | 2 | fputc('|', out); |
1785 | 13 | if (exec_info->outputs[i]) |
1786 | 13 | { |
1787 | 13 | if (CCV_IS_TENSOR_MULTIVIEW(exec_info->outputs[i])) |
1788 | 11 | _ccv_nnc_graph_dot_tensor_multiview((ccv_nnc_tensor_multiview_t*)exec_info->outputs[i], recovery, flags, depth, &k, out); |
1789 | 2 | else { |
1790 | 2 | const ccv_nnc_tensor_dot_t* const tensor_dot = recovery.dots + recovery.remap[k]; |
1791 | 2 | _ccv_nnc_graph_dot_tensor(recovery.rename_index[tensor_dot->index], exec_info->outputs[i], recovery.rename_zone[tensor_dot->zone], flags, depth, out); |
1792 | 2 | ++k; |
1793 | 2 | } |
1794 | 13 | } else |
1795 | 0 | fputc('-', out); |
1796 | 13 | } |
1797 | 11 | fputs("}}", out); |
1798 | 11 | } |
1799 | 12 | fputs("}\"];\n", out); |
1800 | 12 | *tensor_index = k; |
1801 | 12 | } |
1802 | | |
1803 | | static void _ccv_nnc_graph_dot_sub_graphs(const ccv_nnc_graph_exec_info_t* const exec_info, const ccv_nnc_tensor_dot_recovery_t p_recovery, const ccv_array_t* const sub_graphs, const int flags, const int depth, FILE* out, int* tensor_index, int* exec_index) |
1804 | 37 | { |
1805 | 37 | if (exec_info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE) |
1806 | 25 | { |
1807 | 25 | fprintf(out, "subgraph cluster%d {\nstyle=\"rounded\";\nnode%d [style=invisible];\n", *exec_index, *exec_index); |
1808 | 25 | const ccv_nnc_graph_t* const while_graph = *(ccv_nnc_graph_t**)ccv_array_get(sub_graphs, CCV_NNC_GRAPH_REF(exec_info)[0] - 1); |
1809 | | // Output this node info within this subgraph. |
1810 | 25 | _ccv_nnc_graph_dot_while_label(exec_info, *exec_index, p_recovery, while_graph, flags, depth - 1 /* Label all references to its level above. */, out, tensor_index); |
1811 | 25 | } else if (12 exec_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF12 ) { |
1812 | 12 | fprintf(out, "subgraph cluster%d {\nstyle=\"rounded\";\nnode%d [style=invisible];\n", *exec_index, *exec_index); |
1813 | 12 | _ccv_nnc_graph_dot_case_of_label(exec_info, *exec_index, p_recovery, flags, depth - 1 /* Label all references to its level above. */, out, tensor_index); |
1814 | 12 | } |
1815 | 37 | ++(*exec_index); |
1816 | 37 | int p; |
1817 | 94 | for (p = 0; p < exec_info->graph_ref_size; p++57 ) |
1818 | 57 | { |
1819 | 57 | if (exec_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) |
1820 | 32 | { |
1821 | 32 | fprintf(out, "subgraph cluster%d {\nstyle=\"rounded\";\nnode%d [style=invisible];\nlabel=\"\"\n", *exec_index, *exec_index); |
1822 | 32 | ++(*exec_index); |
1823 | 32 | } |
1824 | 57 | const ccv_nnc_graph_t* const graph = *(ccv_nnc_graph_t**)ccv_array_get(sub_graphs, CCV_NNC_GRAPH_REF(exec_info)[p] - 1); |
1825 | 57 | const ccv_nnc_graph_static_schedule_t* const schedule = graph->default_schedule; |
1826 | 57 | ccv_nnc_tensor_dot_recovery_t recovery = _ccv_nnc_graph_tensor_dot_recovery(graph); |
1827 | 57 | int i, j; |
1828 | 57 | int k = 0; |
1829 | 57 | int* node_id = (int*)ccmalloc(sizeof(int) * graph->exec_info->rnum); |
1830 | | // Output styles. |
1831 | 167 | for (i = 0; i < graph->exec_info->rnum; i++110 ) |
1832 | 110 | { |
1833 | 110 | node_id[i] = *exec_index; |
1834 | 110 | ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i); |
1835 | 110 | if (CCV_NNC_GRAPH_REF(exec_info)[0]) |
1836 | 3 | _ccv_nnc_graph_dot_sub_graphs(exec_info, recovery, graph->sub_graphs, flags, depth + 1, out, &k, exec_index); |
1837 | 107 | else { |
1838 | 107 | _ccv_nnc_graph_dot_node(exec_info, |
1839 | 107 | schedule ? (6 i < schedule->exec_info_size6 ? schedule->exec_info + i6 : 00 ) : 0101 , |
1840 | 107 | *exec_index, graph->streams, recovery, flags, depth, out, &k); |
1841 | 107 | ++(*exec_index); |
1842 | 107 | } |
1843 | 110 | } |
1844 | | // Output connections. |
1845 | 167 | for (i = 0; i < graph->exec_info->rnum; i++110 ) |
1846 | 110 | { |
1847 | 110 | ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i); |
1848 | 110 | if (exec_info->outgoings) |
1849 | 108 | for (j = 0; 53 j < exec_info->outgoings->rnum; j++55 ) |
1850 | 55 | { |
1851 | 55 | const int outgoing_idx = *(int*)ccv_array_get(exec_info->outgoings, j); |
1852 | 55 | const ccv_nnc_graph_exec_info_t* const outgoing_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, outgoing_idx); |
1853 | | // If both are sub-graphs, have both tail and head specified. |
1854 | 55 | if (CCV_NNC_GRAPH_REF(exec_info)[0] && CCV_NNC_GRAPH_REF1 (outgoing_info)[0]1 ) |
1855 | 0 | fprintf(out, "node%d -> node%d [ltail=cluster%d,lhead=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[i], node_id[outgoing_idx]); |
1856 | 55 | else if (CCV_NNC_GRAPH_REF(exec_info)[0] && !1 CCV_NNC_GRAPH_REF1 (outgoing_info)[0]) |
1857 | 1 | fprintf(out, "node%d -> node%d [ltail=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[i]); |
1858 | 54 | else if (!CCV_NNC_GRAPH_REF(exec_info)[0] && CCV_NNC_GRAPH_REF(outgoing_info)[0]) |
1859 | 3 | fprintf(out, "node%d -> node%d [lhead=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[outgoing_idx]); |
1860 | 51 | else |
1861 | 51 | fprintf(out, "node%d -> node%d;\n", node_id[i], node_id[outgoing_idx]); |
1862 | 55 | } |
1863 | 110 | } |
1864 | 57 | fputs("}\n", out); |
1865 | 57 | _ccv_nnc_graph_tensor_dot_recovery_free(recovery); |
1866 | 57 | ccfree(node_id); |
1867 | 57 | } |
1868 | | // Extra subgraph cluster. |
1869 | 37 | if (exec_info->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) |
1870 | 12 | fputs("}\n", out); |
1871 | 37 | } |
1872 | | |
1873 | | void ccv_nnc_graph_dot(const ccv_nnc_graph_t* const graph, const int flags, FILE* out) |
1874 | 168 | { |
1875 | 168 | fputs("digraph G {\ncompound=true;\n", out); |
1876 | 168 | ccv_nnc_tensor_dot_recovery_t recovery = _ccv_nnc_graph_tensor_dot_recovery(graph); |
1877 | 168 | int i, j; |
1878 | 168 | int k = 0, c = 0; |
1879 | 168 | int* node_id = (int*)ccmalloc(sizeof(int) * graph->exec_info->rnum); |
1880 | 168 | const ccv_nnc_graph_static_schedule_t* const schedule = graph->default_schedule; |
1881 | | // Output styles. |
1882 | 1.05k | for (i = 0; i < graph->exec_info->rnum; i++888 ) |
1883 | 888 | { |
1884 | 888 | node_id[i] = c; |
1885 | 888 | ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i); |
1886 | 888 | if (CCV_NNC_GRAPH_REF(exec_info)[0]) |
1887 | 34 | _ccv_nnc_graph_dot_sub_graphs(exec_info, recovery, graph->sub_graphs, flags, 1, out, &k, &c); |
1888 | 854 | else { |
1889 | 854 | _ccv_nnc_graph_dot_node(exec_info, |
1890 | 854 | schedule ? (136 i < schedule->exec_info_size136 ? schedule->exec_info + i136 : 00 ) : 0718 , |
1891 | 854 | c, graph->streams, recovery, flags, 0, out, &k); |
1892 | 854 | ++c; |
1893 | 854 | } |
1894 | 888 | } |
1895 | | // Output connections. |
1896 | 1.05k | for (i = 0; i < graph->exec_info->rnum; i++888 ) |
1897 | 888 | { |
1898 | 888 | ccv_nnc_graph_exec_info_t* exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i); |
1899 | 888 | if (exec_info->outgoings) |
1900 | 1.80k | for (j = 0; 720 j < exec_info->outgoings->rnum; j++1.08k ) |
1901 | 1.08k | { |
1902 | 1.08k | const int outgoing_idx = *(int*)ccv_array_get(exec_info->outgoings, j); |
1903 | 1.08k | const ccv_nnc_graph_exec_info_t* const outgoing_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, outgoing_idx); |
1904 | | // If both are sub-graphs, have both tail and head specified. |
1905 | 1.08k | if (CCV_NNC_GRAPH_REF(exec_info)[0] && CCV_NNC_GRAPH_REF18 (outgoing_info)[0]18 ) |
1906 | 3 | fprintf(out, "node%d -> node%d [ltail=cluster%d,lhead=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[i], node_id[outgoing_idx]); |
1907 | 1.08k | else if (CCV_NNC_GRAPH_REF(exec_info)[0] && !15 CCV_NNC_GRAPH_REF15 (outgoing_info)[0]) |
1908 | 15 | fprintf(out, "node%d -> node%d [ltail=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[i]); |
1909 | 1.06k | else if (!CCV_NNC_GRAPH_REF(exec_info)[0] && CCV_NNC_GRAPH_REF(outgoing_info)[0]) |
1910 | 8 | fprintf(out, "node%d -> node%d [lhead=cluster%d];\n", node_id[i], node_id[outgoing_idx], node_id[outgoing_idx]); |
1911 | 1.06k | else |
1912 | 1.06k | fprintf(out, "node%d -> node%d;\n", node_id[i], node_id[outgoing_idx]); |
1913 | 1.08k | } |
1914 | 888 | } |
1915 | 168 | fputs("}\n", out); |
1916 | 168 | _ccv_nnc_graph_tensor_dot_recovery_free(recovery); |
1917 | 168 | ccfree(node_id); |
1918 | 168 | } |
1919 | | |
1920 | | void ccv_nnc_graph_autotune(ccv_nnc_graph_t* const graph, const size_t max_workspace_size, const int flags, const ccv_nnc_graph_exec_t* const sources, const int source_size, const ccv_nnc_graph_exec_t* const destinations, const int destination_size) |
1921 | 120 | { |
1922 | | // exec current node, for synchronous CPU execution, no stream unit. |
1923 | 120 | int i; |
1924 | 120 | #define visitor(node, idx, ...) \ |
1925 | 2.68k | do { \ |
1926 | 2.68k | if (node->cmd.cmd == CCV_NNC_NOOP) \ |
1927 | 2.68k | continue99 ; \ |
1928 | 2.68k | if (2.58k node->cmd.cmd == CCV_NNC_GRAPH_FORWARD2.58k || node->cmd.cmd == CCV_NNC_GRAPH_BACKWARD2.57k ) \ |
1929 | 2.58k | for (i = 0; 12 i < node->graph_ref_size30 ; i++18 ) \ |
1930 | 18 | { \ |
1931 | 18 | ccv_nnc_graph_t* sub_graph = *(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, CCV_NNC_GRAPH_REF(node)[i] - 1); \ |
1932 | 18 | ccv_nnc_graph_autotune(sub_graph, max_workspace_size, flags, 0, 0, 0, 0); \ |
1933 | 18 | } \ |
1934 | 2.58k | else { \ |
1935 | | /* Need to unwrap these tensors */ \ |
1936 | 15.2k | for (i = 0; i < node->input_size + node->output_size; i++12.7k ) \ |
1937 | 12.7k | if (node->inputs[i] && CCV_IS_TENSOR_MULTIVIEW10.2k (node->inputs[i])) \ |
1938 | 12.7k | node->inputs[i] = _ccv_nnc_any_tensor_from_tensor_multiview((ccv_nnc_tensor_multiview_t*)node->inputs[i])13 ; \ |
1939 | 2.57k | PRINT(CCV_CLI_VERBOSE, "%s [%d]: [%d] -> [%d]\n", ccv_nnc_cmd_name(node->cmd.cmd), idx, node->input_size, node->output_size); \ |
1940 | 10.8k | for (i = 0; i < node->input_size; i++8.23k ) \ |
1941 | 8.23k | { \ |
1942 | 8.23k | PRINT(CCV_CLI_VERBOSE, "|-> %d. %p (%p)", i + 1, node->inputs[i], (node->inputs[i] ? node->inputs[i]->data.u8 : 0)); \ |
1943 | 8.23k | if (node->inputs[i] && CCV_CLI_OUTPUT_LEVEL_IS6.02k (CCV_CLI_VERBOSE)) \ |
1944 | 8.23k | ccv_nnc_print_tensor_shape(node->inputs[i])0 ; \ |
1945 | 8.23k | PRINT(CCV_CLI_VERBOSE, "\n"); \ |
1946 | 8.23k | } \ |
1947 | 7.03k | for (i = 0; i < node->output_size; i++4.46k ) \ |
1948 | 4.46k | { \ |
1949 | 4.46k | PRINT(CCV_CLI_VERBOSE, "|<- %d. %p (%p)", i + 1, node->outputs[i], (node->outputs[i] ? node->outputs[i]->data.u8 : 0)); \ |
1950 | 4.46k | if (node->outputs[i] && CCV_CLI_OUTPUT_LEVEL_IS4.26k (CCV_CLI_VERBOSE)) \ |
1951 | 4.46k | ccv_nnc_print_tensor_shape(node->outputs[i])0 ; \ |
1952 | 4.46k | PRINT(CCV_CLI_VERBOSE, "\n"); \ |
1953 | 4.46k | } \ |
1954 | 2.57k | node->cmd = ccv_nnc_cmd_autotune(node->cmd, max_workspace_size, node->hint, flags, node->inputs, node->input_size, node->outputs, node->output_size, 0); \ |
1955 | 2.57k | } \ |
1956 | 2.68k | } while (0) |
1957 | 120 | const ccv_nnc_graph_exec_t* const graph_sources = sources ? sources1 : (119 graph->sources119 ? (ccv_nnc_graph_exec_t*)116 ccv_array_get116 (graph->sources, 0): 03 ); |
1958 | 120 | const int graph_source_size = source_size ? source_size1 : (119 graph->sources119 ? graph->sources->rnum116 : 03 ); |
1959 | 120 | const ccv_nnc_graph_exec_t* const graph_destinations = destinations ? destinations1 : (119 graph->destinations119 ? (ccv_nnc_graph_exec_t*)116 ccv_array_get116 (graph->destinations, 0) : 03 ); |
1960 | 120 | const int graph_destination_size = destination_size ? destination_size1 : (119 graph->destinations119 ? graph->destinations->rnum116 : 03 ); |
1961 | 2.68k | CCV_NNC_GRAPH_VISIT120 (graph, (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, 0), graph->exec_info->rnum, graph_sources, graph_source_size, graph_destinations, graph_destination_size, 0, visitor); |
1962 | 120 | #undef visitor |
1963 | 120 | } |
1964 | | |
1965 | | void ccv_nnc_graph_free(ccv_nnc_graph_t* const graph) |
1966 | 6.24k | { |
1967 | 6.24k | int i, j; |
1968 | 38.7k | for (i = 0; i < graph->exec_info->rnum; i++32.4k ) |
1969 | 32.4k | { |
1970 | 32.4k | ccv_nnc_graph_exec_info_t *info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(graph->exec_info, i); |
1971 | 32.4k | if (info->_heap_graph_ref) |
1972 | 8 | ccfree(info->_heap_graph_ref); |
1973 | 32.4k | ccv_array_t* outgoings = info->outgoings; |
1974 | 32.4k | if (outgoings) |
1975 | 26.2k | ccv_array_free(outgoings); |
1976 | | // We allocate inputs & outputs in continuous fashion, therefore, only need to free the input array. |
1977 | 32.4k | if (info->inputs) |
1978 | 32.2k | ccfree(info->inputs); |
1979 | 32.4k | if (info->input_flags) |
1980 | 32.0k | ccfree(info->input_flags); |
1981 | 32.4k | if (info->updates) |
1982 | 17 | ccfree(info->updates); |
1983 | 32.4k | if ((info->flags & CCV_NNC_GRAPH_EXEC_P_WHILE) && info->p_while.inputs27 ) |
1984 | 23 | ccfree(info->p_while.inputs); |
1985 | 32.4k | } |
1986 | 6.24k | if (graph->tensor_wraps) |
1987 | 27 | { |
1988 | 80 | for (i = 0; i < graph->tensor_wraps->rnum; i++53 ) |
1989 | 53 | { |
1990 | 53 | ccv_nnc_graph_tensor_wrap_array_t* const tensor_wrap_array = *(ccv_nnc_graph_tensor_wrap_array_t**)ccv_array_get(graph->tensor_wraps, i); |
1991 | 53 | if (tensor_wrap_array) |
1992 | 52 | { |
1993 | 195 | for (j = 0; j < tensor_wrap_array->size; j++143 ) |
1994 | 143 | _ccv_nnc_graph_tensor_wrap_free(tensor_wrap_array->tensor_wraps[j]); |
1995 | 52 | ccfree(tensor_wrap_array); |
1996 | 52 | } |
1997 | 53 | } |
1998 | 27 | ccv_array_free(graph->tensor_wraps); |
1999 | 27 | } |
2000 | 6.24k | if (graph->tensor_wraps_refs) |
2001 | 44 | ccv_array_free(graph->tensor_wraps_refs); |
2002 | 6.24k | if (graph->breakpoints) |
2003 | 26 | ccfree(graph->breakpoints); |
2004 | 6.24k | if (graph->sources) |
2005 | 6.23k | ccv_array_free(graph->sources); |
2006 | 6.24k | if (graph->destinations) |
2007 | 6.23k | ccv_array_free(graph->destinations); |
2008 | 6.24k | if (graph->default_schedule) |
2009 | 337 | ccv_nnc_graph_static_schedule_free(graph->default_schedule); |
2010 | 6.24k | if (graph->streams) |
2011 | 337 | { |
2012 | | // If the graph has parent graph, the default stream is allocated by the parent graph, we need to skip. |
2013 | 337 | if (!graph->p) |
2014 | 333 | ccv_nnc_stream_context_free(graph->streams[0]); |
2015 | 1.11k | for (i = 1; i < graph->stream_size; i++773 ) |
2016 | 773 | ccv_nnc_stream_context_free(graph->streams[i]); |
2017 | 337 | ccfree(graph->streams); |
2018 | 337 | } |
2019 | 6.24k | if (graph->block_stream_tasks) |
2020 | 337 | ccfree(graph->block_stream_tasks); |
2021 | 6.24k | if (graph->signals) |
2022 | 337 | { |
2023 | 1.52k | for (i = 0; i < graph->signal_size; i++1.18k ) |
2024 | 1.18k | ccv_nnc_stream_signal_free(graph->signals[i]); |
2025 | 337 | ccfree(graph->signals); |
2026 | 337 | } |
2027 | 6.24k | if (graph->carry_overs) |
2028 | 21 | { |
2029 | 46 | for (i = 0; i < graph->carry_overs->rnum; i++25 ) |
2030 | 25 | { |
2031 | 25 | ccv_nnc_graph_tensor_carry_over_t* const carry_over = (ccv_nnc_graph_tensor_carry_over_t*)ccv_array_get(graph->carry_overs, i); |
2032 | 25 | _ccv_nnc_graph_tensor_wrap_free(carry_over->from); |
2033 | 25 | _ccv_nnc_graph_tensor_wrap_free(carry_over->to); |
2034 | 25 | } |
2035 | 21 | ccv_array_free(graph->carry_overs); |
2036 | 21 | } |
2037 | 6.24k | if (graph->sub_graphs) |
2038 | 35 | { |
2039 | 94 | for (i = 0; i < graph->sub_graphs->rnum; i++59 ) |
2040 | 59 | ccv_nnc_graph_free(*(ccv_nnc_graph_t**)ccv_array_get(graph->sub_graphs, i)); |
2041 | 35 | ccv_array_free(graph->sub_graphs); |
2042 | 35 | } |
2043 | 6.24k | ccv_array_free(graph->exec_info); |
2044 | 6.24k | if (graph->buffer) |
2045 | 335 | ccfree(graph->buffer); |
2046 | 6.24k | ccfree(graph); |
2047 | 6.24k | } |