/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_symbolic_graph_parallel.c
Line | Count | Source |
1 | | #include "ccv_nnc.h" |
2 | | #include "ccv_nnc_easy.h" |
3 | | #include "ccv_nnc_internal.h" |
4 | | #include "ccv_internal.h" |
5 | | #include "_ccv_nnc_symbolic_graph.h" |
6 | | |
7 | | // MARK - Level-3.5 API |
8 | | |
9 | | enum { |
10 | | CCV_NNC_PARALLEL_BROADCAST = 0x1, |
11 | | CCV_NNC_PARALLEL_ALLREDUCER = 0x2, |
12 | | CCV_NNC_PARALLEL_REDUCER = 0x3, |
13 | | }; |
14 | | |
15 | | static int _ccv_nnc_exec_inputs_contain(const ccv_nnc_graph_exec_symbol_info_t* const node, const int d) |
16 | 1.46k | { |
17 | 1.46k | int i; |
18 | 4.82k | for (i = 0; i < node->input_size; i++3.36k ) |
19 | 3.88k | if (node->inputs[i] == d) |
20 | 520 | return 1; |
21 | 940 | return 0; |
22 | 1.46k | } |
23 | | |
24 | | void ccv_nnc_symbolic_graph_data_parallel(ccv_nnc_symbolic_graph_t* const graph, const int parallel, const ccv_nnc_tensor_symbol_t* const broadcasts, const int broadcast_size, const ccv_nnc_tensor_symbol_t* const allreducers, const int allreducer_size, ccv_nnc_tensor_symbol_t* const allreducer_outs, const ccv_nnc_tensor_symbol_t* const reducers, const int reducer_size, ccv_nnc_tensor_symbol_t* const reducer_outs, const int reduce_op_type, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size) |
25 | 17 | { |
26 | 17 | assert(reduce_op_type == CCV_NNC_PARALLEL_REDUCE_OP_SUM); |
27 | 17 | const int parallel_count = (parallel == 0) ? ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU)0 : parallel; |
28 | 17 | if (parallel_count == 1) |
29 | 0 | return; |
30 | 17 | assert(parallel_count > 1); |
31 | 34 | ccv_nnc_graph_visit_t* const visit = 17 ccv_nnc_graph_visit_new17 (graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0), graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0); |
32 | 0 | int i, j, k; |
33 | | // Tensor symbol has to be on device 0 or any. |
34 | 570 | ccv_nnc_graph_visit_for(visit, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0), node, idx) { |
35 | 2.39k | for (i = 0; i < node->input_size; i++1.82k ) |
36 | 1.82k | if (node->inputs[i] >= 0) |
37 | 1.32k | { |
38 | 1.32k | ccv_nnc_tensor_symbol_info_t* const tensor_symbol = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, node->inputs[i]); |
39 | 1.32k | if (CCV_TENSOR_GET_MEMORY(tensor_symbol->info.type) == CCV_TENSOR_GPU_MEMORY && |
40 | 1.32k | CCV_TENSOR_GET_DEVICE(tensor_symbol->info.type) != CCV_COMPUTE_DEVICE_ANY) |
41 | 1.32k | { assert(CCV_TENSOR_GET_DEVICE(tensor_symbol->info.type) == CCV_COMPUTE_DEVICE_000); } |
42 | 1.32k | } |
43 | 1.53k | for (i = 0; 570 i < node->output_size; i++960 ) |
44 | 960 | if (node->outputs[i] >= 0) |
45 | 914 | { |
46 | 914 | ccv_nnc_tensor_symbol_info_t* const tensor_symbol = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, node->outputs[i]); |
47 | 914 | if (CCV_TENSOR_GET_MEMORY(tensor_symbol->info.type) == CCV_TENSOR_GPU_MEMORY && |
48 | 914 | CCV_TENSOR_GET_DEVICE911 (tensor_symbol->info.type) != CCV_COMPUTE_DEVICE_ANY911 ) |
49 | 911 | { assert(CCV_TENSOR_GET_DEVICE(tensor_symbol->info.type) == CCV_COMPUTE_DEVICE_000); } |
50 | 914 | } |
51 | 570 | } ccv_nnc_graph_visit_endfor |
52 | | // Run infer in the graph to get all tensors shaped. |
53 | 17 | ccv_nnc_symbolic_graph_symbol_infer(graph, visit, sources, source_size, destinations, destination_size, 0, 0, (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, 0), (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0)); |
54 | | // Set ANY device to default device. Make a list of execution nodes / tensors to be duplicated. |
55 | 17 | ccv_array_t* const dup_tensors = ccv_array_new(sizeof(int), 0, 0); |
56 | 17 | ccv_array_t* const dup_execs = ccv_array_new(sizeof(int), 0, 0); |
57 | 17 | ccv_array_t* const broadcast_reduce_execs = ccv_array_new(sizeof(int), 0, 0); |
58 | 17 | int* const allreduce_inputs = allreducer_size > 0 ? (int*)9 ccmalloc9 (sizeof(int) * allreducer_size) : 08 ; |
59 | 149 | for (i = 0; i < allreducer_size; i++132 ) |
60 | 132 | { |
61 | 132 | if (allreducers[i].d == CCV_NNC_NO_TENSOR_SYMBOL) |
62 | 0 | allreduce_inputs[i] = CCV_NNC_NO_TENSOR_SYMBOL; |
63 | 132 | else |
64 | 132 | allreduce_inputs[i] = ccv_nnc_tensor_symbol_new(graph, ccv_nnc_tensor_symbol_params(graph, allreducers[i]), 0).d; |
65 | 132 | } |
66 | 17 | const int tensor_symbol_size = graph->tensor_symbol_info->rnum; |
67 | 17 | const int graph_exec_symbol_size = graph->exec_symbol_info->rnum; |
68 | 17 | int* const tensor_flags = (int*)cccalloc(tensor_symbol_size + graph_exec_symbol_size, sizeof(int)); |
69 | 17 | int* const exec_flags = tensor_flags + tensor_symbol_size; |
70 | 29 | for (i = 0; i < broadcast_size; i++12 ) |
71 | 12 | { |
72 | 12 | if (broadcasts[i].d == CCV_NNC_NO_TENSOR_SYMBOL) |
73 | 0 | continue; |
74 | | // Doesn't support alias for these. |
75 | 12 | tensor_flags[broadcasts[i].d] = CCV_NNC_PARALLEL_BROADCAST; |
76 | 12 | assert(graph == broadcasts[i].graph); |
77 | 12 | assert(!((ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, broadcasts[i].d))->alias_ref); |
78 | 12 | } |
79 | 17 | int* const allreduce_producers = allreducer_size > 0 ? (int*)9 cccalloc9 (tensor_symbol_size, sizeof(int)) : 08 ; |
80 | 149 | for (i = 0; i < allreducer_size; i++132 ) |
81 | 132 | { |
82 | 132 | if (allreducers[i].d == CCV_NNC_NO_TENSOR_SYMBOL) |
83 | 0 | continue; |
84 | | // Doesn't support alias for these. |
85 | 132 | tensor_flags[allreducers[i].d] = CCV_NNC_PARALLEL_ALLREDUCER; |
86 | 132 | assert(graph == allreducers[i].graph); |
87 | 132 | assert(!((ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, allreducers[i].d))->alias_ref); |
88 | 132 | } |
89 | 25 | for (i = 0; 17 i < reducer_size; i++8 ) |
90 | 8 | { |
91 | 8 | if (reducers[i].d == CCV_NNC_NO_TENSOR_SYMBOL) |
92 | 0 | continue; |
93 | | // Doesn't support alias for these. |
94 | 8 | tensor_flags[reducers[i].d] = CCV_NNC_PARALLEL_REDUCER; |
95 | 8 | assert(graph == reducers[i].graph); |
96 | 8 | assert(!((ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, reducers[i].d))->alias_ref); |
97 | 8 | } |
98 | | // No overlap between broadcasts, allreducers, reducers. |
99 | 29 | for (i = 0; 17 i < broadcast_size; i++12 ) |
100 | 12 | { |
101 | 12 | if (broadcasts[i].d == CCV_NNC_NO_TENSOR_SYMBOL) |
102 | 0 | continue; |
103 | 44 | for (j = 0; 12 j < reducer_size; j++32 ) |
104 | 32 | { assert(broadcasts[i].d != reducers[j].d); } |
105 | 28 | for (j = 0; 12 j < allreducer_size; j++16 ) |
106 | 16 | { assert(broadcasts[i].d != allreducers[j].d); } |
107 | 12 | } |
108 | 149 | for (i = 0; 17 i < allreducer_size; i++132 ) |
109 | 132 | { |
110 | 132 | if (allreducers[i].d == CCV_NNC_NO_TENSOR_SYMBOL) |
111 | 0 | continue; |
112 | 132 | for (j = 0; j < reducer_size; j++0 ) |
113 | 0 | { assert(allreducers[i].d != reducers[j].d); } |
114 | 132 | } |
115 | 570 | ccv_nnc_graph_visit_for(visit, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0), node, idx) { |
116 | 570 | int parallelizable_data = 0; |
117 | 570 | int reduce_inputs = 0; |
118 | 570 | int broadcast_outputs = 0; |
119 | 2.39k | for (i = 0; i < node->input_size; i++1.82k ) |
120 | 1.82k | if (node->inputs[i] >= 0) |
121 | 1.32k | { |
122 | 1.32k | ccv_nnc_tensor_symbol_info_t* const tensor_symbol = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, node->inputs[i]); |
123 | 1.32k | if (CCV_TENSOR_GET_MEMORY(tensor_symbol->info.type) == CCV_TENSOR_GPU_MEMORY) |
124 | 1.32k | { |
125 | 1.32k | if (CCV_TENSOR_GET_DEVICE(tensor_symbol->info.type) == CCV_COMPUTE_DEVICE_ANY) |
126 | 0 | CCV_TENSOR_SET_DEVICE_ID(tensor_symbol->info.type, 0); |
127 | | // Don't support alias for broadcast / allreducer / reducer. |
128 | 1.32k | assert(!tensor_symbol->alias_ref || tensor_flags[tensor_symbol->alias_ref - 1] != CCV_NNC_PARALLEL_BROADCAST); |
129 | 1.32k | assert(!tensor_symbol->alias_ref || tensor_flags[tensor_symbol->alias_ref - 1] != CCV_NNC_PARALLEL_ALLREDUCER); |
130 | 1.32k | assert(!tensor_symbol->alias_ref || tensor_flags[tensor_symbol->alias_ref - 1] != CCV_NNC_PARALLEL_REDUCER); |
131 | 1.32k | const int d = node->inputs[i]; |
132 | 1.32k | if (tensor_flags[d] == CCV_NNC_PARALLEL_REDUCER) |
133 | 8 | reduce_inputs = 1; |
134 | 1.32k | parallelizable_data = 1; |
135 | 1.32k | } |
136 | 1.32k | } |
137 | 1.53k | for (i = 0; 570 i < node->output_size; i++960 ) |
138 | 960 | if (node->outputs[i] >= 0) |
139 | 914 | { |
140 | 914 | ccv_nnc_tensor_symbol_info_t* const tensor_symbol = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, node->outputs[i]); |
141 | 914 | if (CCV_TENSOR_GET_MEMORY(tensor_symbol->info.type) == CCV_TENSOR_GPU_MEMORY) |
142 | 914 | { |
143 | 914 | if (CCV_TENSOR_GET_DEVICE(tensor_symbol->info.type) == CCV_COMPUTE_DEVICE_ANY) |
144 | 0 | CCV_TENSOR_SET_DEVICE_ID(tensor_symbol->info.type, 0); |
145 | | // Don't support alias for broadcast / allreducer / reducer. |
146 | 914 | assert(!tensor_symbol->alias_ref || tensor_flags[tensor_symbol->alias_ref - 1] != CCV_NNC_PARALLEL_BROADCAST); |
147 | 914 | assert(!tensor_symbol->alias_ref || tensor_flags[tensor_symbol->alias_ref - 1] != CCV_NNC_PARALLEL_ALLREDUCER); |
148 | 914 | assert(!tensor_symbol->alias_ref || tensor_flags[tensor_symbol->alias_ref - 1] != CCV_NNC_PARALLEL_REDUCER); |
149 | 914 | const int d = node->outputs[i]; |
150 | 914 | if (tensor_flags[d] == CCV_NNC_PARALLEL_BROADCAST) |
151 | 0 | broadcast_outputs = 1; |
152 | 914 | else if (tensor_flags[d] == CCV_NNC_PARALLEL_ALLREDUCER) |
153 | 132 | allreduce_producers[d] = idx + 1; |
154 | 914 | parallelizable_data = 1; |
155 | 914 | } |
156 | 914 | } |
157 | 570 | assert(!(broadcast_outputs && reduce_inputs)); // This node cannot be both broadcast and reducer. |
158 | 570 | if (broadcast_outputs ^ reduce_inputs) |
159 | 8 | { |
160 | 8 | if (broadcast_outputs) |
161 | 0 | exec_flags[idx] = CCV_NNC_PARALLEL_BROADCAST; |
162 | 8 | else if (reduce_inputs) |
163 | 8 | exec_flags[idx] = CCV_NNC_PARALLEL_REDUCER; |
164 | 8 | ccv_array_push(broadcast_reduce_execs, &idx); |
165 | 562 | } else if (parallelizable_data && !broadcast_outputs && !reduce_inputs) { |
166 | | // If this node contains GPU data that need to be parallelized, and this node itself is not a broadcast node or a reducer node.. |
167 | 562 | ccv_array_push(dup_execs, &idx); |
168 | 2.36k | for (i = 0; i < node->input_size; i++1.80k ) |
169 | 1.80k | if (node->inputs[i] >= 0) |
170 | 1.30k | { |
171 | 1.30k | ccv_nnc_tensor_symbol_info_t* const tensor_symbol = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, node->inputs[i]); |
172 | 1.30k | if (CCV_TENSOR_GET_MEMORY(tensor_symbol->info.type) == CCV_TENSOR_GPU_MEMORY) |
173 | 1.30k | { |
174 | | // Add the symbol alias to first. |
175 | 1.30k | if (tensor_symbol->alias_ref) |
176 | 112 | ccv_array_add_unique_int(dup_tensors, tensor_symbol->alias_ref - 1); |
177 | 1.30k | ccv_array_add_unique_int(dup_tensors, node->inputs[i]); |
178 | 1.30k | } |
179 | 1.30k | } |
180 | 1.50k | for (i = 0; i < node->output_size; i++944 ) |
181 | 944 | if (node->outputs[i] >= 0) |
182 | 898 | { |
183 | 898 | ccv_nnc_tensor_symbol_info_t* const tensor_symbol = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, node->outputs[i]); |
184 | 898 | if (CCV_TENSOR_GET_MEMORY(tensor_symbol->info.type) == CCV_TENSOR_GPU_MEMORY) |
185 | 898 | { |
186 | 898 | if (tensor_symbol->alias_ref) |
187 | 72 | ccv_array_add_unique_int(dup_tensors, tensor_symbol->alias_ref - 1); |
188 | 898 | ccv_array_add_unique_int(dup_tensors, node->outputs[i]); |
189 | 898 | } |
190 | 898 | } |
191 | 562 | } |
192 | 570 | } ccv_nnc_graph_visit_endfor |
193 | | // Now, actually create these tensors. |
194 | 17 | if (!graph->data_parallel.tensor_symbol_idx) |
195 | 17 | graph->data_parallel.tensor_symbol_idx = (int*)ccmalloc(sizeof(int) * (parallel_count - 1) * tensor_symbol_size); |
196 | 0 | else if (graph->data_parallel.tensor_symbol_size * (graph->data_parallel.count - 1) != tensor_symbol_size * (parallel_count - 1)) |
197 | | // This may shrink too, but that is OK. |
198 | 0 | graph->data_parallel.tensor_symbol_idx = (int*)ccrealloc(graph->data_parallel.tensor_symbol_idx, sizeof(int) * (parallel_count - 1) * tensor_symbol_size); |
199 | 17 | graph->data_parallel.tensor_symbol_size = tensor_symbol_size; |
200 | 17 | graph->data_parallel.count = parallel_count; |
201 | 17 | int* const dup_tensor_idx = graph->data_parallel.tensor_symbol_idx; |
202 | | // dup_tensor_idx is the array starts with 0 here. |
203 | 4.24k | for (i = 0; i < (parallel_count - 1) * tensor_symbol_size; i++4.22k ) |
204 | 4.22k | dup_tensor_idx[i] = -1; |
205 | | // Make the duplicated tensors (on different devices). |
206 | 1.34k | for (i = 0; i < dup_tensors->rnum; i++1.32k ) |
207 | 1.32k | { |
208 | 1.32k | const int d = *(int*)ccv_array_get(dup_tensors, i); |
209 | 1.32k | ccv_nnc_tensor_symbol_info_t* const tensor_symbol = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, d); |
210 | 1.32k | ccv_nnc_tensor_param_t info = tensor_symbol->info; |
211 | 1.32k | const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type); |
212 | 1.32k | const int flags = tensor_symbol->flags; |
213 | 1.32k | if (tensor_symbol->alias_ref) |
214 | 136 | { |
215 | 136 | const int alias_ref = tensor_symbol->alias_ref - 1; |
216 | 536 | for (j = 0; j < parallel_count - 1; j++400 ) |
217 | 400 | { |
218 | 400 | const int dup_d = dup_tensor_idx[alias_ref * (parallel_count - 1) + j]; |
219 | 400 | if (j + 1 != device_id) |
220 | 400 | CCV_TENSOR_SET_DEVICE_ID(info.type, j + 1); // Set the device id. |
221 | 0 | else |
222 | 0 | CCV_TENSOR_SET_DEVICE_ID(info.type, 0); |
223 | 400 | assert(dup_d >= 0); |
224 | | // Get tensor symbol again, it may be invalid after added new symbol (we use it for ofs and inc). |
225 | 400 | ccv_nnc_tensor_symbol_info_t* const tensor_symbol = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, d); |
226 | 400 | const ccv_nnc_tensor_symbol_t new_symbol = ccv_nnc_tensor_symbol_alias_new(graph, (ccv_nnc_tensor_symbol_t){ |
227 | 400 | .d = dup_d, |
228 | 400 | .graph = graph, |
229 | 400 | }, tensor_symbol->ofs, tensor_symbol->stride, info, 0); |
230 | 400 | ccv_nnc_tensor_symbol_set_flags(graph, new_symbol, flags); |
231 | 400 | dup_tensor_idx[d * (parallel_count - 1) + j] = new_symbol.d; |
232 | 400 | } |
233 | 1.19k | } else { |
234 | 4.60k | for (j = 0; j < parallel_count - 1; j++3.41k ) |
235 | 3.41k | { |
236 | 3.41k | if (j + 1 != device_id) |
237 | 3.41k | CCV_TENSOR_SET_DEVICE_ID(info.type, j + 1); // Set the device id. |
238 | 0 | else |
239 | 0 | CCV_TENSOR_SET_DEVICE_ID(info.type, 0); |
240 | 3.41k | const ccv_nnc_tensor_symbol_t new_symbol = ccv_nnc_tensor_symbol_new(graph, info, 0); |
241 | 3.41k | ccv_nnc_tensor_symbol_set_flags(graph, new_symbol, flags); |
242 | 3.41k | dup_tensor_idx[d * (parallel_count - 1) + j] = new_symbol.d; |
243 | 3.41k | } |
244 | 1.19k | } |
245 | 1.32k | } |
246 | 17 | ccv_array_free(dup_tensors); |
247 | | // Now, create execs. |
248 | 17 | if (!graph->data_parallel.exec_symbol_idx) |
249 | 17 | graph->data_parallel.exec_symbol_idx = (int*)ccmalloc(sizeof(int) * (parallel_count - 1) * graph_exec_symbol_size); |
250 | 0 | else if (graph->data_parallel.exec_symbol_size * (graph->data_parallel.count - 1) != graph_exec_symbol_size * (parallel_count - 1)) |
251 | | // This may shrink too, but that is OK. |
252 | 0 | graph->data_parallel.exec_symbol_idx = (int*)ccrealloc(graph->data_parallel.exec_symbol_idx, sizeof(int) * (parallel_count - 1) * graph_exec_symbol_size); |
253 | 17 | graph->data_parallel.exec_symbol_size = graph_exec_symbol_size; |
254 | 17 | int* const dup_exec_idx = graph->data_parallel.exec_symbol_idx; |
255 | | // dup_exec_idx is the array starts with 0 here. |
256 | 1.62k | for (i = 0; i < (parallel_count - 1) * graph_exec_symbol_size; i++1.60k ) |
257 | 1.60k | dup_exec_idx[i] = -1; |
258 | 17 | int max_io_size = 1 + parallel_count; |
259 | | // Now make the duplicated execs nodes (on different devices). |
260 | 579 | for (i = 0; i < dup_execs->rnum; i++562 ) |
261 | 562 | { |
262 | 562 | const int d = *(int*)ccv_array_get(dup_execs, i); |
263 | 562 | ccv_nnc_graph_exec_symbol_info_t* const exec_symbol = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, d); |
264 | 562 | max_io_size = ccv_max(max_io_size, exec_symbol->input_size + exec_symbol->output_size); |
265 | 562 | } |
266 | 17 | max_io_size = ccv_max(max_io_size, parallel_count * 2); // tensors from all parallel_count, the output is to all parallel_count (thus, allreduce). |
267 | 17 | ccv_nnc_tensor_symbol_t max_io[max_io_size]; |
268 | 579 | for (i = 0; i < dup_execs->rnum; i++562 ) |
269 | 562 | { |
270 | 562 | const int d = *(int*)ccv_array_get(dup_execs, i); |
271 | 2.16k | for (j = 0; j < parallel_count - 1; j++1.59k ) |
272 | 1.59k | { |
273 | 1.59k | ccv_nnc_graph_exec_symbol_info_t* const exec_symbol = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, d); |
274 | 1.59k | const ccv_nnc_cmd_t cmd = exec_symbol->cmd; |
275 | 1.59k | const ccv_nnc_hint_t hint = exec_symbol->hint; |
276 | 1.59k | const int input_size = exec_symbol->input_size; |
277 | 1.59k | const int output_size = exec_symbol->output_size; |
278 | 1.59k | ccv_nnc_tensor_symbol_t* const inputs = max_io; |
279 | 6.78k | for (k = 0; k < input_size; k++5.18k ) |
280 | 5.18k | { |
281 | 5.18k | const int idx = exec_symbol->inputs[k]; |
282 | 5.18k | if (idx >= 0) |
283 | 3.75k | inputs[k].d = dup_tensor_idx[idx * (parallel_count - 1) + j] >= 0 ? dup_tensor_idx[idx * (parallel_count - 1) + j] : idx0 ; |
284 | 1.42k | else |
285 | 1.42k | inputs[k].d = idx; |
286 | 5.18k | inputs[k].graph = idx != CCV_NNC_NO_TENSOR_SYMBOL ? graph3.75k : 01.42k ; |
287 | 5.18k | } |
288 | 1.59k | ccv_nnc_tensor_symbol_t* const outputs = max_io + input_size; |
289 | 4.29k | for (k = 0; k < output_size; k++2.70k ) |
290 | 2.70k | { |
291 | 2.70k | const int idx = exec_symbol->outputs[k]; |
292 | 2.70k | if (idx >= 0) |
293 | 2.57k | outputs[k].d = dup_tensor_idx[idx * (parallel_count - 1) + j] >= 0 ? dup_tensor_idx[idx * (parallel_count - 1) + j] : idx0 ; |
294 | 126 | else |
295 | 126 | outputs[k].d = idx; |
296 | 2.70k | outputs[k].graph = idx != CCV_NNC_NO_TENSOR_SYMBOL ? graph2.57k : 0126 ; |
297 | 2.70k | } |
298 | 1.59k | const ccv_nnc_graph_exec_symbol_t new_symbol = ccv_nnc_graph_exec_symbol_new(graph, cmd, inputs, input_size, outputs, output_size, 0); |
299 | 1.59k | ccv_nnc_graph_exec_symbol_set_hint(graph, new_symbol, hint); |
300 | 1.59k | dup_exec_idx[d * (parallel_count - 1) + j] = new_symbol.d; |
301 | 1.59k | } |
302 | 562 | } |
303 | | // Create new tensors for broadcast / reduce. |
304 | 17 | int* const broadcast_reduce_tensor_idx = (int*)cccalloc(tensor_symbol_size, sizeof(int)); |
305 | 37 | for (i = 0; i < broadcast_size + reducer_size; i++20 ) |
306 | 20 | { |
307 | 20 | const int idx = i >= broadcast_size ? reducers[i - broadcast_size].d8 : broadcasts[i].d12 ; |
308 | 20 | if (idx == CCV_NNC_NO_TENSOR_SYMBOL) |
309 | 0 | continue; |
310 | 20 | ccv_nnc_tensor_symbol_info_t* const tensor_symbol = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, idx); |
311 | 20 | ccv_nnc_tensor_param_t info = tensor_symbol->info; |
312 | 20 | const int flags = tensor_symbol->flags; |
313 | | // No alias handling. |
314 | 20 | assert(!tensor_symbol->alias_ref); |
315 | 20 | const ccv_nnc_tensor_symbol_t new_symbol = ccv_nnc_tensor_symbol_new(graph, info, 0); |
316 | 20 | ccv_nnc_tensor_symbol_set_flags(graph, new_symbol, flags); |
317 | 20 | broadcast_reduce_tensor_idx[idx] = new_symbol.d + 1; |
318 | 20 | } |
319 | 17 | int* const broadcast_exec_idx = (int*)cccalloc(tensor_symbol_size, sizeof(int)); |
320 | 17 | int* const reduce_exec_idx = (int*)cccalloc(tensor_symbol_size, sizeof(int)); |
321 | | // Create node for broadcast (thus, transfer data to different parallel_count) and reducer (transfer data back to a device, and sum). |
322 | 25 | for (i = 0; i < broadcast_reduce_execs->rnum; i++8 ) |
323 | 8 | { |
324 | 8 | const int d = *(int*)ccv_array_get(broadcast_reduce_execs, i); |
325 | | // For broadcast, we create data transfers as our dup node, and create connections to these data transfers. |
326 | 8 | if (exec_flags[d] == CCV_NNC_PARALLEL_BROADCAST) |
327 | 0 | { |
328 | 0 | ccv_nnc_graph_exec_symbol_info_t* const exec_symbol = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, d); |
329 | 0 | ccv_nnc_tensor_symbol_t* const inputs = max_io; |
330 | 0 | ccv_nnc_tensor_symbol_t* const outputs = max_io + 1; |
331 | 0 | const ccv_nnc_graph_exec_symbol_t source = { |
332 | 0 | .d = d, |
333 | 0 | .graph = graph, |
334 | 0 | }; |
335 | 0 | for (j = 0; j < exec_symbol->output_size; j++) |
336 | 0 | { |
337 | 0 | const int idx = exec_symbol->outputs[j]; |
338 | 0 | if (idx >= 0 && tensor_flags[idx] == CCV_NNC_PARALLEL_BROADCAST) |
339 | 0 | { |
340 | 0 | inputs[0] = (ccv_nnc_tensor_symbol_t){ |
341 | 0 | .d = idx, |
342 | 0 | .graph = graph, |
343 | 0 | }; |
344 | | // Reset the tensor flags, it is broadcasted now. |
345 | 0 | tensor_flags[idx] = 0; |
346 | 0 | outputs[0] = (ccv_nnc_tensor_symbol_t){ |
347 | 0 | .d = broadcast_reduce_tensor_idx[idx] - 1, |
348 | 0 | .graph = graph, |
349 | 0 | }; |
350 | 0 | assert(broadcast_reduce_tensor_idx[idx] > 0); |
351 | 0 | for (k = 0; k < parallel_count - 1; k++) |
352 | 0 | outputs[k + 1] = (ccv_nnc_tensor_symbol_t){ |
353 | 0 | .d = dup_tensor_idx[idx * (parallel_count - 1) + k], |
354 | 0 | .graph = graph, |
355 | 0 | }; |
356 | 0 | const ccv_nnc_graph_exec_symbol_t bcast = ccv_nnc_graph_exec_symbol_new(graph, CMD_COMM_BROADCAST_FORWARD(), inputs, 1, outputs, parallel_count, 0); |
357 | 0 | ccv_nnc_graph_exec_symbol_concat(graph, source, bcast); |
358 | 0 | assert(!broadcast_exec_idx[idx]); |
359 | 0 | broadcast_exec_idx[idx] = bcast.d + 1; |
360 | 0 | } |
361 | 0 | } |
362 | 8 | } else if (exec_flags[d] == CCV_NNC_PARALLEL_REDUCER) { |
363 | | // Gather is a bit more sophisticated, we need to use the new tensor to hold the summed value. |
364 | | // This is what we have right now, I will use NCCL later. |
365 | 8 | ccv_nnc_tensor_symbol_t* const inputs = max_io; |
366 | 8 | ccv_nnc_tensor_symbol_t* const outputs = max_io + parallel_count; |
367 | 8 | ccv_nnc_graph_exec_symbol_info_t* exec_symbol = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, d); |
368 | 32 | for (j = 0; j < exec_symbol->input_size; j++24 ) |
369 | 24 | { |
370 | 24 | const int idx = exec_symbol->inputs[j]; |
371 | 24 | if (idx >= 0 && tensor_flags[idx] == CCV_NNC_PARALLEL_REDUCER && !reduce_exec_idx[idx]8 ) |
372 | 8 | { |
373 | 8 | inputs[0] = (ccv_nnc_tensor_symbol_t){ |
374 | 8 | .d = idx, |
375 | 8 | .graph = graph, |
376 | 8 | }; |
377 | 16 | for (k = 0; k < parallel_count - 1; k++8 ) |
378 | 8 | inputs[k + 1] = (ccv_nnc_tensor_symbol_t){ |
379 | 8 | .d = dup_tensor_idx[idx * (parallel_count - 1) + k], |
380 | 8 | .graph = graph, |
381 | 8 | }; |
382 | 8 | outputs[0] = (ccv_nnc_tensor_symbol_t){ |
383 | 8 | .d = broadcast_reduce_tensor_idx[idx] - 1, |
384 | 8 | .graph = graph, |
385 | 8 | }; |
386 | | // Create new symbol for all other tensors to facilitate copy (this is not useful for NCCL, but useful for REF implementation). |
387 | 8 | ccv_nnc_tensor_symbol_info_t* const tensor_symbol = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, idx); |
388 | 8 | ccv_nnc_tensor_param_t info = tensor_symbol->info; |
389 | 8 | const int flags = tensor_symbol->flags; |
390 | | // No alias handling. |
391 | 8 | assert(!tensor_symbol->alias_ref); |
392 | 16 | for (k = 1; 8 k < parallel_count; k++8 ) |
393 | 8 | { |
394 | 8 | const ccv_nnc_tensor_symbol_t new_symbol = ccv_nnc_tensor_symbol_new(graph, info, 0); |
395 | 8 | ccv_nnc_tensor_symbol_set_flags(graph, new_symbol, flags); |
396 | 8 | outputs[k] = new_symbol; |
397 | 8 | } |
398 | 8 | const ccv_nnc_graph_exec_symbol_t reduce = ccv_nnc_graph_exec_symbol_new(graph, CMD_COMM_REDUCE_FORWARD(), inputs, parallel_count, outputs, parallel_count, 0); |
399 | | // Refresh the pointer to keep it up to date. |
400 | 8 | exec_symbol = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, d); |
401 | 8 | ccv_nnc_graph_exec_symbol_concat(graph, reduce, (ccv_nnc_graph_exec_symbol_t){ |
402 | 8 | .d = d, |
403 | 8 | .graph = graph, |
404 | 8 | }); |
405 | 8 | reduce_exec_idx[idx] = reduce.d + 1; |
406 | 8 | } |
407 | 24 | } |
408 | | // Update the inputs pointing to the summed value. |
409 | 32 | for (j = 0; 8 j < exec_symbol->input_size; j++24 ) |
410 | 24 | { |
411 | 24 | const int idx = exec_symbol->inputs[j]; |
412 | 24 | if (idx >= 0 && tensor_flags[idx] == CCV_NNC_PARALLEL_REDUCER) |
413 | 8 | exec_symbol->inputs[j] = broadcast_reduce_tensor_idx[idx] - 1; |
414 | 24 | } |
415 | 8 | } |
416 | 8 | } |
417 | 17 | ccv_array_free(broadcast_reduce_execs); |
418 | | // If this tensor is not broadcasted yet, that means there is no exec to generate this tensor. We just generate headless copy. |
419 | 579 | for (i = 0; i < dup_execs->rnum; i++562 ) |
420 | 562 | { |
421 | 562 | const int idx = *(int*)ccv_array_get(dup_execs, i); |
422 | 562 | ccv_nnc_graph_exec_symbol_info_t* const node = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, idx); |
423 | 562 | if (exec_flags[idx] == CCV_NNC_PARALLEL_REDUCER) |
424 | 0 | continue; |
425 | | // We try to make copy command as compact as possible by having one copy for multiple tensors if they used together. |
426 | 562 | ccv_nnc_tensor_symbol_t* const inputs = max_io; |
427 | 562 | ccv_nnc_tensor_symbol_t* const outputs = max_io + 1; |
428 | 2.36k | for (j = 0; j < node->input_size; j++1.80k ) |
429 | 1.80k | { |
430 | 1.80k | const int idx = node->inputs[j]; |
431 | | // Now, figure out whether we need to create copy command. |
432 | 1.80k | if (idx >= 0 && idx < tensor_symbol_size1.30k && tensor_flags[idx] == CCV_NNC_PARALLEL_BROADCAST1.30k ) |
433 | 12 | { |
434 | 12 | inputs[0] = (ccv_nnc_tensor_symbol_t){ |
435 | 12 | .d = idx, |
436 | 12 | .graph = graph, |
437 | 12 | }; |
438 | | // Reset the tensor flags, it is broadcasted now. |
439 | 12 | tensor_flags[idx] = 0; |
440 | 12 | outputs[0] = (ccv_nnc_tensor_symbol_t){ |
441 | 12 | .d = broadcast_reduce_tensor_idx[idx] - 1, |
442 | 12 | .graph = graph, |
443 | 12 | }; |
444 | 12 | assert(broadcast_reduce_tensor_idx[idx] > 0); |
445 | 24 | for (k = 0; 12 k < parallel_count - 1; k++12 ) |
446 | 12 | outputs[k + 1] = (ccv_nnc_tensor_symbol_t){ |
447 | 12 | .d = dup_tensor_idx[idx * (parallel_count - 1) + k], |
448 | 12 | .graph = graph, |
449 | 12 | }; |
450 | 12 | const ccv_nnc_graph_exec_symbol_t bcast = ccv_nnc_graph_exec_symbol_new(graph, CMD_COMM_BROADCAST_FORWARD(), inputs, 1, outputs, parallel_count, 0); |
451 | 12 | broadcast_exec_idx[idx] = bcast.d + 1; |
452 | 12 | } |
453 | 1.80k | } |
454 | 562 | } |
455 | | // Write reducer_outs last, because it may be the same pointer as reducers. |
456 | 17 | if (reducer_outs) |
457 | 0 | for (i = 0; i < reducer_size; i++) |
458 | 0 | { |
459 | 0 | reducer_outs[i].d = broadcast_reduce_tensor_idx[i + broadcast_size] - 1; |
460 | 0 | reducer_outs[i].graph = graph; |
461 | 0 | } |
462 | 17 | ccfree(broadcast_reduce_tensor_idx); |
463 | 17 | ccv_array_free(dup_execs); |
464 | | // Now everything is dup'ed, connect them all. |
465 | 570 | ccv_nnc_graph_visit_for(visit, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0), node, idx) { |
466 | 2.39k | for (i = 0; i < node->input_size; i++1.82k ) |
467 | 1.82k | { |
468 | 1.82k | const int input = node->inputs[i]; |
469 | | // If it is broadcast worthy. |
470 | 1.82k | if (input >= 0 && input < tensor_symbol_size1.32k && broadcast_exec_idx[input]1.31k ) |
471 | 27 | ccv_nnc_graph_exec_symbol_concat(graph, (ccv_nnc_graph_exec_symbol_t){ |
472 | 27 | .d = broadcast_exec_idx[input] - 1, |
473 | 27 | .graph = graph, |
474 | 27 | }, (ccv_nnc_graph_exec_symbol_t){ |
475 | 27 | .d = idx, |
476 | 27 | .graph = graph, |
477 | 27 | }); |
478 | 1.82k | } |
479 | | // Check whether this node has outgoing to the reducer node, if so, replace that to the sum node. |
480 | 570 | if (node->outgoings && node->outgoings->rnum422 ) |
481 | 1.29k | for (i = 0; 422 i < node->outgoings->rnum; i++871 ) |
482 | 871 | { |
483 | 871 | const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i); |
484 | 871 | if (outgoing_idx >= graph_exec_symbol_size) |
485 | 0 | continue; |
486 | 871 | if (exec_flags[outgoing_idx] == CCV_NNC_PARALLEL_REDUCER) |
487 | 40 | for (j = 0; 12 j < node->output_size; j++28 ) |
488 | 28 | { |
489 | 28 | const int output_idx = node->outputs[j]; |
490 | 28 | if (output_idx >= 0 && tensor_flags[output_idx] == CCV_NNC_PARALLEL_REDUCER24 ) |
491 | 16 | { |
492 | 16 | assert(reduce_exec_idx[output_idx]); |
493 | 16 | ccv_array_replace_unique_int(node->outgoings, outgoing_idx, reduce_exec_idx[output_idx] - 1); |
494 | 16 | } |
495 | 28 | } |
496 | 871 | } |
497 | 2.17k | for (i = 0; 570 i < parallel_count - 1; i++1.60k ) |
498 | 1.60k | { |
499 | 1.60k | const int d = dup_exec_idx[idx * (parallel_count - 1) + i]; |
500 | 1.60k | if (d < 0) |
501 | 8 | continue; |
502 | 1.59k | const ccv_nnc_graph_exec_symbol_t source = { |
503 | 1.59k | .d = d, |
504 | 1.59k | .graph = graph, |
505 | 1.59k | }; |
506 | | // If it is broadcast worthy. |
507 | 6.78k | for (j = 0; j < node->input_size; j++5.18k ) |
508 | 5.18k | { |
509 | 5.18k | const int input = node->inputs[j]; |
510 | 5.18k | if (input >= 0 && input < tensor_symbol_size3.75k && broadcast_exec_idx[input]3.75k ) |
511 | 19 | ccv_nnc_graph_exec_symbol_concat(graph, (ccv_nnc_graph_exec_symbol_t){ |
512 | 19 | .d = broadcast_exec_idx[input] - 1, |
513 | 19 | .graph = graph, |
514 | 19 | }, source); |
515 | 5.18k | } |
516 | | // If it is reduce worthy. |
517 | 4.29k | for (j = 0; j < node->output_size; j++2.70k ) |
518 | 2.70k | { |
519 | 2.70k | const int output = node->outputs[j]; |
520 | 2.70k | if (output >= 0 && output < tensor_symbol_size2.57k && reduce_exec_idx[output]2.57k ) |
521 | 8 | ccv_nnc_graph_exec_symbol_concat(graph, source, (ccv_nnc_graph_exec_symbol_t){ |
522 | 8 | .d = reduce_exec_idx[output] - 1, |
523 | 8 | .graph = graph, |
524 | 8 | }); |
525 | 2.70k | } |
526 | 1.59k | if (node->outgoings && node->outgoings->rnum1.18k ) |
527 | 3.67k | for (j = 0; 1.18k j < node->outgoings->rnum; j++2.48k ) |
528 | 2.48k | { |
529 | 2.48k | const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, j); |
530 | 2.48k | if (outgoing_idx > graph_exec_symbol_size) |
531 | 8 | continue; |
532 | 2.47k | const int outgoing_d = dup_exec_idx[outgoing_idx * (parallel_count - 1) + i]; |
533 | 2.47k | if (outgoing_d < 0) |
534 | 4 | continue; |
535 | 2.47k | ccv_nnc_graph_exec_symbol_concat(graph, source, (ccv_nnc_graph_exec_symbol_t){ |
536 | 2.47k | .d = outgoing_d, |
537 | 2.47k | .graph = graph, |
538 | 2.47k | }); |
539 | 2.47k | } |
540 | 1.59k | } |
541 | 570 | } ccv_nnc_graph_visit_endfor |
542 | 17 | ccfree(broadcast_exec_idx); |
543 | 17 | ccfree(reduce_exec_idx); |
544 | 17 | ccfree(tensor_flags); |
545 | 17 | ccv_nnc_graph_visit_free(visit); |
546 | | // Allreduce is easier to do, we do that the last. It consists of two steps: |
547 | | // 1. Generate allreduce node for each symbol; |
548 | | // 2. Disconnect them from source and connect them through all reduce nodes. |
549 | 149 | for (i = 0; i < allreducer_size; i++132 ) |
550 | 132 | { |
551 | 132 | if (allreducers[i].d == CCV_NNC_NO_TENSOR_SYMBOL) |
552 | 0 | continue; |
553 | 132 | ccv_nnc_tensor_symbol_t* const outputs = max_io + parallel_count; |
554 | 132 | outputs[0] = allreducers[i]; |
555 | | // Copy over allreducers output symbols (as the old symbol). |
556 | 520 | for (j = 0; j < parallel_count - 1; j++388 ) |
557 | 388 | { |
558 | 388 | const int d = allreducers[i].d; |
559 | 388 | outputs[j + 1].graph = graph; |
560 | 388 | assert(dup_tensor_idx[d * (parallel_count - 1) + j] >= 0); |
561 | 388 | outputs[j + 1].d = dup_tensor_idx[d * (parallel_count - 1) + j]; |
562 | 388 | } |
563 | 132 | ccv_nnc_tensor_symbol_t* const inputs = max_io; |
564 | 132 | inputs[0].graph = graph; |
565 | 132 | inputs[0].d = allreduce_inputs[i]; |
566 | | // Create identical new tensor symbols |
567 | 520 | for (j = 0; j < parallel_count - 1; j++388 ) |
568 | 388 | { |
569 | 388 | if (dup_tensor_idx[allreduce_inputs[i] * (parallel_count - 1) + j] < 0) |
570 | 388 | dup_tensor_idx[allreduce_inputs[i] * (parallel_count - 1) + j] = ccv_nnc_tensor_symbol_new(graph, ccv_nnc_tensor_symbol_params(graph, outputs[j + 1]), 0).d; |
571 | 388 | inputs[j + 1].graph = graph; |
572 | 388 | inputs[j + 1].d = dup_tensor_idx[allreduce_inputs[i] * (parallel_count - 1) + j]; |
573 | 388 | } |
574 | | // Create allreduce node. |
575 | 132 | const ccv_nnc_graph_exec_symbol_t allreduce = ccv_nnc_graph_exec_symbol_new(graph, CMD_COMM_ALLREDUCE_FORWARD(), inputs, parallel_count, outputs, parallel_count, 0); |
576 | 132 | const int exec_idx = allreduce_producers[allreducers[i].d] - 1; |
577 | 132 | assert(exec_idx >= 0); |
578 | 132 | ccv_nnc_graph_exec_symbol_info_t* const node = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, exec_idx); |
579 | 576 | for (j = 0; j < node->output_size; j++444 ) |
580 | 444 | if (node->outputs[j] == outputs[0].d) |
581 | 132 | node->outputs[j] = inputs[0].d; |
582 | 132 | ccv_nnc_graph_exec_symbol_concat(graph, (ccv_nnc_graph_exec_symbol_t){ |
583 | 132 | .graph = graph, |
584 | 132 | .d = exec_idx, |
585 | 132 | }, allreduce); |
586 | | // Remove connections from current node directly to its following nodes (these should follow allreduce node now). |
587 | 634 | for (j = 0; j < node->outgoings->rnum;) |
588 | 502 | { |
589 | 502 | const int d = *(int*)ccv_array_get(node->outgoings, j); |
590 | 502 | if (d == allreduce.d) |
591 | 132 | { |
592 | 132 | ++j; |
593 | 132 | continue; |
594 | 132 | } |
595 | | // Get the destination nodes, and check whether they have inputs matches our outputs. |
596 | 370 | ccv_nnc_graph_exec_symbol_info_t* const outgoing_node = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, d); |
597 | 370 | if (_ccv_nnc_exec_inputs_contain(outgoing_node, allreducers[i].d)) |
598 | 132 | { |
599 | 132 | ccv_nnc_graph_exec_symbol_concat(graph, allreduce, (ccv_nnc_graph_exec_symbol_t){ |
600 | 132 | .graph = graph, |
601 | 132 | .d = d, |
602 | 132 | }); |
603 | | // Remove the connection. |
604 | 132 | if (j < node->outgoings->rnum - 1) |
605 | 132 | *(int*)ccv_array_get(node->outgoings, j) = *(int*)ccv_array_get(node->outgoings, node->outgoings->rnum - 1); |
606 | 132 | --node->outgoings->rnum; |
607 | 132 | } else |
608 | 238 | ++j; |
609 | 370 | } |
610 | 520 | for (j = 0; j < parallel_count - 1; j++388 ) |
611 | 388 | { |
612 | 388 | const int new_exec_idx = dup_exec_idx[exec_idx * (parallel_count - 1) + j]; |
613 | 388 | ccv_nnc_graph_exec_symbol_info_t* const node = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, new_exec_idx); |
614 | 1.69k | for (k = 0; k < node->output_size; k++1.30k ) |
615 | 1.30k | if (node->outputs[k] == outputs[j + 1].d) |
616 | 388 | node->outputs[k] = inputs[j + 1].d; |
617 | 388 | ccv_nnc_graph_exec_symbol_concat(graph, (ccv_nnc_graph_exec_symbol_t){ |
618 | 388 | .graph = graph, |
619 | 388 | .d = new_exec_idx, |
620 | 388 | }, allreduce); |
621 | 1.86k | for (k = 0; k < node->outgoings->rnum;) |
622 | 1.47k | { |
623 | 1.47k | const int d = *(int*)ccv_array_get(node->outgoings, k); |
624 | 1.47k | if (d == allreduce.d) |
625 | 388 | { |
626 | 388 | ++k; |
627 | 388 | continue; |
628 | 388 | } |
629 | | // Get the destination nodes, and check whether they have inputs matches our outputs. |
630 | 1.09k | ccv_nnc_graph_exec_symbol_info_t* const outgoing_node = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, d); |
631 | 1.09k | if (_ccv_nnc_exec_inputs_contain(outgoing_node, outputs[j + 1].d)) |
632 | 388 | { |
633 | 388 | ccv_nnc_graph_exec_symbol_concat(graph, allreduce, (ccv_nnc_graph_exec_symbol_t){ |
634 | 388 | .graph = graph, |
635 | 388 | .d = d, |
636 | 388 | }); |
637 | | // Remove the connection. |
638 | 388 | if (k < node->outgoings->rnum - 1) |
639 | 388 | *(int*)ccv_array_get(node->outgoings, k) = *(int*)ccv_array_get(node->outgoings, node->outgoings->rnum - 1); |
640 | 388 | --node->outgoings->rnum; |
641 | 388 | } else |
642 | 702 | ++k; |
643 | 1.09k | } |
644 | 388 | } |
645 | 132 | } |
646 | 17 | ccfree(allreduce_producers); |
647 | | // Write allreducer_outs last, because it may be the same pointer as allreducers. |
648 | 17 | if (allreducer_outs) |
649 | 136 | for (i = 0; 8 i < allreducer_size; i++128 ) |
650 | 128 | { |
651 | 128 | if (allreduce_inputs[i] != CCV_NNC_NO_TENSOR_SYMBOL) |
652 | 128 | { |
653 | 128 | allreducer_outs[i].d = allreduce_inputs[i]; |
654 | 128 | allreducer_outs[i].graph = graph; |
655 | 128 | } else { |
656 | 0 | allreducer_outs[i].d = CCV_NNC_NO_TENSOR_SYMBOL; |
657 | 0 | allreducer_outs[i].graph = 0; |
658 | 0 | } |
659 | 128 | } |
660 | 17 | ccfree(allreduce_inputs); |
661 | 17 | } |
662 | | |
663 | | ccv_nnc_tensor_symbol_t ccv_nnc_tensor_symbol_copy(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t symbol, const int device_id) |
664 | 5.42k | { |
665 | 5.42k | if (!graph->data_parallel.tensor_symbol_idx) |
666 | 0 | return NO_TENSOR_SYMBOL; |
667 | 5.42k | assert(graph->data_parallel.tensor_symbol_idx); |
668 | 5.42k | assert(symbol.d >= 0); |
669 | 5.42k | assert(symbol.d < graph->data_parallel.tensor_symbol_size); |
670 | 5.42k | assert(symbol.graph == graph); |
671 | 5.42k | if (device_id == 0) |
672 | 0 | return symbol; |
673 | 5.42k | const int parallel_count = graph->data_parallel.count; |
674 | 5.42k | if (graph->data_parallel.tensor_symbol_idx[symbol.d * (parallel_count - 1) + device_id - 1] < 0) |
675 | 0 | return NO_TENSOR_SYMBOL; |
676 | 5.42k | ccv_nnc_tensor_symbol_t tensor = { |
677 | 5.42k | .d = graph->data_parallel.tensor_symbol_idx[symbol.d * (parallel_count - 1) + device_id - 1], |
678 | 5.42k | .graph = graph, |
679 | 5.42k | }; |
680 | 5.42k | return tensor; |
681 | 5.42k | } |
682 | | |
683 | | void ccv_nnc_tensor_symbol_set_copy(ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t symbol, const int device_id, const ccv_nnc_tensor_symbol_t copy) |
684 | 672 | { |
685 | 672 | assert(graph->data_parallel.tensor_symbol_idx); |
686 | 672 | assert(symbol.d >= 0); |
687 | 672 | assert(symbol.d < graph->tensor_symbol_info->rnum); |
688 | 672 | assert(symbol.graph == graph); |
689 | 672 | const int parallel_count = graph->data_parallel.count; |
690 | 672 | if (copy.d == CCV_NNC_NO_TENSOR_SYMBOL) |
691 | 0 | { |
692 | 0 | assert(symbol.d < graph->data_parallel.tensor_symbol_size); |
693 | 0 | graph->data_parallel.tensor_symbol_idx[symbol.d * (parallel_count - 1) + device_id - 1] = -1; |
694 | 0 | return; |
695 | 0 | } |
696 | 672 | assert(copy.d >= 0); |
697 | 672 | assert(copy.d < graph->tensor_symbol_info->rnum); |
698 | 672 | assert(copy.graph == graph); |
699 | 672 | assert(parallel_count > 1); |
700 | 672 | if (symbol.d >= graph->data_parallel.tensor_symbol_size) |
701 | 224 | { |
702 | 224 | graph->data_parallel.tensor_symbol_idx = ccrealloc(graph->data_parallel.tensor_symbol_idx, sizeof(int) * (parallel_count - 1) * (symbol.d + 1)); |
703 | 224 | int i; |
704 | 8.40k | for (i = graph->data_parallel.tensor_symbol_size * (parallel_count - 1); i < (symbol.d + 1) * (parallel_count - 1); i++8.17k ) |
705 | 8.17k | graph->data_parallel.tensor_symbol_idx[i] = -1; |
706 | 224 | graph->data_parallel.tensor_symbol_size = symbol.d + 1; |
707 | 224 | } |
708 | 672 | graph->data_parallel.tensor_symbol_idx[symbol.d * (parallel_count - 1) + device_id - 1] = copy.d; |
709 | 672 | } |
710 | | |
711 | | ccv_nnc_graph_exec_symbol_t ccv_nnc_graph_exec_symbol_copy(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_graph_exec_symbol_t symbol, const int device_id) |
712 | 30.7k | { |
713 | 30.7k | if (!graph->data_parallel.exec_symbol_idx) |
714 | 102 | return NO_GRAPH_EXEC_SYMBOL; |
715 | 30.7k | assert(graph->data_parallel.exec_symbol_idx)30.6k ; |
716 | 30.6k | assert(symbol.d >= 0); |
717 | 30.6k | assert(symbol.d < graph->data_parallel.exec_symbol_size); |
718 | 30.6k | assert(symbol.graph == graph); |
719 | 30.6k | if (device_id == 0) |
720 | 0 | return symbol; |
721 | 30.6k | const int parallel_count = graph->data_parallel.count; |
722 | 30.6k | if (graph->data_parallel.exec_symbol_idx[symbol.d * (parallel_count - 1) + device_id - 1] < 0) |
723 | 0 | return NO_GRAPH_EXEC_SYMBOL; |
724 | 30.6k | ccv_nnc_graph_exec_symbol_t graph_exec = { |
725 | 30.6k | .d = graph->data_parallel.exec_symbol_idx[symbol.d * (parallel_count - 1) + device_id - 1], |
726 | 30.6k | .graph = graph, |
727 | 30.6k | }; |
728 | 30.6k | return graph_exec; |
729 | 30.6k | } |
730 | | |
731 | | void ccv_nnc_graph_exec_symbol_set_copy(ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_graph_exec_symbol_t symbol, const int device_id, const ccv_nnc_graph_exec_symbol_t copy) |
732 | 0 | { |
733 | 0 | assert(graph->data_parallel.exec_symbol_idx); |
734 | 0 | assert(symbol.d >= 0); |
735 | 0 | assert(symbol.d < graph->exec_symbol_info->rnum); |
736 | 0 | assert(symbol.graph == graph); |
737 | 0 | const int parallel_count = graph->data_parallel.count; |
738 | 0 | if (copy.d == CCV_NNC_NO_GRAPH_EXEC_SYMBOL) |
739 | 0 | { |
740 | 0 | assert(symbol.d < graph->data_parallel.exec_symbol_size); |
741 | 0 | graph->data_parallel.exec_symbol_idx[symbol.d * (parallel_count - 1) + device_id - 1] = -1; |
742 | 0 | return; |
743 | 0 | } |
744 | 0 | assert(copy.d >= 0); |
745 | 0 | assert(copy.d < graph->exec_symbol_info->rnum); |
746 | 0 | assert(copy.graph == graph); |
747 | 0 | assert(parallel_count > 1); |
748 | 0 | if (symbol.d >= graph->data_parallel.exec_symbol_size) |
749 | 0 | { |
750 | 0 | graph->data_parallel.exec_symbol_idx = ccrealloc(graph->data_parallel.exec_symbol_idx, sizeof(int) * (parallel_count - 1) * (symbol.d + 1)); |
751 | 0 | int i; |
752 | 0 | for (i = graph->data_parallel.exec_symbol_size * (parallel_count - 1); i < (symbol.d + 1) * (parallel_count - 1); i++) |
753 | 0 | graph->data_parallel.exec_symbol_idx[i] = -1; |
754 | 0 | graph->data_parallel.exec_symbol_size = symbol.d + 1; |
755 | 0 | } |
756 | 0 | graph->data_parallel.exec_symbol_idx[symbol.d * (parallel_count - 1) + device_id - 1] = copy.d; |
757 | 0 | } |