/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_dynamic_graph_backward.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "ccv_nnc.h" |
2 | | #include "ccv_nnc_easy.h" |
3 | | #include "ccv_nnc_internal.h" |
4 | | #include "ccv_nnc_easy.h" |
5 | | #include "ccv_internal.h" |
6 | | #ifdef HAVE_CUDA |
7 | | #include "gpu/ccv_nnc_compat.h" |
8 | | #endif |
9 | | #include "_ccv_nnc_dynamic_graph.h" |
10 | | |
11 | | // MARK - Level-4.5 API |
12 | | |
13 | | static void* _ccv_nnc_dynamic_compile_alloc(const int type, const int pinned_mem, const size_t size, void* const arg) |
14 | 266 | { |
15 | 266 | assert(type & CCV_TENSOR_GPU_MEMORY); |
16 | 266 | ccv_nnc_dy_xpu_alloc_t* const xpu_alloc = (ccv_nnc_dy_xpu_alloc_t*)arg; |
17 | 266 | const int device = CCV_TENSOR_GET_DEVICE_ID(type); |
18 | 266 | return ccv_nnc_xpu_alloc(xpu_alloc->xpu_alloc, device, xpu_alloc->stream, size); |
19 | 266 | } |
20 | | |
21 | | static void _ccv_nnc_dynamic_compile_free(void* const ptr, void* const arg) |
22 | 266 | { |
23 | 266 | ccv_nnc_xpu_alloc_t* const xpu_alloc = (ccv_nnc_xpu_alloc_t*)arg; |
24 | 266 | ccv_nnc_xpu_free(xpu_alloc, ptr); |
25 | 266 | } |
26 | | |
27 | | const ccv_nnc_symbolic_graph_compile_allocator_vtab_t ccv_nnc_dy_allocator_isa = { |
28 | | .alloc = _ccv_nnc_dynamic_compile_alloc, |
29 | | .free = _ccv_nnc_dynamic_compile_free |
30 | | }; |
31 | | |
32 | | void ccv_nnc_dynamic_graph_backward(ccv_nnc_dynamic_graph_t* const dynamic_graph, const ccv_nnc_tensor_variable_t* const f_variables, const int f_variable_size, const ccv_nnc_tensor_variable_t* const df_optionals, const ccv_nnc_tensor_variable_t* const inputs, const int input_size, ccv_nnc_tensor_variable_t* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
33 | 3.42k | { |
34 | 3.42k | int d, i, j, k; |
35 | 3.42k | assert(input_size == output_size); |
36 | 3.42k | assert(input_size > 0); |
37 | 3.42k | assert(output_size > 0); |
38 | 3.42k | assert(f_variable_size > 0); |
39 | 3.42k | int f_source_size = 0; |
40 | | // Both f_variable and tensor_variable should be, at least, executed. Otherwise we cannot differentiate. |
41 | 6.85k | for (i = 0; i < f_variable_size; i++3.43k ) |
42 | 3.43k | { |
43 | 3.43k | assert(f_variables[i]->symbol.d >= 0); |
44 | 3.43k | const ccv_nnc_tensor_variable_graph_bind_t* const f_symbol_extra = (ccv_nnc_tensor_variable_graph_bind_t*)ccv_array_get(dynamic_graph->binds, f_variables[i]->symbol.d); |
45 | 3.43k | assert(f_symbol_extra->sources && f_symbol_extra->sources->rnum > 0); |
46 | 3.43k | f_source_size += f_symbol_extra->sources->rnum; |
47 | 3.43k | } |
48 | 3.42k | assert(!dynamic_graph->no_grad); |
49 | 6.86k | for (i = 0; 3.42k i < input_size; i++3.44k ) |
50 | 3.44k | { |
51 | 3.44k | assert(inputs[i]->type != CCV_NNC_TENSOR_CONSTANT); |
52 | 3.44k | assert(inputs[i]->symbol.d >= 0); |
53 | 3.44k | assert(((ccv_nnc_tensor_variable_graph_bind_t*)ccv_array_get(dynamic_graph->binds, inputs[i]->symbol.d))->destinations && |
54 | 3.44k | ((ccv_nnc_tensor_variable_graph_bind_t*)ccv_array_get(dynamic_graph->binds, inputs[i]->symbol.d))->destinations->rnum > 0); |
55 | 3.44k | } |
56 | | // Fill in the symbol info for outputs. |
57 | 6.86k | for (i = 0; 3.42k i < output_size; i++3.44k ) |
58 | 3.44k | if (outputs[i] && ccv_nnc_is_tensor_auto(outputs[i]->info)3.24k ) |
59 | 1.23k | outputs[i]->info = inputs[i]->info; |
60 | 3.42k | const int exec_symbol_info_size = ccv_nnc_graph_exec_symbol_count(dynamic_graph->tape); |
61 | 3.42k | ccv_array_t* const sources = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), 1, 0); |
62 | 3.42k | if (!dynamic_graph->ws) |
63 | 15 | dynamic_graph->ws = ccv_array_new(sizeof(int), exec_symbol_info_size * 2 + ((exec_symbol_info_size + 31) >> 5), 0); |
64 | 3.42k | ccv_array_t* const ws = dynamic_graph->ws; |
65 | 3.42k | ccv_array_resize(ws, exec_symbol_info_size * 2 + ((exec_symbol_info_size + 31) >> 5)); |
66 | | // set visited to all 0. |
67 | 3.42k | memset((uint32_t*)ccv_array_get(ws, exec_symbol_info_size * 2), 0, sizeof(uint32_t) * ((exec_symbol_info_size + 31) >> 5)); |
68 | 6.86k | for (i = 0; i < input_size; i++3.44k ) |
69 | 3.44k | { |
70 | 3.44k | ccv_array_t* const destinations = ((ccv_nnc_tensor_variable_graph_bind_t*)ccv_array_get(dynamic_graph->binds, inputs[i]->symbol.d))->destinations; |
71 | 6.91k | for (j = 0; j < destinations->rnum; j++3.47k ) |
72 | 3.47k | ccv_nnc_insert_if_prior_to_any(dynamic_graph->tape, |
73 | 3.47k | *(int*)ccv_array_get(destinations, j), |
74 | 3.47k | sources, (uint32_t*)ccv_array_get(ws, exec_symbol_info_size * 2), |
75 | 3.47k | (int*)ccv_array_get(ws, 0), (int*)ccv_array_get(ws, exec_symbol_info_size)); |
76 | 3.44k | } |
77 | 3.42k | ccv_array_t* const destinations = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), f_source_size, 0); |
78 | 6.85k | for (i = 0; i < f_variable_size; i++3.43k ) |
79 | 3.43k | { |
80 | 3.43k | const ccv_nnc_tensor_variable_graph_bind_t* const loss_symbol_extra = (ccv_nnc_tensor_variable_graph_bind_t*)ccv_array_get(dynamic_graph->binds, f_variables[i]->symbol.d); |
81 | 6.86k | for (j = 0; j < loss_symbol_extra->sources->rnum; j++3.43k ) |
82 | 3.43k | { |
83 | 3.43k | const int symbol_d = *(int*)ccv_array_get(loss_symbol_extra->sources, j); |
84 | 3.43k | int flag = 0; |
85 | 3.45k | for (k = 0; !flag && k < destinations->rnum; k++19 ) |
86 | 19 | flag = (symbol_d == ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(destinations, k))->d); |
87 | 3.43k | if (!flag) |
88 | 3.43k | { |
89 | 3.43k | const ccv_nnc_graph_exec_symbol_t symbol = { |
90 | 3.43k | .d = symbol_d, |
91 | 3.43k | .graph = dynamic_graph->tape |
92 | 3.43k | }; |
93 | 3.43k | ccv_array_push(destinations, &symbol); |
94 | 3.43k | } |
95 | 3.43k | } |
96 | 3.43k | } |
97 | | // Go over sources, because destinations will get removed all the time, thus, the index is not accurate. |
98 | 3.42k | if (destinations->rnum > 1) |
99 | 25 | for (i = 0; 7 i < destinations->rnum; i++18 ) |
100 | 18 | { |
101 | 18 | memset((uint32_t*)ccv_array_get(ws, exec_symbol_info_size * 2), 0, sizeof(uint32_t) * ((exec_symbol_info_size + 31) >> 5)); |
102 | 18 | ccv_nnc_remove_if_prior_to_any(dynamic_graph->tape, |
103 | 18 | ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(destinations, i))->d, |
104 | 18 | destinations, (uint32_t*)ccv_array_get(ws, exec_symbol_info_size * 2), |
105 | 18 | (int*)ccv_array_get(ws, 0), (int*)ccv_array_get(ws, exec_symbol_info_size)); |
106 | 18 | } |
107 | 3.42k | ccv_nnc_tensor_symbol_t f_symbols[f_variable_size]; |
108 | 6.85k | for (i = 0; i < f_variable_size; i++3.43k ) |
109 | 3.43k | f_symbols[i] = f_variables[i]->symbol; |
110 | 3.42k | ccv_nnc_tensor_symbol_t input_symbols[input_size]; |
111 | 6.86k | for (i = 0; i < input_size; i++3.44k ) |
112 | 3.44k | input_symbols[i] = inputs[i]->symbol; |
113 | 3.42k | ccv_array_t* const symbol_stack = ccv_array_new(sizeof(ccv_nnc_tape_symbol_t), 1, 0); |
114 | 3.42k | ccv_nnc_tensor_symbol_new_hook(dynamic_graph->tape, ccv_nnc_dynamic_graph_push_backward_tensor_symbol, symbol_stack, 0); |
115 | 3.42k | ccv_nnc_tensor_symbol_alias_new_hook(dynamic_graph->tape, ccv_nnc_dynamic_graph_push_backward_tensor_symbol_alias, symbol_stack, 0); |
116 | 3.42k | ccv_nnc_graph_exec_symbol_new_hook(dynamic_graph->tape, ccv_nnc_dynamic_graph_push_backward_graph_exec_symbol, symbol_stack, 0); |
117 | 3.42k | ccv_nnc_symbolic_graph_backward(dynamic_graph->tape, |
118 | 3.42k | f_symbols, f_variable_size, input_symbols, input_size, |
119 | 3.42k | (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, 0), sources->rnum, |
120 | 3.42k | (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(destinations, 0), destinations->rnum); |
121 | | // Bind generated tensors. |
122 | 3.42k | ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), dynamic_graph->vars->rnum + 2, 0); |
123 | 32.5k | for (i = 0; i < dynamic_graph->vars->rnum; i++29.1k ) |
124 | 29.1k | { |
125 | 29.1k | ccv_nnc_tensor_variable_t var = *(ccv_nnc_tensor_variable_t*)ccv_array_get(dynamic_graph->vars, i); |
126 | 29.1k | if (var && var->tensor_view29.1k && var->symbol.d >= 027.8k ) |
127 | 22.8k | { |
128 | 22.8k | ccv_nnc_tensor_bind_t bind = { |
129 | 22.8k | .symbol = var->symbol, |
130 | 22.8k | .tensor = (ccv_nnc_tensor_t*)CCV_NNC_TENSOR_VIEW(var->tensor_view) |
131 | 22.8k | }; |
132 | 22.8k | ccv_array_push(tensor_binds, &bind); |
133 | 22.8k | } |
134 | 29.1k | } |
135 | 26.6k | for (i = 0; i < dynamic_graph->binds->rnum; i++23.2k ) |
136 | 23.2k | { |
137 | 23.2k | ccv_nnc_tensor_variable_graph_bind_t* const bind = (ccv_nnc_tensor_variable_graph_bind_t*)ccv_array_get(dynamic_graph->binds, i); |
138 | 23.2k | if (bind->index == CCV_NNC_TENSOR_NO_VARIABLE_BUT_USED && bind->tensor_view418 ) |
139 | 418 | { |
140 | 418 | ccv_nnc_tensor_bind_t b = { |
141 | 418 | .symbol = { |
142 | 418 | .d = i, |
143 | 418 | .graph = dynamic_graph->tape, |
144 | 418 | }, |
145 | 418 | .tensor = (ccv_nnc_tensor_t*)CCV_NNC_TENSOR_VIEW(bind->tensor_view) |
146 | 418 | }; |
147 | 418 | ccv_array_push(tensor_binds, &b); |
148 | 418 | } |
149 | 23.2k | } |
150 | | // Compiled graph comes from the df. |
151 | 3.42k | ccv_array_clear(sources); |
152 | 3.42k | ccv_nnc_tensor_symbol_t df_symbols[f_variable_size]; |
153 | 6.85k | for (i = 0; i < f_variable_size; i++3.43k ) |
154 | 3.43k | { |
155 | 3.43k | df_symbols[i] = ccv_nnc_tensor_symbol_for_backward(dynamic_graph->tape, f_variables[i]->symbol); |
156 | 3.43k | assert(f_symbols[i].d >= 0); |
157 | 3.43k | } |
158 | 6.85k | for (d = 0; 3.42k d < destinations->rnum; d++3.43k ) |
159 | 3.43k | { |
160 | 3.43k | const ccv_nnc_graph_exec_symbol_t* const destination = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(destinations, d); |
161 | 3.43k | const int* outgoings; int outgoing_size; |
162 | 3.43k | ccv_nnc_graph_exec_symbol_to(dynamic_graph->tape, *destination, &outgoings, &outgoing_size); |
163 | 6.86k | for (i = 0; i < outgoing_size; i++3.43k ) |
164 | 3.43k | { |
165 | 3.43k | const int exec_idx = outgoings[i]; |
166 | 3.43k | const int* inputs; int input_size; |
167 | 3.43k | ccv_nnc_graph_exec_symbol_io(dynamic_graph->tape, (ccv_nnc_graph_exec_symbol_t){ |
168 | 3.43k | .d = exec_idx, |
169 | 3.43k | .graph = dynamic_graph->tape |
170 | 3.43k | }, &inputs, &input_size, 0, 0); |
171 | 3.63k | for (j = 0; j < input_size; j++206 ) |
172 | 3.63k | { |
173 | 3.63k | const int input = inputs[j]; |
174 | 3.63k | const int alias_ref = input >= 0 ? ccv_nnc_tensor_symbol_alias_to(dynamic_graph->tape, (ccv_nnc_tensor_symbol_t){ |
175 | 3.43k | .d = input, |
176 | 3.43k | .graph = dynamic_graph->tape |
177 | 3.43k | }).d : CCV_NNC_NO_TENSOR_SYMBOL206 ; // This could be CCV_NNC_NO_TENSOR_SYMBOL, which is negative. |
178 | | // alias_ref is either exists, or -1. |
179 | 3.63k | int flag = 0; |
180 | 7.31k | for (k = 0; !flag && k < f_variable_size3.88k ; k++3.67k ) |
181 | 3.67k | flag = (df_symbols[k].d == input || df_symbols[k].d == alias_ref247 ); |
182 | 3.63k | if (flag) |
183 | 3.43k | { |
184 | 3.43k | flag = 0; |
185 | 3.44k | for (k = 0; !flag && k < sources->rnum; k++16 ) |
186 | 16 | flag = (exec_idx == ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, k))->d); |
187 | 3.43k | if (!flag) |
188 | 3.43k | { |
189 | 3.43k | const ccv_nnc_graph_exec_symbol_t source = { |
190 | 3.43k | .d = exec_idx, |
191 | 3.43k | .graph = dynamic_graph->tape |
192 | 3.43k | }; |
193 | 3.43k | ccv_array_push(sources, &source); |
194 | 3.43k | } |
195 | 3.43k | break; |
196 | 3.43k | } |
197 | 3.63k | } |
198 | 3.43k | } |
199 | 3.43k | } |
200 | 3.42k | int freeable_size = 0; |
201 | 3.42k | ccv_nnc_tensor_variable_t freeables[output_size]; |
202 | 3.42k | ccv_array_clear(destinations); |
203 | 3.42k | int max_input_size = 1; |
204 | 3.42k | int max_output_size = 1; |
205 | 6.85k | for (i = 0; i < sources->rnum; i++3.43k ) |
206 | 3.43k | { |
207 | 3.43k | const ccv_nnc_graph_exec_symbol_t source = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, i); |
208 | 3.43k | int input_size; int output_size; |
209 | 3.43k | ccv_nnc_graph_exec_symbol_io(dynamic_graph->tape, source, 0, &input_size, 0, &output_size); |
210 | 3.43k | max_input_size = ccv_max(input_size, max_input_size); |
211 | 3.43k | max_output_size = ccv_max(output_size, max_output_size); |
212 | 3.43k | } |
213 | 6.86k | for (i = 0; i < output_size; i++3.44k ) |
214 | 3.44k | { |
215 | 3.44k | const ccv_nnc_tensor_symbol_t symbol = ccv_nnc_tensor_symbol_for_backward(dynamic_graph->tape, input_symbols[i]); |
216 | 3.44k | ccv_nnc_graph_exec_symbol_t destination = ccv_nnc_graph_exec_symbol_for_backward(dynamic_graph->tape, symbol); |
217 | 3.44k | int input_size; int output_size; |
218 | 3.44k | ccv_nnc_graph_exec_symbol_io(dynamic_graph->tape, destination, 0, &input_size, 0, &output_size); |
219 | 3.44k | max_input_size = ccv_max(input_size, max_input_size); |
220 | 3.44k | max_output_size = ccv_max(output_size, max_output_size); |
221 | 3.44k | } |
222 | 3.42k | const int max_input_bitmask_size = ((max_input_size + 63) >> 6); |
223 | 3.42k | const int max_output_bitmask_size = ((max_output_size + 63) >> 6); |
224 | 3.42k | ccv_nnc_tensor_symbol_t temp_input_symbols[max_input_size]; |
225 | 3.42k | ccv_nnc_tensor_symbol_t temp_output_symbols[max_output_size]; |
226 | 3.42k | uint64_t temp_input_bitmasks[max_input_bitmask_size]; |
227 | 3.42k | uint64_t temp_output_bitmasks[max_output_bitmask_size]; |
228 | | // Bind dt tensor. |
229 | 6.86k | for (i = 0; i < output_size; i++3.44k ) |
230 | 3.44k | { |
231 | 3.44k | const ccv_nnc_tensor_symbol_t symbol = ccv_nnc_tensor_symbol_for_backward(dynamic_graph->tape, input_symbols[i]); |
232 | 3.44k | ccv_nnc_graph_exec_symbol_t destination = ccv_nnc_graph_exec_symbol_for_backward(dynamic_graph->tape, symbol); |
233 | 3.44k | if (outputs[i]) |
234 | 3.24k | { |
235 | 3.24k | if (ccv_nnc_tensor_variable_contains_value(outputs[i])) |
236 | 2.00k | { |
237 | | // If the output tensors already exist, we need to accumulate the result. |
238 | | // However, if this tensor is set from outside, we don't accumulate on that |
239 | | // (these maybe people just want to collect the result in explicit way). |
240 | | // On the other hand, if these external tensor views has a symbol associated |
241 | | // with them, they are not made to collect results. They are probably bind in |
242 | | // previous computations. |
243 | | // The above logic is convoluted, but it should make intuitive sense in many |
244 | | // cases. |
245 | 2.00k | ccv_nnc_tensor_symbol_t inputs[2]; |
246 | 2.00k | inputs[0] = ccv_nnc_tensor_symbol_new(dynamic_graph->tape, outputs[i]->info, 0); |
247 | 2.00k | inputs[1] = symbol; |
248 | 2.00k | const ccv_nnc_tensor_symbol_t output = ccv_nnc_tensor_symbol_new(dynamic_graph->tape, outputs[i]->info, 0); |
249 | 2.00k | ccv_nnc_tensor_bind_t dt_bind = { |
250 | 2.00k | .symbol = inputs[0], |
251 | 2.00k | .tensor = ccv_nnc_tensor_from_variable(dynamic_graph, outputs[i], stream_context) |
252 | 2.00k | }; |
253 | 2.00k | ccv_array_push(tensor_binds, &dt_bind); |
254 | 2.00k | ccv_nnc_graph_exec_symbol_t accum = ccv_nnc_graph_exec_symbol_new(dynamic_graph->tape, CMD_EWSUM_FORWARD(), inputs, 2, &output, 1, 0); |
255 | 2.00k | ccv_nnc_graph_exec_symbol_concat(dynamic_graph->tape, destination, accum); |
256 | 2.00k | destination = accum; // The accumulation unit becomes the new destination. |
257 | 2.00k | freeables[freeable_size++] = ccv_nnc_tensor_variable_exchange_new(dynamic_graph, outputs[i]); |
258 | 2.00k | dt_bind.symbol = output; |
259 | 2.00k | dt_bind.tensor = ccv_nnc_tensor_from_variable(dynamic_graph, outputs[i], stream_context); |
260 | 2.00k | ccv_array_push(tensor_binds, &dt_bind); |
261 | 2.00k | } else { |
262 | 1.23k | assert(outputs[i]->symbol.d < 0); |
263 | | // Otherwise, we can directly bind to the backward output. |
264 | 1.23k | ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_from_variable(dynamic_graph, outputs[i], stream_context); |
265 | 1.23k | const ccv_nnc_tensor_bind_t dt_bind = { |
266 | 1.23k | .symbol = symbol, |
267 | 1.23k | .tensor = tensor |
268 | 1.23k | }; |
269 | 1.23k | ccv_array_push(tensor_binds, &dt_bind); |
270 | 1.23k | } |
271 | 3.24k | } else { |
272 | | // Remove this symbol if it is possible, since we don't have any use of it. |
273 | | // This won't cover cases where we need to merge them together (hence, the cmd will be sum), so it is the best guess. |
274 | 196 | const int* inputs; int input_size; |
275 | 196 | const int* outputs; int output_size; |
276 | 196 | ccv_nnc_graph_exec_symbol_io(dynamic_graph->tape, destination, &inputs, &input_size, &outputs, &output_size); |
277 | 196 | ccv_nnc_tensor_symbol_t* input_symbols = temp_input_symbols; |
278 | 196 | ccv_nnc_tensor_symbol_t* output_symbols = temp_output_symbols; |
279 | 196 | uint64_t* input_bitmasks = temp_input_bitmasks; |
280 | 196 | uint64_t* output_bitmasks = temp_output_bitmasks; |
281 | 196 | memset(input_bitmasks, 0, sizeof(uint64_t) * ccv_max(1, ((input_size + 63) >> 6))); |
282 | 196 | memset(output_bitmasks, 0, sizeof(uint64_t) * ccv_max(1, ((output_size + 63) >> 6))); |
283 | 196 | const ccv_nnc_cmd_t cmd = ccv_nnc_graph_exec_symbol_cmd(dynamic_graph->tape, destination); |
284 | | // Now, check to see if we can remove this symbol from this source. |
285 | 784 | for (k = 0; k < input_size; k++588 ) |
286 | 588 | if (inputs[k] >= 0) |
287 | 588 | input_bitmasks[k >> 6] |= ((uint64_t)1 << (k & 63)); |
288 | 196 | int flag = 0; |
289 | 392 | for (k = 0; k < output_size; k++196 ) |
290 | 196 | if (outputs[k] >= 0 && outputs[k] != symbol.d) |
291 | 0 | { |
292 | 0 | output_bitmasks[k >> 6] |= ((uint64_t)1 << (k & 63)); |
293 | 0 | flag = 1; |
294 | 0 | } |
295 | | // If we can omit this output (or there is no output at all). |
296 | 196 | if (!flag || ccv_nnc_cmd_bitmask(cmd, input_size, output_size, input_bitmasks, (input_size + 63) >> 6, output_bitmasks, (output_size + 63) >> 6)0 ) |
297 | 196 | { |
298 | | // Set the new outputs by omitting the one. |
299 | 784 | for (k = 0; k < input_size; k++588 ) |
300 | 588 | input_symbols[k] = (ccv_nnc_tensor_symbol_t){ |
301 | 588 | .d = inputs[k], |
302 | 588 | .graph = inputs[k] != CCV_NNC_NO_TENSOR_SYMBOL ? dynamic_graph->tape : 00 , |
303 | 588 | }; |
304 | 392 | for (k = 0; k < output_size; k++196 ) |
305 | 196 | if (outputs[k] != symbol.d) |
306 | 0 | output_symbols[k] = (ccv_nnc_tensor_symbol_t){ |
307 | 0 | .d = outputs[k], |
308 | 0 | .graph = outputs[k] != CCV_NNC_NO_TENSOR_SYMBOL ? dynamic_graph->tape : 0, |
309 | 0 | }; |
310 | 196 | else |
311 | 196 | output_symbols[k] = (ccv_nnc_tensor_symbol_t){ |
312 | 196 | .d = CCV_NNC_NO_TENSOR_SYMBOL, |
313 | 196 | .graph = 0, |
314 | 196 | }; |
315 | 196 | ccv_nnc_graph_exec_symbol_set_io(dynamic_graph->tape, destination, input_symbols, input_size, output_symbols, output_size); |
316 | | // If there is no output, and this is not custom (custom may have side effect, |
317 | | // whereas the normal ops are side-effect free), set this symbol to be a noop. |
318 | | // TODO: This could be other cases regarding CCV_NNC_GRAPH_BACKWARD. |
319 | 196 | if (!flag && |
320 | 196 | cmd.cmd != CCV_NNC_CUSTOM_FORWARD && |
321 | 196 | cmd.cmd != CCV_NNC_CUSTOM_BACKWARD) |
322 | 0 | ccv_nnc_graph_exec_symbol_set(dynamic_graph->tape, destination, ccv_nnc_cmd(CCV_NNC_NOOP, 0, ccv_nnc_cmd_auto, 0)); |
323 | 196 | } |
324 | 196 | } |
325 | 3.44k | ccv_array_push(destinations, &destination); |
326 | 3.44k | } |
327 | | // Remove the hook only at this point. |
328 | 3.42k | ccv_nnc_tensor_symbol_new_hook(dynamic_graph->tape, 0, 0, 0); |
329 | 3.42k | ccv_nnc_tensor_symbol_alias_new_hook(dynamic_graph->tape, 0, 0, 0); |
330 | 3.42k | ccv_nnc_graph_exec_symbol_new_hook(dynamic_graph->tape, 0, 0, 0); |
331 | 3.42k | ccv_nnc_dy_xpu_alloc_t xpu_alloc = { |
332 | 3.42k | .xpu_alloc = &dynamic_graph->xpu_alloc, |
333 | 3.42k | .stream = stream_context |
334 | 3.42k | }; |
335 | 3.42k | ccv_nnc_symbolic_graph_compile_param_t compile_params = { |
336 | 3.42k | .allocator = { |
337 | 3.42k | .isa = &ccv_nnc_dy_allocator_isa, |
338 | 3.42k | .context = { |
339 | 3.42k | .alloc = &xpu_alloc, |
340 | 3.42k | .free = &dynamic_graph->xpu_alloc, |
341 | 3.42k | } |
342 | 3.42k | } |
343 | 3.42k | }; |
344 | 3.42k | ccv_nnc_graph_t* graph = 0; |
345 | 3.42k | ccv_nnc_tensor_arena_t* tensor_arena = 0; |
346 | 3.42k | ccv_nnc_graph_exec_arena_t* exec_arena = 0; |
347 | | // TODO: Should apply simplification right after the backward pass generated. |
348 | | // Remove these if it is not needed by the cmd, for example, if absence assumed to be 1. |
349 | 6.85k | for (i = 0; i < f_variable_size; i++3.43k ) |
350 | 3.43k | { |
351 | 3.43k | if (df_optionals && df_optionals[i]2 ) |
352 | 2 | { |
353 | 2 | const ccv_nnc_tensor_bind_t df_bind = { |
354 | 2 | .symbol = df_symbols[i], |
355 | 2 | .tensor = ccv_nnc_tensor_from_variable(dynamic_graph, df_optionals[i], stream_context) |
356 | 2 | }; |
357 | 2 | ccv_array_push(tensor_binds, &df_bind); |
358 | 2 | continue; |
359 | 2 | } |
360 | 3.43k | if (!df_symbols[i].graph) // Skip. |
361 | 0 | continue; |
362 | 3.43k | int no_set = 0; // If we cannot find the df_symbols in all sources, we cannot predict whether it is used or not. |
363 | 6.89k | for (j = 0; j < sources->rnum; j++3.46k ) |
364 | 3.46k | { |
365 | 3.46k | const ccv_nnc_graph_exec_symbol_t source = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, j); |
366 | 3.46k | const int* inputs; int input_size; |
367 | 3.46k | const int* outputs; int output_size; |
368 | 3.46k | ccv_nnc_graph_exec_symbol_io(dynamic_graph->tape, source, &inputs, &input_size, &outputs, &output_size); |
369 | 3.46k | const ccv_nnc_cmd_t cmd = ccv_nnc_graph_exec_symbol_cmd(dynamic_graph->tape, source); |
370 | 3.46k | int flag = 0; |
371 | 7.26k | for (k = 0; !flag && k < input_size3.83k ; k++3.80k ) |
372 | 3.80k | { |
373 | 3.80k | const int alias_ref = inputs[k] >= 0 ? ccv_nnc_tensor_symbol_alias_to(dynamic_graph->tape, (ccv_nnc_tensor_symbol_t){ |
374 | 3.49k | .d = inputs[k], |
375 | 3.49k | .graph = dynamic_graph->tape |
376 | 3.49k | }).d : CCV_NNC_NO_TENSOR_SYMBOL306 ; // This could be CCV_NNC_NO_TENSOR_SYMBOL, which is negative. |
377 | 3.80k | flag = (df_symbols[i].d == inputs[k] || df_symbols[i].d == alias_ref375 ); |
378 | 3.80k | } |
379 | 3.46k | if (flag) |
380 | 3.43k | { |
381 | 3.43k | no_set = 1; |
382 | | // Now, check to see if we can remove this symbol from this source. |
383 | 3.43k | memset(temp_input_bitmasks, 0, sizeof(uint64_t) * ccv_max(1, ((input_size + 63) >> 6))); |
384 | 3.43k | memset(temp_output_bitmasks, 0, sizeof(uint64_t) * ccv_max(1, ((output_size + 63) >> 6))); |
385 | 16.5k | for (k = 0; k < input_size; k++13.1k ) |
386 | 13.1k | if (inputs[k] >= 0) |
387 | 8.46k | { |
388 | 8.46k | const int alias_ref = inputs[k] >= 0 ? ccv_nnc_tensor_symbol_alias_to(dynamic_graph->tape, (ccv_nnc_tensor_symbol_t){ |
389 | 8.46k | .d = inputs[k], |
390 | 8.46k | .graph = dynamic_graph->tape |
391 | 8.46k | }).d : CCV_NNC_NO_TENSOR_SYMBOL0 ; // This could be CCV_NNC_NO_TENSOR_SYMBOL, which is negative. |
392 | 8.46k | if (df_symbols[i].d != inputs[k] && df_symbols[i].d != alias_ref5.03k ) |
393 | 5.03k | temp_input_bitmasks[k >> 6] |= ((uint64_t)1 << (k & 63)); |
394 | 8.46k | } |
395 | 9.28k | for (k = 0; k < output_size; k++5.85k ) |
396 | 5.85k | if (outputs[k] >= 0) |
397 | 5.64k | temp_output_bitmasks[k >> 6] |= ((uint64_t)1 << (k & 63)); |
398 | 3.43k | if (!ccv_nnc_cmd_bitmask(cmd, input_size, output_size, temp_input_bitmasks, (input_size + 63) >> 6, temp_output_bitmasks, (output_size + 63) >> 6)) |
399 | 3.22k | no_set = 0; |
400 | 3.43k | } |
401 | 3.46k | } |
402 | 3.43k | if (no_set) // Remove this flag from all sources and continue. |
403 | 206 | { |
404 | 436 | for (j = 0; j < sources->rnum; j++230 ) |
405 | 230 | { |
406 | 230 | const ccv_nnc_graph_exec_symbol_t source = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, j); |
407 | 230 | const int* inputs; int input_size; |
408 | 230 | const int* outputs; int output_size; |
409 | 230 | ccv_nnc_graph_exec_symbol_io(dynamic_graph->tape, source, &inputs, &input_size, &outputs, &output_size); |
410 | 230 | int flag = 0; |
411 | 786 | for (k = 0; !flag && k < input_size580 ; k++556 ) |
412 | 556 | { |
413 | 556 | const int alias_ref = inputs[k] >= 0 ? ccv_nnc_tensor_symbol_alias_to(dynamic_graph->tape, (ccv_nnc_tensor_symbol_t){ |
414 | 266 | .d = inputs[k], |
415 | 266 | .graph = dynamic_graph->tape |
416 | 290 | }).d : CCV_NNC_NO_TENSOR_SYMBOL; // This could be CCV_NNC_NO_TENSOR_SYMBOL, which is negative. |
417 | 556 | flag = (df_symbols[i].d == inputs[k] || df_symbols[i].d == alias_ref350 ); |
418 | 556 | } |
419 | 230 | if (flag) |
420 | 206 | { |
421 | 1.44k | for (k = 0; k < input_size; k++1.23k ) |
422 | 1.23k | if (inputs[k] >= 0) |
423 | 618 | { |
424 | 618 | const int alias_ref = inputs[k] >= 0 ? ccv_nnc_tensor_symbol_alias_to(dynamic_graph->tape, (ccv_nnc_tensor_symbol_t){ |
425 | 618 | .d = inputs[k], |
426 | 618 | .graph = dynamic_graph->tape |
427 | 618 | }).d : CCV_NNC_NO_TENSOR_SYMBOL0 ; // This could be CCV_NNC_NO_TENSOR_SYMBOL, which is negative. |
428 | 618 | const int no_symbol = df_symbols[i].d == inputs[k] || df_symbols[i].d == alias_ref412 ; |
429 | 618 | temp_input_symbols[k] = (ccv_nnc_tensor_symbol_t){ |
430 | 618 | .d = no_symbol ? CCV_NNC_NO_TENSOR_SYMBOL206 : inputs[k]412 , |
431 | 618 | .graph = no_symbol ? 0206 : dynamic_graph->tape412 , |
432 | 618 | }; |
433 | 618 | } else { |
434 | 618 | temp_input_symbols[k] = (ccv_nnc_tensor_symbol_t){ |
435 | 618 | .d = inputs[k], |
436 | 618 | .graph = inputs[k] != CCV_NNC_NO_TENSOR_SYMBOL ? dynamic_graph->tape0 : 0, |
437 | 618 | }; |
438 | 618 | } |
439 | 618 | for (k = 0; k < output_size; k++412 ) |
440 | 412 | temp_output_symbols[k] = (ccv_nnc_tensor_symbol_t){ |
441 | 412 | .d = outputs[k], |
442 | 412 | .graph = outputs[k] != CCV_NNC_NO_TENSOR_SYMBOL ? dynamic_graph->tape206 : 0206 , |
443 | 412 | }; |
444 | 206 | ccv_nnc_graph_exec_symbol_set_io(dynamic_graph->tape, source, temp_input_symbols, input_size, temp_output_symbols, output_size); |
445 | 206 | } |
446 | 230 | } |
447 | 206 | df_symbols[i].graph = 0; |
448 | 206 | } |
449 | 3.43k | } |
450 | | // Aggregate them into one set command. |
451 | 3.42k | ccv_nnc_tensor_symbol_t df_symbols_0[f_variable_size]; |
452 | 3.42k | ccv_nnc_graph_exec_symbol_t set_ones[f_variable_size]; |
453 | 3.42k | int set_one_size = 0; |
454 | 6.85k | for (i = 0; i < f_variable_size;) |
455 | 3.43k | if ((df_optionals && df_optionals[i]2 ) || !df_symbols[i].graph3.43k ) // Skip. |
456 | 208 | ++i; |
457 | 3.22k | else { |
458 | 3.22k | df_symbols_0[0] = df_symbols[i]; |
459 | 3.22k | k = 1; |
460 | 3.22k | int idx = f_variable_size; |
461 | 3.22k | const ccv_nnc_tensor_param_t params_0 = ccv_nnc_tensor_symbol_params(dynamic_graph->tape, df_symbols_0[0]); |
462 | 3.22k | for (j = i + 1; j < f_variable_size; j++4 ) |
463 | 4 | if (df_symbols[j].graph) |
464 | 4 | { |
465 | 4 | const ccv_nnc_tensor_param_t params_j = ccv_nnc_tensor_symbol_params(dynamic_graph->tape, df_symbols[j]); |
466 | 4 | if (params_j.type != params_0.type) |
467 | 4 | { |
468 | 4 | if (idx == f_variable_size) |
469 | 4 | idx = j; |
470 | 4 | } else { |
471 | 0 | df_symbols_0[k++] = df_symbols[j]; |
472 | 0 | assert(df_symbols[j].graph == dynamic_graph->tape); |
473 | 0 | df_symbols[j].graph = 0; |
474 | 0 | } |
475 | 4 | } |
476 | 3.22k | i = idx; |
477 | 3.22k | set_ones[set_one_size] = ccv_nnc_graph_exec_symbol_new(dynamic_graph->tape, CMD_SET_FORWARD(1), 0, 0, df_symbols_0, k, 0); |
478 | 6.45k | for (j = 0; j < sources->rnum; j++3.23k ) |
479 | 3.23k | ccv_nnc_graph_exec_symbol_concat(dynamic_graph->tape, set_ones[set_one_size], *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, j)); |
480 | 3.22k | ++set_one_size; |
481 | 3.22k | } |
482 | | // Reset it back. |
483 | 6.85k | for (i = 0; 3.42k i < f_variable_size; i++3.43k ) |
484 | 3.43k | df_symbols[i].graph = dynamic_graph->tape; |
485 | 3.42k | if (set_one_size > 0) |
486 | 3.22k | { |
487 | 3.22k | ccv_nnc_symbolic_graph_compile(dynamic_graph->tape, compile_params, |
488 | 3.22k | (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, |
489 | 3.22k | 0, 0, |
490 | 3.22k | set_ones, set_one_size, |
491 | 3.22k | (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(destinations, 0), destinations->rnum, |
492 | 3.22k | &graph, &tensor_arena, &exec_arena); |
493 | 3.22k | } else { |
494 | | // Otherwise we don't have a single set ones, in this case, we still compile from source. |
495 | 202 | ccv_nnc_symbolic_graph_compile(dynamic_graph->tape, compile_params, |
496 | 202 | (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, |
497 | 202 | 0, 0, |
498 | 202 | (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, 0), sources->rnum, |
499 | 202 | (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(destinations, 0), destinations->rnum, |
500 | 202 | &graph, &tensor_arena, &exec_arena); |
501 | 202 | } |
502 | 3.42k | ccv_array_free(sources); |
503 | 6.64k | for (i = 0; i < set_one_size; i++3.22k ) |
504 | 3.22k | ccv_nnc_graph_exec_symbol_free(dynamic_graph->tape, set_ones[i]); |
505 | 3.42k | ccv_array_free(destinations); |
506 | 3.42k | ccv_array_free(tensor_binds); |
507 | | // Remove newly added symbols to restore the graph. |
508 | 48.1k | for (i = 0; i < symbol_stack->rnum; i++44.6k ) |
509 | 44.6k | { |
510 | 44.6k | const ccv_nnc_tape_symbol_t* const symbol = (ccv_nnc_tape_symbol_t*)ccv_array_get(symbol_stack, i); |
511 | 44.6k | if (symbol->type == CCV_NNC_SYMBOL_TENSOR || symbol->type == CCV_NNC_SYMBOL_TENSOR_ALIAS18.5k ) |
512 | 27.1k | ccv_nnc_tensor_symbol_free(dynamic_graph->tape, (ccv_nnc_tensor_symbol_t){ |
513 | 27.1k | .d = symbol->d, |
514 | 27.1k | .graph = dynamic_graph->tape |
515 | 27.1k | }); |
516 | 17.4k | else if (symbol->type == CCV_NNC_SYMBOL_GRAPH_EXEC) |
517 | 17.4k | ccv_nnc_graph_exec_symbol_free(dynamic_graph->tape, (ccv_nnc_graph_exec_symbol_t){ |
518 | 17.4k | .d = symbol->d, |
519 | 17.4k | .graph = dynamic_graph->tape |
520 | 17.4k | }); |
521 | 44.6k | } |
522 | 3.42k | ccv_array_free(symbol_stack); |
523 | | // Go through inputs and outputs to find out stream type and parallel counts. |
524 | 3.42k | int multi_device = 0; |
525 | 3.43k | for (i = 1; !multi_device && i < input_size3.42k ; i++9 ) |
526 | 9 | multi_device = (CCV_TENSOR_GET_DEVICE(inputs[i - 1]->info.type) != CCV_TENSOR_GET_DEVICE(inputs[i]->info.type)); |
527 | 3.42k | if (stream_context) |
528 | 200 | { |
529 | 200 | ccv_nnc_graph_set_default_static_schedule(graph, ccv_nnc_stream_context_type(stream_context), dynamic_graph->max_stream_count); |
530 | 200 | ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, stream_context); |
531 | | #ifdef HAVE_MPS |
532 | | // This might be problematic when we will actually have async logic going on. |
533 | | ccv_nnc_graph_free(graph); |
534 | | ccv_nnc_tensor_arena_free(tensor_arena); |
535 | | ccv_nnc_graph_exec_arena_free(exec_arena); |
536 | | #else |
537 | 200 | ccv_nnc_tensor_arena_buffer_free(tensor_arena); |
538 | 200 | ccv_nnc_compilation_artifact_t* const artifact = ccv_nnc_compilation_artifact_new(graph, tensor_arena, exec_arena); |
539 | 200 | ccv_nnc_stream_context_add_callback(stream_context, (ccv_nnc_callback_f)ccv_nnc_compilation_artifact_free, artifact); |
540 | 200 | #endif |
541 | 3.22k | } else { |
542 | 3.22k | if (multi_device) |
543 | 2 | { |
544 | 2 | int flag = 0; |
545 | 4 | for (i = 0; !flag && i < input_size2 ; i++2 ) |
546 | 2 | flag = (CCV_TENSOR_GET_MEMORY(inputs[i]->info.type) == CCV_TENSOR_GPU_MEMORY); |
547 | 2 | const int stream_type = flag ? CCV_STREAM_CONTEXT_GPU : CCV_STREAM_CONTEXT_CPU0 ; |
548 | 2 | ccv_nnc_graph_set_default_static_schedule(graph, stream_type, dynamic_graph->max_stream_count); |
549 | 2 | ccv_nnc_stream_context_t* const default_stream = ccv_nnc_graph_default_stream(graph); |
550 | 2 | ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, default_stream); |
551 | 2 | ccv_nnc_stream_context_wait(default_stream); |
552 | 2 | } else |
553 | 3.22k | ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0); |
554 | 3.22k | ccv_nnc_graph_free(graph); |
555 | 3.22k | ccv_nnc_tensor_arena_free(tensor_arena); |
556 | 3.22k | ccv_nnc_graph_exec_arena_free(exec_arena); |
557 | 3.22k | } |
558 | | // Now, able to free some of the reused outputs. This need to be the last step otherwise some of the exec symbols |
559 | | // above may be freed by this operation. |
560 | 5.43k | for (i = 0; i < freeable_size; i++2.00k ) |
561 | 2.00k | ccv_nnc_tensor_variable_free(dynamic_graph, freeables[i]); |
562 | 3.42k | } |