/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_dynamic_graph_evaluate.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "ccv_nnc.h" |
2 | | #include "ccv_nnc_easy.h" |
3 | | #include "ccv_nnc_internal.h" |
4 | | #include "ccv_nnc_easy.h" |
5 | | #include "ccv_internal.h" |
6 | | #include "_ccv_nnc_dynamic_graph.h" |
7 | | #include "_ccv_cnnp_model.h" |
8 | | |
9 | | // MARK - Level-5.5 API |
10 | | |
11 | | static int _ccv_cnnp_model_exec(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context) |
12 | 4.81k | { |
13 | 4.81k | ccv_nnc_stateful_exec_t* const stateful_exec = (ccv_nnc_stateful_exec_t*)cmd.data; |
14 | 4.81k | ccv_cnnp_model_t* const model = (ccv_cnnp_model_t*)stateful_exec->data; |
15 | | // I cannot just use stream context, it cannot synchronize correctly based on existing coroutine implementation. |
16 | 4.81k | int i; |
17 | 4.81k | int wait_for_any_neighbor = 0; |
18 | 4.81k | const int parallel_count = ccv_max(model->parallel_count, 1); |
19 | 4.81k | if (stream_context) // Find all neighbor context and wait on them all. |
20 | 804 | for (i = 0; 396 i < parallel_count; i++408 ) |
21 | 408 | { |
22 | 408 | ccv_nnc_stream_context_t* const neighbor_context = ccv_nnc_stream_context_find_neighbor(stream_context, i); |
23 | 408 | if (neighbor_context && neighbor_context != stream_context204 ) |
24 | 6 | { |
25 | 6 | ccv_nnc_stream_signal_t* const signal = ccv_nnc_stream_context_emit_signal_new(neighbor_context); |
26 | 6 | ccv_nnc_stream_context_wait_signal(stream_context, signal); |
27 | 6 | wait_for_any_neighbor = 1; |
28 | 6 | } |
29 | 408 | } |
30 | 4.81k | co_scheduler_t* old_scheduler; |
31 | 4.81k | co_routine_t* old_main; |
32 | 4.81k | if (stream_context) |
33 | 396 | { |
34 | 396 | old_main = stream_context->main; |
35 | 396 | old_scheduler = stream_context->scheduler; |
36 | | // We cannot piggyback on old scheduler. |
37 | 396 | stream_context->scheduler = 0; |
38 | | // We will have a new main coroutine when schedule as the root. |
39 | | // Otherwise it will be scheduled after the existing routines all scheduled |
40 | | // out, and that won't be right. |
41 | 396 | stream_context->main = 0; |
42 | 396 | } |
43 | 4.81k | if (cmd.cmd == CCV_NNC_CUSTOM_FORWARD) |
44 | 2.41k | { |
45 | 2.41k | ccv_cnnp_model_evaluate(model, (ccv_cnnp_evaluate_param_t){ |
46 | 2.41k | .requires_grad = stateful_exec->requires_grad, |
47 | 2.41k | .disable_outgrad = stateful_exec->disable_outgrad, |
48 | 2.41k | .is_test = stateful_exec->is_test, |
49 | 2.41k | }, inputs, input_size, outputs, output_size, 0, stream_context); |
50 | 2.41k | } else { |
51 | 2.40k | const int ingrad_size = model->output_size * parallel_count; |
52 | 2.40k | assert(ingrad_size <= input_size); |
53 | 2.40k | if (stateful_exec->disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_NONE) |
54 | 2.39k | ccv_cnnp_model_backward(model, inputs, ingrad_size, outputs, output_size, 0, stream_context); |
55 | 4 | else if (stateful_exec->disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_ALL) |
56 | 0 | ccv_cnnp_model_backward(model, inputs, ingrad_size, 0, 0, 0, stream_context); |
57 | 4 | else { |
58 | 4 | assert(output_size == model->input_size * parallel_count); |
59 | 4 | int per_outgrad_size = 0; |
60 | 4 | int i, j, k; |
61 | 14 | for (i = 0; i < model->input_size; i++10 ) |
62 | 10 | if (!(stateful_exec->disable_outgrad & ((uint64_t)1 << i))) |
63 | 4 | ++per_outgrad_size; |
64 | 4 | assert(per_outgrad_size > 0); |
65 | 4 | const int outgrad_size = per_outgrad_size * parallel_count; |
66 | 4 | ccv_nnc_tensor_t* outgrads[outgrad_size]; |
67 | 14 | for (i = 0; i < parallel_count; i++10 ) |
68 | 32 | for (k = 0, j = 0; 10 j < model->input_size; j++22 ) |
69 | 22 | if (!(stateful_exec->disable_outgrad & ((uint64_t)1 << j))) |
70 | 10 | outgrads[(k++) + i * per_outgrad_size] = outputs[j + i * model->input_size]; |
71 | 4 | ccv_cnnp_model_backward(model, inputs, ingrad_size, outgrads, outgrad_size, 0, stream_context); |
72 | 4 | } |
73 | 2.40k | stateful_exec->did_backward_but_not_apply_gradients = 1; |
74 | 2.40k | } |
75 | 4.81k | if (stream_context) |
76 | 396 | { |
77 | | // Should have new scheduler created. |
78 | 396 | assert(stream_context->scheduler); |
79 | | // The new scheduler shouldn't be active (everything is scheduled). |
80 | 396 | assert(!co_scheduler_is_active(stream_context->scheduler)); |
81 | 396 | co_scheduler_free(stream_context->scheduler); |
82 | | // Switch back to the old scheduler. |
83 | 396 | stream_context->scheduler = old_scheduler; |
84 | | // The main coroutine should be cleared. |
85 | 396 | assert(!stream_context->main); |
86 | 396 | stream_context->main = old_main; |
87 | 396 | } |
88 | 4.81k | if (wait_for_any_neighbor) // Find all neighbor context and wait on them all. |
89 | 2 | { |
90 | 2 | assert(stream_context); |
91 | 2 | ccv_nnc_stream_signal_t* const signal = ccv_nnc_stream_context_emit_signal_new(stream_context); |
92 | 10 | for (i = 0; i < parallel_count; i++8 ) |
93 | 8 | { |
94 | 8 | ccv_nnc_stream_context_t* const neighbor_context = ccv_nnc_stream_context_find_neighbor(stream_context, i); |
95 | 8 | if (neighbor_context && neighbor_context != stream_context) |
96 | 6 | ccv_nnc_stream_context_wait_signal(neighbor_context, signal); |
97 | 8 | } |
98 | 2 | } |
99 | 4.81k | return CCV_NNC_EXEC_SUCCESS; |
100 | 4.81k | } |
101 | | |
102 | | static void _ccv_cnnp_model_tensor_auto(const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_hint_t hint, ccv_nnc_tensor_param_t* const outputs, const int output_size) |
103 | 4.41k | { |
104 | 4.41k | ccv_nnc_stateful_exec_t* const stateful_exec = (ccv_nnc_stateful_exec_t*)cmd.data; |
105 | 4.41k | ccv_cnnp_model_t* const model = (ccv_cnnp_model_t*)stateful_exec->data; |
106 | 4.41k | const int parallel_count = ccv_max(model->parallel_count, 1); |
107 | 4.41k | const int per_input_size = input_size / parallel_count; |
108 | 4.41k | assert(per_input_size > 0); |
109 | 4.41k | assert((input_size % parallel_count) == 0); |
110 | 4.41k | const int per_output_size = output_size / parallel_count; |
111 | 4.41k | assert(per_output_size > 0); |
112 | 4.41k | assert((output_size % parallel_count) == 0); |
113 | 4.41k | int i, j; |
114 | 8.84k | for (i = 0; i < parallel_count; i++4.43k ) |
115 | 4.43k | { |
116 | 4.43k | ccv_cnnp_model_tensor_auto(model, outputs + i * per_output_size, per_output_size); |
117 | | // Set device id to the corresponding inputs' device id. |
118 | 4.43k | const int device_id = CCV_TENSOR_GET_DEVICE_ID(inputs[i * per_input_size].type); |
119 | 8.86k | for (j = 0; j < per_output_size; j++4.43k ) |
120 | 4.43k | CCV_TENSOR_SET_DEVICE_ID(outputs[i * per_output_size + j].type, device_id); |
121 | 4.43k | } |
122 | 4.41k | } |
123 | | |
124 | | static void _ccv_cnnp_model_apply_gradients(const ccv_nnc_cmd_t cmd, ccv_nnc_stream_context_t* const stream_context) |
125 | 2.40k | { |
126 | 2.40k | ccv_nnc_stateful_exec_t* const stateful_exec = (ccv_nnc_stateful_exec_t*)cmd.data; |
127 | 2.40k | ccv_cnnp_model_t* const model = (ccv_cnnp_model_t*)stateful_exec->data; |
128 | 2.40k | ccv_cnnp_model_apply_gradients(model, stream_context); |
129 | 2.40k | } |
130 | | |
131 | | static ccv_nnc_stateful_cmd_vtab_t ccv_cnnp_model_exec_isa = { |
132 | | .super = { |
133 | | .exec = _ccv_cnnp_model_exec, |
134 | | .tensor_auto = _ccv_cnnp_model_tensor_auto, |
135 | | }, |
136 | | .apply_gradients = _ccv_cnnp_model_apply_gradients, |
137 | | }; |
138 | | |
139 | | void ccv_nnc_dynamic_graph_dry_run(ccv_nnc_dynamic_graph_t* const dynamic_graph, ccv_cnnp_model_t* const model, const int is_test, const ccv_nnc_tensor_variable_t* const inputs, const int input_size, ccv_nnc_stream_context_t* const stream_context) |
140 | 0 | { |
141 | 0 | assert(input_size > 0); |
142 | 0 | const int parallel_count = ccv_max(model->parallel_count, 1); |
143 | 0 | const int per_input_size = input_size / parallel_count; |
144 | 0 | assert(per_input_size > 0); |
145 | 0 | assert((input_size % parallel_count) == 0); |
146 | 0 | int i, j; |
147 | 0 | if (!model->graph) |
148 | 0 | { |
149 | 0 | ccv_nnc_tensor_param_t input_params[per_input_size]; |
150 | 0 | for (i = 0; i < per_input_size; i++) |
151 | 0 | input_params[i] = inputs[i]->info; |
152 | 0 | ccv_cnnp_model_compile(model, input_params, per_input_size, CMD_NOOP(), CMD_NOOP()); |
153 | 0 | } else { |
154 | 0 | assert(per_input_size == model->input_size); |
155 | 0 | ccv_nnc_tensor_param_t input_params[per_input_size]; |
156 | 0 | int flag = 0; |
157 | 0 | for (i = 0; i < per_input_size; i++) |
158 | 0 | { |
159 | 0 | input_params[i] = inputs[i]->info; |
160 | 0 | const ccv_nnc_tensor_param_t params = ccv_nnc_tensor_symbol_params(model->graph, model->inputs[i]); |
161 | | // If these two parameters doesn't match, recompile the graph.. |
162 | 0 | if (memcmp(¶ms, &input_params[i], sizeof(params)) != 0) |
163 | 0 | flag = 1; |
164 | 0 | } |
165 | 0 | if (flag) // Recompile the graph. |
166 | 0 | ccv_cnnp_model_compile(model, input_params, per_input_size, ccv_cnnp_model_minimizer(model), CMD_NOOP()); |
167 | 0 | } |
168 | 0 | ccv_nnc_tensor_t* input_tensors[input_size]; |
169 | 0 | for (i = 0; i < input_size; i++) |
170 | 0 | { |
171 | | // Cannot have the parameter be a partial tensor view for model evaluation. |
172 | 0 | input_tensors[i] = inputs[i] ? ccv_nnc_tensor_from_variable(dynamic_graph, inputs[i], stream_context) : 0; |
173 | 0 | if (input_tensors[i]) |
174 | 0 | { assert(CCV_IS_TENSOR_CONTIGUOUS(input_tensors[i])); } |
175 | 0 | } |
176 | 0 | const int per_output_size = ccv_cnnp_model_output_size(model); |
177 | 0 | ccv_nnc_tensor_param_t output_params[ccv_max(1, per_output_size)]; |
178 | 0 | const int output_size = per_output_size * parallel_count; |
179 | 0 | ccv_nnc_tensor_variable_t outputs[output_size]; |
180 | 0 | ccv_nnc_tensor_t* output_tensors[output_size]; |
181 | 0 | for (i = 0; i < parallel_count; i++) |
182 | 0 | { |
183 | 0 | for (j = 0; j < per_output_size; j++) |
184 | 0 | output_params[j] = ccv_nnc_tensor_auto; |
185 | 0 | ccv_cnnp_model_tensor_auto(model, output_params, per_output_size); |
186 | 0 | for (j = 0; j < per_output_size; j++) |
187 | 0 | if (!ccv_nnc_is_tensor_auto(output_params[j])) |
188 | 0 | { |
189 | 0 | outputs[i * per_output_size + j] = ccv_nnc_tensor_variable_new(dynamic_graph, output_params[j]); |
190 | 0 | output_tensors[i * per_output_size + j] = ccv_nnc_tensor_from_variable(dynamic_graph, outputs[i * per_output_size + j], stream_context); |
191 | 0 | } else { |
192 | 0 | outputs[i * per_output_size + j] = 0; |
193 | 0 | output_tensors[i * per_output_size + j] = 0; |
194 | 0 | } |
195 | 0 | } |
196 | 0 | if (dynamic_graph->no_grad) |
197 | 0 | { |
198 | 0 | ccv_cnnp_model_dry_run(model, (ccv_cnnp_evaluate_param_t){ |
199 | 0 | .requires_grad = 0, |
200 | 0 | .disable_outgrad = CCV_CNNP_DISABLE_OUTGRAD_ALL, |
201 | 0 | .is_test = is_test, |
202 | 0 | }, input_tensors, input_size, output_tensors, output_size); |
203 | 0 | } else { |
204 | 0 | uint64_t disable_outgrad = 0; |
205 | 0 | int count = 0; |
206 | 0 | for (i = 0; i < per_input_size; i++) |
207 | 0 | if (!inputs[i] || inputs[i]->type == CCV_NNC_TENSOR_CONSTANT) |
208 | 0 | { |
209 | 0 | disable_outgrad |= ((uint64_t)1 << i); |
210 | 0 | ++count; |
211 | 0 | } |
212 | 0 | if (count == per_input_size) |
213 | 0 | disable_outgrad = CCV_CNNP_DISABLE_OUTGRAD_ALL; |
214 | 0 | ccv_cnnp_model_dry_run(model, (ccv_cnnp_evaluate_param_t){ |
215 | 0 | .requires_grad = 1, |
216 | 0 | .disable_outgrad = disable_outgrad, |
217 | 0 | .is_test = is_test, |
218 | 0 | }, input_tensors, input_size, output_tensors, output_size); |
219 | 0 | } |
220 | | // Free the allocated variables. |
221 | 0 | for (i = 0; i < output_size; i++) |
222 | 0 | if (outputs[i]) |
223 | 0 | ccv_nnc_tensor_variable_free(dynamic_graph, outputs[i]); |
224 | 0 | } |
225 | | |
226 | | void ccv_nnc_dynamic_graph_evaluate(ccv_nnc_dynamic_graph_t* const dynamic_graph, ccv_cnnp_model_t* const model, const int is_test, const ccv_nnc_tensor_variable_t* const inputs, const int input_size, ccv_nnc_tensor_variable_t* const outputs, const int output_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context) |
227 | 2.41k | { |
228 | 2.41k | ccv_nnc_cmd_t cmd = ccv_nnc_cmd(CCV_NNC_CUSTOM_FORWARD, (ccv_nnc_cmd_vtab_t*)&ccv_cnnp_model_exec_isa, (ccv_nnc_cmd_param_t){}, 0); |
229 | 2.41k | assert(input_size > 0); |
230 | 2.41k | const int parallel_count = ccv_max(model->parallel_count, 1); |
231 | 2.41k | const int per_input_size = input_size / parallel_count; |
232 | 2.41k | assert(per_input_size > 0); |
233 | 2.41k | assert((input_size % parallel_count) == 0); |
234 | 2.41k | int i; |
235 | 2.41k | if (!model->graph) |
236 | 9 | { |
237 | 9 | ccv_nnc_tensor_param_t input_params[per_input_size]; |
238 | 24 | for (i = 0; i < per_input_size; i++15 ) |
239 | 15 | input_params[i] = inputs[i]->info; |
240 | 9 | ccv_cnnp_model_compile(model, input_params, per_input_size, CMD_NOOP(), CMD_NOOP()); |
241 | 2.40k | } else { |
242 | 2.40k | assert(per_input_size == model->input_size); |
243 | 2.40k | ccv_nnc_tensor_param_t input_params[per_input_size]; |
244 | 2.40k | int flag = 0; |
245 | 4.81k | for (i = 0; i < per_input_size; i++2.41k ) |
246 | 2.41k | { |
247 | 2.41k | input_params[i] = inputs[i]->info; |
248 | 2.41k | const ccv_nnc_tensor_param_t params = ccv_nnc_tensor_symbol_params(model->graph, model->inputs[i]); |
249 | | // If these two parameters doesn't match, recompile the graph.. |
250 | 2.41k | if (memcmp(¶ms, &input_params[i], sizeof(params)) != 0) |
251 | 2.20k | flag = 1; |
252 | 2.41k | } |
253 | 2.40k | if (flag) // Recompile the graph. |
254 | 2.20k | ccv_cnnp_model_compile(model, input_params, per_input_size, ccv_cnnp_model_minimizer(model), CMD_NOOP()); |
255 | 2.40k | } |
256 | 4.86k | for (i = 0; 2.41k i < input_size; i++2.45k ) |
257 | 2.45k | { |
258 | | // Cannot have the parameter be a partial tensor view for model evaluation. |
259 | 2.45k | ccv_nnc_tensor_t* const tensor = inputs[i] ? ccv_nnc_tensor_from_variable(dynamic_graph, inputs[i], stream_context) : 00 ; |
260 | 2.45k | if (tensor) |
261 | 2.45k | { assert(CCV_IS_TENSOR_CONTIGUOUS(tensor)); } |
262 | 2.45k | } |
263 | 2.41k | if (dynamic_graph->no_grad) |
264 | 4 | { |
265 | 4 | ccv_nnc_stateful_exec_t stateful_exec = { |
266 | 4 | .requires_grad = 0, |
267 | 4 | .is_test = is_test, |
268 | 4 | .disable_outgrad = CCV_CNNP_DISABLE_OUTGRAD_ALL, |
269 | 4 | .tensor_tape = tensor_tape, |
270 | 4 | .data = model |
271 | 4 | }; |
272 | 4 | cmd.data = &stateful_exec; |
273 | | // Parallel parameter doesn't make sense here, the parallel is defined inside the model. |
274 | 4 | ccv_nnc_dynamic_graph_exec_ret(dynamic_graph, cmd, ccv_nnc_no_hint, 0, inputs, input_size, outputs, output_size, 0, stream_context, 0); |
275 | 2.40k | } else { |
276 | 2.40k | uint64_t disable_outgrad = 0; |
277 | 2.40k | int count = 0; |
278 | 4.82k | for (i = 0; i < per_input_size; i++2.41k ) |
279 | 2.41k | if (!inputs[i] || inputs[i]->type == CCV_NNC_TENSOR_CONSTANT) |
280 | 7 | { |
281 | 7 | disable_outgrad |= ((uint64_t)1 << i); |
282 | 7 | ++count; |
283 | 7 | } |
284 | 2.40k | if (count == per_input_size) |
285 | 1 | disable_outgrad = CCV_CNNP_DISABLE_OUTGRAD_ALL; |
286 | 2.40k | ccv_nnc_stateful_exec_t* const stateful_exec = (ccv_nnc_stateful_exec_t*)ccmalloc(sizeof(ccv_nnc_stateful_exec_t)); |
287 | 2.40k | cmd.data = stateful_exec; |
288 | 2.40k | stateful_exec->requires_grad = 1; |
289 | 2.40k | stateful_exec->is_test = is_test; |
290 | 2.40k | stateful_exec->did_backward_but_not_apply_gradients = 0; |
291 | 2.40k | stateful_exec->should_free = 0; |
292 | 2.40k | stateful_exec->disable_outgrad = disable_outgrad; |
293 | 2.40k | stateful_exec->tensor_tape = tensor_tape; |
294 | 2.40k | stateful_exec->data = model; |
295 | 2.40k | stateful_exec->cmd = cmd; |
296 | 2.40k | ccv_nnc_graph_exec_symbol_t symbol = {}; |
297 | 2.40k | ccv_nnc_dynamic_graph_exec_ret(dynamic_graph, cmd, ccv_nnc_no_hint, 0, inputs, input_size, outputs, output_size, 0, stream_context, &symbol); |
298 | 2.40k | if (!symbol.graph) // This is because inputs are all constants. |
299 | 1 | ccfree(stateful_exec); // No one records it, there is no cmd.data refer to it. |
300 | 2.40k | else { |
301 | 2.40k | if (!dynamic_graph->stateful_execs) |
302 | 12 | { |
303 | 12 | dynamic_graph->stateful_execs = ccv_array_new(sizeof(ccv_nnc_stateful_exec_t*), 1, 0); |
304 | 12 | ccv_array_push(dynamic_graph->stateful_execs, &stateful_exec); |
305 | 12 | stateful_exec->index = dynamic_graph->stateful_execs->rnum - 1; |
306 | 2.39k | } else { |
307 | 2.39k | if (dynamic_graph->reuse_stateful_exec >= 0) |
308 | 2.38k | { |
309 | 2.38k | *(ccv_nnc_stateful_exec_t**)ccv_array_get(dynamic_graph->stateful_execs, dynamic_graph->reuse_stateful_exec) = stateful_exec; |
310 | 2.38k | stateful_exec->index = dynamic_graph->reuse_stateful_exec; |
311 | 2.38k | int flag = 0; |
312 | 4.03k | for (i = dynamic_graph->reuse_stateful_exec + 1; !flag && i < dynamic_graph->stateful_execs->rnum2.38k ; i++1.64k ) |
313 | 1.64k | if (*(ccv_nnc_stateful_exec_t**)ccv_array_get(dynamic_graph->stateful_execs, i) == 0) |
314 | 1.64k | dynamic_graph->reuse_stateful_exec = i, flag = 1; |
315 | 2.38k | if (!flag) // Reset to 1. |
316 | 743 | dynamic_graph->reuse_stateful_exec = -1; |
317 | 2.38k | } else { |
318 | | // Push new, no reuse available. |
319 | 9 | ccv_array_push(dynamic_graph->stateful_execs, &stateful_exec); |
320 | 9 | stateful_exec->index = dynamic_graph->stateful_execs->rnum - 1; |
321 | 9 | } |
322 | 2.39k | } |
323 | 2.40k | } |
324 | 2.40k | } |
325 | 2.41k | } |
326 | | |