/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_cnnp_model.c
Line | Count | Source |
1 | | #include "ccv_nnc.h" |
2 | | #include "ccv_nnc_easy.h" |
3 | | #include "ccv_nnc_internal.h" |
4 | | #include "ccv_internal.h" |
5 | | #include "_ccv_cnnp_model.h" |
6 | | #include "_ccv_nnc_graph.h" |
7 | | |
8 | | // MARK - Level-5 API |
9 | | |
10 | | ccv_cnnp_model_io_t ccv_cnnp_model_apply(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t* const inputs, const int input_size) |
11 | 545 | { |
12 | 545 | if (!model->io) |
13 | 536 | model->io = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 1, 0); |
14 | 545 | ccv_cnnp_model_io_t model_io = ccmalloc(sizeof(struct ccv_cnnp_model_io_s) + sizeof(ccv_nnc_tensor_symbol_t) * model->output_size); |
15 | 545 | model_io->param_ref = 0; |
16 | 545 | model_io->param_sel = 0; |
17 | 545 | model_io->visit = 0; |
18 | 545 | model_io->model = model; |
19 | 545 | model_io->dependencies = 0; |
20 | 545 | model_io->dependents = 0; |
21 | 545 | model_io->outgoings = 0; |
22 | 545 | model_io->outputs = (ccv_nnc_tensor_symbol_t*)(model_io + 1); |
23 | 545 | ccv_array_push(model->io, &model_io); |
24 | 545 | if (input_size > 0) |
25 | 542 | { |
26 | 542 | model_io->incomings = ccv_array_new(sizeof(ccv_cnnp_model_io_t), input_size, 0); |
27 | 542 | ccv_array_resize(model_io->incomings, input_size); |
28 | 542 | int i; |
29 | 542 | memcpy(ccv_array_get(model_io->incomings, 0), inputs, sizeof(ccv_cnnp_model_io_t) * input_size); |
30 | 1.22k | for (i = 0; i < input_size; i++680 ) |
31 | 680 | { |
32 | 680 | if (!inputs[i]->outgoings) |
33 | 592 | inputs[i]->outgoings = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 1, 0); |
34 | 680 | ccv_array_push(inputs[i]->outgoings, &model_io); |
35 | 680 | } |
36 | 542 | } else { |
37 | 3 | model_io->incomings = 0; |
38 | 3 | } |
39 | 545 | return model_io; |
40 | 545 | } |
41 | | |
42 | | void ccv_cnnp_model_add_dependencies(ccv_cnnp_model_io_t model_io, const ccv_cnnp_model_io_t* const dependencies, const int dependency_size) |
43 | 2 | { |
44 | 2 | assert(dependency_size > 0); |
45 | 2 | if (!model_io->dependencies) |
46 | 2 | model_io->dependencies = ccv_array_new(sizeof(ccv_cnnp_model_io_t), dependency_size, 0); |
47 | 2 | int i, j; |
48 | 5 | for (i = 0; i < dependency_size; i++3 ) |
49 | 3 | { |
50 | 3 | int flag = 0; |
51 | | // Check if it is already exist or not. |
52 | 4 | for (j = 0; !flag && j < model_io->dependencies->rnum; j++1 ) |
53 | 1 | if (*(ccv_cnnp_model_io_t*)ccv_array_get(model_io->dependencies, j) == dependencies[i]) |
54 | 0 | flag = 1; |
55 | 3 | if (flag) |
56 | 0 | continue; |
57 | 3 | ccv_array_push(model_io->dependencies, dependencies + i); |
58 | 3 | ++dependencies[i]->dependents; |
59 | 3 | } |
60 | 2 | } |
61 | | |
62 | | int ccv_cnnp_model_output_size(const ccv_cnnp_model_t* const model) |
63 | 0 | { |
64 | 0 | return model->output_size; |
65 | 0 | } |
66 | | |
67 | | int ccv_cnnp_model_is_trainable(const ccv_cnnp_model_t* const model) |
68 | 16 | { |
69 | | // If the model is compiled, it is default to 1 unless it is not. |
70 | 16 | if (model->compiled_data) |
71 | 4 | return model->is_trainable >= 0 ? model->is_trainable : 10 ; |
72 | 12 | return model->is_trainable; |
73 | 16 | } |
74 | | |
75 | | ccv_cnnp_model_io_t ccv_cnnp_model_parameters(ccv_cnnp_model_t* const model, const int selector, const int index) |
76 | 389 | { |
77 | 389 | if (!model->io) |
78 | 35 | model->io = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 1, 0); |
79 | 389 | ccv_cnnp_model_io_t model_io = ccmalloc(sizeof(struct ccv_cnnp_model_io_s)); |
80 | 389 | model_io->param_ref = index >= 0 ? index + 137 : ALL_PARAMETERS352 ; |
81 | 389 | model_io->param_sel = selector >= 0 ? selector + 1308 : ALL_PARAMETERS81 ; |
82 | 389 | model_io->visit = 0; |
83 | 389 | model_io->model = model; |
84 | 389 | model_io->outputs = 0; |
85 | 389 | model_io->dependencies = 0; |
86 | 389 | model_io->dependents = 0; |
87 | 389 | model_io->incomings = 0; |
88 | 389 | model_io->outgoings = 0; |
89 | 389 | ccv_array_push(model->io, &model_io); |
90 | 389 | return model_io; |
91 | 389 | } |
92 | | |
93 | | void ccv_cnnp_model_notify_hook(ccv_cnnp_model_t* const model, ccv_cnnp_model_notify_f func, void* const context) |
94 | 3 | { |
95 | 3 | model->notify_hook.func = func; |
96 | 3 | model->notify_hook.context = context; |
97 | 3 | } |
98 | | |
99 | | void ccv_cnnp_model_notify(const ccv_cnnp_model_t* const model, const int tag, void* const payload) |
100 | 14 | { |
101 | 14 | if (model->notify_hook.func) |
102 | 3 | model->notify_hook.func(model, tag, payload, model->notify_hook.context); |
103 | 14 | if (model->isa->notify) |
104 | 1 | model->isa->notify(model, tag, payload); |
105 | 14 | } |
106 | | |
107 | | static int _ccv_nnc_array_dedup_graph_exec_symbols(ccv_nnc_graph_exec_symbol_t* const graph_exec_symbols, int graph_exec_symbol_size) |
108 | 2.24k | { |
109 | 2.24k | int i, j; |
110 | 4.84k | for (i = 0; i < graph_exec_symbol_size; i++2.60k ) |
111 | 2.60k | { |
112 | 2.60k | ccv_nnc_graph_exec_symbol_t* const graph_exec_symbol = graph_exec_symbols + i; |
113 | | // Check whether this tensor symbol has any duplicate. |
114 | 23.2k | for (j = i + 1; j < graph_exec_symbol_size;) |
115 | 20.6k | { |
116 | 20.6k | ccv_nnc_graph_exec_symbol_t* const other_symbol = graph_exec_symbols + j; |
117 | | // If there is a same tensor symbol, remove it. |
118 | 20.6k | if (other_symbol->d == graph_exec_symbol->d && other_symbol->graph == graph_exec_symbol->graph2.70k ) |
119 | 2.70k | { |
120 | 2.70k | if (j + 1 < graph_exec_symbol_size) |
121 | 436 | *other_symbol = graph_exec_symbols[graph_exec_symbol_size - 1]; |
122 | 2.70k | --graph_exec_symbol_size; |
123 | 2.70k | continue; |
124 | 2.70k | } |
125 | 17.9k | ++j; |
126 | 17.9k | } |
127 | 2.60k | } |
128 | 2.24k | return graph_exec_symbol_size; |
129 | 2.24k | } |
130 | | |
131 | | void ccv_cnnp_model_add_to_array(void* const context, const ccv_nnc_tensor_symbol_t symbol, const int is_trainable) |
132 | 3.14k | { |
133 | 3.14k | ccv_cnnp_model_add_to_array_context_t* const add_to_array_context = (ccv_cnnp_model_add_to_array_context_t*)context; |
134 | 3.14k | ccv_cnnp_model_t* const model = add_to_array_context->sequence->model; |
135 | 3.14k | int i; |
136 | 3.14k | if (add_to_array_context->add_parameter_indices && !model->parameter_indices2.96k ) |
137 | 2.52k | model->parameter_indices = ccv_array_new(sizeof(int), 0, 0); |
138 | 37.1k | for (i = 0; i < add_to_array_context->symbols->rnum; i++33.9k ) |
139 | 33.9k | { |
140 | 33.9k | const ccv_nnc_tensor_symbol_t other_symbol = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(add_to_array_context->symbols, i); |
141 | 33.9k | if (other_symbol.d == symbol.d && other_symbol.graph == symbol.graph24 ) |
142 | 24 | { |
143 | | // Only add to parameter_indices if it is trainable. |
144 | 24 | if (add_to_array_context->add_parameter_indices) |
145 | 15 | ccv_array_add_unique_int(model->parameter_indices, i); |
146 | | // Found it, return, don't add it. |
147 | 24 | return; |
148 | 24 | } |
149 | 33.9k | } |
150 | | // Only add to parameter_indices if it is trainable. |
151 | 3.12k | if (add_to_array_context->add_parameter_indices) |
152 | 2.94k | ccv_array_push(model->parameter_indices, &add_to_array_context->symbols->rnum); |
153 | | // This is a new one, no need to add_unique_int, it is unique. |
154 | 3.12k | ccv_array_push(add_to_array_context->symbols, &symbol); |
155 | 3.12k | if (add_to_array_context->trainables) |
156 | 2.96k | ccv_array_push(add_to_array_context->trainables, &is_trainable); |
157 | 3.12k | char id[2048]; |
158 | 3.12k | id[0] = add_to_array_context->prefix; |
159 | 3.12k | id[1] = '-'; |
160 | 3.12k | int total_len = 2; |
161 | 6.47k | for (i = 0; i < add_to_array_context->sequence->sequences->rnum; i++3.34k ) |
162 | 3.34k | { |
163 | 3.34k | const ccv_cnnp_model_name_t* const name = (ccv_cnnp_model_name_t*)ccv_array_get(add_to_array_context->sequence->sequences, i); |
164 | 3.34k | int len; |
165 | 3.34k | if (name->name && name->name[0] != '\0'345 ) |
166 | 345 | len = snprintf(id + total_len, 2048 - total_len, "%s-%d-", name->name, name->sequence); |
167 | 3.00k | else |
168 | 3.00k | len = snprintf(id + total_len, 2048 - total_len, "%d-", name->sequence); |
169 | 3.34k | total_len += len; |
170 | 3.34k | if (total_len >= 2047) |
171 | 0 | break; |
172 | 3.34k | } |
173 | 3.12k | if (total_len < 2047) |
174 | 3.12k | total_len += snprintf(id + total_len, 2048 - total_len, "%d", add_to_array_context->sequence->it); |
175 | 3.12k | assert(total_len < 2048); |
176 | 3.12k | char *heap_id = (char*)ccmalloc(total_len + 1); |
177 | 3.12k | memcpy(heap_id, id, total_len + 1); |
178 | 3.12k | ccv_array_push(add_to_array_context->ids, &heap_id); |
179 | 3.12k | ++add_to_array_context->sequence->it; |
180 | 3.12k | } |
181 | | |
182 | | static void _ccv_cnnp_compiled_data_init(ccv_cnnp_compiled_data_t* const compiled_data, const int output_size, ccv_array_t* const gradient_checkpoints) |
183 | 2.29k | { |
184 | 2.29k | compiled_data->f = compiled_data->fits + output_size; |
185 | 2.29k | compiled_data->xpu_alloc.mp_hdr = -1; |
186 | 2.29k | compiled_data->xpu_alloc.freed = kh_init(dy_str); |
187 | 2.29k | compiled_data->xpu_alloc.allocd = kh_init(dy_alloc); |
188 | 2.29k | compiled_data->gradient_checkpoints = gradient_checkpoints; |
189 | 2.29k | } |
190 | | |
191 | | static void _ccv_cnnp_model_compile(ccv_cnnp_model_t* const model, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_cmd_t loss) |
192 | 2.29k | { |
193 | 2.29k | assert(model->graph); |
194 | 2.29k | model->inputs = ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * input_size); |
195 | 2.29k | int i; |
196 | 4.65k | for (i = 0; i < input_size; i++2.35k ) |
197 | 2.35k | model->inputs[i] = ccv_nnc_tensor_symbol_new(model->graph, inputs[i], 0); |
198 | 2.29k | ccv_array_t* const parameters = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0); |
199 | 2.29k | ccv_array_t* const parameter_ids = ccv_array_new(sizeof(char*), 0, 0); |
200 | 2.29k | ccv_array_t* const parameter_trainables = ccv_array_new(sizeof(int), 0, 0); |
201 | 2.29k | ccv_cnnp_model_sequence_t model_sequence = { |
202 | 2.29k | .bank = kh_init(ccv_cnnp_model_name_bank) |
203 | 2.29k | }; |
204 | 2.29k | ccv_cnnp_model_add_to_array_context_t add_to_parameter_context = { |
205 | 2.29k | .add_parameter_indices = 1, |
206 | 2.29k | .prefix = 't', |
207 | 2.29k | .sequence = &model_sequence, |
208 | 2.29k | .symbols = parameters, |
209 | 2.29k | .ids = parameter_ids, |
210 | 2.29k | .trainables = parameter_trainables, |
211 | 2.29k | }; |
212 | 2.29k | ccv_array_t* const internals = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0); |
213 | 2.29k | ccv_array_t* const internal_ids = ccv_array_new(sizeof(char*), 0, 0); |
214 | 2.29k | ccv_cnnp_model_add_to_array_context_t add_to_output_context = { |
215 | 2.29k | .add_parameter_indices = 0, |
216 | 2.29k | .prefix = 'r', |
217 | 2.29k | .sequence = &model_sequence, |
218 | 2.29k | .symbols = internals, |
219 | 2.29k | .ids = internal_ids, |
220 | 2.29k | .trainables = 0, |
221 | 2.29k | }; |
222 | 2.29k | ccv_cnnp_model_build_data_t build_data = { |
223 | 2.29k | .is_trainable = model->is_trainable >= 0 ? model->is_trainable2.28k : 14 , |
224 | 2.29k | .model_sequence = &model_sequence, |
225 | 2.29k | .add_to_array = ccv_cnnp_model_add_to_array, |
226 | 2.29k | .parameters = parameters, |
227 | 2.29k | .context = { |
228 | 2.29k | .add_to_parameter = &add_to_parameter_context, |
229 | 2.29k | .add_to_output = &add_to_output_context, |
230 | 2.29k | }, |
231 | 2.29k | .gradient_checkpoints = 0, |
232 | 2.29k | }; |
233 | 2.29k | model->data = &build_data; |
234 | 2.29k | ccv_cnnp_model_build(model, model->graph, model->inputs, input_size, 0, 0); |
235 | 4.59k | for (i = 0; i < model->output_size; i++2.30k ) |
236 | 2.30k | { |
237 | 2.30k | const ccv_nnc_tensor_symbol_t output = model->outputs[i]; |
238 | 2.30k | const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(model->graph, output); |
239 | 2.30k | if (alias_to.d == CCV_NNC_NO_TENSOR_SYMBOL) |
240 | 1.30k | continue; |
241 | | // If output is an alias, insert data transform regardless for result correctness (we cannot bind an alias). You can check ccv_nnc_tensor_bind_symbol method |
242 | | // to see that we can correctly bind a tensor which from it, has aliases, but we cannot bind an alias tensor correctly (this is expected, sort of, to be |
243 | | // honest, because we cannot handle cases of alias is part of the original tensor but bind differently). |
244 | 1.00k | const ccv_nnc_tensor_param_t output_params = ccv_nnc_tensor_symbol_params(model->graph, output); |
245 | 1.00k | model->outputs[i] = ccv_nnc_tensor_symbol_new(model->graph, output_params, 0); |
246 | 1.00k | ccv_nnc_graph_exec_symbol_t make_contiguous = ccv_nnc_graph_exec_symbol_new(model->graph, CMD_FORMAT_TRANSFORM_FORWARD(), &output, 1, model->outputs + i, 1, "contiguous"); |
247 | 1.00k | ccv_nnc_graph_exec_symbol_set_flags(model->graph, make_contiguous, CCV_NNC_GRAPH_EXEC_DISABLE_OPT); |
248 | 1.00k | } |
249 | 2.29k | model->data = 0; |
250 | 2.29k | kh_destroy(ccv_cnnp_model_name_bank, model_sequence.bank); |
251 | 2.29k | if (model_sequence.sequences) |
252 | 2.27k | ccv_array_free(model_sequence.sequences); |
253 | | // Check if there are parameters that are not trainables. If there are, we will allocate uint64 bitmap to record that. |
254 | 2.29k | int not_trainables = 0; |
255 | | // Assert no parameter is alias. |
256 | 5.24k | for (i = 0; i < parameters->rnum; i++2.94k ) |
257 | 2.94k | { |
258 | 2.94k | const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(parameters, i); |
259 | 2.94k | const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(parameter.graph, parameter); |
260 | 2.94k | assert(alias_to.graph == 0); // Cannot find the one alias to. |
261 | 2.94k | if (*(int*)ccv_array_get(parameter_trainables, i) == 0) |
262 | 11 | not_trainables = 1; |
263 | 2.94k | } |
264 | 2.29k | assert(parameters->rnum == parameter_trainables->rnum); |
265 | 2.29k | uint64_t* parameter_flags = 0; |
266 | 2.29k | if (not_trainables) |
267 | 8 | { |
268 | 8 | parameter_flags = (uint64_t*)cccalloc(((parameters->rnum + 63) >> 6), sizeof(uint64_t)); |
269 | 39 | for (i = 0; i < parameter_trainables->rnum; i++31 ) |
270 | 31 | if (*(int*)ccv_array_get(parameter_trainables, i)) |
271 | 20 | parameter_flags[i >> 6] |= ((uint64_t)1 << (i & 63)); |
272 | 8 | } |
273 | 2.29k | ccv_array_free(parameter_trainables); |
274 | | // Assert no internal is alias. |
275 | 2.45k | for (i = 0; i < internals->rnum; i++161 ) |
276 | 161 | { |
277 | 161 | const ccv_nnc_tensor_symbol_t internal = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(internals, i); |
278 | 161 | const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(internal.graph, internal); |
279 | 161 | assert(alias_to.graph == 0); // Cannot find the one alias to. |
280 | 161 | } |
281 | 2.29k | const int output_size = model->output_size; |
282 | 2.29k | ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
283 | 2.29k | const int parameters_rnum = parameters->rnum; |
284 | 2.29k | if (input_size > 0) |
285 | 2.29k | { |
286 | 2.29k | ccv_array_resize(parameters, parameters_rnum + input_size); |
287 | 2.29k | memcpy(ccv_array_get(parameters, parameters_rnum), model->inputs, input_size * sizeof(ccv_nnc_tensor_symbol_t)); |
288 | 2.29k | } |
289 | 2.29k | ccv_nnc_symbolic_graph_simplify(model->graph, |
290 | 2.29k | SYMBOLIC_GRAPH_PASSES(CCV_NNC_SIMPLIFY_COMMON_SUBEXPRESSION_ELIMINATION, |
291 | 2.29k | CCV_NNC_SIMPLIFY_DATA_TRANSFER_OPT, |
292 | 2.29k | CCV_NNC_SIMPLIFY_OPS_FUSION, |
293 | 2.29k | CCV_NNC_SIMPLIFY_GRAPH_PRUNING), |
294 | 2.29k | ccv_array_get(parameters, 0), parameters_rnum + input_size, |
295 | 2.29k | model->outputs, output_size, |
296 | 2.29k | SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph)); |
297 | 2.29k | ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
298 | | // Size it down. |
299 | 2.29k | parameters->rnum = parameters_rnum; |
300 | 2.29k | ccv_cnnp_compiled_data_t* compiled_data = model->compiled_data = cccalloc(1, sizeof(ccv_cnnp_compiled_data_t) + sizeof(ccv_nnc_tensor_symbol_t) * (output_size * 2 - 1)); |
301 | 2.29k | _ccv_cnnp_compiled_data_init(compiled_data, output_size, build_data.gradient_checkpoints); |
302 | 2.29k | const int evaluate_to_size = compiled_data->evaluate.to_size = ccv_nnc_symbolic_graph_destination_size(model->graph); |
303 | 2.29k | assert(evaluate_to_size > 0); |
304 | 2.29k | compiled_data->evaluate.tos = ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size); |
305 | 2.29k | memcpy(compiled_data->evaluate.tos, ccv_nnc_symbolic_graph_destinations(model->graph), sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size); |
306 | 2.29k | compiled_data->loss = loss; |
307 | 2.29k | if (loss.cmd == CCV_NNC_NOOP) |
308 | 2.28k | { |
309 | | // If no loss function provided, there is no fits. |
310 | 4.57k | for (i = 0; i < output_size; i++2.29k ) |
311 | 2.29k | { |
312 | 2.29k | compiled_data->fits[i] = NO_TENSOR_SYMBOL; |
313 | 2.29k | const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(model->graph, model->outputs[i]); |
314 | 2.29k | if (alias_to.d < 0) |
315 | 2.29k | compiled_data->f[i] = model->outputs[i]; |
316 | 0 | else { // We cannot differentiate against an alias, therefore, we have to verify this output is full, and we can diff against the original. |
317 | 0 | int ofs[CCV_NNC_MAX_DIM_ALLOC]; |
318 | 0 | int inc[CCV_NNC_MAX_DIM_ALLOC]; |
319 | 0 | ccv_nnc_tensor_symbol_alias_params(model->graph, model->outputs[i], ofs, inc); |
320 | 0 | int j; |
321 | 0 | for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC; j++) |
322 | 0 | { assert(ofs[j] == 0); } // There is no ofs. |
323 | 0 | compiled_data->f[i] = alias_to; // Unfortunately, I cannot assert the size yet. |
324 | 0 | } |
325 | 2.29k | } |
326 | 2.28k | } else { |
327 | 20 | for (i = 0; i < output_size; i++10 ) |
328 | 10 | { |
329 | 10 | const ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(model->graph, model->outputs[i]); |
330 | 10 | const ccv_nnc_tensor_symbol_t fit = compiled_data->fits[i] = ccv_nnc_tensor_symbol_new(model->graph, info, 0); |
331 | 10 | compiled_data->f[i] = ccv_nnc_tensor_symbol_new(model->graph, ccv_nnc_tensor_auto, 0); |
332 | 10 | ccv_nnc_graph_exec_symbol_new(model->graph, loss, TENSOR_SYMBOL_LIST(model->outputs[i], fit), TENSOR_SYMBOL_LIST(compiled_data->f[i]), 0); |
333 | 10 | } |
334 | 10 | } |
335 | 2.29k | ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
336 | 2.29k | ccv_nnc_symbolic_graph_simplify(model->graph, |
337 | 2.29k | SYMBOLIC_GRAPH_PASSES(CCV_NNC_SIMPLIFY_OPS_FUSION), // Only do Ops fusion, in this way, we can fuse the loss function. |
338 | 2.29k | 0, 0, // No need to provide binds at this point. |
339 | 2.29k | compiled_data->f, model->output_size, |
340 | 2.29k | SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph)); |
341 | 2.29k | ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
342 | | // If inputs are from GPU, stream type is GPU. |
343 | 2.29k | compiled_data->parameters = parameters; |
344 | 2.29k | compiled_data->parameter_flags = parameter_flags; |
345 | 2.29k | compiled_data->internals = internals; |
346 | 2.29k | compiled_data->ids.parameters = parameter_ids; |
347 | 2.29k | compiled_data->ids.internals = internal_ids; |
348 | 2.29k | ccv_cnnp_model_gradient_checkpoints_cleanup_after_build(compiled_data, model->graph); |
349 | 2.29k | } |
350 | | |
351 | | static void _ccv_cnnp_graph_push_graph_exec_symbol(void* context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_symbol_t* const inputs, const int input_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const char* const name) |
352 | 8.82k | { |
353 | 8.82k | ccv_array_t* const stack = (ccv_array_t*)context; |
354 | 8.82k | ccv_array_push(stack, &symbol.d); |
355 | 8.82k | } |
356 | | |
357 | | static void _ccv_nnc_tensor_symbol_reinit(const ccv_nnc_symbolic_graph_t* const src_graph, ccv_nnc_symbolic_graph_t* const dest_graph, const int src_index, const int dest_index) |
358 | 38.5k | { |
359 | 38.5k | const ccv_nnc_tensor_symbol_t src_symbol = { |
360 | 38.5k | .d = src_index, |
361 | 38.5k | .graph = src_graph |
362 | 38.5k | }; |
363 | 38.5k | const ccv_nnc_tensor_symbol_t dest_symbol = { |
364 | 38.5k | .d = dest_index, |
365 | 38.5k | .graph = dest_graph |
366 | 38.5k | }; |
367 | 38.5k | const ccv_nnc_tensor_param_t params = ccv_nnc_tensor_symbol_params(src_graph, src_symbol); |
368 | 38.5k | ccv_nnc_tensor_symbol_set(dest_graph, dest_symbol, params); |
369 | 38.5k | int ofs[CCV_NNC_MAX_DIM_ALLOC]; |
370 | 38.5k | int inc[CCV_NNC_MAX_DIM_ALLOC]; |
371 | 38.5k | if (0 == ccv_nnc_tensor_symbol_alias_params(src_graph, src_symbol, ofs, inc)) |
372 | 2.00k | ccv_nnc_tensor_symbol_alias_set(dest_graph, dest_symbol, ofs, inc); |
373 | 38.5k | } |
374 | | |
375 | | static int _ccv_nnc_tensor_symbol_check_dim(const ccv_nnc_symbolic_graph_t* const src_graph, ccv_nnc_symbolic_graph_t* const dest_graph, const int src_index, const int dest_index) |
376 | 2.41k | { |
377 | 2.41k | const ccv_nnc_tensor_symbol_t src_symbol = { |
378 | 2.41k | .d = src_index, |
379 | 2.41k | .graph = src_graph |
380 | 2.41k | }; |
381 | 2.41k | const ccv_nnc_tensor_param_t src_params = ccv_nnc_tensor_symbol_params(src_graph, src_symbol); |
382 | 2.41k | const ccv_nnc_tensor_symbol_t dest_symbol = { |
383 | 2.41k | .d = dest_index, |
384 | 2.41k | .graph = dest_graph |
385 | 2.41k | }; |
386 | 2.41k | const ccv_nnc_tensor_param_t dest_params = ccv_nnc_tensor_symbol_params(dest_graph, dest_symbol); |
387 | 2.41k | return memcmp(src_params.dim, dest_params.dim, sizeof(src_params.dim)) == 0; |
388 | 2.41k | } |
389 | | |
390 | | static void _ccv_cnnp_model_gradient_init(ccv_cnnp_model_t* const model, const int gradient_mode, const uint64_t disable_outgrad, ccv_nnc_tensor_t* const* const fits, const int fit_size); |
391 | | static void _ccv_cnnp_compiled_data_graph_free(ccv_cnnp_compiled_data_t* const compiled_data); |
392 | | |
393 | | typedef struct { |
394 | | int parallel_count; |
395 | | ccv_nnc_symbolic_graph_t* graph; |
396 | | ccv_nnc_graph_exec_arena_t* graph_exec_arena; |
397 | | } ccv_nnc_graph_exec_update_t; |
398 | | |
399 | | static void _ccv_cnnp_cmd_update_for_execs(void* const context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint) |
400 | 58 | { |
401 | 58 | ccv_nnc_graph_exec_update_t* const graph_exec_update = (ccv_nnc_graph_exec_update_t*)context; |
402 | 58 | ccv_nnc_graph_exec_arena_t* const graph_exec_arena = graph_exec_update->graph_exec_arena; |
403 | 58 | ccv_nnc_graph_exec_t graph_exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, symbol); |
404 | 58 | ccv_nnc_graph_exec_set(graph_exec.graph, graph_exec, cmd); |
405 | 58 | ccv_nnc_graph_exec_set_hint(graph_exec.graph, graph_exec, hint); |
406 | 58 | const ccv_nnc_symbolic_graph_t* const graph = graph_exec_update->graph; |
407 | 58 | const int parallel_count = graph_exec_update->parallel_count; |
408 | 58 | int i; |
409 | 178 | for (i = 1; i < parallel_count; i++120 ) |
410 | 120 | { |
411 | 120 | const ccv_nnc_graph_exec_t copy = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, ccv_nnc_graph_exec_symbol_copy(graph, symbol, i)); |
412 | 120 | if (!CCV_NO_GRAPH_EXEC(copy)) |
413 | 120 | { |
414 | 120 | ccv_nnc_graph_exec_set(copy.graph, copy, cmd); |
415 | 120 | ccv_nnc_graph_exec_set_hint(copy.graph, copy, hint); |
416 | 120 | } |
417 | 120 | } |
418 | 58 | } |
419 | | |
420 | | void ccv_cnnp_model_absorb(ccv_cnnp_model_t* const model, ccv_cnnp_model_t* const init, const ccv_nnc_tensor_param_t* const inputs, const int input_size) |
421 | 2.20k | { |
422 | 2.20k | assert(model->graph); |
423 | 2.20k | assert(model->compiled_data); |
424 | 2.20k | assert(!init->graph); |
425 | 2.20k | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
426 | 2.20k | init->graph = ccv_nnc_symbolic_graph_new(); |
427 | 2.20k | ccv_array_t* const stack = ccv_array_new(sizeof(int), 0, 0); |
428 | 2.20k | ccv_nnc_graph_exec_symbol_new_hook(init->graph, _ccv_cnnp_graph_push_graph_exec_symbol, stack, 0); |
429 | 2.20k | _ccv_cnnp_model_compile(init, inputs, input_size, compiled_data->loss); |
430 | 2.20k | init->parallel_count = model->parallel_count; |
431 | 2.20k | init->memory_compression = model->memory_compression; |
432 | 2.20k | init->memory_reduction = model->memory_reduction; |
433 | 2.20k | init->gradient_checkpointing = model->gradient_checkpointing; |
434 | 2.20k | init->compiled_data->stream_type = model->compiled_data->stream_type; |
435 | 2.20k | init->compiled_data->minimize.minimizer = model->compiled_data->minimize.minimizer; |
436 | 2.20k | init->compiled_data->minimize.max_saved_aux_size = model->compiled_data->minimize.max_saved_aux_size; |
437 | 2.20k | if (model->compiled_data->gradient_mode != CCV_CNNP_COMPILED_DATA_GRADIENT_NONE) |
438 | 2.20k | _ccv_cnnp_model_gradient_init(init, model->compiled_data->gradient_mode, model->compiled_data->disable_outgrad, 0, 0); |
439 | 2.20k | ccv_nnc_graph_exec_symbol_new_hook(init->graph, 0, 0, 0); |
440 | 2.20k | ccv_nnc_symbolic_graph_tensor_auto(init->graph, TRAVERSE_FULL); |
441 | 2.20k | int i, j; |
442 | | // Verify parameters, internals and saved_aux in both graph has the same dimensionality. |
443 | 4.61k | for (i = 0; i < compiled_data->parameters->rnum; i++2.41k ) |
444 | 2.41k | { |
445 | 2.41k | const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d; |
446 | 2.41k | assert(_ccv_nnc_tensor_symbol_check_dim(model->graph, init->graph, d, d)); |
447 | 2.41k | } |
448 | 2.20k | for (i = 0; i < compiled_data->internals->rnum; i++0 ) |
449 | 0 | { |
450 | 0 | const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i))->d; |
451 | 0 | assert(_ccv_nnc_tensor_symbol_check_dim(model->graph, init->graph, d, d)); |
452 | 0 | } |
453 | | // Update inputs. |
454 | 2.20k | assert(model->input_size == init->input_size); |
455 | 4.40k | for (i = 0; 2.20k i < model->input_size; i++2.20k ) |
456 | 2.20k | if (model->inputs[i].d >= 0) |
457 | 2.20k | { |
458 | 2.20k | assert(init->inputs[i].d >= 0); |
459 | 2.20k | _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, init->inputs[i].d, model->inputs[i].d); |
460 | 2.20k | } |
461 | | // Update outputs. |
462 | 2.20k | assert(model->output_size == init->output_size); |
463 | 4.40k | for (i = 0; 2.20k i < model->output_size; i++2.20k ) |
464 | 2.20k | { |
465 | 2.20k | if (model->outputs[i].d >= 0) |
466 | 2.20k | { |
467 | 2.20k | assert(init->outputs[i].d >= 0); |
468 | 2.20k | _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, init->outputs[i].d, model->outputs[i].d); |
469 | 2.20k | } |
470 | 2.20k | if (model->outputs[i].d != model->compiled_data->f[i].d) |
471 | 0 | { |
472 | 0 | assert(init->outputs[i].d != init->compiled_data->f[i].d); |
473 | 0 | if (model->compiled_data->f[i].d >= 0) |
474 | 0 | { |
475 | 0 | assert(init->compiled_data->f[i].d >= 0); |
476 | 0 | _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, init->compiled_data->f[i].d, model->compiled_data->f[i].d); |
477 | 0 | } |
478 | 0 | } |
479 | 2.20k | } |
480 | | // Go through the graph to set tensor on matching symbols |
481 | 11.0k | for (i = 0; 2.20k i < stack->rnum; i++8.82k ) |
482 | 8.82k | { |
483 | 8.82k | const int d = *(int*)ccv_array_get(stack, i); |
484 | | // If exceed range, skip. |
485 | 8.82k | if (d >= ccv_nnc_graph_exec_symbol_count(init->graph) || |
486 | 8.82k | d >= ccv_nnc_graph_exec_symbol_count(model->graph)) |
487 | 0 | continue; |
488 | 8.82k | const ccv_nnc_graph_exec_symbol_t src_symbol = { |
489 | 8.82k | .d = d, |
490 | 8.82k | .graph = init->graph |
491 | 8.82k | }; |
492 | 8.82k | const ccv_nnc_graph_exec_symbol_t dest_symbol = { |
493 | 8.82k | .d = d, |
494 | 8.82k | .graph = model->graph |
495 | 8.82k | }; |
496 | 8.82k | const ccv_nnc_cmd_t src_cmd = ccv_nnc_graph_exec_symbol_cmd(init->graph, src_symbol); |
497 | 8.82k | const ccv_nnc_cmd_t dest_cmd = ccv_nnc_graph_exec_symbol_cmd(model->graph, dest_symbol); |
498 | | // If the name doesn't match, skip. |
499 | 8.82k | if (dest_cmd.cmd != src_cmd.cmd && src_cmd.cmd != CCV_NNC_NOOP0 ) |
500 | 0 | continue; |
501 | | // Now get all the inputs and outputs, if matches, set them. |
502 | 8.82k | const int* src_inputs; |
503 | 8.82k | int src_input_size; |
504 | 8.82k | const int* src_outputs; |
505 | 8.82k | int src_output_size; |
506 | 8.82k | ccv_nnc_graph_exec_symbol_io(init->graph, src_symbol, &src_inputs, &src_input_size, &src_outputs, &src_output_size); |
507 | 8.82k | const int* dest_inputs; |
508 | 8.82k | int dest_input_size; |
509 | 8.82k | const int* dest_outputs; |
510 | 8.82k | int dest_output_size; |
511 | 8.82k | ccv_nnc_graph_exec_symbol_io(model->graph, dest_symbol, &dest_inputs, &dest_input_size, &dest_outputs, &dest_output_size); |
512 | | // We may have unmatched input / output size because this is the minimizer and it has |
513 | | // different saved_aux (for example, when we shrunk with CMD_NOOP). |
514 | 8.82k | if (src_input_size != dest_input_size) |
515 | 0 | continue; |
516 | 8.82k | if (src_output_size != dest_output_size) |
517 | 0 | continue; |
518 | 8.82k | ccv_nnc_graph_exec_symbol_set(model->graph, dest_symbol, src_cmd); |
519 | | // There may be mismatches of the source tensor symbols and destination tensor symbols. The reason is because |
520 | | // we may later passed-in the minimizer, therefore, we may allocate tensors for minimizer later in the original |
521 | | // graph whereas in the newly created graph, it is streamlined (the minimizer exists from the beginning). That |
522 | | // will make the order of tensor symbols creation different, therefore, exact which tensor is which wrong as |
523 | | // well. However, set a new minimizer won't change the exec symbol ordering, because we never create new exec |
524 | | // symbols after gradient init step. Changing a new minimizer just updated that exec symbols setting, it is not |
525 | | // a new exec symbol. |
526 | 33.7k | for (j = 0; j < src_input_size; j++24.8k ) |
527 | 24.8k | if (src_inputs[j] >= 0) |
528 | 20.4k | _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, src_inputs[j], dest_inputs[j]); |
529 | 22.4k | for (j = 0; j < src_output_size; j++13.6k ) |
530 | 13.6k | if (src_outputs[j] >= 0) |
531 | 13.6k | _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, src_outputs[j], dest_outputs[j]); |
532 | 8.82k | } |
533 | 2.20k | ccv_array_free(stack); |
534 | | // After this, we get all tensors in the model graph resolved through tensor_auto. |
535 | 2.20k | ccv_nnc_symbolic_graph_tensor_auto(model->graph, TRAVERSE_FULL); |
536 | | // Verify symbols we get matches. |
537 | 2.20k | const int parameter_size = compiled_data->parameters->rnum; |
538 | 4.61k | for (i = 0; i < parameter_size; i++2.41k ) |
539 | 2.41k | { assert(((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d == ((ccv_nnc_tensor_symbol_t*)ccv_array_get(init->compiled_data->parameters, i))->d); } |
540 | 2.20k | const int internal_size = compiled_data->internals->rnum; |
541 | 2.20k | for (i = 0; i < internal_size; i++0 ) |
542 | 0 | { assert(((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i))->d == ((ccv_nnc_tensor_symbol_t*)ccv_array_get(init->compiled_data->internals, i))->d); } |
543 | | // Go through compiled data. |
544 | 2.20k | if (compiled_data->tensor_arena) |
545 | 2.20k | { |
546 | 2.20k | const int flag = ccv_nnc_tensor_arena_reinit(compiled_data->tensor_arena, model->graph); |
547 | 2.20k | if (flag == 0 && compiled_data->graph_exec_arena) |
548 | 2.20k | { |
549 | 2.20k | ccv_nnc_graph_exec_reinit(compiled_data->graph_exec_arena, compiled_data->graph, model->graph); |
550 | | // Since we will reinit, if we previously set is_test, we need to set it again. |
551 | 2.20k | if (compiled_data->is_test) |
552 | 1 | { |
553 | 1 | const int parallel_count = ccv_max(model->parallel_count, 1); |
554 | 1 | ccv_nnc_graph_exec_update_t update = { |
555 | 1 | .parallel_count = parallel_count, |
556 | 1 | .graph = model->graph, |
557 | 1 | .graph_exec_arena = compiled_data->graph_exec_arena, |
558 | 1 | }; |
559 | 1 | ccv_cnnp_model_set_is_test(model, 1, _ccv_cnnp_cmd_update_for_execs, &update); |
560 | 1 | } |
561 | 2.20k | } else |
562 | | // Free-up tensor arena & graph exec arena. |
563 | 0 | _ccv_cnnp_compiled_data_graph_free(compiled_data); |
564 | 2.20k | } |
565 | | // There are other compiled graphs, for accum and apply gradients. |
566 | | // However, the main conclusion is, these absorb operations shouldn't impact parameters. |
567 | | // Thus, it won't impact the shape of gradients (only outgrad). Since for outgrad, we |
568 | | // don't allocate ourselves, it is not a concern. For normal gradients, the shape cannot |
569 | | // be changed otherwise parameters' shape will be meaningless. The same goes to internals. |
570 | | // That is why we don't update these compiled graphs at all this point. |
571 | | // Free the model, we've already "absorbed" it. |
572 | 2.20k | ccv_cnnp_model_free(init); |
573 | 2.20k | } |
574 | | |
575 | | void ccv_cnnp_model_compile(ccv_cnnp_model_t* const model, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_cmd_t minimizer, const ccv_nnc_cmd_t loss) |
576 | 2.28k | { |
577 | 2.28k | assert(input_size == model->input_size || model->input_size == 0); |
578 | 2.28k | if (model->input_size == 0) |
579 | 6 | model->input_size = input_size; |
580 | 2.28k | if (!model->graph) // The graph is not compiled yet. |
581 | 87 | { |
582 | 87 | model->graph = ccv_nnc_symbolic_graph_new(); |
583 | 87 | _ccv_cnnp_model_compile(model, inputs, input_size, loss); |
584 | 87 | assert(model->compiled_data); |
585 | 87 | int i, flag = 0; |
586 | 217 | for (i = 0; !flag && i < input_size197 ; i++130 ) |
587 | 130 | flag = (CCV_TENSOR_GET_MEMORY(inputs[i].type) == CCV_TENSOR_GPU_MEMORY); |
588 | | // If inputs are from GPU, stream type is GPU. |
589 | 87 | model->compiled_data->stream_type = flag ? CCV_STREAM_CONTEXT_GPU20 : CCV_STREAM_CONTEXT_CPU67 ; |
590 | 87 | model->compiled_data->minimize.minimizer = minimizer; |
591 | 87 | model->compiled_data->minimize.max_saved_aux_size = ccv_nnc_minimizer_saved_aux_size(minimizer); |
592 | 2.20k | } else { |
593 | | // Now, finally fill in this part. If the graph is already compiled, we make a copy of the model. |
594 | | // And then absorb the "new model" to the old one. |
595 | 2.20k | ccv_cnnp_model_t* const init = ccv_cnnp_model_copy(model, model->is_trainable); |
596 | 2.20k | ccv_cnnp_model_absorb(model, init, inputs, input_size); |
597 | | // Reset minimizer. |
598 | 2.20k | ccv_cnnp_model_set_minimizer(model, minimizer, 1, 0, 0); |
599 | 2.20k | } |
600 | 2.28k | } |
601 | | |
602 | | ccv_cnnp_model_t* ccv_cnnp_model_copy(const ccv_cnnp_model_t* const model, const int is_trainable) |
603 | 2.20k | { |
604 | 2.20k | ccv_cnnp_model_t* const new_model = _ccv_cnnp_model_copy(model, 0); |
605 | 2.20k | new_model->is_trainable = is_trainable; |
606 | 2.20k | return new_model; |
607 | 2.20k | } |
608 | | |
609 | | void ccv_cnnp_model_tensor_auto(ccv_cnnp_model_t* const model, ccv_nnc_tensor_param_t* const outputs, const int output_size) |
610 | 4.44k | { |
611 | 4.44k | assert(model->graph); |
612 | 4.44k | assert(output_size == model->output_size); |
613 | 4.44k | ccv_nnc_symbolic_graph_t* const graph = model->graph; |
614 | 4.44k | ccv_nnc_symbolic_graph_tensor_auto(graph, TRAVERSE_FULL); |
615 | 4.44k | int i; |
616 | 8.89k | for (i = 0; i < output_size; i++4.44k ) |
617 | 4.44k | { |
618 | 4.44k | assert(model->outputs[i].d != CCV_NNC_NO_TENSOR_SYMBOL); |
619 | 4.44k | outputs[i] = ccv_nnc_tensor_symbol_params(graph, model->outputs[i]); |
620 | 4.44k | } |
621 | 4.44k | } |
622 | | |
623 | | void ccv_cnnp_model_set_workspace_size(ccv_cnnp_model_t* const model, size_t workspace_size) |
624 | 3 | { |
625 | 3 | if (workspace_size == model->workspace_size) |
626 | 0 | return; |
627 | 3 | model->workspace_size = workspace_size; |
628 | 3 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
629 | 3 | if (compiled_data && compiled_data->graph) |
630 | 0 | ccv_nnc_graph_autotune(compiled_data->graph, workspace_size, 0, TRAVERSE_FULL); |
631 | 3 | } |
632 | | |
633 | | size_t ccv_cnnp_model_workspace_size(ccv_cnnp_model_t* const model) |
634 | 0 | { |
635 | 0 | return model->workspace_size; |
636 | 0 | } |
637 | | |
638 | | void ccv_cnnp_model_set_data_parallel(ccv_cnnp_model_t* const model, const int parallel) |
639 | 15 | { |
640 | 15 | if (parallel == 0) |
641 | 0 | model->parallel_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU); |
642 | 15 | else |
643 | 15 | model->parallel_count = parallel; |
644 | 15 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
645 | 15 | if (compiled_data) |
646 | 11 | { assert(!compiled_data->graph); } |
647 | 15 | } |
648 | | |
649 | | void ccv_cnnp_model_set_max_concurrency(ccv_cnnp_model_t* const model, const int max_stream_count) |
650 | 0 | { |
651 | 0 | model->max_stream_count = max_stream_count; |
652 | 0 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
653 | 0 | if (compiled_data) |
654 | 0 | { assert(!compiled_data->graph); } |
655 | 0 | } |
656 | | |
657 | | void ccv_cnnp_model_set_memory_compression(ccv_cnnp_model_t* const model, const int memory_compression) |
658 | 0 | { |
659 | 0 | model->memory_compression = memory_compression; |
660 | 0 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
661 | 0 | if (compiled_data) |
662 | 0 | { assert(!compiled_data->graph); } |
663 | 0 | } |
664 | | |
665 | | void ccv_cnnp_model_set_memory_reduction(ccv_cnnp_model_t* const model, const int memory_reduction) |
666 | 0 | { |
667 | 0 | model->memory_reduction = memory_reduction; |
668 | 0 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
669 | 0 | if (compiled_data) |
670 | 0 | { assert(!compiled_data->graph); } |
671 | 0 | } |
672 | | |
673 | | void ccv_cnnp_model_set_gradient_checkpointing(ccv_cnnp_model_t* const model, const int gradient_checkpointing) |
674 | 2 | { |
675 | 2 | model->gradient_checkpointing = gradient_checkpointing; |
676 | 2 | } |
677 | | |
678 | | int ccv_cnnp_model_gradient_checkpointing(ccv_cnnp_model_t* const model) |
679 | 0 | { |
680 | 0 | return model->gradient_checkpointing; |
681 | 0 | } |
682 | | |
683 | | typedef struct { |
684 | | int parallel_count; |
685 | | ccv_nnc_symbolic_graph_t* graph; |
686 | | ccv_cnnp_compiled_data_t* compiled_data; |
687 | | ccv_nnc_tensor_arena_t* tensor_arena; |
688 | | } ccv_nnc_tensor_init_states_t; |
689 | | |
690 | | static int _ccv_cnnp_any_to_init(const ccv_cnnp_compiled_data_t* const compiled_data) |
691 | 89 | { |
692 | 89 | int i; |
693 | 89 | const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v); |
694 | 165 | for (i = 0; i < compiled_data->parameters->rnum; i++76 ) |
695 | 112 | { |
696 | 112 | const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d; |
697 | 112 | if (!(init_v[d >> 5] & (1u << (d & 0x1f)))) |
698 | 36 | return 1; |
699 | 112 | } |
700 | 53 | for (i = 0; i < compiled_data->internals->rnum; i++0 ) |
701 | 5 | { |
702 | 5 | const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i))->d; |
703 | 5 | if (!(init_v[d >> 5] & (1u << (d & 0x1f)))) |
704 | 5 | return 1; |
705 | 5 | } |
706 | 48 | return 0; |
707 | 53 | } |
708 | | |
709 | | static void _ccv_cnnp_init_states_for_tensors(void* const context, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const input, const ccv_nnc_tensor_symbol_t output_symbol) |
710 | 329 | { |
711 | 329 | ccv_nnc_tensor_init_states_t* const tensor_init_states = (ccv_nnc_tensor_init_states_t*)context; |
712 | 329 | ccv_nnc_tensor_arena_t* const tensor_arena = tensor_init_states->tensor_arena; |
713 | 329 | ccv_nnc_tensor_t* const output_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, output_symbol); |
714 | 329 | if (!output_tensor) |
715 | 0 | return; |
716 | 329 | const int d = output_symbol.d; |
717 | 329 | assert(d < tensor_init_states->compiled_data->tensors_init.size); |
718 | 329 | uint32_t* const init_v = CCV_NNC_INIT_V(tensor_init_states->compiled_data->tensors_init.v); |
719 | 329 | if (init_v[d >> 5] & (1u << (d & 0x1f))) |
720 | 29 | return; |
721 | 300 | init_v[d >> 5] |= (1u << (d & 0x1f)); |
722 | 300 | ccv_nnc_cmd_exec(cmd, hint, flags, &input, input ? 112 : 0288 , &output_tensor, 1, 0); |
723 | 300 | const ccv_nnc_symbolic_graph_t* const graph = tensor_init_states->graph; |
724 | 300 | const int parallel_count = tensor_init_states->parallel_count; |
725 | 300 | int i; |
726 | 780 | for (i = 1; i < parallel_count; i++480 ) |
727 | 480 | { |
728 | 480 | ccv_nnc_tensor_t* const copy = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_copy(graph, output_symbol, i)); |
729 | 480 | if (copy) |
730 | 480 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &output_tensor, 1, ©, 1, 0); |
731 | 480 | } |
732 | 300 | } |
733 | | |
734 | | // This method can only handle cases we added new tensors and exec, never delete. This invariant is true because |
735 | | // we setup everything (including calling simplify method) in ccv_cnnp_model_compile method, before this rewind setup. |
736 | | static void _ccv_cnnp_model_rewind_graph(ccv_cnnp_model_t* const model) |
737 | 2 | { |
738 | 2 | assert(model->graph); |
739 | 2 | assert(model->compiled_data); |
740 | 2 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
741 | 2 | assert(compiled_data->rewindables); |
742 | 2 | int i; |
743 | 51 | for (i = 0; i < compiled_data->rewindables->rnum; i++49 ) |
744 | 49 | { |
745 | 49 | const ccv_cnnp_rewind_symbol_t* const rewind_symbol = (ccv_cnnp_rewind_symbol_t*)ccv_array_get(compiled_data->rewindables, i); |
746 | 49 | if (rewind_symbol->type == CCV_CNNP_REWIND_GRAPH_EXEC) |
747 | 16 | ccv_nnc_graph_exec_symbol_free(model->graph, rewind_symbol->graph_exec); |
748 | 33 | else if (rewind_symbol->type == CCV_CNNP_REWIND_TENSOR) |
749 | 33 | ccv_nnc_tensor_symbol_free(model->graph, rewind_symbol->tensor); |
750 | 49 | } |
751 | 2 | ccv_array_clear(compiled_data->rewindables); |
752 | 2 | ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
753 | 2 | } |
754 | | |
755 | | static void _ccv_cnnp_model_tensor_symbol_new_hook(void* context, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_param_t info, const char* const name) |
756 | 6.09k | { |
757 | 6.09k | const ccv_cnnp_rewind_symbol_t rewind_symbol = { |
758 | 6.09k | .type = CCV_CNNP_REWIND_TENSOR, |
759 | 6.09k | .tensor = symbol |
760 | 6.09k | }; |
761 | 6.09k | ccv_array_t* const rewind_symbols = (ccv_array_t*)context; |
762 | 6.09k | ccv_array_push(rewind_symbols, &rewind_symbol); |
763 | 6.09k | } |
764 | | |
765 | | static void _ccv_cnnp_model_tensor_symbol_alias_new_hook(void* context, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_symbol_t from_symbol, const int ofs[CCV_NNC_MAX_DIM_ALLOC], const int inc[CCV_NNC_MAX_DIM_ALLOC], const ccv_nnc_tensor_param_t info, const char* const name) |
766 | 475 | { |
767 | 475 | const ccv_cnnp_rewind_symbol_t rewind_symbol = { |
768 | 475 | .type = CCV_CNNP_REWIND_TENSOR, |
769 | 475 | .tensor = symbol |
770 | 475 | }; |
771 | 475 | ccv_array_t* const rewind_symbols = (ccv_array_t*)context; |
772 | 475 | ccv_array_push(rewind_symbols, &rewind_symbol); |
773 | 475 | } |
774 | | |
775 | | static void _ccv_cnnp_model_graph_exec_symbol_new_hook(void* context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_symbol_t* const inputs, const int input_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const char* const name) |
776 | 2.32k | { |
777 | 2.32k | const ccv_cnnp_rewind_symbol_t rewind_symbol = { |
778 | 2.32k | .type = CCV_CNNP_REWIND_GRAPH_EXEC, |
779 | 2.32k | .graph_exec = symbol |
780 | 2.32k | }; |
781 | 2.32k | ccv_array_t* const rewind_symbols = (ccv_array_t*)context; |
782 | 2.32k | ccv_array_push(rewind_symbols, &rewind_symbol); |
783 | 2.32k | } |
784 | | |
785 | | static void _ccv_cnnp_model_graph_symbol_exec_set_for_graph_exec_arena(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena, const int parallel_count, const ccv_nnc_graph_exec_symbol_t exec_symbol, const ccv_nnc_cmd_t cmd, ccv_nnc_symbolic_graph_t* const symbolic_graph) |
786 | 35.0k | { |
787 | 35.0k | ccv_nnc_graph_exec_t const update_exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, exec_symbol); |
788 | 35.0k | if (!CCV_NO_GRAPH_EXEC(update_exec)) |
789 | 19.9k | ccv_nnc_graph_exec_set(update_exec.graph, update_exec, cmd); |
790 | 35.0k | int i; |
791 | 49.9k | for (i = 1; i < parallel_count; i++14.8k ) |
792 | 14.8k | { |
793 | 14.8k | ccv_nnc_graph_exec_symbol_t copy_symbol = ccv_nnc_graph_exec_symbol_copy(symbolic_graph, exec_symbol, i); |
794 | 14.8k | const ccv_nnc_graph_exec_t copy = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, copy_symbol); |
795 | 14.8k | if (!CCV_NO_GRAPH_EXEC(copy)) |
796 | 14.6k | ccv_nnc_graph_exec_set(copy.graph, copy, cmd); |
797 | 14.8k | } |
798 | 35.0k | } |
799 | | |
800 | | static void _ccv_cnnp_model_graph_exec_symbol_set(ccv_nnc_symbolic_graph_t* const symbolic_graph, ccv_cnnp_compiled_data_t* const compiled_data, const int parallel_count, const ccv_nnc_graph_exec_symbol_t exec_symbol, const ccv_nnc_cmd_t cmd) |
801 | 20.0k | { |
802 | 20.0k | assert(compiled_data); |
803 | 20.0k | assert(symbolic_graph); |
804 | 20.0k | ccv_nnc_graph_exec_symbol_set(symbolic_graph, exec_symbol, cmd); |
805 | 20.0k | int i; |
806 | 35.0k | for (i = 1; i < parallel_count; i++14.9k ) |
807 | 14.9k | { |
808 | 14.9k | ccv_nnc_graph_exec_symbol_t copy_symbol = ccv_nnc_graph_exec_symbol_copy(symbolic_graph, exec_symbol, i); |
809 | 14.9k | if (copy_symbol.graph) |
810 | 14.8k | ccv_nnc_graph_exec_symbol_set(symbolic_graph, copy_symbol, cmd); |
811 | 14.9k | } |
812 | 20.0k | ccv_nnc_graph_exec_arena_t* const graph_exec_arena = compiled_data->graph_exec_arena; |
813 | 20.0k | if (graph_exec_arena) |
814 | 20.0k | _ccv_cnnp_model_graph_symbol_exec_set_for_graph_exec_arena(graph_exec_arena, parallel_count, exec_symbol, cmd, symbolic_graph); |
815 | | // Skip backward graph exec arena because it is for a specific accum symbolic graph, not the main graph (model->graph) |
816 | 20.0k | ccv_nnc_graph_exec_arena_t* const gradient_graph_exec_arena = compiled_data->apply_gradients.graph_exec_arena; |
817 | 20.0k | if (gradient_graph_exec_arena) |
818 | 15.0k | _ccv_cnnp_model_graph_symbol_exec_set_for_graph_exec_arena(gradient_graph_exec_arena, parallel_count, exec_symbol, cmd, symbolic_graph); |
819 | 20.0k | } |
820 | | |
821 | | static int _ccv_cnnp_set_minimizer_for_parameter(ccv_nnc_symbolic_graph_t* const graph, ccv_cnnp_compiled_data_t* const compiled_data, ccv_nnc_graph_exec_symbol_t* const update_nodes, ccv_nnc_tensor_symbol_t* const updated_parameters, ccv_nnc_tensor_symbol_map_t* const saved_aux, const int parallel_count, const ccv_nnc_cmd_t minimizer, const int saved_aux_size, const int max_saved_aux_size, const int parameter_indice) |
822 | 20.0k | { |
823 | 20.0k | int this_parameter_flag = 0; |
824 | 20.0k | if (update_nodes[parameter_indice].d == CCV_NNC_NO_TENSOR_SYMBOL) |
825 | 0 | return this_parameter_flag; |
826 | 20.0k | const ccv_nnc_cmd_t old_minimizer = ccv_nnc_graph_exec_symbol_cmd(graph, update_nodes[parameter_indice]); |
827 | 20.0k | int j, k; |
828 | | // For no-op, we can preserve previous saved_aux_size. |
829 | 20.0k | if (old_minimizer.cmd != minimizer.cmd && minimizer.cmd != CCV_NNC_NOOP71 ) |
830 | 67 | { |
831 | | // If the old minimizer is a noop, then the old_saved_aux_size should be whatever its previous |
832 | | // saved_aux_size is, otherwise we will reinit the saved_aux repeatedly if you switch between |
833 | | // noop and a minimizer. We don't want that because we do that in high-level frameworks to |
834 | | // make sure some model parameters don't update if we don't want them to. |
835 | 67 | int old_saved_aux_size; |
836 | 67 | if (old_minimizer.cmd == CCV_NNC_NOOP) |
837 | 67 | { |
838 | 67 | int input_size; |
839 | 67 | ccv_nnc_graph_exec_symbol_io(graph, update_nodes[parameter_indice], 0, &input_size, 0, 0); |
840 | 67 | if (input_size < 2) // This is not legit. |
841 | 0 | old_saved_aux_size = ccv_nnc_minimizer_saved_aux_size(old_minimizer); |
842 | 67 | else // See ccv_nnc_minimizer_saved_aux_size, the saved_aux is inputs excluding gradients and parameters. |
843 | 67 | old_saved_aux_size = input_size - 2; |
844 | 67 | } else |
845 | 0 | old_saved_aux_size = ccv_nnc_minimizer_saved_aux_size(old_minimizer); |
846 | 67 | if (old_saved_aux_size != saved_aux_size) |
847 | 65 | { |
848 | 65 | this_parameter_flag = 1; |
849 | 65 | if (saved_aux_size > old_saved_aux_size) |
850 | 65 | { |
851 | | // Allocate new tensor symbols. |
852 | 65 | const ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(graph, updated_parameters[parameter_indice]); |
853 | 189 | for (j = old_saved_aux_size; j < saved_aux_size; j++124 ) |
854 | 124 | { |
855 | 124 | saved_aux[parameter_indice * max_saved_aux_size + j].source = ccv_nnc_tensor_symbol_new(graph, info, 0); |
856 | 124 | saved_aux[parameter_indice * max_saved_aux_size + j].destination = ccv_nnc_tensor_symbol_new(graph, info, 0); |
857 | 124 | const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type); |
858 | 460 | for (k = 1; k < parallel_count; k++336 ) |
859 | 336 | { |
860 | 336 | ccv_nnc_tensor_param_t dev_info = info; |
861 | 336 | if (k != device_id) |
862 | 336 | CCV_TENSOR_SET_DEVICE_ID(dev_info.type, k); |
863 | 0 | else |
864 | 0 | CCV_TENSOR_SET_DEVICE_ID(dev_info.type, 0); |
865 | 336 | const ccv_nnc_tensor_symbol_t src_copy = ccv_nnc_tensor_symbol_new(graph, dev_info, 0); |
866 | 336 | const ccv_nnc_tensor_symbol_t dest_copy = ccv_nnc_tensor_symbol_new(graph, dev_info, 0); |
867 | 336 | ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k, src_copy); |
868 | 336 | ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k, dest_copy); |
869 | 336 | } |
870 | 124 | } |
871 | 65 | } else { |
872 | 0 | for (j = saved_aux_size; j < old_saved_aux_size; j++) |
873 | 0 | { |
874 | 0 | for (k = 1; k < parallel_count; k++) |
875 | 0 | { |
876 | 0 | const ccv_nnc_tensor_symbol_t src_copy = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k); |
877 | 0 | if (src_copy.d >= 0) |
878 | 0 | { |
879 | 0 | ccv_nnc_tensor_symbol_free(graph, src_copy); |
880 | 0 | ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k, NO_TENSOR_SYMBOL); |
881 | 0 | } |
882 | 0 | const ccv_nnc_tensor_symbol_t dest_copy = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k); |
883 | 0 | if (dest_copy.d >= 0) |
884 | 0 | { |
885 | 0 | ccv_nnc_tensor_symbol_free(graph, dest_copy); |
886 | 0 | ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k, NO_TENSOR_SYMBOL); |
887 | 0 | } |
888 | 0 | } |
889 | 0 | ccv_nnc_tensor_symbol_free(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source); |
890 | 0 | ccv_nnc_tensor_symbol_free(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination); |
891 | 0 | saved_aux[parameter_indice * max_saved_aux_size + j].source = saved_aux[parameter_indice * max_saved_aux_size + j].destination = NO_TENSOR_SYMBOL; |
892 | 0 | } |
893 | 0 | } |
894 | 65 | } |
895 | 67 | } |
896 | 20.0k | _ccv_cnnp_model_graph_exec_symbol_set(graph, compiled_data, parallel_count, update_nodes[parameter_indice], minimizer); |
897 | 20.0k | if (this_parameter_flag) |
898 | 65 | { |
899 | 65 | ccv_nnc_tensor_symbol_t update_inputs[saved_aux_size + 2]; |
900 | 65 | ccv_nnc_tensor_symbol_t update_outputs[saved_aux_size + 1]; |
901 | 65 | const int* inputs = 0; |
902 | 65 | int input_size = 0; |
903 | 65 | ccv_nnc_graph_exec_symbol_io(graph, update_nodes[parameter_indice], &inputs, &input_size, 0, 0); |
904 | 65 | assert(input_size >= 1); |
905 | 65 | update_inputs[0].d = inputs[0]; |
906 | 65 | update_inputs[0].graph = graph; |
907 | 65 | update_inputs[1].d = inputs[1]; |
908 | 65 | update_inputs[1].graph = graph; |
909 | 65 | update_outputs[0] = updated_parameters[parameter_indice]; |
910 | 189 | for (j = 0; j < saved_aux_size; j++124 ) |
911 | 124 | { |
912 | 124 | update_inputs[j + 2] = saved_aux[parameter_indice * max_saved_aux_size + j].source; |
913 | 124 | update_outputs[j + 1] = saved_aux[parameter_indice * max_saved_aux_size + j].destination; |
914 | 124 | } |
915 | 65 | ccv_nnc_graph_exec_symbol_set_io(graph, update_nodes[parameter_indice], update_inputs, saved_aux_size + 2, update_outputs, saved_aux_size + 1); |
916 | 233 | for (k = 1; k < parallel_count; k++168 ) |
917 | 168 | { |
918 | 168 | const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(graph, update_nodes[parameter_indice], k); |
919 | 168 | assert(copy.d >= 0); |
920 | 168 | ccv_nnc_graph_exec_symbol_io(graph, copy, &inputs, &input_size, 0, 0); |
921 | 168 | assert(input_size >= 1); |
922 | 168 | update_inputs[0].d = inputs[0]; |
923 | 168 | update_inputs[0].graph = graph; |
924 | 168 | update_inputs[1].d = inputs[1]; |
925 | 168 | update_inputs[1].graph = graph; |
926 | 168 | update_outputs[0] = ccv_nnc_tensor_symbol_copy(graph, updated_parameters[parameter_indice], k); |
927 | 504 | for (j = 0; j < saved_aux_size; j++336 ) |
928 | 336 | { |
929 | 336 | update_inputs[j + 2] = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k); |
930 | 336 | update_outputs[j + 1] = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k); |
931 | 336 | } |
932 | 168 | ccv_nnc_graph_exec_symbol_set_io(graph, copy, update_inputs, saved_aux_size + 2, update_outputs, saved_aux_size + 1); |
933 | 168 | } |
934 | 65 | } |
935 | 20.0k | return this_parameter_flag; |
936 | 20.0k | } |
937 | | |
938 | | typedef struct { |
939 | | int parameter_size; |
940 | | ccv_nnc_cmd_t minimizer; |
941 | | ccv_cnnp_model_io_t parameters[1]; |
942 | | } ccv_cnnp_set_minimizer_for_parameter_t; |
943 | | |
944 | | static int _ccv_cnnp_apply_parameters_with_minimizer(ccv_cnnp_model_t* const model) |
945 | 296 | { |
946 | 296 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
947 | 296 | assert(compiled_data); |
948 | 296 | const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size; |
949 | | // We update all parameters, at this point, we have one minimizer. |
950 | 296 | const int parameter_size = compiled_data->parameters->rnum; |
951 | 296 | ccv_nnc_graph_exec_symbol_t* const update_nodes = compiled_data->update_nodes; |
952 | 296 | ccv_nnc_symbolic_graph_t* const symbolic_graph = model->graph; |
953 | 296 | assert(symbolic_graph); |
954 | 296 | const int parallel_count = ccv_max(model->parallel_count, 1); |
955 | 296 | ccv_array_t* const parameters = compiled_data->minimize.parameters; |
956 | 296 | ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0); |
957 | 296 | int i, j, flag = 0; |
958 | 301 | for (i = 0; i < parameters->rnum; i++5 ) |
959 | 5 | { |
960 | 5 | ccv_cnnp_set_minimizer_for_parameter_t* const set_minimizer_for_parameter = *(ccv_cnnp_set_minimizer_for_parameter_t**)ccv_array_get(parameters, i); |
961 | 10 | for (j = 0; j < set_minimizer_for_parameter->parameter_size; j++5 ) |
962 | 5 | { |
963 | 5 | const int param_sel = set_minimizer_for_parameter->parameters[j]->param_sel > 0 ? set_minimizer_for_parameter->parameters[j]->param_sel - 13 : set_minimizer_for_parameter->parameters[j]->param_sel2 ; |
964 | 5 | assert(set_minimizer_for_parameter->parameters[j]->param_sel != 0); |
965 | 5 | const int old_rnum = parameter_indices->rnum; |
966 | 5 | ccv_cnnp_model_add_to_parameter_indices(set_minimizer_for_parameter->parameters[j]->model, param_sel, parameter_indices); |
967 | 5 | const int param_ref = set_minimizer_for_parameter->parameters[j]->param_ref > 0 ? set_minimizer_for_parameter->parameters[j]->param_ref - 10 : set_minimizer_for_parameter->parameters[j]->param_ref; |
968 | 5 | assert(set_minimizer_for_parameter->parameters[j]->param_ref != 0); |
969 | 5 | if (param_ref >= 0) |
970 | 0 | { |
971 | 0 | assert(param_ref + old_rnum < parameter_indices->rnum); |
972 | 0 | *(int*)ccv_array_get(parameter_indices, old_rnum) = *(int*)ccv_array_get(parameter_indices, param_ref + old_rnum); |
973 | 0 | parameter_indices->rnum = old_rnum + 1; |
974 | 0 | } |
975 | 5 | } |
976 | 5 | const int saved_aux_size = ccv_nnc_minimizer_saved_aux_size(set_minimizer_for_parameter->minimizer); |
977 | | // We may have duplicated indices, but that is OK, we will set it twice. |
978 | 58 | for (j = 0; j < parameter_indices->rnum; j++53 ) |
979 | 53 | { |
980 | 53 | const int d = *(int*)ccv_array_get(parameter_indices, j); |
981 | 53 | assert(d <= parameter_size); |
982 | 53 | if (_ccv_cnnp_set_minimizer_for_parameter(symbolic_graph, compiled_data, update_nodes, compiled_data->updated_parameters, compiled_data->saved_aux, parallel_count, set_minimizer_for_parameter->minimizer, saved_aux_size, max_saved_aux_size, d)) |
983 | 0 | flag = 1; |
984 | 53 | } |
985 | 5 | ccv_array_clear(parameter_indices); |
986 | 5 | } |
987 | 296 | ccv_array_free(parameter_indices); |
988 | 296 | return flag; |
989 | 296 | } |
990 | | |
991 | | static void _ccv_cnnp_scatter_saved_aux(ccv_nnc_tensor_symbol_map_t* const saved_aux, const int parameter_size, const int old_saved_aux_size, const int new_saved_aux_size) |
992 | 2.24k | { |
993 | 2.24k | if (new_saved_aux_size == old_saved_aux_size) |
994 | 2.24k | return; |
995 | 2.24k | assert(new_saved_aux_size > old_saved_aux_size)7 ; |
996 | 7 | int i, j; |
997 | 72 | for (i = parameter_size - 1; i >= 0; i--65 ) |
998 | 65 | { |
999 | 189 | for (j = new_saved_aux_size - 1; j >= old_saved_aux_size; j--124 ) |
1000 | 124 | saved_aux[i * new_saved_aux_size + j].source = saved_aux[i * new_saved_aux_size + j].destination = NO_TENSOR_SYMBOL; |
1001 | 65 | for (j = old_saved_aux_size - 1; j >= 0; j--0 ) |
1002 | 0 | saved_aux[i * new_saved_aux_size + j] = saved_aux[i * old_saved_aux_size + j]; |
1003 | 65 | } |
1004 | 7 | } |
1005 | | |
1006 | | static void _ccv_cnnp_model_set_rewindables(ccv_cnnp_model_t* const model) |
1007 | 41 | { |
1008 | 41 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
1009 | 41 | assert(compiled_data); |
1010 | 41 | if (!compiled_data->rewindables) |
1011 | 41 | compiled_data->rewindables = ccv_array_new(sizeof(ccv_cnnp_rewind_symbol_t), 0, 0); |
1012 | 41 | ccv_nnc_tensor_symbol_new_hook(model->graph, _ccv_cnnp_model_tensor_symbol_new_hook, compiled_data->rewindables, 0); |
1013 | 41 | ccv_nnc_tensor_symbol_alias_new_hook(model->graph, _ccv_cnnp_model_tensor_symbol_alias_new_hook, compiled_data->rewindables, 0); |
1014 | 41 | ccv_nnc_graph_exec_symbol_new_hook(model->graph, _ccv_cnnp_model_graph_exec_symbol_new_hook, compiled_data->rewindables, 0); |
1015 | 41 | } |
1016 | | |
1017 | | static void _ccv_cnnp_model_gradient_init(ccv_cnnp_model_t* const model, const int gradient_mode, const uint64_t disable_outgrad, ccv_nnc_tensor_t* const* const fits, const int fit_size) |
1018 | 2.24k | { |
1019 | 2.24k | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
1020 | 2.24k | assert(compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE); |
1021 | 2.24k | assert(gradient_mode != CCV_CNNP_COMPILED_DATA_GRADIENT_NONE); |
1022 | 2.24k | const int evaluate_to_size = compiled_data->evaluate.to_size; |
1023 | 2.24k | assert(evaluate_to_size > 0); |
1024 | 2.24k | const int parallel_count = ccv_max(model->parallel_count, 1); |
1025 | 2.24k | compiled_data->evaluate.tos = ccrealloc(compiled_data->evaluate.tos, sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size * parallel_count + sizeof(ccv_nnc_graph_exec_t) * evaluate_to_size * parallel_count); |
1026 | 2.24k | compiled_data->evaluate.to_ops = (ccv_nnc_graph_exec_t*)(compiled_data->evaluate.tos + evaluate_to_size * parallel_count); |
1027 | 2.24k | int i, j; |
1028 | 2.24k | const int output_size = model->output_size; |
1029 | 2.24k | assert(!fits || fit_size == output_size * parallel_count); |
1030 | 2.24k | if (fits) |
1031 | 12 | for (i = 0; 6 i < output_size; i++6 ) |
1032 | 6 | ccv_nnc_tensor_symbol_set(model->graph, compiled_data->fits[i], fits[i]->info); |
1033 | 2.24k | const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size; |
1034 | 2.24k | const int parameter_size = compiled_data->parameters->rnum; |
1035 | 2.24k | compiled_data->updated_parameters = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size + sizeof(ccv_nnc_graph_exec_symbol_t) * parameter_size + sizeof(ccv_nnc_tensor_symbol_map_t) * max_saved_aux_size * parameter_size); |
1036 | 2.24k | compiled_data->update_nodes = (ccv_nnc_graph_exec_symbol_t*)(compiled_data->updated_parameters + parameter_size); |
1037 | 2.24k | compiled_data->saved_aux = (ccv_nnc_tensor_symbol_map_t*)(compiled_data->update_nodes + parameter_size); |
1038 | 2.24k | int parameter_size_maybe_more = parameter_size; |
1039 | 2.24k | compiled_data->disable_outgrad = disable_outgrad; |
1040 | 2.24k | int outgrad_size; |
1041 | 2.24k | if (gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES || model->input_size == 02.23k ) |
1042 | 9 | outgrad_size = 0; |
1043 | 2.23k | else if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_NONE) // Compute minimize with gradients including inputs. |
1044 | 2.22k | outgrad_size = model->input_size; |
1045 | 3 | else { |
1046 | 3 | assert(disable_outgrad != CCV_CNNP_DISABLE_OUTGRAD_ALL); // If it is disable all, gradient mode won't be this. |
1047 | 3 | outgrad_size = 0; |
1048 | 10 | for (i = 0; i < model->input_size; i++7 ) |
1049 | 7 | if (!(disable_outgrad & ((uint64_t)1 << i))) |
1050 | 3 | ++outgrad_size; |
1051 | 3 | } |
1052 | 2.24k | compiled_data->outgrad_size = outgrad_size; |
1053 | 2.24k | parameter_size_maybe_more += outgrad_size; |
1054 | 2.24k | compiled_data->gradients = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size_maybe_more + sizeof(ccv_nnc_graph_exec_symbol_t) * parameter_size_maybe_more * parallel_count); |
1055 | 2.24k | compiled_data->outgrads = parameter_size_maybe_more > parameter_size ? compiled_data->gradients + parameter_size2.23k : 09 ; |
1056 | 2.24k | compiled_data->backward.tos = (ccv_nnc_graph_exec_symbol_t*)(compiled_data->gradients + parameter_size_maybe_more); |
1057 | 2.24k | compiled_data->backward.to_size = parameter_size_maybe_more; |
1058 | 2.24k | ccv_nnc_tensor_symbol_t* parameters = (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0); |
1059 | 2.24k | if (compiled_data->parameter_flags) |
1060 | 4 | { |
1061 | 4 | parameters = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size); |
1062 | 25 | for (i = 0; i < parameter_size; i++21 ) |
1063 | 21 | if (compiled_data->parameter_flags[i >> 6] & ((uint64_t)1 << (i & 63))) |
1064 | 14 | parameters[i] = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i); |
1065 | 7 | else |
1066 | 7 | parameters[i] = NO_TENSOR_SYMBOL; |
1067 | 4 | } |
1068 | 2.24k | if (gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES || model->input_size == 02.23k ) |
1069 | 9 | ccv_nnc_symbolic_graph_minimize(model->graph, compiled_data->minimize.minimizer, compiled_data->f, output_size, parameters, parameter_size, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), compiled_data->gradients, compiled_data->updated_parameters, compiled_data->saved_aux, compiled_data->update_nodes); |
1070 | 2.23k | else if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_NONE) // Compute minimize with gradients including inputs. |
1071 | 2.22k | ccv_nnc_symbolic_graph_minimize(model->graph, compiled_data->minimize.minimizer, compiled_data->f, output_size, parameters, parameter_size, model->inputs, model->input_size, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), compiled_data->gradients, compiled_data->updated_parameters, compiled_data->saved_aux, compiled_data->update_nodes); |
1072 | 3 | else { // Compute minimize with gradients including selected inputs. |
1073 | 3 | assert(model->input_size > 0); |
1074 | 3 | assert(disable_outgrad != CCV_CNNP_DISABLE_OUTGRAD_ALL); // If it is disable all, gradient mode won't be this. |
1075 | 3 | assert(outgrad_size > 0); |
1076 | 3 | ccv_nnc_tensor_symbol_t outgrads[outgrad_size]; |
1077 | 3 | j = 0; |
1078 | 10 | for (i = 0; i < model->input_size; i++7 ) |
1079 | 7 | if (!(disable_outgrad & ((uint64_t)1 << i))) |
1080 | 3 | outgrads[j++] = model->inputs[i]; |
1081 | 3 | ccv_nnc_symbolic_graph_minimize(model->graph, compiled_data->minimize.minimizer, compiled_data->f, output_size, parameters, parameter_size, outgrads, outgrad_size, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), compiled_data->gradients, compiled_data->updated_parameters, compiled_data->saved_aux, compiled_data->update_nodes); |
1082 | 3 | } |
1083 | 2.24k | if (compiled_data->parameter_flags) |
1084 | 4 | ccfree(parameters); |
1085 | 2.24k | _ccv_cnnp_scatter_saved_aux(compiled_data->saved_aux, parameter_size, ccv_nnc_minimizer_saved_aux_size(compiled_data->minimize.minimizer), compiled_data->minimize.max_saved_aux_size); |
1086 | 2.24k | if (compiled_data->minimize.parameters) |
1087 | 5 | _ccv_cnnp_apply_parameters_with_minimizer(model); |
1088 | | // Go through gradient checkpoints to generate tensor inputs for backward pass just before executing the backward pass. |
1089 | 2.24k | ccv_cnnp_model_apply_gradient_checkpoints(compiled_data, model->graph); |
1090 | 4.48k | for (i = 0; i < output_size; i++2.24k ) |
1091 | 2.24k | { |
1092 | 2.24k | const ccv_nnc_tensor_symbol_t df = ccv_nnc_tensor_symbol_for_backward(model->graph, compiled_data->f[i]); |
1093 | | // Init this to 1 so we can backprop. |
1094 | 2.24k | ccv_nnc_tensor_symbol_set_flags(model->graph, df, CCV_NNC_TENSOR_SYMBOL_INIT_ONES); |
1095 | 2.24k | } |
1096 | 2.24k | compiled_data->backward.to_size = 0; |
1097 | 7.14k | for (i = 0; i < parameter_size_maybe_more; i++4.90k ) |
1098 | 4.90k | if (compiled_data->gradients[i].d != CCV_NNC_NO_TENSOR_SYMBOL) |
1099 | 4.90k | compiled_data->backward.tos[compiled_data->backward.to_size++] = ccv_nnc_graph_exec_symbol_for_backward(model->graph, compiled_data->gradients[i]); |
1100 | 2.24k | ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS); |
1101 | 2.24k | ccv_nnc_symbolic_graph_set_destinations(model->graph, compiled_data->update_nodes, parameter_size); |
1102 | 4.48k | for (i = 0; i < parameter_size_maybe_more - parameter_size; i++2.24k ) |
1103 | 2.24k | { |
1104 | 2.24k | if (compiled_data->outgrads[i].d < 0) // When we go through input, we might find zero-length inputs, and for these, we cannot have any outgrads. |
1105 | 0 | continue; |
1106 | 2.24k | const ccv_nnc_graph_exec_symbol_t outgrad = ccv_nnc_graph_exec_symbol_for_backward(model->graph, compiled_data->outgrads[i]); |
1107 | 2.24k | const int* tos; |
1108 | 2.24k | int to_size; |
1109 | 2.24k | ccv_nnc_graph_exec_symbol_to(model->graph, outgrad, &tos, &to_size); |
1110 | 2.24k | if (to_size == 0) // If this is the end (no minimizers afterwards). We need to attach this as a destination. Otherwise this is covered in update_nodes. |
1111 | 9 | { |
1112 | 9 | const ccv_nnc_graph_exec_symbol_t* destinations = ccv_nnc_symbolic_graph_destinations(model->graph); |
1113 | 9 | const int destination_count = ccv_nnc_symbolic_graph_destination_size(model->graph); |
1114 | 9 | int flag = 0; |
1115 | 9 | const int outgrad_destination_start = ccv_max(0, destination_count - i); |
1116 | 11 | for (j = i - 1; !flag && j >= 09 ; j--2 ) |
1117 | 2 | if (j + outgrad_destination_start < destination_count) |
1118 | 2 | flag = (destinations[j + outgrad_destination_start].d == outgrad.d); |
1119 | 9 | if (!flag) // Only if we cannot find it, we add it. |
1120 | 7 | ccv_nnc_symbolic_graph_add_destination(model->graph, outgrad); |
1121 | 9 | } |
1122 | 2.24k | } |
1123 | 2.24k | if (parallel_count > 1) |
1124 | 8 | { |
1125 | 8 | ccv_nnc_symbolic_graph_data_parallel(model->graph, parallel_count, |
1126 | 8 | 0, 0, |
1127 | 8 | compiled_data->gradients, parameter_size /* No need to deal with outgrads, we don't allreduce outgrads */, |
1128 | 8 | compiled_data->gradients /* We only care about gradients before allreduce, thus, update our current pointers */, |
1129 | 8 | 0, 0, 0, |
1130 | 8 | CCV_NNC_PARALLEL_REDUCE_OP_SUM, |
1131 | 8 | SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph)); |
1132 | 8 | ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
1133 | 16 | for (i = 0; i < evaluate_to_size; i++8 ) |
1134 | 32 | for (j = 1; 8 j < parallel_count; j++24 ) |
1135 | 24 | { |
1136 | 24 | const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->evaluate.tos[i], j); |
1137 | 24 | if (copy.d != CCV_NNC_NO_GRAPH_EXEC_SYMBOL) |
1138 | 24 | compiled_data->evaluate.tos[compiled_data->evaluate.to_size++] = copy; |
1139 | 24 | } |
1140 | 8 | const int backward_to_size = compiled_data->backward.to_size; |
1141 | 146 | for (i = 0; i < backward_to_size; i++138 ) |
1142 | 552 | for (j = 1; 138 j < parallel_count; j++414 ) |
1143 | 414 | { |
1144 | 414 | const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->backward.tos[i], j); |
1145 | 414 | if (copy.d != CCV_NNC_NO_GRAPH_EXEC_SYMBOL) |
1146 | 414 | compiled_data->backward.tos[compiled_data->backward.to_size++] = copy; |
1147 | 414 | } |
1148 | 8 | } |
1149 | | // Only use memory compression if we are in gradient parameter mode. |
1150 | 2.24k | if (gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES || gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS2.23k ) |
1151 | 2.24k | { |
1152 | 2.24k | if (model->memory_compression) |
1153 | 0 | ccv_nnc_symbolic_graph_memory_compression(model->graph, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph)); |
1154 | 2.24k | if (model->memory_reduction) |
1155 | 0 | ccv_nnc_symbolic_graph_memory_reduction(model->graph, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph)); |
1156 | 2.24k | } |
1157 | 2.24k | compiled_data->backward.to_size = _ccv_nnc_array_dedup_graph_exec_symbols(compiled_data->backward.tos, compiled_data->backward.to_size); |
1158 | 2.24k | compiled_data->gradient_mode = gradient_mode; |
1159 | 2.24k | } |
1160 | | |
1161 | | void ccv_cnnp_model_tensors_init_0(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data) |
1162 | 84 | { |
1163 | 84 | assert(!compiled_data->tensors.parameters); |
1164 | 84 | const int parameter_size = compiled_data->parameters->rnum; |
1165 | 84 | const int parallel_count = ccv_max(model->parallel_count, 1); |
1166 | 84 | const int internal_size = compiled_data->internals->rnum; |
1167 | 84 | compiled_data->tensors_init.size = ccv_nnc_tensor_symbol_count(model->graph); |
1168 | 84 | compiled_data->tensors_init.v = cccalloc(((compiled_data->tensors_init.size + 31) >> 5), sizeof(uint32_t)); |
1169 | 84 | compiled_data->tensors.parameters = (ccv_nnc_tensor_t**)cccalloc((parameter_size + internal_size) * parallel_count, sizeof(ccv_nnc_tensor_t*)); |
1170 | 84 | compiled_data->tensors.internals = compiled_data->tensors.parameters + parameter_size * parallel_count; |
1171 | 84 | } |
1172 | | |
1173 | | int ccv_cnnp_model_tensors_any_to_alloc(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data) |
1174 | 3 | { |
1175 | 3 | int i, j; |
1176 | 3 | const int parameter_size = compiled_data->parameters->rnum; |
1177 | 3 | const int parallel_count = ccv_max(model->parallel_count, 1); |
1178 | 3 | const int internal_size = compiled_data->internals->rnum; |
1179 | 19 | for (i = 0; i < parameter_size; i++16 ) |
1180 | 16 | { |
1181 | | // parameters has to be allocated all together. |
1182 | 16 | if (compiled_data->tensors.parameters[i]) |
1183 | 16 | { |
1184 | 16 | for (j = 1; j < parallel_count; j++0 ) |
1185 | 0 | { assert(compiled_data->tensors.parameters[i + j * parameter_size]); } |
1186 | 16 | continue; |
1187 | 16 | } |
1188 | 0 | return 1; |
1189 | 16 | } |
1190 | 3 | for (i = 0; i < internal_size; i++0 ) |
1191 | 0 | { |
1192 | 0 | if (!compiled_data->tensors.internals[i]) |
1193 | 0 | return 1; |
1194 | 0 | for (j = 1; j < parallel_count; j++) |
1195 | 0 | if (!compiled_data->tensors.internals[i + j * internal_size]) |
1196 | 0 | return 1; |
1197 | 0 | } |
1198 | 3 | return 0; |
1199 | 3 | } |
1200 | | |
1201 | | void ccv_cnnp_model_tensors_init_1(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data) |
1202 | 82 | { |
1203 | 82 | int i, j; |
1204 | 82 | const int parameter_size = compiled_data->parameters->rnum; |
1205 | 82 | const int parallel_count = ccv_max(model->parallel_count, 1); |
1206 | 82 | const int internal_size = compiled_data->internals->rnum; |
1207 | 364 | for (i = 0; i < parameter_size; i++282 ) |
1208 | 282 | { |
1209 | | // parameters has to be allocated all together. |
1210 | 282 | if (compiled_data->tensors.parameters[i]) |
1211 | 0 | { |
1212 | 0 | for (j = 1; j < parallel_count; j++) |
1213 | 0 | { assert(compiled_data->tensors.parameters[i + j * parameter_size]); } |
1214 | 0 | continue; |
1215 | 0 | } |
1216 | 282 | const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i); |
1217 | 282 | ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(parameter.graph, parameter); |
1218 | 282 | if (CCV_TENSOR_GET_DEVICE(info.type) == CCV_COMPUTE_DEVICE_ANY) |
1219 | 101 | CCV_TENSOR_SET_DEVICE_ID(info.type, 0); |
1220 | 282 | const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type); |
1221 | 282 | compiled_data->tensors.parameters[i] = ccv_nnc_tensor_new(0, info, 0); |
1222 | 684 | for (j = 1; j < parallel_count; j++402 ) |
1223 | 402 | { |
1224 | 402 | if (j != device_id) |
1225 | 402 | CCV_TENSOR_SET_DEVICE_ID(info.type, j); |
1226 | 0 | else |
1227 | 0 | CCV_TENSOR_SET_DEVICE_ID(info.type, 0); |
1228 | 402 | compiled_data->tensors.parameters[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0); |
1229 | 402 | } |
1230 | 282 | } |
1231 | 82 | const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v); |
1232 | 141 | for (i = 0; i < internal_size; i++59 ) |
1233 | 59 | { |
1234 | 59 | const ccv_nnc_tensor_symbol_t retained = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i); |
1235 | 59 | const int d = retained.d; |
1236 | 59 | if (init_v[d >> 5] & (1u << (d & 0x1f))) |
1237 | 0 | continue; |
1238 | 59 | ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(retained.graph, retained); |
1239 | 59 | if (CCV_TENSOR_GET_DEVICE(info.type) == CCV_COMPUTE_DEVICE_ANY) |
1240 | 7 | CCV_TENSOR_SET_DEVICE_ID(info.type, 0); |
1241 | 59 | const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type); |
1242 | 59 | if (!compiled_data->tensors.internals[i]) |
1243 | 59 | compiled_data->tensors.internals[i] = ccv_nnc_tensor_new(0, info, 0); |
1244 | 155 | for (j = 1; j < parallel_count; j++96 ) |
1245 | 96 | { |
1246 | 96 | if (j != device_id) |
1247 | 96 | CCV_TENSOR_SET_DEVICE_ID(info.type, j); |
1248 | 0 | else |
1249 | 0 | CCV_TENSOR_SET_DEVICE_ID(info.type, 0); |
1250 | 96 | if (!compiled_data->tensors.internals[i + j * internal_size]) |
1251 | 96 | compiled_data->tensors.internals[i + j * internal_size] = ccv_nnc_tensor_new(0, info, 0); |
1252 | 96 | } |
1253 | 59 | } |
1254 | 82 | compiled_data->tensors_init.v = CCV_NNC_INIT_V(compiled_data->tensors_init.v); // Remove 1 if any. |
1255 | 82 | } |
1256 | | |
1257 | | static void _ccv_cnnp_model_tensors_init(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data) |
1258 | 82 | { |
1259 | 82 | ccv_cnnp_model_tensors_init_0(model, compiled_data); |
1260 | 82 | ccv_cnnp_model_tensors_init_1(model, compiled_data); |
1261 | 82 | } |
1262 | | |
1263 | | static void _ccv_cnnp_model_copy_tensors(const uint32_t* const tensors_init, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t* const* const tensors, const int tensor_size, const int parallel_count) |
1264 | 6 | { |
1265 | 6 | assert(parallel_count > 0); |
1266 | 6 | int i, j; |
1267 | 12 | for (i = 0; i < tensor_size; i++6 ) |
1268 | 6 | { |
1269 | 6 | if (!tensors[i]) |
1270 | 0 | continue; |
1271 | 6 | const int d = tensor_symbols[i].d; |
1272 | 6 | if (!(tensors_init[d >> 5] & (1u << (d & 0x1f)))) |
1273 | 0 | continue; |
1274 | 24 | for (j = 1; 6 j < parallel_count; j++18 ) |
1275 | 18 | if (tensors[i + j * tensor_size]) |
1276 | 18 | { |
1277 | 18 | ccv_nnc_tensor_t* const input = CCV_NNC_TENSOR(tensors[i]); |
1278 | 18 | ccv_nnc_tensor_t* const output = CCV_NNC_TENSOR(tensors[i + j * tensor_size]); |
1279 | 18 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &input, 1, &output, 1, 0); |
1280 | 18 | } |
1281 | 6 | } |
1282 | 6 | } |
1283 | | |
1284 | | static void _ccv_cnnp_model_remove_nocopies(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t** const tensors, const int tensor_size, const int parallel_count) |
1285 | 89 | { |
1286 | 89 | assert(parallel_count > 0); |
1287 | 89 | int i, j; |
1288 | 148 | for (i = 0; i < tensor_size; i++59 ) |
1289 | 59 | { |
1290 | 59 | const ccv_nnc_tensor_symbol_t tensor_symbol = tensor_symbols[i]; |
1291 | 155 | for (j = 1; j < parallel_count; j++96 ) |
1292 | 96 | { |
1293 | 96 | const ccv_nnc_tensor_symbol_t copy = ccv_nnc_tensor_symbol_copy(graph, tensor_symbol, j); |
1294 | 96 | ccv_nnc_tensor_t* copy_tensor = tensors[i + j * tensor_size]; |
1295 | 96 | if (copy_tensor && copy.d == CCV_NNC_NO_TENSOR_SYMBOL) |
1296 | 0 | { // We shouldn't allocate this, free it up. |
1297 | 0 | ccv_nnc_tensor_free(tensors[i + j * tensor_size]); |
1298 | 0 | tensors[i + j * tensor_size] = 0; |
1299 | 0 | } |
1300 | 96 | } |
1301 | 59 | } |
1302 | 89 | } |
1303 | | |
1304 | | static void _ccv_cnnp_model_bind_tensors(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t* const* const tensors, const int tensor_size, const int parallel_count, ccv_array_t* const tensor_binds) |
1305 | 479 | { |
1306 | 479 | assert(parallel_count > 0); |
1307 | 479 | int i, j; |
1308 | 1.81k | for (i = 0; i < tensor_size; i++1.33k ) |
1309 | 1.33k | { |
1310 | 1.33k | ccv_nnc_tensor_symbol_t tensor_symbol = tensor_symbols[i]; |
1311 | 1.33k | if (tensor_symbol.d == CCV_NNC_NO_TENSOR_SYMBOL) |
1312 | 7 | continue; |
1313 | 1.32k | if (graph) |
1314 | 1.32k | { |
1315 | 1.32k | const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(graph, tensor_symbol); |
1316 | 1.32k | if (alias_to.d != CCV_NNC_NO_TENSOR_SYMBOL) |
1317 | 0 | tensor_symbol = alias_to; |
1318 | 1.32k | } |
1319 | 1.32k | ccv_nnc_tensor_t* const tensor = CCV_NNC_TENSOR(tensors[i]); |
1320 | 1.32k | if (tensor && tensor_symbol.d != CCV_NNC_NO_TENSOR_SYMBOL) |
1321 | 1.32k | { |
1322 | 1.32k | const ccv_nnc_tensor_bind_t retained_bind = { |
1323 | 1.32k | .symbol = tensor_symbol, |
1324 | 1.32k | .tensor = tensor |
1325 | 1.32k | }; |
1326 | 1.32k | ccv_array_push(tensor_binds, &retained_bind); |
1327 | 1.32k | } |
1328 | 2.87k | for (j = 1; j < parallel_count; j++1.54k ) |
1329 | 1.54k | { |
1330 | 1.54k | const ccv_nnc_tensor_symbol_t copy = ccv_nnc_tensor_symbol_copy(graph, tensor_symbol, j); |
1331 | 1.54k | ccv_nnc_tensor_t* copy_tensor = tensors[i + j * tensor_size]; |
1332 | 1.54k | if (copy_tensor && copy.d != CCV_NNC_NO_TENSOR_SYMBOL) |
1333 | 1.54k | { |
1334 | 1.54k | const ccv_nnc_tensor_bind_t bind = { |
1335 | 1.54k | .symbol = copy, |
1336 | 1.54k | .tensor = tensors[i + j * tensor_size] |
1337 | 1.54k | }; |
1338 | 1.54k | ccv_array_push(tensor_binds, &bind); |
1339 | 1.54k | } |
1340 | 1.54k | } |
1341 | 1.32k | } |
1342 | 479 | } |
1343 | | |
1344 | | static void _ccv_cnnp_compiled_data_graph_free(ccv_cnnp_compiled_data_t* const compiled_data) |
1345 | 2.38k | { |
1346 | 2.38k | if (compiled_data->graph) |
1347 | 89 | ccv_nnc_graph_free(compiled_data->graph); |
1348 | 2.38k | compiled_data->graph = 0; |
1349 | 2.38k | compiled_data->is_test = 0; |
1350 | 2.38k | if (compiled_data->tensor_arena) |
1351 | 89 | ccv_nnc_tensor_arena_free(compiled_data->tensor_arena); |
1352 | 2.38k | compiled_data->tensor_arena = 0; |
1353 | 2.38k | if (compiled_data->graph_exec_arena) |
1354 | 89 | ccv_nnc_graph_exec_arena_free(compiled_data->graph_exec_arena); |
1355 | 2.38k | compiled_data->graph_exec_arena = 0; |
1356 | 2.38k | if (compiled_data->backward.from_ops) |
1357 | 29 | ccfree(compiled_data->backward.from_ops); |
1358 | 2.38k | compiled_data->backward.from_ops = 0; |
1359 | 2.38k | if (compiled_data->evaluate.schedule) |
1360 | 34 | ccv_nnc_graph_static_schedule_free(compiled_data->evaluate.schedule); |
1361 | 2.38k | compiled_data->evaluate.schedule = 0; |
1362 | 2.38k | if (compiled_data->backward.schedule) |
1363 | 25 | ccv_nnc_graph_static_schedule_free(compiled_data->backward.schedule); |
1364 | 2.38k | compiled_data->backward.schedule = 0; |
1365 | 2.38k | } |
1366 | | |
1367 | | static void _ccv_cnnp_compiled_data_gradient_free(ccv_cnnp_compiled_data_t* const compiled_data) |
1368 | 2.29k | { |
1369 | 2.29k | if (compiled_data->gradients) |
1370 | 2.24k | ccfree(compiled_data->gradients); |
1371 | 2.29k | compiled_data->gradients = 0; |
1372 | 2.29k | if (compiled_data->updated_parameters) |
1373 | 2.24k | ccfree(compiled_data->updated_parameters); |
1374 | 2.29k | compiled_data->updated_parameters = 0; |
1375 | 2.29k | compiled_data->update_nodes = 0; |
1376 | 2.29k | compiled_data->saved_aux = 0; |
1377 | 2.29k | } |
1378 | | |
1379 | | static void _ccv_cnnp_compiled_data_backward_free(ccv_cnnp_compiled_data_t* const compiled_data) |
1380 | 2.33k | { |
1381 | 2.33k | if (compiled_data->backward.gradients) |
1382 | 5 | ccfree(compiled_data->backward.gradients); |
1383 | 2.33k | compiled_data->backward.gradients = 0; |
1384 | 2.33k | if (compiled_data->backward.accum) |
1385 | 5 | ccv_nnc_graph_free(compiled_data->backward.accum); |
1386 | 2.33k | compiled_data->backward.accum = 0; |
1387 | 2.33k | if (compiled_data->backward.tensor_arena) |
1388 | 5 | ccv_nnc_tensor_arena_free(compiled_data->backward.tensor_arena); |
1389 | 2.33k | compiled_data->backward.tensor_arena = 0; |
1390 | 2.33k | if (compiled_data->backward.graph_exec_arena) |
1391 | 5 | ccv_nnc_graph_exec_arena_free(compiled_data->backward.graph_exec_arena); |
1392 | 2.33k | compiled_data->backward.graph_exec_arena = 0; |
1393 | 2.33k | } |
1394 | | |
1395 | | static void _ccv_cnnp_compiled_data_apply_gradients_free(ccv_cnnp_compiled_data_t* const compiled_data) |
1396 | 2.30k | { |
1397 | 2.30k | if (compiled_data->apply_gradients.graph) |
1398 | 21 | ccv_nnc_graph_free(compiled_data->apply_gradients.graph); |
1399 | 2.30k | compiled_data->apply_gradients.graph = 0; |
1400 | 2.30k | if (compiled_data->apply_gradients.tensor_arena) |
1401 | 21 | ccv_nnc_tensor_arena_free(compiled_data->apply_gradients.tensor_arena); |
1402 | 2.30k | compiled_data->apply_gradients.tensor_arena = 0; |
1403 | 2.30k | if (compiled_data->apply_gradients.graph_exec_arena) |
1404 | 21 | ccv_nnc_graph_exec_arena_free(compiled_data->apply_gradients.graph_exec_arena); |
1405 | 2.30k | compiled_data->apply_gradients.graph_exec_arena = 0; |
1406 | 2.30k | } |
1407 | | |
1408 | | // Compile the graph to run ccv_cnnp_model_fit |
1409 | | static void _ccv_cnnp_model_fit_jit(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const fits, const int fit_size, ccv_nnc_tensor_t* const* const outputs, const int output_size) |
1410 | 8 | { |
1411 | 8 | int i, j; |
1412 | 8 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
1413 | 8 | assert(!compiled_data->graph || compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_FIT_MODE); |
1414 | 8 | compiled_data->graph_mode = CCV_CNNP_MODEL_GRAPH_FIT_MODE; |
1415 | 8 | const int parallel_count = ccv_max(model->parallel_count, 1); |
1416 | 8 | assert(output_size == model->output_size * parallel_count); |
1417 | 8 | assert(!fits || output_size == fit_size); |
1418 | 8 | assert(output_size > 0); |
1419 | 8 | if (compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE) |
1420 | 8 | { |
1421 | 8 | _ccv_cnnp_model_set_rewindables(model); |
1422 | 8 | _ccv_cnnp_model_gradient_init(model, CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES, CCV_CNNP_DISABLE_OUTGRAD_ALL, fits, fit_size); |
1423 | 8 | } else if (0 compiled_data->gradient_mode != CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES0 ) { |
1424 | 0 | _ccv_cnnp_model_rewind_graph(model); |
1425 | 0 | _ccv_cnnp_compiled_data_gradient_free(compiled_data); |
1426 | 0 | compiled_data->gradient_mode = CCV_CNNP_COMPILED_DATA_GRADIENT_NONE; |
1427 | 0 | _ccv_cnnp_model_gradient_init(model, CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES, CCV_CNNP_DISABLE_OUTGRAD_ALL, fits, fit_size); |
1428 | 0 | } |
1429 | 8 | const int tensors_init = !!compiled_data->tensors_init.v; |
1430 | 8 | if (!tensors_init) |
1431 | 4 | _ccv_cnnp_model_tensors_init(model, compiled_data); |
1432 | 4 | else if ((uintptr_t)compiled_data->tensors_init.v & (uintptr_t)1) |
1433 | | // Check if it is not fully allocated, if it is not, init_1. |
1434 | 0 | ccv_cnnp_model_tensors_init_1(model, compiled_data); |
1435 | 8 | ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0); |
1436 | 8 | assert((input_size % parallel_count) == 0); |
1437 | 8 | assert((output_size % parallel_count) == 0); |
1438 | 8 | assert((fit_size % parallel_count) == 0); |
1439 | 8 | const int input_size_per_p = input_size / parallel_count; |
1440 | 8 | _ccv_cnnp_model_bind_tensors(model->graph, model->inputs, inputs, input_size_per_p, parallel_count, tensor_binds); |
1441 | 8 | const int output_size_per_p = output_size / parallel_count; |
1442 | 8 | _ccv_cnnp_model_bind_tensors(model->graph, model->outputs, outputs, output_size_per_p, parallel_count, tensor_binds); |
1443 | 8 | const int fit_size_per_p = fit_size / parallel_count; |
1444 | 8 | _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->fits, fits, fit_size_per_p, parallel_count, tensor_binds); |
1445 | 8 | const int parameter_size = compiled_data->parameters->rnum; |
1446 | 8 | _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds); |
1447 | 8 | _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->updated_parameters, compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds); |
1448 | 8 | const int internal_size = compiled_data->internals->rnum; |
1449 | 8 | _ccv_cnnp_model_remove_nocopies(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count); |
1450 | 8 | _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count, tensor_binds); |
1451 | 8 | ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena); |
1452 | 8 | ccv_array_free(tensor_binds); |
1453 | 8 | const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v); |
1454 | 8 | if (tensors_init && parallel_count > 14 ) |
1455 | 0 | _ccv_cnnp_model_copy_tensors(init_v, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, compiled_data->parameters->rnum, parallel_count); |
1456 | | // If tensor is not init'ed, we need to init states first. |
1457 | 8 | if (_ccv_cnnp_any_to_init(compiled_data)) |
1458 | 7 | { |
1459 | 7 | ccv_nnc_tensor_init_states_t tensor_init_states = { |
1460 | 7 | .parallel_count = parallel_count, |
1461 | 7 | .graph = model->graph, |
1462 | 7 | .compiled_data = compiled_data, |
1463 | 7 | .tensor_arena = compiled_data->tensor_arena |
1464 | 7 | }; |
1465 | 7 | ccv_cnnp_model_init_states(model, model->graph, _ccv_cnnp_init_states_for_tensors, &tensor_init_states); |
1466 | 7 | } |
1467 | 8 | compiled_data->is_test = 0; |
1468 | 8 | const int saved_aux_size = ccv_nnc_minimizer_saved_aux_size(compiled_data->minimize.minimizer); |
1469 | | // No need to set because it is default to training mode. |
1470 | | // ccv_cnnp_model_set_is_test(model, 0, _ccv_cnnp_cmd_update_for_execs, &update); |
1471 | 105 | for (i = 0; i < saved_aux_size * parameter_size; i++97 ) |
1472 | 97 | { |
1473 | 97 | if (compiled_data->saved_aux[i].source.d == CCV_NNC_NO_TENSOR_SYMBOL) |
1474 | 5 | continue; |
1475 | 92 | ccv_nnc_tensor_t* const tensor = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, compiled_data->saved_aux[i].source); |
1476 | 92 | ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &tensor, 1, 0); |
1477 | 296 | for (j = 1; j < parallel_count; j++204 ) |
1478 | 204 | { |
1479 | 204 | ccv_nnc_tensor_t* const copy = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, compiled_data->saved_aux[i].source, j)); |
1480 | 204 | if (copy) |
1481 | 204 | ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, ©, 1, 0); |
1482 | 204 | } |
1483 | 92 | } |
1484 | 8 | const int evaluate_to_size = compiled_data->evaluate.to_size; |
1485 | 8 | compiled_data->evaluate.to_op_size = 0; |
1486 | 22 | for (i = 0; i < evaluate_to_size; i++14 ) |
1487 | 14 | { |
1488 | 14 | ccv_nnc_graph_exec_t const to = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, compiled_data->evaluate.tos[i]); |
1489 | 14 | if (to.graph) |
1490 | 14 | compiled_data->evaluate.to_ops[compiled_data->evaluate.to_op_size++] = to; |
1491 | 14 | } |
1492 | 8 | ccv_nnc_graph_set_default_static_schedule(compiled_data->graph, compiled_data->stream_type, model->max_stream_count); |
1493 | 8 | ccv_nnc_graph_autotune(compiled_data->graph, model->workspace_size, 0, TRAVERSE_FULL); |
1494 | 8 | } |
1495 | | |
1496 | | ccv_nnc_stream_context_t* ccv_cnnp_model_default_stream(const ccv_cnnp_model_t* const model) |
1497 | 0 | { |
1498 | 0 | const ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
1499 | 0 | if (!compiled_data || !compiled_data->graph) |
1500 | 0 | return 0; |
1501 | 0 | return ccv_nnc_graph_default_stream(compiled_data->graph); |
1502 | 0 | } |
1503 | | |
1504 | | uint64_t ccv_cnnp_model_memory_size(const ccv_cnnp_model_t* const model) |
1505 | 0 | { |
1506 | 0 | const ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
1507 | 0 | if (!compiled_data || !compiled_data->tensor_arena) |
1508 | 0 | return 0; |
1509 | 0 | return ccv_nnc_tensor_arena_size(compiled_data->tensor_arena); |
1510 | 0 | } |
1511 | | |
1512 | | static void _ccv_cnnp_bind_tensors_to_arena(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t* const* const tensors, const int tensor_size, const int parallel_count) |
1513 | 38.8k | { |
1514 | 38.8k | int i, j; |
1515 | 114k | for (i = 0; i < tensor_size; i++75.5k ) |
1516 | 75.5k | { |
1517 | 75.5k | ccv_nnc_tensor_symbol_t tensor_symbol = tensor_symbols[i]; |
1518 | 75.5k | if (tensor_symbol.d == CCV_NNC_NO_TENSOR_SYMBOL) |
1519 | 0 | continue; |
1520 | 75.5k | if (graph) |
1521 | 72.5k | { |
1522 | 72.5k | const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(graph, tensor_symbol); |
1523 | 72.5k | if (alias_to.d != CCV_NNC_NO_TENSOR_SYMBOL) |
1524 | 0 | tensor_symbol = alias_to; |
1525 | 72.5k | } |
1526 | 75.5k | ccv_nnc_tensor_bind_symbol(tensor_arena, tensor_symbol, tensors[i]); |
1527 | 77.3k | for (j = 1; j < parallel_count; j++1.77k ) |
1528 | 1.77k | { |
1529 | 1.77k | const ccv_nnc_tensor_symbol_t copy = ccv_nnc_tensor_symbol_copy(graph, tensor_symbol, j); |
1530 | 1.77k | if (copy.d != CCV_NNC_NO_TENSOR_SYMBOL) |
1531 | 1.77k | ccv_nnc_tensor_bind_symbol(tensor_arena, copy, tensors[i + tensor_size * j]); |
1532 | 1.77k | } |
1533 | 75.5k | } |
1534 | 38.8k | } |
1535 | | |
1536 | | void ccv_cnnp_model_fit(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const fits, const int fit_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context) |
1537 | 2.54k | { |
1538 | 2.54k | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
1539 | 2.54k | assert(compiled_data); |
1540 | 2.54k | const int parallel_count = ccv_max(model->parallel_count, 1); |
1541 | 2.54k | assert(output_size == model->output_size * parallel_count); |
1542 | 2.54k | assert(input_size == model->input_size * parallel_count); |
1543 | 2.54k | assert(!fits || fit_size == output_size); |
1544 | 2.54k | assert(model->graph); |
1545 | 2.54k | if (!compiled_data->graph || compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_FIT_MODE2.53k ) |
1546 | 8 | { |
1547 | 8 | _ccv_cnnp_compiled_data_graph_free(compiled_data); |
1548 | 8 | _ccv_cnnp_compiled_data_backward_free(compiled_data); |
1549 | 8 | _ccv_cnnp_compiled_data_apply_gradients_free(compiled_data); |
1550 | | // Compile the symbolic graph down only when needed. |
1551 | 8 | _ccv_cnnp_model_fit_jit(model, inputs, input_size, fits, fit_size, outputs, output_size); |
1552 | 2.53k | } else { |
1553 | 2.53k | assert((input_size % parallel_count) == 0); |
1554 | 2.53k | assert((output_size % parallel_count) == 0); |
1555 | 2.53k | assert((fit_size % parallel_count) == 0); |
1556 | 2.53k | const int input_size_per_p = input_size / parallel_count; |
1557 | 2.53k | _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->inputs, inputs, input_size_per_p, parallel_count); |
1558 | 2.53k | const int output_size_per_p = output_size / parallel_count; |
1559 | 2.53k | _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->outputs, outputs, output_size_per_p, parallel_count); |
1560 | 2.53k | const int fit_size_per_p = fit_size / parallel_count; |
1561 | 2.53k | _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, compiled_data->fits, fits, fit_size_per_p, parallel_count); |
1562 | 2.53k | } |
1563 | 2.54k | if (compiled_data->is_test) |
1564 | 0 | { |
1565 | 0 | compiled_data->is_test = 0; |
1566 | 0 | ccv_nnc_graph_exec_update_t update = { |
1567 | 0 | .parallel_count = parallel_count, |
1568 | 0 | .graph = model->graph, |
1569 | 0 | .graph_exec_arena = compiled_data->graph_exec_arena, |
1570 | 0 | }; |
1571 | 0 | ccv_cnnp_model_set_is_test(model, 0, _ccv_cnnp_cmd_update_for_execs, &update); |
1572 | 0 | } |
1573 | 2.54k | ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, 0, tensor_tape, stream_context); |
1574 | 2.54k | } |
1575 | | |
1576 | | // Compile the graph to run ccv_cnnp_model_evaluate with require_grad = false (MULTISTAGE_MODE_NO_GRAD). |
1577 | | static void _ccv_cnnp_model_multistage_no_grad_jit(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size) |
1578 | 52 | { |
1579 | 52 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
1580 | 52 | compiled_data->graph_mode = CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE_NO_GRAD; |
1581 | 52 | const int parallel_count = ccv_max(model->parallel_count, 1); |
1582 | 52 | assert(output_size == model->output_size * parallel_count); |
1583 | 52 | assert(output_size > 0); |
1584 | | // If the gradient is not initialized, continue to setup parallel process. We don't init gradient here, but rather, |
1585 | | // we setup proper rewindables so the graph can be rewinded to previous state before we run data parallel. |
1586 | 52 | if (parallel_count > 1 && compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE6 ) |
1587 | 6 | { |
1588 | 6 | const int evaluate_to_size = compiled_data->evaluate.to_size; |
1589 | 6 | compiled_data->evaluate.tos = ccrealloc(compiled_data->evaluate.tos, sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size * parallel_count + sizeof(ccv_nnc_graph_exec_t) * evaluate_to_size * parallel_count); |
1590 | 6 | _ccv_cnnp_model_set_rewindables(model); |
1591 | 6 | ccv_nnc_symbolic_graph_data_parallel(model->graph, parallel_count, |
1592 | 6 | 0, 0, |
1593 | 6 | 0, 0, 0, |
1594 | 6 | 0, 0, 0, |
1595 | 6 | CCV_NNC_PARALLEL_REDUCE_OP_SUM, |
1596 | 6 | SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph)); |
1597 | 6 | ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
1598 | 6 | int i, j; |
1599 | 12 | for (i = 0; i < evaluate_to_size; i++6 ) |
1600 | 24 | for (j = 1; 6 j < parallel_count; j++18 ) |
1601 | 18 | { |
1602 | 18 | const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->evaluate.tos[i], j); |
1603 | 18 | if (copy.d != CCV_NNC_NO_GRAPH_EXEC_SYMBOL) |
1604 | 18 | compiled_data->evaluate.tos[compiled_data->evaluate.to_size++] = copy; |
1605 | 18 | } |
1606 | 6 | } |
1607 | 52 | const int tensors_init = !!compiled_data->tensors_init.v; |
1608 | 52 | if (!tensors_init) |
1609 | 31 | _ccv_cnnp_model_tensors_init(model, compiled_data); |
1610 | 21 | else if ((uintptr_t)compiled_data->tensors_init.v & (uintptr_t)1) |
1611 | | // Check if it is not fully allocated, if it is not, init_1. |
1612 | 0 | ccv_cnnp_model_tensors_init_1(model, compiled_data); |
1613 | 52 | ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0); |
1614 | 52 | assert((input_size % parallel_count) == 0); |
1615 | 52 | assert((output_size % parallel_count) == 0); |
1616 | 52 | const int input_size_per_p = input_size / parallel_count; |
1617 | 52 | _ccv_cnnp_model_bind_tensors(model->graph, model->inputs, inputs, input_size_per_p, parallel_count, tensor_binds); |
1618 | 52 | const int output_size_per_p = output_size / parallel_count; |
1619 | 52 | _ccv_cnnp_model_bind_tensors(model->graph, model->outputs, outputs, output_size_per_p, parallel_count, tensor_binds); |
1620 | 52 | const int parameter_size = compiled_data->parameters->rnum; |
1621 | 52 | _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds); |
1622 | 52 | const int internal_size = compiled_data->internals->rnum; |
1623 | 52 | _ccv_cnnp_model_remove_nocopies(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count); |
1624 | 52 | _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count, tensor_binds); |
1625 | | // If we generated gradient for the graph, only compile part of the graph because the rest is irrelevant for evaluation. |
1626 | 52 | ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), compiled_data->evaluate.tos, compiled_data->evaluate.to_size, &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena); |
1627 | 52 | ccv_array_free(tensor_binds); |
1628 | 52 | const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v); |
1629 | | // If tensor is not init'ed, we need to init states first. |
1630 | 52 | if (tensors_init && parallel_count > 121 ) |
1631 | 6 | _ccv_cnnp_model_copy_tensors(init_v, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, compiled_data->parameters->rnum, parallel_count); |
1632 | 52 | if (_ccv_cnnp_any_to_init(compiled_data)) |
1633 | 16 | { |
1634 | 16 | ccv_nnc_tensor_init_states_t tensor_init_states = { |
1635 | 16 | .parallel_count = parallel_count, |
1636 | 16 | .graph = model->graph, |
1637 | 16 | .compiled_data = compiled_data, |
1638 | 16 | .tensor_arena = compiled_data->tensor_arena |
1639 | 16 | }; |
1640 | 16 | ccv_cnnp_model_init_states(model, model->graph, _ccv_cnnp_init_states_for_tensors, &tensor_init_states); |
1641 | 16 | } |
1642 | 52 | compiled_data->is_test = 1; |
1643 | 52 | ccv_nnc_graph_exec_update_t update = { |
1644 | 52 | .parallel_count = parallel_count, |
1645 | 52 | .graph = model->graph, |
1646 | 52 | .graph_exec_arena = compiled_data->graph_exec_arena, |
1647 | 52 | }; |
1648 | 52 | ccv_cnnp_model_set_is_test(model, 1, _ccv_cnnp_cmd_update_for_execs, &update); |
1649 | 52 | ccv_nnc_graph_set_default_static_schedule(compiled_data->graph, compiled_data->stream_type, model->max_stream_count); |
1650 | 52 | ccv_nnc_graph_autotune(compiled_data->graph, model->workspace_size, 0, TRAVERSE_FULL); |
1651 | 52 | } |
1652 | | |
1653 | | static void _ccv_cnnp_model_gradient_tensors_init(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data) |
1654 | 28 | { |
1655 | 28 | assert(!compiled_data->tensors.gradients); |
1656 | 28 | const int parameter_size = compiled_data->parameters->rnum; |
1657 | 28 | const int parallel_count = ccv_max(model->parallel_count, 1); |
1658 | 28 | compiled_data->tensors.gradients = (ccv_nnc_tensor_t**)ccmalloc(sizeof(ccv_nnc_tensor_t*) * parameter_size * 2 * parallel_count); |
1659 | 28 | compiled_data->tensors.accum_gradients = compiled_data->tensors.gradients + parameter_size * parallel_count; |
1660 | 28 | int i, j; |
1661 | 175 | for (i = 0; i < parameter_size; i++147 ) |
1662 | 147 | { |
1663 | 147 | if (compiled_data->parameter_flags && !(compiled_data->parameter_flags[i >> 6] & ((uint64_t)1 << (i & 63)))6 ) |
1664 | 2 | { |
1665 | 2 | compiled_data->tensors.gradients[i] = 0; |
1666 | 2 | compiled_data->tensors.accum_gradients[i] = 0; |
1667 | 2 | for (j = 1; j < parallel_count; j++0 ) |
1668 | 0 | { |
1669 | 0 | compiled_data->tensors.gradients[i + j * parameter_size] = 0; |
1670 | 0 | compiled_data->tensors.accum_gradients[i + j * parameter_size] = 0; |
1671 | 0 | } |
1672 | 2 | continue; |
1673 | 2 | } |
1674 | 145 | const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i); |
1675 | 145 | ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(parameter.graph, parameter); |
1676 | 145 | if (CCV_TENSOR_GET_DEVICE(info.type) == CCV_COMPUTE_DEVICE_ANY) |
1677 | 38 | CCV_TENSOR_SET_DEVICE_ID(info.type, 0); |
1678 | 145 | const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type); |
1679 | 145 | compiled_data->tensors.gradients[i] = ccv_nnc_tensor_new(0, info, 0); |
1680 | 145 | compiled_data->tensors.accum_gradients[i] = 0; // delay the accumulated gradient allocation until when we need it. |
1681 | 325 | for (j = 1; j < parallel_count; j++180 ) |
1682 | 180 | { |
1683 | 180 | if (j != device_id) |
1684 | 180 | CCV_TENSOR_SET_DEVICE_ID(info.type, j); |
1685 | 0 | else |
1686 | 0 | CCV_TENSOR_SET_DEVICE_ID(info.type, 0); |
1687 | 180 | compiled_data->tensors.gradients[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0); |
1688 | 180 | compiled_data->tensors.accum_gradients[i + j * parameter_size] = 0; |
1689 | 180 | } |
1690 | 145 | } |
1691 | 28 | } |
1692 | | |
1693 | | static int _ccv_cnnp_is_disable_outgrad_all(const uint64_t disable_outgrad, const int input_size) |
1694 | 7.99k | { |
1695 | 7.99k | if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_ALL) |
1696 | 15 | return 1; |
1697 | 7.97k | if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_NONE) |
1698 | 7.97k | return 0; |
1699 | 7 | int i; |
1700 | 7 | for (i = 0; i < input_size; i++0 ) |
1701 | 7 | if (!(disable_outgrad & ((uint64_t)1 << i))) |
1702 | 7 | return 0; |
1703 | 0 | return 1; |
1704 | 7 | } |
1705 | | |
1706 | | // Compile the graph to run ccv_cnnp_model_evaluate with requires_grad = true (MULTISTAGE_MODE). |
1707 | | // Particularly, this method compiles the evaluation and backprop graph (the main graph). |
1708 | | static void _ccv_cnnp_model_multistage_jit_0(ccv_cnnp_model_t* const model, const uint64_t disable_outgrad, const int is_test, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size) |
1709 | 29 | { |
1710 | 29 | int i, j; |
1711 | 29 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
1712 | 29 | const int target_gradient_mode = _ccv_cnnp_is_disable_outgrad_all(disable_outgrad, model->input_size) ? CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES1 : CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS28 ; |
1713 | 29 | assert(!compiled_data->graph || compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE || compiled_data->gradient_mode != target_gradient_mode); |
1714 | 29 | compiled_data->graph_mode = CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE; |
1715 | 29 | const int parallel_count = ccv_max(model->parallel_count, 1); |
1716 | 29 | assert(output_size == model->output_size * parallel_count); |
1717 | 29 | assert(output_size > 0); |
1718 | | // There shouldn't be a loss function if we evaluate with multistage jit. |
1719 | 29 | assert(compiled_data->loss.cmd == CCV_NNC_NOOP); |
1720 | 29 | if (compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE) |
1721 | 27 | { |
1722 | 27 | _ccv_cnnp_model_set_rewindables(model); |
1723 | 27 | _ccv_cnnp_model_gradient_init(model, target_gradient_mode, disable_outgrad, 0, 0); // The type of outputs and fits should be the same. We only use type here. |
1724 | 27 | } else if (2 compiled_data->gradient_mode != target_gradient_mode2 ) { |
1725 | 2 | _ccv_cnnp_model_rewind_graph(model); |
1726 | 2 | _ccv_cnnp_compiled_data_gradient_free(compiled_data); |
1727 | 2 | compiled_data->gradient_mode = CCV_CNNP_COMPILED_DATA_GRADIENT_NONE; |
1728 | 2 | _ccv_cnnp_model_gradient_init(model, target_gradient_mode, disable_outgrad, 0, 0); // The type of outputs and fits should be the same. We only use type here. |
1729 | 2 | } |
1730 | 29 | const int tensors_init = !!compiled_data->tensors_init.v; |
1731 | 29 | if (!tensors_init) |
1732 | 21 | _ccv_cnnp_model_tensors_init(model, compiled_data); |
1733 | 8 | else if ((uintptr_t)compiled_data->tensors_init.v & (uintptr_t)1) |
1734 | | // Check if it is not fully allocated, if it is not, init_1. |
1735 | 0 | ccv_cnnp_model_tensors_init_1(model, compiled_data); |
1736 | 29 | ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0); |
1737 | 29 | assert((input_size % parallel_count) == 0); |
1738 | 29 | assert((output_size % parallel_count) == 0); |
1739 | 29 | const int input_size_per_p = input_size / parallel_count; |
1740 | 29 | _ccv_cnnp_model_bind_tensors(model->graph, model->inputs, inputs, input_size_per_p, parallel_count, tensor_binds); |
1741 | 29 | const int output_size_per_p = output_size / parallel_count; |
1742 | 29 | _ccv_cnnp_model_bind_tensors(model->graph, model->outputs, outputs, output_size_per_p, parallel_count, tensor_binds); |
1743 | 29 | const int parameter_size = compiled_data->parameters->rnum; |
1744 | 29 | _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds); |
1745 | 29 | const int internal_size = compiled_data->internals->rnum; |
1746 | 29 | _ccv_cnnp_model_remove_nocopies(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count); |
1747 | 29 | _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count, tensor_binds); |
1748 | 29 | if (!compiled_data->tensors.gradients) |
1749 | 28 | _ccv_cnnp_model_gradient_tensors_init(model, compiled_data); |
1750 | 29 | _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count, tensor_binds); |
1751 | 29 | if (compiled_data->backward.to_size > 0) |
1752 | 29 | ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), compiled_data->backward.tos, compiled_data->backward.to_size, &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena); |
1753 | 0 | else |
1754 | 0 | ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), compiled_data->evaluate.tos, compiled_data->evaluate.to_size, &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena); |
1755 | 29 | ccv_array_free(tensor_binds); |
1756 | 29 | const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v); |
1757 | 29 | if (tensors_init && parallel_count > 18 ) |
1758 | 0 | _ccv_cnnp_model_copy_tensors(init_v, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, compiled_data->parameters->rnum, parallel_count); |
1759 | | // If tensor is not init'ed, we need to init states first. |
1760 | 29 | if (_ccv_cnnp_any_to_init(compiled_data)) |
1761 | 18 | { |
1762 | 18 | ccv_nnc_tensor_init_states_t tensor_init_states = { |
1763 | 18 | .parallel_count = parallel_count, |
1764 | 18 | .graph = model->graph, |
1765 | 18 | .compiled_data = compiled_data, |
1766 | 18 | .tensor_arena = compiled_data->tensor_arena |
1767 | 18 | }; |
1768 | 18 | ccv_cnnp_model_init_states(model, model->graph, _ccv_cnnp_init_states_for_tensors, &tensor_init_states); |
1769 | 18 | } |
1770 | 29 | compiled_data->is_test = is_test; |
1771 | 29 | ccv_nnc_graph_exec_update_t update = { |
1772 | 29 | .parallel_count = parallel_count, |
1773 | 29 | .graph = model->graph, |
1774 | 29 | .graph_exec_arena = compiled_data->graph_exec_arena, |
1775 | 29 | }; |
1776 | 29 | ccv_cnnp_model_set_is_test(model, is_test, _ccv_cnnp_cmd_update_for_execs, &update); |
1777 | 29 | const int evaluate_to_size = compiled_data->evaluate.to_size; |
1778 | 29 | compiled_data->evaluate.to_op_size = 0; |
1779 | 29 | ccv_array_t* const backward_from = ccv_array_new(sizeof(int), 0, 0); |
1780 | 76 | for (i = 0; i < evaluate_to_size; i++47 ) |
1781 | 47 | { |
1782 | 47 | ccv_nnc_graph_exec_t const to_op = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, compiled_data->evaluate.tos[i]); |
1783 | 47 | if (to_op.graph) |
1784 | 47 | compiled_data->evaluate.to_ops[compiled_data->evaluate.to_op_size++] = to_op; |
1785 | 47 | const int* tos; |
1786 | 47 | int to_size; |
1787 | 47 | ccv_nnc_graph_exec_symbol_to(model->graph, compiled_data->evaluate.tos[i], &tos, &to_size); |
1788 | 94 | for (j = 0; j < to_size; j++47 ) |
1789 | 47 | { |
1790 | 47 | ccv_nnc_graph_exec_t const to_op = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, (ccv_nnc_graph_exec_symbol_t){ |
1791 | 47 | .d = tos[j], |
1792 | 47 | .graph = model->graph |
1793 | 47 | }); |
1794 | 47 | if (to_op.graph) |
1795 | 47 | ccv_array_add_unique_int(backward_from, to_op.d); |
1796 | 47 | } |
1797 | 47 | } |
1798 | 29 | assert(backward_from->rnum > 0); |
1799 | 29 | compiled_data->backward.from_op_size = backward_from->rnum; |
1800 | 29 | compiled_data->backward.from_ops = (ccv_nnc_graph_exec_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_t) * backward_from->rnum); |
1801 | 76 | for (i = 0; i < backward_from->rnum; i++47 ) |
1802 | 47 | compiled_data->backward.from_ops[i] = (ccv_nnc_graph_exec_t){ |
1803 | 47 | .d = *(int*)ccv_array_get(backward_from, i), |
1804 | 47 | .graph = compiled_data->graph, |
1805 | 47 | }; |
1806 | | // If there are any set node (to set some tensors to 0) inserted through backward pass, these won't be executed if we just do sources -> evaluate.to_ops, backward.from_ops -> destinations. We need this logic to find out these nodes and explicitly adding them to backward.from_ops. |
1807 | 29 | ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(compiled_data->graph->exec_info, 0); |
1808 | 29 | const int exec_info_size = compiled_data->graph->exec_info->rnum; |
1809 | 29 | uint32_t* const visited = cccalloc((exec_info_size + 31) >> 5, sizeof(uint32_t)); |
1810 | 29 | const ccv_nnc_graph_exec_t* const sources = (ccv_nnc_graph_exec_t*)ccv_array_get(compiled_data->graph->sources, 0); |
1811 | 29 | const int source_size = compiled_data->graph->sources->rnum; |
1812 | 58 | ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new29 (compiled_data->graph, exec_info, exec_info_size, sources, source_size, compiled_data->evaluate.to_ops, compiled_data->evaluate.to_op_size, 0); |
1813 | 600 | ccv_nnc_graph_visit_for(visit, exec_info, node, idx) { |
1814 | 600 | visited[(idx >> 5)] |= (1u << (idx & 31)); |
1815 | 600 | } ccv_nnc_graph_visit_endfor |
1816 | 58 | ccv_nnc_graph_visit_free(visit); |
1817 | 58 | const ccv_nnc_graph_exec_t* const destinations = (ccv_nnc_graph_exec_t*)ccv_array_get29 (compiled_data->graph->destinations, 0); |
1818 | 58 | const int destination_size = compiled_data->graph->destinations->rnum; |
1819 | 58 | visit = ccv_nnc_graph_visit_new29 (compiled_data->graph, exec_info, exec_info_size, compiled_data->backward.from_ops, compiled_data->backward.from_op_size, destinations, destination_size, 0); |
1820 | 654 | ccv_nnc_graph_visit_for(visit, exec_info, node, idx) { |
1821 | 654 | visited[(idx >> 5)] |= (1u << (idx & 31)); |
1822 | 654 | } ccv_nnc_graph_visit_endfor |
1823 | 58 | ccv_nnc_graph_visit_free(visit); |
1824 | 58 | visit = ccv_nnc_graph_visit_new29 (compiled_data->graph, exec_info, exec_info_size, sources, source_size, destinations, destination_size, 0); |
1825 | | // Find any missing nodes to be added as source. Right now, these are only set nodes. |
1826 | 1.30k | ccv_nnc_graph_visit_for(visit, exec_info, node, idx) { |
1827 | 1.30k | if (!(visited[(idx >> 5)] & (1u << (idx & 31)))) |
1828 | 47 | { |
1829 | 47 | assert(exec_info[idx].cmd.cmd == CCV_NNC_SET_FORWARD); |
1830 | 47 | if (exec_info[idx].cmd.info.blas.a[0] == 0) // Special-casing for empty out the tensor set function, not for the set grad to 1 one. |
1831 | 0 | ccv_array_add_unique_int(backward_from, idx); |
1832 | 47 | } |
1833 | 1.30k | } ccv_nnc_graph_visit_endfor |
1834 | 29 | ccv_nnc_graph_visit_free(visit); |
1835 | 29 | ccfree(visited); |
1836 | 29 | if (backward_from->rnum != compiled_data->backward.from_op_size) // If it doesn't match, need to redo this. |
1837 | 0 | { |
1838 | 0 | compiled_data->backward.from_op_size = backward_from->rnum; |
1839 | 0 | compiled_data->backward.from_ops = (ccv_nnc_graph_exec_t*)ccrealloc(compiled_data->backward.from_ops, sizeof(ccv_nnc_graph_exec_t) * backward_from->rnum); |
1840 | 0 | for (i = 0; i < backward_from->rnum; i++) |
1841 | 0 | compiled_data->backward.from_ops[i] = (ccv_nnc_graph_exec_t){ |
1842 | 0 | .d = *(int*)ccv_array_get(backward_from, i), |
1843 | 0 | .graph = compiled_data->graph, |
1844 | 0 | }; |
1845 | 0 | } |
1846 | 29 | ccv_array_free(backward_from); |
1847 | 29 | ccv_nnc_graph_set_default_static_schedule(compiled_data->graph, compiled_data->stream_type, model->max_stream_count); |
1848 | 29 | ccv_nnc_graph_autotune(compiled_data->graph, model->workspace_size, 0, TRAVERSE_FULL); |
1849 | 29 | } |
1850 | | |
1851 | | void ccv_cnnp_model_dry_run(ccv_cnnp_model_t* const model, const ccv_cnnp_evaluate_param_t params, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size) |
1852 | 7.96k | { |
1853 | 7.96k | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
1854 | 7.96k | assert(compiled_data); |
1855 | 7.96k | const int parallel_count = ccv_max(model->parallel_count, 1); |
1856 | 7.96k | assert(output_size == model->output_size * parallel_count); |
1857 | 7.96k | assert(input_size == model->input_size * parallel_count); |
1858 | 7.96k | assert(model->graph); |
1859 | 7.96k | const int target_gradient_mode = _ccv_cnnp_is_disable_outgrad_all(params.disable_outgrad, model->input_size) ? CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES14 : CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS7.95k ; |
1860 | 7.96k | const int mode_mismatch = (params.requires_grad && (7.82k compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE7.82k || compiled_data->gradient_mode != target_gradient_mode7.79k || compiled_data->disable_outgrad != params.disable_outgrad7.79k )); |
1861 | 7.96k | if (!compiled_data->graph || mode_mismatch7.88k ) |
1862 | 81 | { |
1863 | 81 | _ccv_cnnp_compiled_data_graph_free(compiled_data); |
1864 | 81 | if (mode_mismatch) // If mode mismatch, we need to redo the backward as well (no need to redo apply_gradients, it doesn't require target_gradient_mode or disable_outgrad. |
1865 | 29 | _ccv_cnnp_compiled_data_backward_free(compiled_data); |
1866 | 81 | if (params.requires_grad) |
1867 | 29 | _ccv_cnnp_model_multistage_jit_0(model, params.disable_outgrad, params.is_test, inputs, input_size, outputs, output_size); |
1868 | 52 | else |
1869 | 52 | _ccv_cnnp_model_multistage_no_grad_jit(model, inputs, input_size, outputs, output_size); |
1870 | 7.88k | } else { |
1871 | 7.88k | ccv_nnc_tensor_arena_clear_bindings(compiled_data->tensor_arena); |
1872 | 7.88k | assert((input_size % parallel_count) == 0); |
1873 | 7.88k | const int input_size_per_p = input_size / parallel_count; |
1874 | 7.88k | _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->inputs, inputs, input_size_per_p, parallel_count); |
1875 | 7.88k | assert((output_size % parallel_count) == 0); |
1876 | 7.88k | const int output_size_per_p = output_size / parallel_count; |
1877 | 7.88k | _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->outputs, outputs, output_size_per_p, parallel_count); |
1878 | 7.88k | } |
1879 | 7.96k | if (compiled_data->is_test != params.is_test) |
1880 | 57 | { |
1881 | 57 | compiled_data->is_test = params.is_test; |
1882 | 57 | ccv_nnc_graph_exec_update_t update = { |
1883 | 57 | .parallel_count = parallel_count, |
1884 | 57 | .graph = model->graph, |
1885 | 57 | .graph_exec_arena = compiled_data->graph_exec_arena, |
1886 | 57 | }; |
1887 | 57 | ccv_cnnp_model_set_is_test(model, params.is_test, _ccv_cnnp_cmd_update_for_execs, &update); |
1888 | 57 | } |
1889 | 7.96k | } |
1890 | | |
1891 | | void ccv_cnnp_model_evaluate(ccv_cnnp_model_t* const model, const ccv_cnnp_evaluate_param_t params, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context) |
1892 | 7.96k | { |
1893 | 7.96k | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
1894 | 7.96k | assert(compiled_data); |
1895 | 7.96k | ccv_cnnp_model_dry_run(model, params, inputs, input_size, outputs, output_size); |
1896 | 7.96k | if (compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE_NO_GRAD) |
1897 | 65 | ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, 0, tensor_tape, stream_context); |
1898 | 7.90k | else { |
1899 | 7.90k | if (!compiled_data->evaluate.schedule) |
1900 | 34 | compiled_data->evaluate.schedule = ccv_nnc_graph_static_schedule_new(compiled_data->graph, compiled_data->stream_type, model->max_stream_count, 0, 0, compiled_data->evaluate.to_ops, compiled_data->evaluate.to_op_size); |
1901 | 7.90k | ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, compiled_data->evaluate.schedule, tensor_tape, stream_context); |
1902 | 7.90k | } |
1903 | 7.96k | } |
1904 | | |
1905 | | // Compile the graph to run ccv_cnnp_model_backward after ccv_cnnp_model_evaluate with requires_grad = true (MULTISTAGE_MODE). |
1906 | | // Particularly, this method compiles the accumulator graph. |
1907 | | static void _ccv_cnnp_model_multistage_jit_1(ccv_cnnp_model_t* const model) |
1908 | 5 | { |
1909 | 5 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
1910 | 5 | assert(compiled_data); |
1911 | 5 | assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE); |
1912 | 5 | ccv_nnc_symbolic_graph_t* accum = ccv_nnc_symbolic_graph_new(); |
1913 | 5 | const int parallel_count = ccv_max(model->parallel_count, 1); |
1914 | 5 | const int parameter_size = compiled_data->parameters->rnum; |
1915 | 5 | int i, j; |
1916 | 5 | compiled_data->backward.gradients = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size * parallel_count * 3); |
1917 | 5 | compiled_data->backward.accum_gradients = compiled_data->backward.gradients + parameter_size * parallel_count; |
1918 | 5 | compiled_data->backward.updated_accum_gradients = compiled_data->backward.accum_gradients + parameter_size * parallel_count; |
1919 | 20 | for (i = 0; i < parameter_size; i++15 ) |
1920 | 30 | for (j = 0; 15 j < parallel_count; j++15 ) |
1921 | 15 | if (compiled_data->tensors.gradients[i + j * parameter_size]) |
1922 | 15 | { |
1923 | 15 | const ccv_nnc_tensor_param_t info = compiled_data->tensors.gradients[i + j * parameter_size]->info; |
1924 | | // Now, the old gradient is the accumulated gradient, getting new gradient tensor setup so we can collect them. |
1925 | 15 | compiled_data->tensors.accum_gradients[i + j * parameter_size] = compiled_data->tensors.gradients[i + j * parameter_size]; |
1926 | 15 | compiled_data->tensors.gradients[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0); |
1927 | 15 | ccv_nnc_tensor_symbol_t inputs[2]; |
1928 | 15 | inputs[0] = compiled_data->backward.accum_gradients[i + j * parameter_size] = ccv_nnc_tensor_symbol_new(accum, info, 0); |
1929 | 15 | inputs[1] = compiled_data->backward.gradients[i + j * parameter_size] = ccv_nnc_tensor_symbol_new(accum, info, 0); |
1930 | 15 | ccv_nnc_tensor_symbol_t output = compiled_data->backward.updated_accum_gradients[i + j * parameter_size] = ccv_nnc_tensor_symbol_new(accum, info, 0); |
1931 | 15 | ccv_nnc_graph_exec_symbol_new(accum, CMD_EWSUM_FORWARD(), inputs, 2, &output, 1, 0); |
1932 | 15 | } else { |
1933 | 0 | compiled_data->backward.accum_gradients[i + j * parameter_size] = NO_TENSOR_SYMBOL; |
1934 | 0 | compiled_data->backward.gradients[i + j * parameter_size] = NO_TENSOR_SYMBOL; |
1935 | 0 | compiled_data->backward.updated_accum_gradients[i + j * parameter_size] = NO_TENSOR_SYMBOL; |
1936 | 0 | } |
1937 | 5 | ccv_nnc_graph_exec_symbol_autogen(accum, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
1938 | 5 | if (ccv_nnc_symbolic_graph_source_size(accum) == 0) |
1939 | 0 | { |
1940 | 0 | ccv_nnc_symbolic_graph_free(accum); |
1941 | | // Create empty graph. |
1942 | 0 | compiled_data->backward.accum = ccv_nnc_graph_new(); |
1943 | 0 | ccv_nnc_graph_topsort(compiled_data->backward.accum, 0, 0); |
1944 | 0 | return; |
1945 | 0 | } |
1946 | 5 | ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0); |
1947 | 5 | _ccv_cnnp_model_bind_tensors(accum, compiled_data->backward.accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1, tensor_binds); |
1948 | 5 | _ccv_cnnp_model_bind_tensors(accum, compiled_data->backward.gradients, compiled_data->tensors.gradients, parameter_size * parallel_count, 1, tensor_binds); |
1949 | 5 | _ccv_cnnp_model_bind_tensors(accum, compiled_data->backward.updated_accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1, tensor_binds); |
1950 | 5 | ccv_nnc_symbolic_graph_compile(accum, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(accum), SYMBOLIC_GRAPH_DESTINATIONS(accum), &compiled_data->backward.accum, &compiled_data->backward.tensor_arena, &compiled_data->backward.graph_exec_arena); |
1951 | 5 | ccv_nnc_symbolic_graph_free(accum); |
1952 | 5 | ccv_array_free(tensor_binds); |
1953 | 5 | ccv_nnc_graph_set_default_static_schedule(compiled_data->backward.accum, compiled_data->stream_type, model->max_stream_count); |
1954 | 5 | } |
1955 | | |
1956 | | void ccv_cnnp_model_backward(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const ingrads, const int ingrad_size, ccv_nnc_tensor_t* const* const outgrads, const int outgrad_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context) |
1957 | 7.88k | { |
1958 | 7.88k | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
1959 | 7.88k | assert(compiled_data); |
1960 | 7.88k | assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE); |
1961 | 7.88k | const int parallel_count = ccv_max(model->parallel_count, 1); |
1962 | 7.88k | assert(ingrad_size == 0 || ingrad_size == model->output_size * parallel_count); |
1963 | 7.88k | if (outgrad_size > 0) |
1964 | 2.51k | { assert(outgrad_size == compiled_data->outgrad_size * parallel_count); } |
1965 | 7.88k | assert(model->graph); |
1966 | 7.88k | assert(compiled_data->graph); |
1967 | 7.88k | const int parameter_size = compiled_data->parameters->rnum; |
1968 | | // If we need to accumulate the gradients now, do jit on accumulator. |
1969 | 7.88k | if (compiled_data->backward.count > 0) |
1970 | 1.71k | { |
1971 | 1.71k | if (!compiled_data->backward.accum) |
1972 | 5 | _ccv_cnnp_model_multistage_jit_1(model); |
1973 | 1.71k | else if (compiled_data->backward.count == 1) { |
1974 | | // On this round, we need to switch accumulated gradients with gradients (so we can do accumulation properly). |
1975 | 496 | int i; |
1976 | 1.48k | for (i = 0; i < parameter_size * parallel_count; i++986 ) |
1977 | 986 | { |
1978 | 986 | ccv_nnc_tensor_t* tensor; |
1979 | 986 | CCV_SWAP(compiled_data->tensors.accum_gradients[i], compiled_data->tensors.gradients[i], tensor); |
1980 | 986 | } |
1981 | 496 | if (compiled_data->backward.tensor_arena) |
1982 | 496 | { |
1983 | 496 | ccv_nnc_tensor_arena_clear_bindings(compiled_data->backward.tensor_arena); |
1984 | | // Do rebind in case we messed up the binding (we switch accum_gradients and gradients). |
1985 | 496 | _ccv_cnnp_bind_tensors_to_arena(compiled_data->backward.tensor_arena, 0, compiled_data->backward.gradients, compiled_data->tensors.gradients, parameter_size * parallel_count, 1); |
1986 | 496 | _ccv_cnnp_bind_tensors_to_arena(compiled_data->backward.tensor_arena, 0, compiled_data->backward.accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1); |
1987 | 496 | _ccv_cnnp_bind_tensors_to_arena(compiled_data->backward.tensor_arena, 0, compiled_data->backward.updated_accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1); |
1988 | 496 | } |
1989 | 496 | } |
1990 | 1.71k | } |
1991 | 7.88k | const int ingrad_size_per_p = model->output_size; |
1992 | 7.88k | const int outgrad_size_per_p = compiled_data->outgrad_size; |
1993 | 7.88k | int i, j; |
1994 | 15.7k | for (i = 0; i < ingrad_size_per_p; i++7.88k ) |
1995 | 7.88k | { |
1996 | 7.88k | const ccv_nnc_tensor_symbol_t ingrad = ccv_nnc_tensor_symbol_for_backward(model->graph, compiled_data->f[i]); |
1997 | 7.88k | if (!ingrad_size || !ingrads3.79k || ingrads[i] == 03.79k ) |
1998 | 4.19k | { |
1999 | | // Set it to 1 if it is not specified. |
2000 | 4.19k | ccv_nnc_tensor_t* const ingrad_tensor = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, ingrad); |
2001 | 4.19k | if (ingrad_tensor) |
2002 | 4.19k | ccv_nnc_cmd_exec(CMD_SET_FORWARD(1), ccv_nnc_no_hint, 0, 0, 0, TENSOR_LIST(ingrad_tensor), stream_context); |
2003 | 4.31k | for (j = 1; j < parallel_count; j++120 ) |
2004 | 120 | { |
2005 | 120 | ccv_nnc_tensor_t* const ingrad_tensor = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, ingrad, j)); |
2006 | 120 | if (ingrad_tensor) |
2007 | 120 | ccv_nnc_cmd_exec(CMD_SET_FORWARD(1), ccv_nnc_no_hint, 0, 0, 0, TENSOR_LIST(ingrad_tensor), stream_context); |
2008 | 120 | } |
2009 | 4.19k | } else { |
2010 | | // Make sure the length matches, in case it is an alias. |
2011 | 3.69k | assert(ccv_nnc_tensor_count(ingrads[i]->info) == ccv_nnc_tensor_count(ccv_nnc_tensor_symbol_params(model->graph, ingrad))); |
2012 | 3.69k | ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, ingrad, ingrads[i]); |
2013 | 3.69k | for (j = 1; j < parallel_count; j++6 ) |
2014 | 6 | ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, ingrad, j), ingrads[i + ingrad_size_per_p * j]); |
2015 | 3.69k | } |
2016 | 7.88k | } |
2017 | 7.88k | if (outgrad_size > 0) |
2018 | 2.51k | { |
2019 | 2.51k | assert(compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS && "shouldn't pass disable_outgrad to ccv_cnnp_model_evaluate before if you plan to compute outgrad"); |
2020 | 5.14k | for (i = 0; 2.51k i < outgrad_size_per_p; i++2.62k ) |
2021 | 2.62k | if (outgrads[i]) |
2022 | 2.43k | { |
2023 | 2.43k | const ccv_nnc_tensor_symbol_t outgrad = compiled_data->outgrads[i]; |
2024 | 2.43k | ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, outgrad, outgrads[i]); |
2025 | 2.43k | for (j = 1; j < parallel_count; j++6 ) |
2026 | 6 | ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, outgrad, j), outgrads[i + outgrad_size_per_p * j]); |
2027 | 2.43k | } |
2028 | 5.37k | } else { |
2029 | 5.37k | assert(compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES || |
2030 | 5.37k | compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS); |
2031 | 5.37k | } |
2032 | | // We need to rebind here because in ccv_cnnp_evaluate, we clear bindings, that will reset all bindings for the gradients. |
2033 | | // For parameters and internals these are fine because when we clear bindings, it restores to original bindings, which are these |
2034 | | // parameters and internals. The same cannot be said for gradients due to the accum_gradients switching. |
2035 | 7.88k | _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count); |
2036 | 7.88k | if (!compiled_data->backward.schedule) |
2037 | 25 | compiled_data->backward.schedule = ccv_nnc_graph_static_schedule_new(compiled_data->graph, compiled_data->stream_type, model->max_stream_count, compiled_data->backward.from_ops, compiled_data->backward.from_op_size, 0, 0); |
2038 | | // Run the backward pass. |
2039 | 7.88k | ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, compiled_data->backward.schedule, tensor_tape, stream_context); |
2040 | | // If we need to run accumulation round, do that now. |
2041 | 7.88k | if (compiled_data->backward.count > 0) |
2042 | 1.71k | ccv_nnc_graph_run_with_schedule(compiled_data->backward.accum, 0, 0, 0, stream_context); |
2043 | | // Update the count, this determines whether we need to accumulate or not. |
2044 | 7.88k | ++compiled_data->backward.count; |
2045 | 7.88k | } |
2046 | | |
2047 | | // Compile the graph to run ccv_cnnp_model_apply_gradients after ccv_cnnp_model_backward (MULTISTAGE_MODE). |
2048 | | // Particularly, this method compiles the parameter update graph. |
2049 | | static void _ccv_cnnp_model_multistage_jit_2(ccv_cnnp_model_t* const model) |
2050 | 21 | { |
2051 | 21 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
2052 | 21 | assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE); |
2053 | 21 | const int parallel_count = ccv_max(model->parallel_count, 1); |
2054 | 21 | const int parameter_size = compiled_data->parameters->rnum; |
2055 | 21 | ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0); |
2056 | 21 | _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds); |
2057 | 21 | _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->updated_parameters, compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds); |
2058 | | // Bind accumulated gradients. |
2059 | 21 | if (compiled_data->backward.count > 1) |
2060 | 4 | _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->gradients, compiled_data->tensors.accum_gradients, parameter_size, parallel_count, tensor_binds); |
2061 | 17 | else |
2062 | 17 | _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count, tensor_binds); |
2063 | 21 | ccv_array_t* const apply_gradients_from = ccv_array_new(sizeof(int), 0, 0); |
2064 | 21 | int i, j; |
2065 | 247 | for (i = 0; i < compiled_data->backward.to_size; i++226 ) |
2066 | 226 | { |
2067 | 226 | const int* tos; |
2068 | 226 | int to_size; |
2069 | 226 | ccv_nnc_graph_exec_symbol_to(model->graph, compiled_data->backward.tos[i], &tos, &to_size); |
2070 | 726 | for (j = 0; j < to_size; j++500 ) |
2071 | 500 | { |
2072 | | // Check if this is already show up in the backward graph, if that is the case, it won't be in the apply |
2073 | | // gradients graph. |
2074 | 500 | const ccv_nnc_graph_exec_t exec = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, (ccv_nnc_graph_exec_symbol_t){ |
2075 | 500 | .d = tos[j], |
2076 | 500 | .graph = model->graph, |
2077 | 500 | }); |
2078 | 500 | if (!exec.graph) |
2079 | 313 | ccv_array_add_unique_int(apply_gradients_from, tos[j]); |
2080 | 500 | } |
2081 | 226 | } |
2082 | 21 | const int from_size = apply_gradients_from->rnum; |
2083 | 21 | if (from_size == 0) |
2084 | 0 | { |
2085 | 0 | ccv_array_free(apply_gradients_from); |
2086 | 0 | ccv_array_free(tensor_binds); |
2087 | 0 | return; |
2088 | 0 | } |
2089 | 21 | ccv_nnc_graph_exec_symbol_t* const froms = (ccv_nnc_graph_exec_symbol_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_t) * from_size); |
2090 | 154 | for (i = 0; i < from_size; i++133 ) |
2091 | 133 | froms[i] = (ccv_nnc_graph_exec_symbol_t){ |
2092 | 133 | .d = *(int*)ccv_array_get(apply_gradients_from, i), |
2093 | 133 | .graph = model->graph |
2094 | 133 | }; |
2095 | 21 | ccv_array_free(apply_gradients_from); |
2096 | | // It can only ends with updates on the parameters. |
2097 | 21 | ccv_array_t* const tos = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), parameter_size * parallel_count, 0); |
2098 | 154 | for (i = 0; i < parameter_size; i++133 ) |
2099 | 133 | { |
2100 | 133 | if (compiled_data->update_nodes[i].d == CCV_NNC_NO_TENSOR_SYMBOL) |
2101 | 0 | continue; |
2102 | 133 | ccv_array_push(tos, &compiled_data->update_nodes[i]); |
2103 | 313 | for (j = 1; j < parallel_count; j++180 ) |
2104 | 180 | { |
2105 | 180 | const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->update_nodes[i], j); |
2106 | 180 | ccv_array_push(tos, ©); |
2107 | 180 | } |
2108 | 133 | } |
2109 | 21 | ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, froms, from_size, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(tos, 0), tos->rnum, &compiled_data->apply_gradients.graph, &compiled_data->apply_gradients.tensor_arena, &compiled_data->apply_gradients.graph_exec_arena); |
2110 | 21 | ccv_array_free(tos); |
2111 | 21 | ccv_array_free(tensor_binds); |
2112 | 21 | ccfree(froms); |
2113 | 21 | const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size; |
2114 | 213 | for (i = 0; i < max_saved_aux_size * parameter_size; i++192 ) |
2115 | 192 | { |
2116 | | // Skip on no tensor. |
2117 | 192 | if (compiled_data->saved_aux[i].source.d == CCV_NNC_NO_TENSOR_SYMBOL) |
2118 | 0 | continue; |
2119 | 192 | ccv_nnc_tensor_t* const tensor = ccv_nnc_tensor_from_symbol(compiled_data->apply_gradients.tensor_arena, compiled_data->saved_aux[i].source); |
2120 | 192 | ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &tensor, 1, 0); |
2121 | 540 | for (j = 1; j < parallel_count; j++348 ) |
2122 | 348 | { |
2123 | 348 | ccv_nnc_tensor_t* const copy = ccv_nnc_tensor_from_symbol(compiled_data->apply_gradients.tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, compiled_data->saved_aux[i].source, j)); |
2124 | 348 | if (copy) |
2125 | 348 | ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, ©, 1, 0); |
2126 | 348 | } |
2127 | 192 | } |
2128 | 21 | ccv_nnc_graph_set_default_static_schedule(compiled_data->apply_gradients.graph, compiled_data->stream_type, model->max_stream_count); |
2129 | 21 | } |
2130 | | |
2131 | | void ccv_cnnp_model_apply_gradients(ccv_cnnp_model_t* const model, ccv_nnc_stream_context_t* const stream_context) |
2132 | 7.81k | { |
2133 | 7.81k | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
2134 | 7.81k | assert(compiled_data); |
2135 | 7.81k | assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE); |
2136 | 7.81k | const int parallel_count = ccv_max(model->parallel_count, 1); |
2137 | 7.81k | assert(model->graph); |
2138 | 7.81k | assert(compiled_data->graph); |
2139 | | // Skip if there is no backward pass. |
2140 | 7.81k | if (compiled_data->backward.count <= 0) |
2141 | 1.65k | return; |
2142 | | // Skip if there is no parameters. |
2143 | 6.16k | if (compiled_data->parameters->rnum == 0) |
2144 | 3 | { |
2145 | 3 | compiled_data->backward.count = 0; |
2146 | 3 | return; |
2147 | 3 | } |
2148 | 6.16k | if (!compiled_data->apply_gradients.graph) |
2149 | 21 | _ccv_cnnp_model_multistage_jit_2(model); |
2150 | 6.14k | else { |
2151 | 6.14k | const int parameter_size = compiled_data->parameters->rnum; |
2152 | 6.14k | ccv_nnc_tensor_arena_clear_bindings(compiled_data->apply_gradients.tensor_arena); |
2153 | | // Change to bind accum_gradients if we do gradient accumulation (run backward more than once). |
2154 | 6.14k | if (compiled_data->backward.count > 1) |
2155 | 497 | _ccv_cnnp_bind_tensors_to_arena(compiled_data->apply_gradients.tensor_arena, model->graph, compiled_data->gradients, compiled_data->tensors.accum_gradients, parameter_size, parallel_count); |
2156 | 5.64k | else |
2157 | 5.64k | _ccv_cnnp_bind_tensors_to_arena(compiled_data->apply_gradients.tensor_arena, model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count); |
2158 | 6.14k | } |
2159 | 6.16k | if (compiled_data->apply_gradients.graph) |
2160 | 6.16k | ccv_nnc_graph_run_with_schedule(compiled_data->apply_gradients.graph, 0, 0, 0, stream_context); |
2161 | | // Reset backward count to 0. |
2162 | 6.16k | compiled_data->backward.count = 0; |
2163 | 6.16k | } |
2164 | | |
2165 | | void ccv_cnnp_model_set_parameter(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter, const ccv_nnc_tensor_t* const tensor) |
2166 | 32 | { |
2167 | 32 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
2168 | 32 | const int param_sel = parameter->param_sel > 0 ? parameter->param_sel - 18 : parameter->param_sel24 ; |
2169 | 32 | assert(parameter->param_sel != 0); |
2170 | 32 | const int tensors_init = !!compiled_data->tensors_init.v; |
2171 | 32 | if (!tensors_init) |
2172 | 17 | _ccv_cnnp_model_tensors_init(model, compiled_data); |
2173 | 15 | else if ((uintptr_t)compiled_data->tensors_init.v & (uintptr_t)1) |
2174 | | // Check if it is not fully allocated, if it is not, init_1. |
2175 | 0 | ccv_cnnp_model_tensors_init_1(model, compiled_data); |
2176 | 32 | ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0); |
2177 | 32 | ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices); |
2178 | 32 | const int param_ref = parameter->param_ref > 0 ? parameter->param_ref - 131 : parameter->param_ref1 ; |
2179 | 32 | if (param_ref < 0) |
2180 | 1 | { assert(parameter_indices->rnum == 1); } |
2181 | 31 | else |
2182 | 31 | { assert(param_ref < parameter_indices->rnum); } |
2183 | 32 | const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0); |
2184 | 32 | ccv_array_free(parameter_indices); |
2185 | 32 | const int parameter_size = compiled_data->parameters->rnum; |
2186 | 32 | assert(d >= 0); |
2187 | 32 | assert(d < parameter_size); |
2188 | 32 | const int parallel_count = ccv_max(model->parallel_count, 1); |
2189 | 32 | ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(compiled_data->tensors.parameters[d]); |
2190 | 32 | assert(dest); |
2191 | 32 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)tensor), TENSOR_LIST(dest), 0); |
2192 | 32 | int i; |
2193 | 32 | for (i = 1; i < parallel_count; i++0 ) |
2194 | 0 | { |
2195 | 0 | ccv_nnc_tensor_t* const copy_tensor = CCV_NNC_TENSOR(compiled_data->tensors.parameters[d + i * parameter_size]); |
2196 | 0 | if (copy_tensor) |
2197 | 0 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dest), TENSOR_LIST(copy_tensor), 0); |
2198 | 0 | } |
2199 | | // Mark this symbol as init'ed. |
2200 | 32 | const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, d))->d; |
2201 | 32 | uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v); |
2202 | 32 | init_v[s >> 5] |= (1u << (s & 0x1f)); |
2203 | 32 | } |
2204 | | |
2205 | | void ccv_cnnp_model_parameter_copy(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter, ccv_nnc_tensor_t* const tensor) |
2206 | 6 | { |
2207 | 6 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
2208 | 6 | const int param_sel = parameter->param_sel > 0 ? parameter->param_sel - 13 : parameter->param_sel3 ; |
2209 | 6 | assert(parameter->param_sel != 0); |
2210 | 6 | assert(compiled_data->tensors.parameters); |
2211 | 6 | ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0); |
2212 | 6 | ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices); |
2213 | 6 | const int param_ref = parameter->param_ref > 0 ? parameter->param_ref - 13 : parameter->param_ref3 ; |
2214 | 6 | if (param_ref < 0) |
2215 | 3 | { assert(parameter_indices->rnum == 1); } |
2216 | 3 | else |
2217 | 3 | { assert(param_ref < parameter_indices->rnum); } |
2218 | 6 | const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0); |
2219 | 6 | ccv_array_free(parameter_indices); |
2220 | 6 | const int parameter_size = compiled_data->parameters->rnum; |
2221 | 6 | assert(d >= 0); |
2222 | 6 | assert(d < parameter_size); |
2223 | | // We don't need to consider parallel_count, every parameter on each device is identical. |
2224 | 6 | ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(compiled_data->tensors.parameters[d]); |
2225 | 6 | assert(src); |
2226 | 6 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(src), TENSOR_LIST(tensor), 0); |
2227 | 6 | } |
2228 | | |
2229 | | ccv_nnc_tensor_param_t ccv_cnnp_model_parameter_tensor_params(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter) |
2230 | 1 | { |
2231 | 1 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
2232 | 1 | const int param_sel = parameter->param_sel > 0 ? parameter->param_sel - 10 : parameter->param_sel; |
2233 | 1 | assert(parameter->param_sel != 0); |
2234 | 1 | assert(compiled_data->tensors.parameters); |
2235 | 1 | ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0); |
2236 | 1 | ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices); |
2237 | 1 | const int param_ref = parameter->param_ref > 0 ? parameter->param_ref - 10 : parameter->param_ref; |
2238 | 1 | if (param_ref < 0) |
2239 | 1 | { assert(parameter_indices->rnum == 1); } |
2240 | 0 | else |
2241 | 0 | { assert(param_ref < parameter_indices->rnum); } |
2242 | 1 | const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0); |
2243 | 1 | ccv_array_free(parameter_indices); |
2244 | 1 | const int parameter_size = compiled_data->parameters->rnum; |
2245 | 1 | assert(d >= 0); |
2246 | 1 | assert(d < parameter_size); |
2247 | | // We don't need to consider parallel_count, every parameter on each device is identical. |
2248 | 1 | ccv_nnc_tensor_t* const tensor = CCV_NNC_TENSOR(compiled_data->tensors.parameters[d]); |
2249 | 1 | assert(tensor); |
2250 | 1 | return tensor->info; |
2251 | 1 | } |
2252 | | |
2253 | | const char* ccv_cnnp_model_parameter_name(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter) |
2254 | 2 | { |
2255 | 2 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
2256 | 2 | const int param_sel = parameter->param_sel > 0 ? parameter->param_sel - 1 : parameter->param_sel0 ; |
2257 | 2 | assert(parameter->param_sel != 0); |
2258 | 2 | ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0); |
2259 | 2 | ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices); |
2260 | 2 | const int param_ref = parameter->param_ref > 0 ? parameter->param_ref - 1 : parameter->param_ref0 ; |
2261 | 2 | if (param_ref < 0) |
2262 | 0 | { assert(parameter_indices->rnum == 1); } |
2263 | 2 | else |
2264 | 2 | { assert(param_ref < parameter_indices->rnum); } |
2265 | 2 | const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0); |
2266 | 2 | ccv_array_free(parameter_indices); |
2267 | 2 | const int parameter_size = compiled_data->parameters->rnum; |
2268 | 2 | assert(d >= 0); |
2269 | 2 | assert(d < parameter_size); |
2270 | 2 | return *(char**)ccv_array_get(compiled_data->ids.parameters, d); |
2271 | 2 | } |
2272 | | |
2273 | | int ccv_cnnp_model_parameter_count(ccv_cnnp_model_t* const model) |
2274 | 0 | { |
2275 | 0 | assert(model->compiled_data); |
2276 | 0 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
2277 | 0 | return compiled_data->parameters->rnum; |
2278 | 0 | } |
2279 | | |
2280 | | ccv_cnnp_model_io_t ccv_cnnp_model_parameter_first(ccv_cnnp_model_t* const model, ccv_cnnp_model_parameters_filter_f first, void* const context) |
2281 | 0 | { |
2282 | 0 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
2283 | 0 | assert(compiled_data); |
2284 | 0 | const int parameter_size = compiled_data->parameters->rnum; |
2285 | 0 | int i; |
2286 | 0 | for (i = 0; i < parameter_size; i++) |
2287 | 0 | { |
2288 | 0 | const char* const name = *(char**)ccv_array_get(compiled_data->ids.parameters, i); |
2289 | 0 | if (first(model, name, context)) |
2290 | 0 | return ccv_cnnp_model_parameters(model, -1, i); |
2291 | 0 | } |
2292 | 0 | return 0; |
2293 | 0 | } |
2294 | | |
2295 | | ccv_array_t* ccv_cnnp_model_parameters_filter(ccv_cnnp_model_t* const model, ccv_cnnp_model_parameters_filter_f filter, void* const context) |
2296 | 0 | { |
2297 | 0 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
2298 | 0 | assert(compiled_data); |
2299 | 0 | ccv_array_t* const parameters = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 0, 0); |
2300 | 0 | const int parameter_size = compiled_data->parameters->rnum; |
2301 | 0 | int i; |
2302 | 0 | for (i = 0; i < parameter_size; i++) |
2303 | 0 | { |
2304 | 0 | const char* const name = *(char**)ccv_array_get(compiled_data->ids.parameters, i); |
2305 | 0 | if (filter(model, name, context)) |
2306 | 0 | { |
2307 | 0 | ccv_cnnp_model_io_t parameter = ccv_cnnp_model_parameters(model, -1, i); |
2308 | 0 | ccv_array_push(parameters, ¶meter); |
2309 | 0 | } |
2310 | 0 | } |
2311 | 0 | return parameters; |
2312 | |
|
2313 | 0 | } |
2314 | | |
2315 | | CCV_WARN_UNUSED(ccv_cnnp_model_io_t) ccv_cnnp_model_parameter_first_uninit(ccv_cnnp_model_t* const model) |
2316 | 0 | { |
2317 | 0 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
2318 | 0 | assert(compiled_data); |
2319 | 0 | const int tensors_init = !!compiled_data->tensors_init.v; |
2320 | 0 | if (!tensors_init) // If nothing initialized, we return parameter 0. |
2321 | 0 | return ccv_cnnp_model_parameters(model, -1, 0); |
2322 | 0 | const int parameter_size = compiled_data->parameters->rnum; |
2323 | 0 | int i; |
2324 | 0 | const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v); |
2325 | 0 | for (i = 0; i < parameter_size; i++) |
2326 | 0 | { |
2327 | 0 | const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d; |
2328 | 0 | if (!(init_v[d >> 5] & (1u << (d & 0x1f)))) |
2329 | 0 | return ccv_cnnp_model_parameters(model, -1, i); |
2330 | 0 | } |
2331 | 0 | return 0; |
2332 | 0 | } |
2333 | | |
2334 | | static ccv_array_t* _ccv_cnnp_model_parameter_indices(const ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, int* const param_ref) |
2335 | 48 | { |
2336 | 48 | const int to_param_sel = parameters->param_sel > 0 ? parameters->param_sel - 10 : parameters->param_sel; |
2337 | 48 | assert(parameters->param_sel != 0); |
2338 | 48 | ccv_array_t* const to_parameter_indices = ccv_array_new(sizeof(int), 0, 0); |
2339 | 48 | ccv_cnnp_model_add_to_parameter_indices(parameters->model, to_param_sel, to_parameter_indices); |
2340 | 48 | *param_ref = parameters->param_ref > 0 ? parameters->param_ref - 10 : parameters->param_ref; |
2341 | 48 | return to_parameter_indices; |
2342 | 48 | } |
2343 | | |
2344 | | static void _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters, ccv_array_t** const parameter_indices, int* const param_ref, ccv_array_t** const from_parameter_indices, int* const from_param_ref, const int only_init_0) |
2345 | 14 | { |
2346 | | // If the model is not compiled yet. Compile them now. |
2347 | 14 | if (!model->graph) |
2348 | 3 | { |
2349 | 3 | model->graph = ccv_nnc_symbolic_graph_new(); |
2350 | 3 | assert(from_model->compiled_data); |
2351 | 3 | const int input_size = from_model->input_size; |
2352 | 3 | ccv_nnc_tensor_param_t input_params[input_size]; |
2353 | 3 | int i; |
2354 | 9 | for (i = 0; i < input_size; i++6 ) |
2355 | 6 | input_params[i] = ccv_nnc_tensor_symbol_params(from_model->graph, from_model->inputs[i]); |
2356 | 3 | _ccv_cnnp_model_compile(model, input_params, input_size, from_model->compiled_data->loss); |
2357 | 3 | model->parallel_count = from_model->parallel_count; |
2358 | 3 | model->memory_compression = from_model->memory_compression; |
2359 | 3 | model->memory_reduction = from_model->memory_reduction; |
2360 | 3 | model->gradient_checkpointing = from_model->gradient_checkpointing; |
2361 | 3 | model->compiled_data->stream_type = from_model->compiled_data->stream_type; |
2362 | 3 | model->compiled_data->minimize.minimizer = from_model->compiled_data->minimize.minimizer; |
2363 | 3 | model->compiled_data->minimize.max_saved_aux_size = from_model->compiled_data->minimize.max_saved_aux_size; |
2364 | 3 | } |
2365 | 14 | ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data; |
2366 | 14 | assert(to_compiled_data); |
2367 | 14 | const int to_tensors_init = !!to_compiled_data->tensors_init.v; |
2368 | 14 | if (!to_tensors_init) |
2369 | 10 | { |
2370 | 10 | if (only_init_0) |
2371 | 1 | ccv_cnnp_model_tensors_init_0(model, to_compiled_data); |
2372 | 9 | else |
2373 | 9 | _ccv_cnnp_model_tensors_init(model, to_compiled_data); |
2374 | 10 | } else if (4 !only_init_04 && (uintptr_t)to_compiled_data->tensors_init.v & (uintptr_t)13 ) |
2375 | | // Check if it is not fully allocated, if it is not, init_1. |
2376 | 0 | ccv_cnnp_model_tensors_init_1(model, to_compiled_data); |
2377 | 14 | assert(to_compiled_data->tensors.parameters); |
2378 | 14 | *parameter_indices = _ccv_cnnp_model_parameter_indices(model, parameters, param_ref); |
2379 | 14 | *from_parameter_indices = _ccv_cnnp_model_parameter_indices(from_model, from_parameters, from_param_ref); |
2380 | 14 | if (*from_param_ref < 0 && *param_ref >= 0) |
2381 | 0 | { assert((*from_parameter_indices)->rnum == 1); } |
2382 | 14 | else if (*from_param_ref >= 0) |
2383 | 0 | { assert(*from_param_ref < (*from_parameter_indices)->rnum); } |
2384 | 14 | if (*param_ref < 0 && *from_param_ref >= 0) |
2385 | 0 | { assert((*parameter_indices)->rnum == 1); } |
2386 | 14 | else if (*param_ref >= 0) |
2387 | 0 | { assert(*param_ref < (*parameter_indices)->rnum); } |
2388 | 14 | } |
2389 | | |
2390 | | void ccv_cnnp_model_set_parameters(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters) |
2391 | 9 | { |
2392 | 9 | ccv_array_t* to_parameter_indices; |
2393 | 9 | int to_param_ref; |
2394 | 9 | ccv_array_t* from_parameter_indices; |
2395 | 9 | int from_param_ref; |
2396 | 9 | _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(model, parameters, from_model, from_parameters, &to_parameter_indices, &to_param_ref, &from_parameter_indices, &from_param_ref, 0); |
2397 | | // Should be exactly the same tensor. |
2398 | 9 | if (to_param_ref < 0 && from_param_ref < 0) |
2399 | 9 | { assert(from_parameter_indices->rnum == to_parameter_indices->rnum); } |
2400 | | // To models. |
2401 | 9 | ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data; |
2402 | 9 | assert(to_compiled_data); |
2403 | | // From models. |
2404 | 9 | const ccv_cnnp_compiled_data_t* const from_compiled_data = from_model->compiled_data; |
2405 | 9 | const int parallel_count = ccv_max(model->parallel_count, 1); |
2406 | 9 | const int to_parameter_size = to_compiled_data->parameters->rnum; |
2407 | 9 | const int rnum = (to_param_ref < 0 && from_param_ref < 0) ? from_parameter_indices->rnum : 10 ; |
2408 | 9 | int i, j; |
2409 | 9 | const uint32_t* const from_init_v = CCV_NNC_INIT_V(from_compiled_data->tensors_init.v); |
2410 | 9 | uint32_t* const to_init_v = CCV_NNC_INIT_V(to_compiled_data->tensors_init.v); |
2411 | 18 | for (i = 0; i < rnum; i++9 ) |
2412 | 9 | { |
2413 | 9 | const int src_d = *(int*)ccv_array_get(from_parameter_indices,from_param_ref >= 0 ? from_param_ref : i); |
2414 | 9 | assert(src_d >= 0); |
2415 | 9 | assert(src_d < from_compiled_data->parameters->rnum); |
2416 | 9 | const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(from_compiled_data->parameters, src_d))->d; |
2417 | | // If the original is not init'ed. We cannot copy from. |
2418 | 9 | if (!(from_init_v[s >> 5] & (1u << (s & 0x1f)))) |
2419 | 0 | continue; |
2420 | 9 | const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i); |
2421 | 9 | assert(dest_d >= 0); |
2422 | 9 | assert(dest_d < to_compiled_data->parameters->rnum); |
2423 | 9 | ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(from_compiled_data->tensors.parameters[src_d]); |
2424 | 9 | assert(src); |
2425 | 9 | ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d]); |
2426 | 9 | assert(dest); |
2427 | 9 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(src), TENSOR_LIST(dest), 0); |
2428 | 27 | for (j = 1; j < parallel_count; j++18 ) |
2429 | 18 | { |
2430 | 18 | ccv_nnc_tensor_t* const copy_tensor = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size]); |
2431 | 18 | if (copy_tensor) |
2432 | 18 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dest), TENSOR_LIST(copy_tensor), 0); |
2433 | 18 | } |
2434 | | // Mark this symbol as init'ed. |
2435 | 9 | const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(to_compiled_data->parameters, dest_d))->d; |
2436 | 9 | to_init_v[d >> 5] |= (1u << (d & 0x1f)); |
2437 | 9 | } |
2438 | 9 | ccv_array_free(to_parameter_indices); |
2439 | 9 | ccv_array_free(from_parameter_indices); |
2440 | 9 | } |
2441 | | |
2442 | | KHASH_MAP_INIT_STR(ccv_cnnp_parameter_id, int) |
2443 | | |
2444 | | void ccv_cnnp_model_share_parameters(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters, ccv_cnnp_model_parameters_renamer_f renamer, void* const context) |
2445 | 2 | { |
2446 | 2 | ccv_array_t* to_parameter_indices; |
2447 | 2 | int to_param_ref; |
2448 | 2 | ccv_array_t* from_parameter_indices; |
2449 | 2 | int from_param_ref; |
2450 | 2 | _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(model, parameters, from_model, from_parameters, &to_parameter_indices, &to_param_ref, &from_parameter_indices, &from_param_ref, 1); |
2451 | | // Should be exactly the same tensor. |
2452 | 2 | if (renamer == 0 && to_param_ref < 01 && from_param_ref < 01 ) |
2453 | 1 | { assert(from_parameter_indices->rnum == to_parameter_indices->rnum); } |
2454 | | // To models. |
2455 | 2 | ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data; |
2456 | 2 | assert(to_compiled_data); |
2457 | | // From models. |
2458 | 2 | const ccv_cnnp_compiled_data_t* const from_compiled_data = from_model->compiled_data; |
2459 | 2 | const int parallel_count = ccv_max(model->parallel_count, 1); |
2460 | 2 | assert(parallel_count == ccv_max(from_model->parallel_count, 1)); // Should have the same parallel count can share parameters. |
2461 | 2 | const int from_parameter_size = from_compiled_data->parameters->rnum; |
2462 | 2 | const int to_parameter_size = to_compiled_data->parameters->rnum; |
2463 | 2 | const int rnum = (to_param_ref < 0 && from_param_ref < 0) ? to_parameter_indices->rnum : 10 ; |
2464 | 2 | int i, j; |
2465 | 2 | khash_t(ccv_cnnp_parameter_id)* id_map = 0; |
2466 | 2 | char* updated_name = 0; |
2467 | 2 | const uint32_t* const from_init_v = CCV_NNC_INIT_V(from_compiled_data->tensors_init.v); |
2468 | 2 | uint32_t* const to_init_v = CCV_NNC_INIT_V(to_compiled_data->tensors_init.v); |
2469 | 8 | for (i = 0; i < rnum; i++6 ) |
2470 | 6 | { |
2471 | 6 | int src_d = (from_param_ref >= 0 ? from_param_ref0 : i) < from_parameter_indices->rnum ? *(int*)4 ccv_array_get4 (from_parameter_indices,from_param_ref >= 0 ? from_param_ref : i) : from_parameter_size2 ; |
2472 | | // Need to figure out how to use the renamer here. |
2473 | 6 | const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i); |
2474 | 6 | assert(dest_d >= 0); |
2475 | 6 | assert(dest_d < to_parameter_size); |
2476 | 6 | if (renamer) |
2477 | 3 | { |
2478 | 3 | const char* const src_name = (src_d < from_parameter_size && src_d >= 01 ) ? *(char**)1 ccv_array_get1 (from_compiled_data->ids.parameters, src_d) : 02 ; |
2479 | 3 | const char* const dest_name = *(char**)ccv_array_get(to_compiled_data->ids.parameters, dest_d); |
2480 | 3 | if (!updated_name) |
2481 | 1 | updated_name = (char*)ccmalloc(1024); |
2482 | 3 | const size_t src_name_len = src_name == 0 ? 02 : ccv_min1 (strnlen(src_name, 1023), 1023); |
2483 | 3 | if (src_name_len > 0) |
2484 | 1 | memcpy(updated_name, src_name, src_name_len); |
2485 | 3 | updated_name[src_name_len] = 0; |
2486 | 3 | if (renamer(context, dest_name, updated_name, 1024) != 0) |
2487 | 0 | continue; // Skip this. |
2488 | 3 | if (src_name != 0 && memcmp(updated_name, src_name, src_name_len) == 01 && strnlen(updated_name, 1023) == src_name_len0 ) |
2489 | 0 | { |
2490 | | // Nothing changed. |
2491 | 3 | } else { |
2492 | 3 | if (!id_map) |
2493 | 1 | { |
2494 | 1 | id_map = kh_init(ccv_cnnp_parameter_id); |
2495 | 2 | for (j = 0; j < from_parameter_size; j++1 ) |
2496 | 1 | { |
2497 | 1 | int ret; |
2498 | 1 | const khiter_t k = kh_put(ccv_cnnp_parameter_id, id_map, *(char**)ccv_array_get(from_compiled_data->ids.parameters, j), &ret); |
2499 | 1 | assert(ret != 0); |
2500 | 1 | kh_val(id_map, k) = j; |
2501 | 1 | } |
2502 | 1 | } |
2503 | 3 | const khiter_t k = kh_get(ccv_cnnp_parameter_id, id_map, updated_name); |
2504 | 3 | if (k == kh_end(id_map)) // Cannot find the name, skip. |
2505 | 2 | continue; |
2506 | 1 | src_d = kh_val(id_map, k); |
2507 | 1 | assert(src_d >= 0); |
2508 | 1 | assert(src_d < from_parameter_size); |
2509 | 1 | } |
2510 | 3 | } |
2511 | 6 | assert(src_d >= 0)4 ; |
2512 | 4 | assert(src_d < from_parameter_size); |
2513 | 4 | const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(from_compiled_data->parameters, src_d))->d; |
2514 | | // If the original is not init'ed. We cannot share from. |
2515 | 4 | if (!(from_init_v[s >> 5] & (1u << (s & 0x1f)))) |
2516 | 0 | continue; |
2517 | 8 | for (j = 0; 4 j < parallel_count; j++4 ) |
2518 | 4 | { |
2519 | 4 | ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(from_compiled_data->tensors.parameters[src_d + j * from_parameter_size]); |
2520 | 4 | assert(src); |
2521 | 4 | ccv_nnc_tensor_t* const dest = to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size]; |
2522 | 4 | if (dest && !((uintptr_t)dest & (uintptr_t)1)1 ) |
2523 | 1 | ccv_nnc_tensor_free(dest); |
2524 | 4 | to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size] = (ccv_nnc_tensor_t*)((uintptr_t)src | (uintptr_t)1); |
2525 | 4 | } |
2526 | | // Mark this symbol as init'ed. |
2527 | 4 | const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(to_compiled_data->parameters, dest_d))->d; |
2528 | 4 | to_init_v[d >> 5] |= (1u << (d & 0x1f)); |
2529 | 4 | } |
2530 | 2 | ccv_array_free(to_parameter_indices); |
2531 | 2 | ccv_array_free(from_parameter_indices); |
2532 | 2 | if (id_map) |
2533 | 1 | kh_destroy(ccv_cnnp_parameter_id, id_map); |
2534 | 2 | if (updated_name) |
2535 | 1 | ccfree(updated_name); |
2536 | | // Mark it as incomplete so we will call init_1. |
2537 | 2 | if (ccv_cnnp_model_tensors_any_to_alloc(model, to_compiled_data)) |
2538 | 0 | to_compiled_data->tensors_init.v = (uint32_t*)((uintptr_t)to_compiled_data->tensors_init.v | (uintptr_t)1); |
2539 | 2 | else // Remove the flag. |
2540 | 2 | to_compiled_data->tensors_init.v = CCV_NNC_INIT_V(to_compiled_data->tensors_init.v); |
2541 | 2 | } |
2542 | | |
2543 | | ccv_nnc_stream_context_t* ccv_cnnp_compiled_data_get_stream(ccv_cnnp_compiled_data_t* const compiled_data, const int type) |
2544 | 24 | { |
2545 | 24 | if (!compiled_data->stream_map) |
2546 | 4 | compiled_data->stream_map = kh_init(stream_map); |
2547 | 24 | int ret = 0; |
2548 | 24 | khiter_t k = kh_put(stream_map, compiled_data->stream_map, type, &ret); |
2549 | 24 | assert(ret >= 0); |
2550 | 24 | ccv_nnc_stream_context_t* stream = kh_val(compiled_data->stream_map, k); |
2551 | | // If ret == 0, the key already exist, we can return directly, otherwise, create and return. |
2552 | 24 | if (ret != 0) |
2553 | 16 | { |
2554 | 16 | stream = ccv_nnc_stream_context_new(type); |
2555 | 16 | kh_val(compiled_data->stream_map, k) = stream; |
2556 | 16 | } |
2557 | 24 | return stream; |
2558 | 24 | } |
2559 | | |
2560 | | void ccv_cnnp_model_parameters_zip_map(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const aux_ins, const int aux_in_size, ccv_nnc_tensor_t* const* const aux_outs, const int aux_out_size, ccv_nnc_stream_context_t* const stream_context, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters) |
2561 | 3 | { |
2562 | 3 | ccv_array_t* to_parameter_indices; |
2563 | 3 | int to_param_ref; |
2564 | 3 | ccv_array_t* from_parameter_indices; |
2565 | 3 | int from_param_ref; |
2566 | 3 | _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(model, parameters, from_model, from_parameters, &to_parameter_indices, &to_param_ref, &from_parameter_indices, &from_param_ref, 0); |
2567 | | // Should be exactly the same tensor. |
2568 | 3 | if (to_param_ref < 0 && from_param_ref < 0) |
2569 | 3 | { assert(from_parameter_indices->rnum == to_parameter_indices->rnum); } |
2570 | | // To models. |
2571 | 3 | ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data; |
2572 | 3 | assert(to_compiled_data); |
2573 | | // From models. |
2574 | 3 | const ccv_cnnp_compiled_data_t* const from_compiled_data = from_model->compiled_data; |
2575 | 3 | const int parallel_count = ccv_max(model->parallel_count, 1); |
2576 | 3 | const int to_parameter_size = to_compiled_data->parameters->rnum; |
2577 | 3 | const int rnum = (to_param_ref < 0 && from_param_ref < 0) ? from_parameter_indices->rnum : 10 ; |
2578 | 3 | assert(aux_in_size >= 0); |
2579 | 3 | assert(aux_out_size >= 0); |
2580 | 3 | int i, j; |
2581 | 3 | ccv_nnc_tensor_t* inputs[aux_in_size + 2]; |
2582 | 3 | ccv_nnc_tensor_t* outputs[aux_out_size + 1]; |
2583 | 3 | for (i = 0; i < aux_in_size; i++0 ) |
2584 | 0 | inputs[i + 2] = aux_ins[i]; |
2585 | 3 | for (i = 0; i < aux_out_size; i++0 ) |
2586 | 0 | outputs[i + 1] = aux_outs[i]; |
2587 | 3 | const uint32_t* const from_init_v = CCV_NNC_INIT_V(from_compiled_data->tensors_init.v); |
2588 | 3 | uint32_t* const to_init_v = CCV_NNC_INIT_V(to_compiled_data->tensors_init.v); |
2589 | 6 | for (i = 0; i < rnum; i++3 ) |
2590 | 3 | { |
2591 | 3 | const int src_d = *(int*)ccv_array_get(from_parameter_indices,from_param_ref >= 0 ? from_param_ref : i); |
2592 | 3 | assert(src_d >= 0); |
2593 | 3 | assert(src_d < from_compiled_data->parameters->rnum); |
2594 | 3 | const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(from_compiled_data->parameters, src_d))->d; |
2595 | | // If the original is not init'ed. We cannot copy from. |
2596 | 3 | if (!(from_init_v[s >> 5] & (1u << (s & 0x1f)))) |
2597 | 0 | continue; |
2598 | 3 | const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i); |
2599 | 3 | assert(dest_d >= 0); |
2600 | 3 | assert(dest_d < to_compiled_data->parameters->rnum); |
2601 | 3 | if (parallel_count > 1) |
2602 | 2 | { |
2603 | 2 | ccv_nnc_stream_context_t* streams[parallel_count]; |
2604 | 2 | ccv_nnc_stream_signal_t* signal; |
2605 | 2 | if (stream_context) |
2606 | 1 | signal = ccv_nnc_stream_context_emit_signal_new(stream_context); |
2607 | 10 | for (j = 0; j < parallel_count; j++8 ) |
2608 | 8 | { |
2609 | 8 | ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(from_compiled_data->tensors.parameters[src_d + j * to_parameter_size]); |
2610 | 8 | ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size]); |
2611 | 8 | if (!dest || !src) |
2612 | 0 | { |
2613 | 0 | streams[j] = 0; |
2614 | 0 | continue; |
2615 | 0 | } |
2616 | | // At the moment, can only handle them on the same device. |
2617 | 8 | assert(CCV_TENSOR_GET_MEMORY(src->info.type) == CCV_TENSOR_GET_MEMORY(dest->info.type)); |
2618 | 8 | assert(CCV_TENSOR_GET_DEVICE_ID(src->info.type) == CCV_TENSOR_GET_DEVICE_ID(dest->info.type)); |
2619 | 8 | const int stream_type = CCV_TENSOR_GET_MEMORY(src->info.type) == CCV_TENSOR_GPU_MEMORY ? CCV_STREAM_CONTEXT_GPU : CCV_STREAM_CONTEXT_CPU0 ; |
2620 | 8 | const int device_id = CCV_TENSOR_GET_DEVICE_ID(src->info.type); |
2621 | 8 | int type = stream_type; |
2622 | 8 | CCV_STREAM_SET_DEVICE_ID(type, device_id); |
2623 | 8 | ccv_nnc_stream_context_t* const stream_0 = ccv_cnnp_compiled_data_get_stream(to_compiled_data, type); |
2624 | | // Wait signal to finish. |
2625 | 8 | if (stream_context) |
2626 | 4 | ccv_nnc_stream_context_wait_signal(stream_0, signal); |
2627 | 8 | inputs[0] = outputs[0] = dest; |
2628 | 8 | inputs[1] = src; |
2629 | 8 | ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 2, outputs, aux_out_size + 1, stream_0); |
2630 | 8 | if (stream_context) |
2631 | 4 | { |
2632 | 4 | ccv_nnc_stream_signal_t* const signal = ccv_nnc_stream_context_emit_signal_new(stream_0); |
2633 | 4 | ccv_nnc_stream_context_wait_signal(stream_context, signal); |
2634 | 4 | } |
2635 | 8 | streams[j] = stream_0; |
2636 | 8 | } |
2637 | | // If this should be blocking, blocking it. |
2638 | 2 | if (!stream_context) |
2639 | 5 | for (j = 0; 1 j < parallel_count; j++4 ) |
2640 | 4 | if (streams[j]) |
2641 | 4 | ccv_nnc_stream_context_wait(streams[j]); |
2642 | 2 | } else { |
2643 | 1 | ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(from_compiled_data->tensors.parameters[src_d]); |
2644 | 1 | assert(src); |
2645 | 1 | ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d]); |
2646 | 1 | assert(dest); |
2647 | 1 | inputs[0] = outputs[0] = dest; |
2648 | 1 | inputs[1] = src; |
2649 | 1 | ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 2, outputs, aux_out_size + 1, stream_context); |
2650 | 1 | } |
2651 | | // Mark this symbol as init'ed. |
2652 | 3 | const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(to_compiled_data->parameters, dest_d))->d; |
2653 | 3 | to_init_v[d >> 5] |= (1u << (d & 0x1f)); |
2654 | 3 | } |
2655 | 3 | ccv_array_free(to_parameter_indices); |
2656 | 3 | ccv_array_free(from_parameter_indices); |
2657 | 3 | } |
2658 | | |
2659 | | void ccv_cnnp_model_parameters_map(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const aux_ins, const int aux_in_size, ccv_nnc_tensor_t* const* const aux_outs, const int aux_out_size, ccv_nnc_stream_context_t* const stream_context) |
2660 | 14 | { |
2661 | 14 | int to_param_ref; |
2662 | 14 | ccv_array_t* const to_parameter_indices = _ccv_cnnp_model_parameter_indices(model, parameters, &to_param_ref); |
2663 | | // To models. |
2664 | 14 | ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data; |
2665 | 14 | assert(to_compiled_data); |
2666 | | // Tensor has to be inited already. |
2667 | 14 | assert(!!to_compiled_data->tensors_init.v); |
2668 | 14 | assert(to_compiled_data->tensors.parameters); |
2669 | | // From models. |
2670 | 14 | const int parallel_count = ccv_max(model->parallel_count, 1); |
2671 | 14 | const int to_parameter_size = to_compiled_data->parameters->rnum; |
2672 | 14 | const int rnum = (to_param_ref < 0) ? to_parameter_indices->rnum : 10 ; |
2673 | 14 | assert(aux_in_size >= 0); |
2674 | 14 | assert(aux_out_size >= 0); |
2675 | 14 | int i, j; |
2676 | 14 | ccv_nnc_tensor_t* inputs[aux_in_size + 1]; |
2677 | 14 | ccv_nnc_tensor_t* outputs[aux_out_size + 1]; |
2678 | 14 | for (i = 0; i < aux_in_size; i++0 ) |
2679 | 0 | inputs[i + 1] = aux_ins[i]; |
2680 | 14 | for (i = 0; i < aux_out_size; i++0 ) |
2681 | 0 | outputs[i + 1] = aux_outs[i]; |
2682 | 28 | for (i = 0; i < rnum; i++14 ) |
2683 | 14 | { |
2684 | 14 | const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i); |
2685 | 14 | assert(dest_d >= 0); |
2686 | 14 | assert(dest_d < to_compiled_data->parameters->rnum); |
2687 | 14 | if (parallel_count > 1) |
2688 | 4 | { |
2689 | 4 | ccv_nnc_stream_context_t* streams[parallel_count]; |
2690 | 4 | ccv_nnc_stream_signal_t* signal; |
2691 | 4 | if (stream_context) |
2692 | 1 | signal = ccv_nnc_stream_context_emit_signal_new(stream_context); |
2693 | 20 | for (j = 0; j < parallel_count; j++16 ) |
2694 | 16 | { |
2695 | 16 | ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size]); |
2696 | 16 | if (!dest) |
2697 | 0 | { |
2698 | 0 | streams[j] = 0; |
2699 | 0 | continue; |
2700 | 0 | } |
2701 | 16 | const int stream_type = CCV_TENSOR_GET_MEMORY(dest->info.type) == CCV_TENSOR_GPU_MEMORY ? CCV_STREAM_CONTEXT_GPU : CCV_STREAM_CONTEXT_CPU0 ; |
2702 | 16 | const int device_id = CCV_TENSOR_GET_DEVICE_ID(dest->info.type); |
2703 | 16 | int type = stream_type; |
2704 | 16 | CCV_STREAM_SET_DEVICE_ID(type, device_id); |
2705 | 16 | ccv_nnc_stream_context_t* const stream_0 = ccv_cnnp_compiled_data_get_stream(to_compiled_data, type); |
2706 | | // Wait signal to finish. |
2707 | 16 | if (stream_context) |
2708 | 4 | ccv_nnc_stream_context_wait_signal(stream_0, signal); |
2709 | 16 | inputs[0] = outputs[0] = dest; |
2710 | 16 | ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_0); |
2711 | 16 | if (stream_context) |
2712 | 4 | { |
2713 | 4 | ccv_nnc_stream_signal_t* const signal = ccv_nnc_stream_context_emit_signal_new(stream_0); |
2714 | 4 | ccv_nnc_stream_context_wait_signal(stream_context, signal); |
2715 | 4 | } |
2716 | 16 | streams[j] = stream_0; |
2717 | 16 | } |
2718 | | // If this should be blocking, blocking it. |
2719 | 4 | if (!stream_context) |
2720 | 15 | for (j = 0; 3 j < parallel_count; j++12 ) |
2721 | 12 | if (streams[j]) |
2722 | 12 | ccv_nnc_stream_context_wait(streams[j]); |
2723 | 10 | } else { |
2724 | 10 | ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d]); |
2725 | 10 | assert(dest); |
2726 | 10 | inputs[0] = outputs[0] = dest; |
2727 | 10 | ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_context); |
2728 | 10 | } |
2729 | | // No need to mark this symbol as init'ed, it is already. |
2730 | 14 | } |
2731 | 14 | ccv_array_free(to_parameter_indices); |
2732 | 14 | } |
2733 | | |
2734 | | void ccv_cnnp_model_parameter_gradients_map(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const aux_ins, const int aux_in_size, ccv_nnc_tensor_t* const* const aux_outs, const int aux_out_size, ccv_nnc_stream_context_t* const stream_context) |
2735 | 6 | { |
2736 | 6 | int to_param_ref; |
2737 | 6 | ccv_array_t* const to_parameter_indices = _ccv_cnnp_model_parameter_indices(model, parameters, &to_param_ref); |
2738 | | // To models. |
2739 | 6 | ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data; |
2740 | 6 | assert(to_compiled_data); |
2741 | | // Tensor has to be inited already. |
2742 | 6 | assert(!!to_compiled_data->tensors_init.v); |
2743 | 6 | ccv_nnc_tensor_t** tensor_gradients; |
2744 | 6 | if (to_compiled_data->backward.count > 1) |
2745 | 3 | tensor_gradients = to_compiled_data->tensors.accum_gradients; |
2746 | 3 | else |
2747 | 3 | tensor_gradients = to_compiled_data->tensors.gradients; |
2748 | 6 | assert(tensor_gradients); |
2749 | | // From models. |
2750 | 6 | const int parallel_count = ccv_max(model->parallel_count, 1); |
2751 | 6 | const int to_parameter_size = to_compiled_data->parameters->rnum; |
2752 | 6 | const int rnum = (to_param_ref < 0) ? to_parameter_indices->rnum : 10 ; |
2753 | 6 | assert(aux_in_size >= 0); |
2754 | 6 | assert(aux_out_size >= 0); |
2755 | 6 | int i, j; |
2756 | 6 | ccv_nnc_tensor_t* inputs[aux_in_size + 1]; |
2757 | 6 | ccv_nnc_tensor_t* outputs[aux_out_size + 1]; |
2758 | 10 | for (i = 0; i < aux_in_size; i++4 ) |
2759 | 4 | inputs[i + 1] = aux_ins[i]; |
2760 | 14 | for (i = 0; i < aux_out_size; i++8 ) |
2761 | 8 | outputs[i + 1] = aux_outs[i]; |
2762 | 12 | for (i = 0; i < rnum; i++6 ) |
2763 | 6 | { |
2764 | 6 | const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i); |
2765 | 6 | assert(dest_d >= 0); |
2766 | 6 | assert(dest_d < to_compiled_data->parameters->rnum); |
2767 | 6 | if (parallel_count > 1) |
2768 | 0 | { |
2769 | 0 | ccv_nnc_stream_context_t* streams[parallel_count]; |
2770 | 0 | ccv_nnc_stream_signal_t* signal; |
2771 | 0 | if (stream_context) |
2772 | 0 | signal = ccv_nnc_stream_context_emit_signal_new(stream_context); |
2773 | 0 | for (j = 0; j < parallel_count; j++) |
2774 | 0 | { |
2775 | 0 | ccv_nnc_tensor_t* const dest = tensor_gradients[dest_d + j * to_parameter_size]; |
2776 | 0 | if (!dest) |
2777 | 0 | { |
2778 | 0 | streams[j] = 0; |
2779 | 0 | continue; |
2780 | 0 | } |
2781 | 0 | const int stream_type = CCV_TENSOR_GET_MEMORY(dest->info.type) == CCV_TENSOR_GPU_MEMORY ? CCV_STREAM_CONTEXT_GPU : CCV_STREAM_CONTEXT_CPU; |
2782 | 0 | const int device_id = CCV_TENSOR_GET_DEVICE_ID(dest->info.type); |
2783 | 0 | int type = stream_type; |
2784 | 0 | CCV_STREAM_SET_DEVICE_ID(type, device_id); |
2785 | 0 | ccv_nnc_stream_context_t* const stream_0 = ccv_cnnp_compiled_data_get_stream(to_compiled_data, type); |
2786 | | // Wait signal to finish. |
2787 | 0 | if (stream_context) |
2788 | 0 | ccv_nnc_stream_context_wait_signal(stream_0, signal); |
2789 | 0 | inputs[0] = outputs[0] = dest; |
2790 | 0 | ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_0); |
2791 | 0 | if (stream_context) |
2792 | 0 | { |
2793 | 0 | ccv_nnc_stream_signal_t* const signal = ccv_nnc_stream_context_emit_signal_new(stream_0); |
2794 | 0 | ccv_nnc_stream_context_wait_signal(stream_context, signal); |
2795 | 0 | } |
2796 | 0 | streams[j] = stream_0; |
2797 | 0 | } |
2798 | | // If this should be blocking, blocking it. |
2799 | 0 | if (!stream_context) |
2800 | 0 | for (j = 0; j < parallel_count; j++) |
2801 | 0 | if (streams[j]) |
2802 | 0 | ccv_nnc_stream_context_wait(streams[j]); |
2803 | 6 | } else { |
2804 | 6 | ccv_nnc_tensor_t* const dest = tensor_gradients[dest_d]; |
2805 | 6 | if (!dest) |
2806 | 0 | continue; |
2807 | 6 | assert(dest); |
2808 | 6 | inputs[0] = outputs[0] = dest; |
2809 | 6 | ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_context); |
2810 | 6 | } |
2811 | | // No need to mark this symbol as init'ed, it is already. |
2812 | 6 | } |
2813 | 6 | ccv_array_free(to_parameter_indices); |
2814 | 6 | } |
2815 | | |
2816 | | ccv_nnc_cmd_t ccv_cnnp_model_minimizer(ccv_cnnp_model_t* const model) |
2817 | 2.20k | { |
2818 | 2.20k | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
2819 | 2.20k | assert(compiled_data); |
2820 | 2.20k | return compiled_data->minimize.minimizer; |
2821 | 2.20k | } |
2822 | | |
2823 | | void ccv_cnnp_model_set_minimizer(ccv_cnnp_model_t* const model, const ccv_nnc_cmd_t minimizer, const int reset, const ccv_cnnp_model_io_t* const set_parameters, const int set_parameter_size) |
2824 | 4.36k | { |
2825 | 4.36k | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
2826 | 4.36k | assert(compiled_data); |
2827 | 4.36k | const int parameter_size = compiled_data->parameters->rnum; |
2828 | 4.36k | if (parameter_size == 0) |
2829 | 3 | return; |
2830 | 4.35k | if (reset) |
2831 | 2.49k | { assert(set_parameters == 0 && set_parameter_size == 0); } |
2832 | 4.35k | const int old_max_saved_aux_size = compiled_data->minimize.max_saved_aux_size; |
2833 | 4.35k | const int saved_aux_size = ccv_nnc_minimizer_saved_aux_size(minimizer); |
2834 | 4.35k | if (saved_aux_size > compiled_data->minimize.max_saved_aux_size) |
2835 | 7 | compiled_data->minimize.max_saved_aux_size = saved_aux_size; |
2836 | 4.35k | const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size; |
2837 | | // We update all parameters, at this point, we have one minimizer. |
2838 | 4.35k | if (set_parameters == 0 || set_parameter_size == 0301 ) |
2839 | 4.05k | compiled_data->minimize.minimizer = minimizer; |
2840 | 4.35k | int i; |
2841 | 4.35k | if (set_parameters && set_parameter_size301 ) |
2842 | 301 | { |
2843 | | // I need to save what's the minimizer along with this. |
2844 | 301 | if (!compiled_data->minimize.parameters) |
2845 | 5 | compiled_data->minimize.parameters = ccv_array_new(sizeof(ccv_cnnp_set_minimizer_for_parameter_t*), 1, 0); |
2846 | 301 | ccv_cnnp_set_minimizer_for_parameter_t* const set_minimizer_for_parameter = ccmalloc(sizeof(ccv_cnnp_set_minimizer_for_parameter_t) + (set_parameter_size - 1) * sizeof(ccv_cnnp_model_io_t)); |
2847 | 301 | set_minimizer_for_parameter->minimizer = minimizer; |
2848 | 301 | set_minimizer_for_parameter->parameter_size = set_parameter_size; |
2849 | 301 | memcpy(set_minimizer_for_parameter->parameters, set_parameters, sizeof(ccv_cnnp_model_io_t) * set_parameter_size); |
2850 | 301 | ccv_array_push(compiled_data->minimize.parameters, &set_minimizer_for_parameter); |
2851 | 301 | } |
2852 | | // If reset is true, clear the parameters array. |
2853 | 4.35k | if (reset && compiled_data->minimize.parameters2.49k ) |
2854 | 291 | { |
2855 | 582 | for (i = 0; i < compiled_data->minimize.parameters->rnum; i++291 ) |
2856 | 291 | ccfree(*(ccv_cnnp_set_minimizer_for_parameter_t**)ccv_array_get(compiled_data->minimize.parameters, i)); |
2857 | 291 | ccv_array_clear(compiled_data->minimize.parameters); |
2858 | 291 | } |
2859 | 4.35k | if (!compiled_data->update_nodes) |
2860 | 9 | return; |
2861 | 4.34k | ccv_nnc_symbolic_graph_t* const symbolic_graph = model->graph; |
2862 | 4.34k | assert(symbolic_graph); |
2863 | 4.34k | if (saved_aux_size > old_max_saved_aux_size) |
2864 | 7 | { |
2865 | 7 | assert(compiled_data->updated_parameters); |
2866 | | // Reallocate first, move them around later. |
2867 | 7 | compiled_data->updated_parameters = (ccv_nnc_tensor_symbol_t*)ccrealloc(compiled_data->updated_parameters, sizeof(ccv_nnc_tensor_symbol_t) * parameter_size + sizeof(ccv_nnc_graph_exec_symbol_t) * parameter_size + sizeof(ccv_nnc_tensor_symbol_map_t) * saved_aux_size * parameter_size); |
2868 | 7 | compiled_data->update_nodes = (ccv_nnc_graph_exec_symbol_t*)(compiled_data->updated_parameters + parameter_size); |
2869 | 7 | compiled_data->saved_aux = (ccv_nnc_tensor_symbol_map_t*)(compiled_data->update_nodes + parameter_size); |
2870 | | // We need to do this from back to front because saved_aux_size > old_saved_aux_size, it could overlap. |
2871 | 7 | _ccv_cnnp_scatter_saved_aux(compiled_data->saved_aux, parameter_size, old_max_saved_aux_size, saved_aux_size); |
2872 | 7 | } |
2873 | 4.34k | int flag = 0; |
2874 | 4.34k | const int parallel_count = ccv_max(model->parallel_count, 1); |
2875 | 4.34k | if (set_parameters && set_parameter_size296 ) |
2876 | 296 | { |
2877 | 296 | ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0); |
2878 | 592 | for (i = 0; i < set_parameter_size; i++296 ) |
2879 | 296 | { |
2880 | 296 | const int param_sel = set_parameters[i]->param_sel > 0 ? set_parameters[i]->param_sel - 1291 : set_parameters[i]->param_sel5 ; |
2881 | 296 | assert(set_parameters[i]->param_sel != 0); |
2882 | 296 | const int old_rnum = parameter_indices->rnum; |
2883 | 296 | ccv_cnnp_model_add_to_parameter_indices(set_parameters[i]->model, param_sel, parameter_indices); |
2884 | 296 | const int param_ref = set_parameters[i]->param_ref > 0 ? set_parameters[i]->param_ref - 10 : set_parameters[i]->param_ref; |
2885 | 296 | assert(set_parameters[i]->param_ref != 0); |
2886 | 296 | if (param_ref >= 0) |
2887 | 0 | { |
2888 | 0 | assert(param_ref + old_rnum < parameter_indices->rnum); |
2889 | 0 | *(int*)ccv_array_get(parameter_indices, old_rnum) = *(int*)ccv_array_get(parameter_indices, param_ref + old_rnum); |
2890 | 0 | parameter_indices->rnum = old_rnum + 1; |
2891 | 0 | } |
2892 | 296 | } |
2893 | | // We may have duplicated indices, but that is OK, we will set it twice. |
2894 | 5.24k | for (i = 0; 296 i < parameter_indices->rnum; i++4.95k ) |
2895 | 4.95k | { |
2896 | 4.95k | const int d = *(int*)ccv_array_get(parameter_indices, i); |
2897 | 4.95k | if (_ccv_cnnp_set_minimizer_for_parameter(symbolic_graph, compiled_data, compiled_data->update_nodes, compiled_data->updated_parameters, compiled_data->saved_aux, parallel_count, minimizer, saved_aux_size, max_saved_aux_size, d)) |
2898 | 0 | flag = 1; |
2899 | 4.95k | } |
2900 | 296 | ccv_array_free(parameter_indices); |
2901 | 4.05k | } else { |
2902 | 19.1k | for (i = 0; i < parameter_size; i++15.0k ) |
2903 | 15.0k | if (_ccv_cnnp_set_minimizer_for_parameter(symbolic_graph, compiled_data, compiled_data->update_nodes, compiled_data->updated_parameters, compiled_data->saved_aux, parallel_count, minimizer, saved_aux_size, max_saved_aux_size, i)) |
2904 | 65 | flag = 1; |
2905 | 4.05k | if (compiled_data->minimize.parameters) |
2906 | 291 | if (_ccv_cnnp_apply_parameters_with_minimizer(model)) |
2907 | 0 | flag = 1; |
2908 | 4.05k | } |
2909 | 4.34k | if (flag) |
2910 | 7 | { |
2911 | | // If saved_aux_size doesn't match, we need to remove / add new saved_aux to the graph. But first, free up apply gradients graph. |
2912 | 7 | if (compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_FIT_MODE) |
2913 | 0 | _ccv_cnnp_compiled_data_graph_free(compiled_data); |
2914 | 7 | _ccv_cnnp_compiled_data_apply_gradients_free(compiled_data); |
2915 | 7 | } |
2916 | 4.34k | } |
2917 | | |
2918 | | void ccv_cnnp_model_set_compile_params(ccv_cnnp_model_t* const model, const ccv_nnc_symbolic_graph_compile_param_t compile_params) |
2919 | 0 | { |
2920 | 0 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
2921 | 0 | assert(compiled_data); |
2922 | 0 | compiled_data->compile_params = compile_params; |
2923 | 0 | } |
2924 | | |
2925 | | void ccv_cnnp_model_dot(const ccv_cnnp_model_t* const model, const int flags, FILE** const outs, const int out_size) |
2926 | 45 | { |
2927 | 45 | if (model->graph && out_size > 044 ) |
2928 | 44 | ccv_nnc_symbolic_graph_dot(model->graph, flags, outs[0]); |
2929 | 45 | if (model->compiled_data && model->compiled_data->graph44 && out_size > 116 ) |
2930 | 0 | ccv_nnc_graph_dot(model->compiled_data->graph, flags, outs[1]); |
2931 | 45 | if (model->compiled_data && model->compiled_data->backward.accum44 && out_size > 20 ) |
2932 | 0 | ccv_nnc_graph_dot(model->compiled_data->backward.accum, flags, outs[2]); |
2933 | 45 | if (model->compiled_data && model->compiled_data->apply_gradients.graph44 && out_size > 33 ) |
2934 | 0 | ccv_nnc_graph_dot(model->compiled_data->apply_gradients.graph, flags, outs[3]); |
2935 | 45 | } |
2936 | | |
2937 | | void ccv_cnnp_model_format(const ccv_cnnp_model_t* const model, const ccv_nnc_symbolic_graph_format_f format_fn, void* const context) |
2938 | 0 | { |
2939 | 0 | if (model->graph) |
2940 | 0 | ccv_nnc_symbolic_graph_format(model->graph, 0, 0, 0, 0, format_fn, context); |
2941 | 0 | } |
2942 | | |
2943 | | static void _ccv_cnnp_compiled_data_free(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data) |
2944 | 2.29k | { |
2945 | 2.29k | int i; |
2946 | 2.29k | const int parameter_size = compiled_data->parameters->rnum; |
2947 | 2.29k | ccv_array_free(compiled_data->parameters); |
2948 | 2.29k | if (compiled_data->parameter_flags) |
2949 | 8 | ccfree(compiled_data->parameter_flags); |
2950 | 2.29k | const int internal_size = compiled_data->internals->rnum; |
2951 | 2.29k | ccv_array_free(compiled_data->internals); |
2952 | 2.29k | assert(compiled_data->ids.parameters->rnum == parameter_size); |
2953 | 2.29k | assert(compiled_data->ids.internals->rnum == internal_size); |
2954 | 5.24k | for (i = 0; 2.29k i < parameter_size; i++2.94k ) |
2955 | 2.94k | ccfree(*(char**)ccv_array_get(compiled_data->ids.parameters, i)); |
2956 | 2.29k | ccv_array_free(compiled_data->ids.parameters); |
2957 | 2.45k | for (i = 0; i < internal_size; i++161 ) |
2958 | 161 | ccfree(*(char**)ccv_array_get(compiled_data->ids.internals, i)); |
2959 | 2.29k | ccv_array_free(compiled_data->ids.internals); |
2960 | 2.29k | const int parallel_count = ccv_max(model->parallel_count, 1); |
2961 | 2.29k | if (compiled_data->tensors.parameters) |
2962 | 84 | { |
2963 | 781 | for (i = 0; i < parameter_size * parallel_count; i++697 ) |
2964 | | // If it is not marked as not belonging, we can free it. |
2965 | 697 | if (!((uintptr_t)compiled_data->tensors.parameters[i] & (uintptr_t)1)) |
2966 | 693 | if (compiled_data->tensors.parameters[i]) |
2967 | 693 | ccv_nnc_tensor_free(compiled_data->tensors.parameters[i]); |
2968 | 239 | for (i = 0; i < internal_size * parallel_count; i++155 ) |
2969 | 155 | if (compiled_data->tensors.internals[i]) |
2970 | 155 | ccv_nnc_tensor_free(compiled_data->tensors.internals[i]); |
2971 | 84 | ccfree(compiled_data->tensors.parameters); |
2972 | 84 | } |
2973 | 2.29k | if (compiled_data->tensors.gradients) |
2974 | 28 | { |
2975 | 355 | for (i = 0; i < parameter_size * parallel_count; i++327 ) |
2976 | 327 | { |
2977 | 327 | if (compiled_data->tensors.gradients[i]) |
2978 | 325 | ccv_nnc_tensor_free(compiled_data->tensors.gradients[i]); |
2979 | 327 | if (compiled_data->tensors.accum_gradients[i]) |
2980 | 15 | ccv_nnc_tensor_free(compiled_data->tensors.accum_gradients[i]); |
2981 | 327 | } |
2982 | 28 | ccfree(compiled_data->tensors.gradients); |
2983 | 28 | } |
2984 | 2.29k | if (compiled_data->minimize.parameters) |
2985 | 5 | { |
2986 | 15 | for (i = 0; i < compiled_data->minimize.parameters->rnum; i++10 ) |
2987 | 10 | ccfree(*(ccv_cnnp_set_minimizer_for_parameter_t**)ccv_array_get(compiled_data->minimize.parameters, i)); |
2988 | 5 | ccv_array_free(compiled_data->minimize.parameters); |
2989 | 5 | } |
2990 | 2.29k | if (compiled_data->rewindables) |
2991 | 41 | ccv_array_free(compiled_data->rewindables); |
2992 | 2.29k | if (compiled_data->tensors_init.v) |
2993 | 84 | ccfree(CCV_NNC_INIT_V(compiled_data->tensors_init.v)); |
2994 | 2.29k | if (compiled_data->evaluate.tos) |
2995 | 2.29k | ccfree(compiled_data->evaluate.tos); |
2996 | 2.29k | compiled_data->evaluate.tos = 0; |
2997 | 2.29k | if (compiled_data->stream_map) |
2998 | 4 | { |
2999 | 4 | khiter_t k; |
3000 | 36 | for (k = kh_begin4 (compiled_data->stream_map); k != kh_end(compiled_data->stream_map); ++k32 ) |
3001 | 32 | { |
3002 | 32 | if (!kh_exist(compiled_data->stream_map, k)) |
3003 | 16 | continue; |
3004 | 16 | ccv_nnc_stream_context_t* const stream = kh_val(compiled_data->stream_map, k); |
3005 | 16 | ccv_nnc_stream_context_free(stream); |
3006 | 16 | } |
3007 | 4 | kh_destroy(stream_map, compiled_data->stream_map); |
3008 | 4 | } |
3009 | 2.29k | _ccv_cnnp_compiled_data_graph_free(compiled_data); |
3010 | 2.29k | _ccv_cnnp_compiled_data_gradient_free(compiled_data); |
3011 | 2.29k | _ccv_cnnp_compiled_data_backward_free(compiled_data); |
3012 | 2.29k | _ccv_cnnp_compiled_data_apply_gradients_free(compiled_data); |
3013 | 2.29k | if (compiled_data->gradient_checkpoints) |
3014 | 2 | { |
3015 | 4 | for (i = 0; i < compiled_data->gradient_checkpoints->rnum; i++2 ) |
3016 | 2 | { |
3017 | 2 | ccv_cnnp_model_gradient_checkpoint_t* const checkpoint = (ccv_cnnp_model_gradient_checkpoint_t*)ccv_array_get(compiled_data->gradient_checkpoints, i); |
3018 | 2 | assert(checkpoint->inputs); |
3019 | 2 | ccfree(checkpoint->inputs); |
3020 | 2 | ccv_array_free(checkpoint->tensor_symbols); |
3021 | 2 | } |
3022 | 2 | ccv_array_free(compiled_data->gradient_checkpoints); |
3023 | 2 | } |
3024 | 2.29k | ccv_nnc_xpu_alloc_destroy(&compiled_data->xpu_alloc); |
3025 | 2.29k | ccfree(compiled_data); |
3026 | 2.29k | } |
3027 | | |
3028 | | void ccv_cnnp_model_free(ccv_cnnp_model_t* const model) |
3029 | 5.40k | { |
3030 | 5.40k | if (model->isa->deinit) |
3031 | 1.37k | model->isa->deinit(model); |
3032 | 5.40k | if (model->io) |
3033 | 771 | { |
3034 | 771 | int i; |
3035 | 1.90k | for (i = 0; i < model->io->rnum; i++1.13k ) |
3036 | 1.13k | { |
3037 | 1.13k | ccv_cnnp_model_io_t model_io = *(ccv_cnnp_model_io_t*)ccv_array_get(model->io, i); |
3038 | 1.13k | if (model_io->outgoings) |
3039 | 634 | ccv_array_free(model_io->outgoings); |
3040 | 1.13k | if (model_io->incomings) |
3041 | 579 | ccv_array_free(model_io->incomings); |
3042 | 1.13k | if (model_io->dependencies) |
3043 | 2 | ccv_array_free(model_io->dependencies); |
3044 | 1.13k | ccfree(model_io); |
3045 | 1.13k | } |
3046 | 771 | ccv_array_free(model->io); |
3047 | 771 | } |
3048 | 5.40k | if (model->parameter_indices) |
3049 | 2.52k | ccv_array_free(model->parameter_indices); |
3050 | 5.40k | if (model->inputs) |
3051 | 2.29k | ccfree(model->inputs); |
3052 | 5.40k | if (model->graph) |
3053 | 2.29k | ccv_nnc_symbolic_graph_free(model->graph); |
3054 | 5.40k | if (model->compiled_data) |
3055 | 2.29k | _ccv_cnnp_compiled_data_free(model, model->compiled_data); |
3056 | 5.40k | if (model->name) |
3057 | 198 | ccfree(model->name); |
3058 | 5.40k | ccfree(model); |
3059 | 5.40k | } |
3060 | | |
3061 | | void ccv_cnnp_model_cancel(ccv_cnnp_model_t* const model) |
3062 | 0 | { |
3063 | 0 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
3064 | 0 | if (!compiled_data) |
3065 | 0 | return; |
3066 | 0 | if (compiled_data->graph) |
3067 | 0 | ccv_nnc_graph_cancel(compiled_data->graph); |
3068 | 0 | if (compiled_data->apply_gradients.graph) |
3069 | 0 | ccv_nnc_graph_cancel(compiled_data->apply_gradients.graph); |
3070 | 0 | } |