/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_cnnp_model.c
Line | Count | Source |
1 | | #include "ccv_nnc.h" |
2 | | #include "ccv_nnc_easy.h" |
3 | | #include "ccv_nnc_internal.h" |
4 | | #include "ccv_internal.h" |
5 | | #include "_ccv_cnnp_model.h" |
6 | | #include "_ccv_nnc_graph.h" |
7 | | |
8 | | // MARK - Level-5 API |
9 | | |
10 | | ccv_cnnp_model_io_t ccv_cnnp_model_apply(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t* const inputs, const int input_size) |
11 | 557 | { |
12 | 557 | if (!model->io) |
13 | 548 | model->io = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 1, 0); |
14 | 557 | ccv_cnnp_model_io_t model_io = ccmalloc(sizeof(struct ccv_cnnp_model_io_s) + sizeof(ccv_nnc_tensor_symbol_t) * model->output_size); |
15 | 557 | model_io->param_ref = 0; |
16 | 557 | model_io->param_sel = 0; |
17 | 557 | model_io->visit = 0; |
18 | 557 | model_io->model = model; |
19 | 557 | model_io->dependencies = 0; |
20 | 557 | model_io->dependents = 0; |
21 | 557 | model_io->outgoings = 0; |
22 | 557 | model_io->outputs = (ccv_nnc_tensor_symbol_t*)(model_io + 1); |
23 | 557 | ccv_array_push(model->io, &model_io); |
24 | 557 | if (input_size > 0) |
25 | 554 | { |
26 | 554 | model_io->incomings = ccv_array_new(sizeof(ccv_cnnp_model_io_t), input_size, 0); |
27 | 554 | ccv_array_resize(model_io->incomings, input_size); |
28 | 554 | int i; |
29 | 554 | memcpy(ccv_array_get(model_io->incomings, 0), inputs, sizeof(ccv_cnnp_model_io_t) * input_size); |
30 | 1.25k | for (i = 0; i < input_size; i++700 ) |
31 | 700 | { |
32 | 700 | if (!inputs[i]->outgoings) |
33 | 608 | inputs[i]->outgoings = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 1, 0); |
34 | 700 | ccv_array_push(inputs[i]->outgoings, &model_io); |
35 | 700 | } |
36 | 554 | } else { |
37 | 3 | model_io->incomings = 0; |
38 | 3 | } |
39 | 557 | return model_io; |
40 | 557 | } |
41 | | |
42 | | void ccv_cnnp_model_add_dependencies(ccv_cnnp_model_io_t model_io, const ccv_cnnp_model_io_t* const dependencies, const int dependency_size) |
43 | 2 | { |
44 | 2 | assert(dependency_size > 0); |
45 | 2 | if (!model_io->dependencies) |
46 | 2 | model_io->dependencies = ccv_array_new(sizeof(ccv_cnnp_model_io_t), dependency_size, 0); |
47 | 2 | int i, j; |
48 | 5 | for (i = 0; i < dependency_size; i++3 ) |
49 | 3 | { |
50 | 3 | int flag = 0; |
51 | | // Check if it is already exist or not. |
52 | 4 | for (j = 0; !flag && j < model_io->dependencies->rnum; j++1 ) |
53 | 1 | if (*(ccv_cnnp_model_io_t*)ccv_array_get(model_io->dependencies, j) == dependencies[i]) |
54 | 0 | flag = 1; |
55 | 3 | if (flag) |
56 | 0 | continue; |
57 | 3 | ccv_array_push(model_io->dependencies, dependencies + i); |
58 | 3 | ++dependencies[i]->dependents; |
59 | 3 | } |
60 | 2 | } |
61 | | |
62 | | int ccv_cnnp_model_output_size(const ccv_cnnp_model_t* const model) |
63 | 0 | { |
64 | 0 | return model->output_size; |
65 | 0 | } |
66 | | |
67 | | int ccv_cnnp_model_is_trainable(const ccv_cnnp_model_t* const model) |
68 | 16 | { |
69 | | // If the model is compiled, it is default to 1 unless it is not. |
70 | 16 | if (model->compiled_data) |
71 | 4 | return model->is_trainable >= 0 ? model->is_trainable : 10 ; |
72 | 12 | return model->is_trainable; |
73 | 16 | } |
74 | | |
75 | | ccv_cnnp_model_io_t ccv_cnnp_model_parameters(ccv_cnnp_model_t* const model, const int selector, const int index) |
76 | 393 | { |
77 | 393 | if (!model->io) |
78 | 38 | model->io = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 1, 0); |
79 | 393 | ccv_cnnp_model_io_t model_io = ccmalloc(sizeof(struct ccv_cnnp_model_io_s)); |
80 | 393 | model_io->param_ref = index >= 0 ? index + 140 : ALL_PARAMETERS353 ; |
81 | 393 | model_io->param_sel = selector >= 0 ? selector + 1308 : ALL_PARAMETERS85 ; |
82 | 393 | model_io->visit = 0; |
83 | 393 | model_io->model = model; |
84 | 393 | model_io->outputs = 0; |
85 | 393 | model_io->dependencies = 0; |
86 | 393 | model_io->dependents = 0; |
87 | 393 | model_io->incomings = 0; |
88 | 393 | model_io->outgoings = 0; |
89 | 393 | ccv_array_push(model->io, &model_io); |
90 | 393 | return model_io; |
91 | 393 | } |
92 | | |
93 | | void ccv_cnnp_model_notify_hook(ccv_cnnp_model_t* const model, ccv_cnnp_model_notify_f func, void* const context) |
94 | 3 | { |
95 | 3 | model->notify_hook.func = func; |
96 | 3 | model->notify_hook.context = context; |
97 | 3 | } |
98 | | |
99 | | void ccv_cnnp_model_notify(const ccv_cnnp_model_t* const model, const int tag, void* const payload) |
100 | 14 | { |
101 | 14 | if (model->notify_hook.func) |
102 | 3 | model->notify_hook.func(model, tag, payload, model->notify_hook.context); |
103 | 14 | if (model->isa->notify) |
104 | 1 | model->isa->notify(model, tag, payload); |
105 | 14 | } |
106 | | |
107 | | static int _ccv_nnc_array_dedup_graph_exec_symbols(ccv_nnc_graph_exec_symbol_t* const graph_exec_symbols, int graph_exec_symbol_size) |
108 | 2.24k | { |
109 | 2.24k | int i, j; |
110 | 4.85k | for (i = 0; i < graph_exec_symbol_size; i++2.61k ) |
111 | 2.61k | { |
112 | 2.61k | ccv_nnc_graph_exec_symbol_t* const graph_exec_symbol = graph_exec_symbols + i; |
113 | | // Check whether this tensor symbol has any duplicate. |
114 | 23.2k | for (j = i + 1; j < graph_exec_symbol_size;) |
115 | 20.6k | { |
116 | 20.6k | ccv_nnc_graph_exec_symbol_t* const other_symbol = graph_exec_symbols + j; |
117 | | // If there is a same tensor symbol, remove it. |
118 | 20.6k | if (other_symbol->d == graph_exec_symbol->d && other_symbol->graph == graph_exec_symbol->graph2.71k ) |
119 | 2.71k | { |
120 | 2.71k | if (j + 1 < graph_exec_symbol_size) |
121 | 439 | *other_symbol = graph_exec_symbols[graph_exec_symbol_size - 1]; |
122 | 2.71k | --graph_exec_symbol_size; |
123 | 2.71k | continue; |
124 | 2.71k | } |
125 | 17.9k | ++j; |
126 | 17.9k | } |
127 | 2.61k | } |
128 | 2.24k | return graph_exec_symbol_size; |
129 | 2.24k | } |
130 | | |
131 | | void ccv_cnnp_model_add_to_array(void* const context, const ccv_nnc_tensor_symbol_t symbol, const int is_trainable) |
132 | 3.16k | { |
133 | 3.16k | ccv_cnnp_model_add_to_array_context_t* const add_to_array_context = (ccv_cnnp_model_add_to_array_context_t*)context; |
134 | 3.16k | ccv_cnnp_model_t* const model = add_to_array_context->sequence->model; |
135 | 3.16k | int i; |
136 | 3.16k | if (add_to_array_context->add_parameter_indices && !model->parameter_indices2.97k ) |
137 | 2.52k | model->parameter_indices = ccv_array_new(sizeof(int), 0, 0); |
138 | 37.1k | for (i = 0; i < add_to_array_context->symbols->rnum; i++33.9k ) |
139 | 33.9k | { |
140 | 33.9k | const ccv_nnc_tensor_symbol_t other_symbol = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(add_to_array_context->symbols, i); |
141 | 33.9k | if (other_symbol.d == symbol.d && other_symbol.graph == symbol.graph28 ) |
142 | 28 | { |
143 | | // Only add to parameter_indices if it is trainable. |
144 | 28 | if (add_to_array_context->add_parameter_indices) |
145 | 15 | ccv_array_add_unique_int(model->parameter_indices, i); |
146 | | // Found it, return, don't add it. |
147 | 28 | return; |
148 | 28 | } |
149 | 33.9k | } |
150 | | // Only add to parameter_indices if it is trainable. |
151 | 3.13k | if (add_to_array_context->add_parameter_indices) |
152 | 2.95k | ccv_array_push(model->parameter_indices, &add_to_array_context->symbols->rnum); |
153 | | // This is a new one, no need to add_unique_int, it is unique. |
154 | 3.13k | ccv_array_push(add_to_array_context->symbols, &symbol); |
155 | 3.13k | if (add_to_array_context->trainables) |
156 | 2.96k | ccv_array_push(add_to_array_context->trainables, &is_trainable); |
157 | 3.13k | char id[2048]; |
158 | 3.13k | id[0] = add_to_array_context->prefix; |
159 | 3.13k | id[1] = '-'; |
160 | 3.13k | int total_len = 2; |
161 | 6.50k | for (i = 0; i < add_to_array_context->sequence->sequences->rnum; i++3.36k ) |
162 | 3.36k | { |
163 | 3.36k | const ccv_cnnp_model_name_t* const name = (ccv_cnnp_model_name_t*)ccv_array_get(add_to_array_context->sequence->sequences, i); |
164 | 3.36k | int len; |
165 | 3.36k | if (name->name && name->name[0] != '\0'364 ) |
166 | 364 | len = snprintf(id + total_len, 2048 - total_len, "%s-%d-", name->name, name->sequence); |
167 | 3.00k | else |
168 | 3.00k | len = snprintf(id + total_len, 2048 - total_len, "%d-", name->sequence); |
169 | 3.36k | total_len += len; |
170 | 3.36k | if (total_len >= 2047) |
171 | 0 | break; |
172 | 3.36k | } |
173 | 3.13k | if (total_len < 2047) |
174 | 3.13k | total_len += snprintf(id + total_len, 2048 - total_len, "%d", add_to_array_context->sequence->it); |
175 | 3.13k | assert(total_len < 2048); |
176 | 3.13k | char *heap_id = (char*)ccmalloc(total_len + 1); |
177 | 3.13k | memcpy(heap_id, id, total_len + 1); |
178 | 3.13k | ccv_array_push(add_to_array_context->ids, &heap_id); |
179 | 3.13k | ++add_to_array_context->sequence->it; |
180 | 3.13k | } |
181 | | |
182 | | static void _ccv_cnnp_compiled_data_init(ccv_cnnp_compiled_data_t* const compiled_data, const int output_size, ccv_array_t* const gradient_checkpoints) |
183 | 2.30k | { |
184 | 2.30k | compiled_data->f = compiled_data->fits + output_size; |
185 | 2.30k | compiled_data->xpu_alloc.mp_hdr = -1; |
186 | 2.30k | compiled_data->xpu_alloc.freed = kh_init(dy_str); |
187 | 2.30k | compiled_data->xpu_alloc.allocd = kh_init(dy_alloc); |
188 | 2.30k | compiled_data->gradient_checkpoints = gradient_checkpoints; |
189 | 2.30k | } |
190 | | |
191 | | typedef struct { |
192 | | void* old_graph_exec_symbol_new_hook_context; |
193 | | ccv_nnc_graph_exec_symbol_new_hook_f old_graph_exec_symbol_new_hook; |
194 | | ccv_nnc_symbolic_graph_t* graph; |
195 | | ccv_cnnp_model_build_data_t* build_data; |
196 | | } ccv_cnnp_model_set_exec_flags_context_t; |
197 | | |
198 | | static void _ccv_cnnp_model_set_exec_flags(void* context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_symbol_t* const inputs, const int input_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const char* const name) |
199 | 2.92k | { |
200 | 2.92k | ccv_cnnp_model_set_exec_flags_context_t* flags_context = (ccv_cnnp_model_set_exec_flags_context_t*)context; |
201 | 2.92k | if (flags_context->build_data->exec_flags) |
202 | 0 | ccv_nnc_graph_exec_symbol_set_flags(flags_context->graph, symbol, flags_context->build_data->exec_flags); |
203 | 2.92k | if (flags_context->old_graph_exec_symbol_new_hook) |
204 | 2.20k | flags_context->old_graph_exec_symbol_new_hook(flags_context->old_graph_exec_symbol_new_hook_context, symbol, cmd, inputs, input_size, outputs, output_size, name); |
205 | 2.92k | } |
206 | | |
207 | | static void _ccv_cnnp_model_compile(ccv_cnnp_model_t* const model, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_cmd_t loss) |
208 | 2.30k | { |
209 | 2.30k | assert(model->graph); |
210 | 2.30k | model->inputs = ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * input_size); |
211 | 2.30k | int i; |
212 | 4.67k | for (i = 0; i < input_size; i++2.37k ) |
213 | 2.37k | model->inputs[i] = ccv_nnc_tensor_symbol_new(model->graph, inputs[i], 0); |
214 | 2.30k | ccv_array_t* const parameters = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0); |
215 | 2.30k | ccv_array_t* const parameter_ids = ccv_array_new(sizeof(char*), 0, 0); |
216 | 2.30k | ccv_array_t* const parameter_trainables = ccv_array_new(sizeof(int), 0, 0); |
217 | 2.30k | ccv_cnnp_model_sequence_t model_sequence = { |
218 | 2.30k | .bank = kh_init(ccv_cnnp_model_name_bank) |
219 | 2.30k | }; |
220 | 2.30k | ccv_cnnp_model_add_to_array_context_t add_to_parameter_context = { |
221 | 2.30k | .add_parameter_indices = 1, |
222 | 2.30k | .prefix = 't', |
223 | 2.30k | .sequence = &model_sequence, |
224 | 2.30k | .symbols = parameters, |
225 | 2.30k | .ids = parameter_ids, |
226 | 2.30k | .trainables = parameter_trainables, |
227 | 2.30k | }; |
228 | 2.30k | ccv_array_t* const internals = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0); |
229 | 2.30k | ccv_array_t* const internal_ids = ccv_array_new(sizeof(char*), 0, 0); |
230 | 2.30k | ccv_cnnp_model_add_to_array_context_t add_to_output_context = { |
231 | 2.30k | .add_parameter_indices = 0, |
232 | 2.30k | .prefix = 'r', |
233 | 2.30k | .sequence = &model_sequence, |
234 | 2.30k | .symbols = internals, |
235 | 2.30k | .ids = internal_ids, |
236 | 2.30k | .trainables = 0, |
237 | 2.30k | }; |
238 | 2.30k | ccv_cnnp_model_build_data_t build_data = { |
239 | 2.30k | .exec_flags = 0, |
240 | 2.30k | .is_trainable = model->is_trainable >= 0 ? model->is_trainable2.29k : 14 , |
241 | 2.30k | .model_sequence = &model_sequence, |
242 | 2.30k | .add_to_array = ccv_cnnp_model_add_to_array, |
243 | 2.30k | .parameters = parameters, |
244 | 2.30k | .context = { |
245 | 2.30k | .add_to_parameter = &add_to_parameter_context, |
246 | 2.30k | .add_to_output = &add_to_output_context, |
247 | 2.30k | }, |
248 | 2.30k | .gradient_checkpoints = 0, |
249 | 2.30k | }; |
250 | 2.30k | model->data = &build_data; |
251 | 2.30k | ccv_cnnp_model_set_exec_flags_context_t flags_context = { |
252 | 2.30k | .graph = model->graph, |
253 | 2.30k | .build_data = &build_data, |
254 | 2.30k | .old_graph_exec_symbol_new_hook = 0, |
255 | 2.30k | .old_graph_exec_symbol_new_hook_context = 0 |
256 | 2.30k | }; |
257 | 2.30k | flags_context.old_graph_exec_symbol_new_hook_context = ccv_nnc_graph_exec_symbol_new_hook(model->graph, _ccv_cnnp_model_set_exec_flags, &flags_context, &flags_context.old_graph_exec_symbol_new_hook); |
258 | 2.30k | ccv_cnnp_model_build(model, model->graph, model->inputs, input_size, 0, 0); |
259 | | // Reset back to previous hook. |
260 | 2.30k | ccv_nnc_graph_exec_symbol_new_hook(model->graph, flags_context.old_graph_exec_symbol_new_hook, flags_context.old_graph_exec_symbol_new_hook_context, 0); |
261 | 4.62k | for (i = 0; i < model->output_size; i++2.31k ) |
262 | 2.31k | { |
263 | 2.31k | const ccv_nnc_tensor_symbol_t output = model->outputs[i]; |
264 | 2.31k | const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(model->graph, output); |
265 | 2.31k | if (alias_to.d == CCV_NNC_NO_TENSOR_SYMBOL) |
266 | 1.31k | continue; |
267 | | // If output is an alias, insert data transform regardless for result correctness (we cannot bind an alias). You can check ccv_nnc_tensor_bind_symbol method |
268 | | // to see that we can correctly bind a tensor which from it, has aliases, but we cannot bind an alias tensor correctly (this is expected, sort of, to be |
269 | | // honest, because we cannot handle cases of alias is part of the original tensor but bind differently). |
270 | 1.00k | const ccv_nnc_tensor_param_t output_params = ccv_nnc_tensor_symbol_params(model->graph, output); |
271 | 1.00k | model->outputs[i] = ccv_nnc_tensor_symbol_new(model->graph, output_params, 0); |
272 | 1.00k | ccv_nnc_graph_exec_symbol_t make_contiguous = ccv_nnc_graph_exec_symbol_new(model->graph, CMD_FORMAT_TRANSFORM_FORWARD(), &output, 1, model->outputs + i, 1, "contiguous"); |
273 | 1.00k | ccv_nnc_graph_exec_symbol_set_flags(model->graph, make_contiguous, CCV_NNC_GRAPH_EXEC_DISABLE_OPT); |
274 | 1.00k | } |
275 | 2.30k | model->data = 0; |
276 | 2.30k | kh_destroy(ccv_cnnp_model_name_bank, model_sequence.bank); |
277 | 2.30k | if (model_sequence.sequences) |
278 | 2.28k | ccv_array_free(model_sequence.sequences); |
279 | | // Check if there are parameters that are not trainables. If there are, we will allocate uint64 bitmap to record that. |
280 | 2.30k | int not_trainables = 0; |
281 | | // Assert no parameter is alias. |
282 | 5.25k | for (i = 0; i < parameters->rnum; i++2.95k ) |
283 | 2.95k | { |
284 | 2.95k | const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(parameters, i); |
285 | 2.95k | const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(parameter.graph, parameter); |
286 | 2.95k | assert(alias_to.graph == 0); // Cannot find the one alias to. |
287 | 2.95k | if (*(int*)ccv_array_get(parameter_trainables, i) == 0) |
288 | 14 | not_trainables = 1; |
289 | 2.95k | } |
290 | 2.30k | assert(parameters->rnum == parameter_trainables->rnum); |
291 | 2.30k | uint64_t* parameter_flags = 0; |
292 | 2.30k | if (not_trainables) |
293 | 10 | { |
294 | 10 | parameter_flags = (uint64_t*)cccalloc(((parameters->rnum + 63) >> 6), sizeof(uint64_t)); |
295 | 44 | for (i = 0; i < parameter_trainables->rnum; i++34 ) |
296 | 34 | if (*(int*)ccv_array_get(parameter_trainables, i)) |
297 | 20 | parameter_flags[i >> 6] |= ((uint64_t)1 << (i & 63)); |
298 | 10 | } |
299 | 2.30k | ccv_array_free(parameter_trainables); |
300 | | // Assert no internal is alias. |
301 | 2.46k | for (i = 0; i < internals->rnum; i++165 ) |
302 | 165 | { |
303 | 165 | const ccv_nnc_tensor_symbol_t internal = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(internals, i); |
304 | 165 | const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(internal.graph, internal); |
305 | 165 | assert(alias_to.graph == 0); // Cannot find the one alias to. |
306 | 165 | } |
307 | 2.30k | const int output_size = model->output_size; |
308 | 2.30k | ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
309 | 2.30k | const int parameters_rnum = parameters->rnum; |
310 | 2.30k | if (input_size > 0) |
311 | 2.30k | { |
312 | 2.30k | ccv_array_resize(parameters, parameters_rnum + input_size); |
313 | 2.30k | memcpy(ccv_array_get(parameters, parameters_rnum), model->inputs, input_size * sizeof(ccv_nnc_tensor_symbol_t)); |
314 | 2.30k | } |
315 | 2.30k | ccv_nnc_symbolic_graph_simplify(model->graph, |
316 | 2.30k | SYMBOLIC_GRAPH_PASSES(CCV_NNC_SIMPLIFY_COMMON_SUBEXPRESSION_ELIMINATION, |
317 | 2.30k | CCV_NNC_SIMPLIFY_DATA_TRANSFER_OPT, |
318 | 2.30k | CCV_NNC_SIMPLIFY_OPS_FUSION, |
319 | 2.30k | CCV_NNC_SIMPLIFY_GRAPH_PRUNING), |
320 | 2.30k | ccv_array_get(parameters, 0), parameters_rnum + input_size, |
321 | 2.30k | model->outputs, output_size, |
322 | 2.30k | SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph)); |
323 | 2.30k | ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
324 | | // Size it down. |
325 | 2.30k | parameters->rnum = parameters_rnum; |
326 | 2.30k | ccv_cnnp_compiled_data_t* compiled_data = model->compiled_data = cccalloc(1, sizeof(ccv_cnnp_compiled_data_t) + sizeof(ccv_nnc_tensor_symbol_t) * (output_size * 2 - 1)); |
327 | 2.30k | _ccv_cnnp_compiled_data_init(compiled_data, output_size, build_data.gradient_checkpoints); |
328 | 2.30k | const int evaluate_to_size = compiled_data->evaluate.to_size = ccv_nnc_symbolic_graph_destination_size(model->graph); |
329 | 2.30k | assert(evaluate_to_size > 0); |
330 | 2.30k | compiled_data->evaluate.tos = ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size); |
331 | 2.30k | memcpy(compiled_data->evaluate.tos, ccv_nnc_symbolic_graph_destinations(model->graph), sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size); |
332 | 2.30k | compiled_data->loss = loss; |
333 | 2.30k | if (loss.cmd == CCV_NNC_NOOP) |
334 | 2.29k | { |
335 | | // If no loss function provided, there is no fits. |
336 | 4.60k | for (i = 0; i < output_size; i++2.30k ) |
337 | 2.30k | { |
338 | 2.30k | compiled_data->fits[i] = NO_TENSOR_SYMBOL; |
339 | 2.30k | const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(model->graph, model->outputs[i]); |
340 | 2.30k | if (alias_to.d < 0) |
341 | 2.30k | compiled_data->f[i] = model->outputs[i]; |
342 | 0 | else { // We cannot differentiate against an alias, therefore, we have to verify this output is full, and we can diff against the original. |
343 | 0 | int ofs[CCV_NNC_MAX_DIM_ALLOC]; |
344 | 0 | int inc[CCV_NNC_MAX_DIM_ALLOC]; |
345 | 0 | ccv_nnc_tensor_symbol_alias_params(model->graph, model->outputs[i], ofs, inc); |
346 | 0 | int j; |
347 | 0 | for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC; j++) |
348 | 0 | { assert(ofs[j] == 0); } // There is no ofs. |
349 | 0 | compiled_data->f[i] = alias_to; // Unfortunately, I cannot assert the size yet. |
350 | 0 | } |
351 | 2.30k | } |
352 | 2.29k | } else { |
353 | 20 | for (i = 0; i < output_size; i++10 ) |
354 | 10 | { |
355 | 10 | const ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(model->graph, model->outputs[i]); |
356 | 10 | const ccv_nnc_tensor_symbol_t fit = compiled_data->fits[i] = ccv_nnc_tensor_symbol_new(model->graph, info, 0); |
357 | 10 | compiled_data->f[i] = ccv_nnc_tensor_symbol_new(model->graph, ccv_nnc_tensor_auto, 0); |
358 | 10 | ccv_nnc_graph_exec_symbol_new(model->graph, loss, TENSOR_SYMBOL_LIST(model->outputs[i], fit), TENSOR_SYMBOL_LIST(compiled_data->f[i]), 0); |
359 | 10 | } |
360 | 10 | } |
361 | 2.30k | ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
362 | 2.30k | ccv_nnc_symbolic_graph_simplify(model->graph, |
363 | 2.30k | SYMBOLIC_GRAPH_PASSES(CCV_NNC_SIMPLIFY_OPS_FUSION), // Only do Ops fusion, in this way, we can fuse the loss function. |
364 | 2.30k | 0, 0, // No need to provide binds at this point. |
365 | 2.30k | compiled_data->f, model->output_size, |
366 | 2.30k | SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph)); |
367 | 2.30k | ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
368 | | // If inputs are from GPU, stream type is GPU. |
369 | 2.30k | compiled_data->parameters = parameters; |
370 | 2.30k | compiled_data->parameter_flags = parameter_flags; |
371 | 2.30k | compiled_data->internals = internals; |
372 | 2.30k | compiled_data->ids.parameters = parameter_ids; |
373 | 2.30k | compiled_data->ids.internals = internal_ids; |
374 | 2.30k | ccv_cnnp_model_gradient_checkpoints_cleanup_after_build(compiled_data, model->graph); |
375 | 2.30k | } |
376 | | |
377 | | static void _ccv_cnnp_graph_push_graph_exec_symbol(void* context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_symbol_t* const inputs, const int input_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const char* const name) |
378 | 8.82k | { |
379 | 8.82k | ccv_array_t* const stack = (ccv_array_t*)context; |
380 | 8.82k | ccv_array_push(stack, &symbol.d); |
381 | 8.82k | } |
382 | | |
383 | | static void _ccv_nnc_tensor_symbol_reinit(const ccv_nnc_symbolic_graph_t* const src_graph, ccv_nnc_symbolic_graph_t* const dest_graph, const int src_index, const int dest_index) |
384 | 38.5k | { |
385 | 38.5k | const ccv_nnc_tensor_symbol_t src_symbol = { |
386 | 38.5k | .d = src_index, |
387 | 38.5k | .graph = src_graph |
388 | 38.5k | }; |
389 | 38.5k | const ccv_nnc_tensor_symbol_t dest_symbol = { |
390 | 38.5k | .d = dest_index, |
391 | 38.5k | .graph = dest_graph |
392 | 38.5k | }; |
393 | 38.5k | const ccv_nnc_tensor_param_t params = ccv_nnc_tensor_symbol_params(src_graph, src_symbol); |
394 | 38.5k | ccv_nnc_tensor_symbol_set(dest_graph, dest_symbol, params); |
395 | 38.5k | int ofs[CCV_NNC_MAX_DIM_ALLOC]; |
396 | 38.5k | int inc[CCV_NNC_MAX_DIM_ALLOC]; |
397 | 38.5k | if (0 == ccv_nnc_tensor_symbol_alias_params(src_graph, src_symbol, ofs, inc)) |
398 | 2.00k | ccv_nnc_tensor_symbol_alias_set(dest_graph, dest_symbol, ofs, inc); |
399 | 38.5k | } |
400 | | |
401 | | static int _ccv_nnc_tensor_symbol_check_dim(const ccv_nnc_symbolic_graph_t* const src_graph, ccv_nnc_symbolic_graph_t* const dest_graph, const int src_index, const int dest_index) |
402 | 2.41k | { |
403 | 2.41k | const ccv_nnc_tensor_symbol_t src_symbol = { |
404 | 2.41k | .d = src_index, |
405 | 2.41k | .graph = src_graph |
406 | 2.41k | }; |
407 | 2.41k | const ccv_nnc_tensor_param_t src_params = ccv_nnc_tensor_symbol_params(src_graph, src_symbol); |
408 | 2.41k | const ccv_nnc_tensor_symbol_t dest_symbol = { |
409 | 2.41k | .d = dest_index, |
410 | 2.41k | .graph = dest_graph |
411 | 2.41k | }; |
412 | 2.41k | const ccv_nnc_tensor_param_t dest_params = ccv_nnc_tensor_symbol_params(dest_graph, dest_symbol); |
413 | 2.41k | return memcmp(src_params.dim, dest_params.dim, sizeof(src_params.dim)) == 0; |
414 | 2.41k | } |
415 | | |
416 | | static void _ccv_cnnp_model_gradient_init(ccv_cnnp_model_t* const model, const int gradient_mode, const uint64_t disable_outgrad, ccv_nnc_tensor_t* const* const fits, const int fit_size); |
417 | | static void _ccv_cnnp_compiled_data_graph_free(ccv_cnnp_compiled_data_t* const compiled_data); |
418 | | |
419 | | typedef struct { |
420 | | int parallel_count; |
421 | | ccv_nnc_symbolic_graph_t* graph; |
422 | | ccv_nnc_graph_exec_arena_t* graph_exec_arena; |
423 | | } ccv_nnc_graph_exec_update_t; |
424 | | |
425 | | static void _ccv_cnnp_cmd_update_for_execs(void* const context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint) |
426 | 58 | { |
427 | 58 | ccv_nnc_graph_exec_update_t* const graph_exec_update = (ccv_nnc_graph_exec_update_t*)context; |
428 | 58 | ccv_nnc_graph_exec_arena_t* const graph_exec_arena = graph_exec_update->graph_exec_arena; |
429 | 58 | ccv_nnc_graph_exec_t graph_exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, symbol); |
430 | 58 | ccv_nnc_graph_exec_set(graph_exec.graph, graph_exec, cmd); |
431 | 58 | ccv_nnc_graph_exec_set_hint(graph_exec.graph, graph_exec, hint); |
432 | 58 | const ccv_nnc_symbolic_graph_t* const graph = graph_exec_update->graph; |
433 | 58 | const int parallel_count = graph_exec_update->parallel_count; |
434 | 58 | int i; |
435 | 178 | for (i = 1; i < parallel_count; i++120 ) |
436 | 120 | { |
437 | 120 | const ccv_nnc_graph_exec_t copy = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, ccv_nnc_graph_exec_symbol_copy(graph, symbol, i)); |
438 | 120 | if (!CCV_NO_GRAPH_EXEC(copy)) |
439 | 120 | { |
440 | 120 | ccv_nnc_graph_exec_set(copy.graph, copy, cmd); |
441 | 120 | ccv_nnc_graph_exec_set_hint(copy.graph, copy, hint); |
442 | 120 | } |
443 | 120 | } |
444 | 58 | } |
445 | | |
446 | | void ccv_cnnp_model_absorb(ccv_cnnp_model_t* const model, ccv_cnnp_model_t* const init, const ccv_nnc_tensor_param_t* const inputs, const int input_size) |
447 | 2.20k | { |
448 | 2.20k | assert(model->graph); |
449 | 2.20k | assert(model->compiled_data); |
450 | 2.20k | assert(!init->graph); |
451 | 2.20k | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
452 | 2.20k | init->graph = ccv_nnc_symbolic_graph_new(); |
453 | 2.20k | ccv_array_t* const stack = ccv_array_new(sizeof(int), 0, 0); |
454 | 2.20k | ccv_nnc_graph_exec_symbol_new_hook(init->graph, _ccv_cnnp_graph_push_graph_exec_symbol, stack, 0); |
455 | 2.20k | _ccv_cnnp_model_compile(init, inputs, input_size, compiled_data->loss); |
456 | 2.20k | init->parallel_count = model->parallel_count; |
457 | 2.20k | init->memory_compression = model->memory_compression; |
458 | 2.20k | init->memory_reduction = model->memory_reduction; |
459 | 2.20k | init->gradient_checkpointing = model->gradient_checkpointing; |
460 | 2.20k | init->compiled_data->stream_type = model->compiled_data->stream_type; |
461 | 2.20k | init->compiled_data->minimize.minimizer = model->compiled_data->minimize.minimizer; |
462 | 2.20k | init->compiled_data->minimize.max_saved_aux_size = model->compiled_data->minimize.max_saved_aux_size; |
463 | 2.20k | if (model->compiled_data->gradient_mode != CCV_CNNP_COMPILED_DATA_GRADIENT_NONE) |
464 | 2.20k | _ccv_cnnp_model_gradient_init(init, model->compiled_data->gradient_mode, model->compiled_data->disable_outgrad, 0, 0); |
465 | 2.20k | ccv_nnc_graph_exec_symbol_new_hook(init->graph, 0, 0, 0); |
466 | 2.20k | ccv_nnc_symbolic_graph_tensor_auto(init->graph, TRAVERSE_FULL); |
467 | 2.20k | int i, j; |
468 | | // Verify parameters, internals and saved_aux in both graph has the same dimensionality. |
469 | 4.61k | for (i = 0; i < compiled_data->parameters->rnum; i++2.41k ) |
470 | 2.41k | { |
471 | 2.41k | const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d; |
472 | 2.41k | assert(_ccv_nnc_tensor_symbol_check_dim(model->graph, init->graph, d, d)); |
473 | 2.41k | } |
474 | 2.20k | for (i = 0; i < compiled_data->internals->rnum; i++0 ) |
475 | 0 | { |
476 | 0 | const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i))->d; |
477 | 0 | assert(_ccv_nnc_tensor_symbol_check_dim(model->graph, init->graph, d, d)); |
478 | 0 | } |
479 | | // Update inputs. |
480 | 2.20k | assert(model->input_size == init->input_size); |
481 | 4.40k | for (i = 0; 2.20k i < model->input_size; i++2.20k ) |
482 | 2.20k | if (model->inputs[i].d >= 0) |
483 | 2.20k | { |
484 | 2.20k | assert(init->inputs[i].d >= 0); |
485 | 2.20k | _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, init->inputs[i].d, model->inputs[i].d); |
486 | 2.20k | } |
487 | | // Update outputs. |
488 | 2.20k | assert(model->output_size == init->output_size); |
489 | 4.40k | for (i = 0; 2.20k i < model->output_size; i++2.20k ) |
490 | 2.20k | { |
491 | 2.20k | if (model->outputs[i].d >= 0) |
492 | 2.20k | { |
493 | 2.20k | assert(init->outputs[i].d >= 0); |
494 | 2.20k | _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, init->outputs[i].d, model->outputs[i].d); |
495 | 2.20k | } |
496 | 2.20k | if (model->outputs[i].d != model->compiled_data->f[i].d) |
497 | 0 | { |
498 | 0 | assert(init->outputs[i].d != init->compiled_data->f[i].d); |
499 | 0 | if (model->compiled_data->f[i].d >= 0) |
500 | 0 | { |
501 | 0 | assert(init->compiled_data->f[i].d >= 0); |
502 | 0 | _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, init->compiled_data->f[i].d, model->compiled_data->f[i].d); |
503 | 0 | } |
504 | 0 | } |
505 | 2.20k | } |
506 | | // Go through the graph to set tensor on matching symbols |
507 | 11.0k | for (i = 0; 2.20k i < stack->rnum; i++8.82k ) |
508 | 8.82k | { |
509 | 8.82k | const int d = *(int*)ccv_array_get(stack, i); |
510 | | // If exceed range, skip. |
511 | 8.82k | if (d >= ccv_nnc_graph_exec_symbol_count(init->graph) || |
512 | 8.82k | d >= ccv_nnc_graph_exec_symbol_count(model->graph)) |
513 | 0 | continue; |
514 | 8.82k | const ccv_nnc_graph_exec_symbol_t src_symbol = { |
515 | 8.82k | .d = d, |
516 | 8.82k | .graph = init->graph |
517 | 8.82k | }; |
518 | 8.82k | const ccv_nnc_graph_exec_symbol_t dest_symbol = { |
519 | 8.82k | .d = d, |
520 | 8.82k | .graph = model->graph |
521 | 8.82k | }; |
522 | 8.82k | const ccv_nnc_cmd_t src_cmd = ccv_nnc_graph_exec_symbol_cmd(init->graph, src_symbol); |
523 | 8.82k | const ccv_nnc_cmd_t dest_cmd = ccv_nnc_graph_exec_symbol_cmd(model->graph, dest_symbol); |
524 | | // If the name doesn't match, skip. |
525 | 8.82k | if (dest_cmd.cmd != src_cmd.cmd && src_cmd.cmd != CCV_NNC_NOOP0 ) |
526 | 0 | continue; |
527 | | // Now get all the inputs and outputs, if matches, set them. |
528 | 8.82k | const int* src_inputs; |
529 | 8.82k | int src_input_size; |
530 | 8.82k | const int* src_outputs; |
531 | 8.82k | int src_output_size; |
532 | 8.82k | ccv_nnc_graph_exec_symbol_io(init->graph, src_symbol, &src_inputs, &src_input_size, &src_outputs, &src_output_size); |
533 | 8.82k | const int* dest_inputs; |
534 | 8.82k | int dest_input_size; |
535 | 8.82k | const int* dest_outputs; |
536 | 8.82k | int dest_output_size; |
537 | 8.82k | ccv_nnc_graph_exec_symbol_io(model->graph, dest_symbol, &dest_inputs, &dest_input_size, &dest_outputs, &dest_output_size); |
538 | | // We may have unmatched input / output size because this is the minimizer and it has |
539 | | // different saved_aux (for example, when we shrunk with CMD_NOOP). |
540 | 8.82k | if (src_input_size != dest_input_size) |
541 | 0 | continue; |
542 | 8.82k | if (src_output_size != dest_output_size) |
543 | 0 | continue; |
544 | 8.82k | ccv_nnc_graph_exec_symbol_set(model->graph, dest_symbol, src_cmd); |
545 | | // There may be mismatches of the source tensor symbols and destination tensor symbols. The reason is because |
546 | | // we may later passed-in the minimizer, therefore, we may allocate tensors for minimizer later in the original |
547 | | // graph whereas in the newly created graph, it is streamlined (the minimizer exists from the beginning). That |
548 | | // will make the order of tensor symbols creation different, therefore, exact which tensor is which wrong as |
549 | | // well. However, set a new minimizer won't change the exec symbol ordering, because we never create new exec |
550 | | // symbols after gradient init step. Changing a new minimizer just updated that exec symbols setting, it is not |
551 | | // a new exec symbol. |
552 | 33.7k | for (j = 0; j < src_input_size; j++24.8k ) |
553 | 24.8k | if (src_inputs[j] >= 0) |
554 | 20.4k | _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, src_inputs[j], dest_inputs[j]); |
555 | 22.4k | for (j = 0; j < src_output_size; j++13.6k ) |
556 | 13.6k | if (src_outputs[j] >= 0) |
557 | 13.6k | _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, src_outputs[j], dest_outputs[j]); |
558 | 8.82k | } |
559 | 2.20k | ccv_array_free(stack); |
560 | | // After this, we get all tensors in the model graph resolved through tensor_auto. |
561 | 2.20k | ccv_nnc_symbolic_graph_tensor_auto(model->graph, TRAVERSE_FULL); |
562 | | // Verify symbols we get matches. |
563 | 2.20k | const int parameter_size = compiled_data->parameters->rnum; |
564 | 4.61k | for (i = 0; i < parameter_size; i++2.41k ) |
565 | 2.41k | { assert(((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d == ((ccv_nnc_tensor_symbol_t*)ccv_array_get(init->compiled_data->parameters, i))->d); } |
566 | 2.20k | const int internal_size = compiled_data->internals->rnum; |
567 | 2.20k | for (i = 0; i < internal_size; i++0 ) |
568 | 0 | { assert(((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i))->d == ((ccv_nnc_tensor_symbol_t*)ccv_array_get(init->compiled_data->internals, i))->d); } |
569 | | // Go through compiled data. |
570 | 2.20k | if (compiled_data->tensor_arena) |
571 | 2.20k | { |
572 | 2.20k | const int flag = ccv_nnc_tensor_arena_reinit(compiled_data->tensor_arena, model->graph); |
573 | 2.20k | if (flag == 0 && compiled_data->graph_exec_arena) |
574 | 2.20k | { |
575 | 2.20k | ccv_nnc_graph_exec_reinit(compiled_data->graph_exec_arena, compiled_data->graph, model->graph); |
576 | | // Since we will reinit, if we previously set is_test, we need to set it again. |
577 | 2.20k | if (compiled_data->is_test) |
578 | 1 | { |
579 | 1 | const int parallel_count = ccv_max(model->parallel_count, 1); |
580 | 1 | ccv_nnc_graph_exec_update_t update = { |
581 | 1 | .parallel_count = parallel_count, |
582 | 1 | .graph = model->graph, |
583 | 1 | .graph_exec_arena = compiled_data->graph_exec_arena, |
584 | 1 | }; |
585 | 1 | ccv_cnnp_model_set_is_test(model, 1, _ccv_cnnp_cmd_update_for_execs, &update); |
586 | 1 | } |
587 | 2.20k | } else |
588 | | // Free-up tensor arena & graph exec arena. |
589 | 0 | _ccv_cnnp_compiled_data_graph_free(compiled_data); |
590 | 2.20k | } |
591 | | // There are other compiled graphs, for accum and apply gradients. |
592 | | // However, the main conclusion is, these absorb operations shouldn't impact parameters. |
593 | | // Thus, it won't impact the shape of gradients (only outgrad). Since for outgrad, we |
594 | | // don't allocate ourselves, it is not a concern. For normal gradients, the shape cannot |
595 | | // be changed otherwise parameters' shape will be meaningless. The same goes to internals. |
596 | | // That is why we don't update these compiled graphs at all this point. |
597 | | // Free the model, we've already "absorbed" it. |
598 | 2.20k | ccv_cnnp_model_free(init); |
599 | 2.20k | } |
600 | | |
601 | | void ccv_cnnp_model_compile(ccv_cnnp_model_t* const model, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_cmd_t minimizer, const ccv_nnc_cmd_t loss) |
602 | 2.29k | { |
603 | 2.29k | assert(input_size == model->input_size || model->input_size == 0); |
604 | 2.29k | if (model->input_size == 0) |
605 | 10 | model->input_size = input_size; |
606 | 2.29k | if (!model->graph) // The graph is not compiled yet. |
607 | 97 | { |
608 | 97 | model->graph = ccv_nnc_symbolic_graph_new(); |
609 | 97 | _ccv_cnnp_model_compile(model, inputs, input_size, loss); |
610 | 97 | assert(model->compiled_data); |
611 | 97 | int i, flag = 0; |
612 | 246 | for (i = 0; !flag && i < input_size226 ; i++149 ) |
613 | 149 | flag = (CCV_TENSOR_GET_MEMORY(inputs[i].type) == CCV_TENSOR_GPU_MEMORY); |
614 | | // If inputs are from GPU, stream type is GPU. |
615 | 97 | model->compiled_data->stream_type = flag ? CCV_STREAM_CONTEXT_GPU20 : CCV_STREAM_CONTEXT_CPU77 ; |
616 | 97 | model->compiled_data->minimize.minimizer = minimizer; |
617 | 97 | model->compiled_data->minimize.max_saved_aux_size = ccv_nnc_minimizer_saved_aux_size(minimizer); |
618 | 2.20k | } else { |
619 | | // Now, finally fill in this part. If the graph is already compiled, we make a copy of the model. |
620 | | // And then absorb the "new model" to the old one. |
621 | 2.20k | ccv_cnnp_model_t* const init = ccv_cnnp_model_copy(model, model->is_trainable); |
622 | 2.20k | ccv_cnnp_model_absorb(model, init, inputs, input_size); |
623 | | // Reset minimizer. |
624 | 2.20k | ccv_cnnp_model_set_minimizer(model, minimizer, 1, 0, 0); |
625 | 2.20k | } |
626 | 2.29k | } |
627 | | |
628 | | ccv_cnnp_model_t* ccv_cnnp_model_copy(const ccv_cnnp_model_t* const model, const int is_trainable) |
629 | 2.20k | { |
630 | 2.20k | ccv_cnnp_model_t* const new_model = _ccv_cnnp_model_copy(model, 0); |
631 | 2.20k | new_model->is_trainable = is_trainable; |
632 | 2.20k | return new_model; |
633 | 2.20k | } |
634 | | |
635 | | void ccv_cnnp_model_tensor_auto(ccv_cnnp_model_t* const model, ccv_nnc_tensor_param_t* const outputs, const int output_size) |
636 | 4.44k | { |
637 | 4.44k | assert(model->graph); |
638 | 4.44k | assert(output_size == model->output_size); |
639 | 4.44k | ccv_nnc_symbolic_graph_t* const graph = model->graph; |
640 | 4.44k | ccv_nnc_symbolic_graph_tensor_auto(graph, TRAVERSE_FULL); |
641 | 4.44k | int i; |
642 | 8.89k | for (i = 0; i < output_size; i++4.45k ) |
643 | 4.45k | { |
644 | 4.45k | assert(model->outputs[i].d != CCV_NNC_NO_TENSOR_SYMBOL); |
645 | 4.45k | outputs[i] = ccv_nnc_tensor_symbol_params(graph, model->outputs[i]); |
646 | 4.45k | } |
647 | 4.44k | } |
648 | | |
649 | | void ccv_cnnp_model_set_workspace_size(ccv_cnnp_model_t* const model, size_t workspace_size) |
650 | 3 | { |
651 | 3 | if (workspace_size == model->workspace_size) |
652 | 0 | return; |
653 | 3 | model->workspace_size = workspace_size; |
654 | 3 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
655 | 3 | if (compiled_data && compiled_data->graph) |
656 | 0 | ccv_nnc_graph_autotune(compiled_data->graph, workspace_size, 0, TRAVERSE_FULL); |
657 | 3 | } |
658 | | |
659 | | size_t ccv_cnnp_model_workspace_size(ccv_cnnp_model_t* const model) |
660 | 0 | { |
661 | 0 | return model->workspace_size; |
662 | 0 | } |
663 | | |
664 | | void ccv_cnnp_model_set_data_parallel(ccv_cnnp_model_t* const model, const int parallel) |
665 | 15 | { |
666 | 15 | if (parallel == 0) |
667 | 0 | model->parallel_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU); |
668 | 15 | else |
669 | 15 | model->parallel_count = parallel; |
670 | 15 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
671 | 15 | if (compiled_data) |
672 | 11 | { assert(!compiled_data->graph); } |
673 | 15 | } |
674 | | |
675 | | void ccv_cnnp_model_set_max_concurrency(ccv_cnnp_model_t* const model, const int max_stream_count) |
676 | 0 | { |
677 | 0 | model->max_stream_count = max_stream_count; |
678 | 0 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
679 | 0 | if (compiled_data) |
680 | 0 | { assert(!compiled_data->graph); } |
681 | 0 | } |
682 | | |
683 | | void ccv_cnnp_model_set_memory_compression(ccv_cnnp_model_t* const model, const int memory_compression) |
684 | 0 | { |
685 | 0 | model->memory_compression = memory_compression; |
686 | 0 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
687 | 0 | if (compiled_data) |
688 | 0 | { assert(!compiled_data->graph); } |
689 | 0 | } |
690 | | |
691 | | void ccv_cnnp_model_set_memory_reduction(ccv_cnnp_model_t* const model, const int memory_reduction) |
692 | 0 | { |
693 | 0 | model->memory_reduction = memory_reduction; |
694 | 0 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
695 | 0 | if (compiled_data) |
696 | 0 | { assert(!compiled_data->graph); } |
697 | 0 | } |
698 | | |
699 | | void ccv_cnnp_model_set_gradient_checkpointing(ccv_cnnp_model_t* const model, const int gradient_checkpointing) |
700 | 2 | { |
701 | 2 | model->gradient_checkpointing = gradient_checkpointing; |
702 | 2 | } |
703 | | |
704 | | int ccv_cnnp_model_gradient_checkpointing(ccv_cnnp_model_t* const model) |
705 | 0 | { |
706 | 0 | return model->gradient_checkpointing; |
707 | 0 | } |
708 | | |
709 | | typedef struct { |
710 | | int parallel_count; |
711 | | ccv_nnc_symbolic_graph_t* graph; |
712 | | ccv_cnnp_compiled_data_t* compiled_data; |
713 | | ccv_nnc_tensor_arena_t* tensor_arena; |
714 | | } ccv_nnc_tensor_init_states_t; |
715 | | |
716 | | static int _ccv_cnnp_any_to_init(const ccv_cnnp_compiled_data_t* const compiled_data) |
717 | 99 | { |
718 | 99 | int i; |
719 | 99 | const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v); |
720 | 179 | for (i = 0; i < compiled_data->parameters->rnum; i++80 ) |
721 | 119 | { |
722 | 119 | const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d; |
723 | 119 | if (!(init_v[d >> 5] & (1u << (d & 0x1f)))) |
724 | 39 | return 1; |
725 | 119 | } |
726 | 60 | for (i = 0; i < compiled_data->internals->rnum; i++0 ) |
727 | 6 | { |
728 | 6 | const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i))->d; |
729 | 6 | if (!(init_v[d >> 5] & (1u << (d & 0x1f)))) |
730 | 6 | return 1; |
731 | 6 | } |
732 | 54 | return 0; |
733 | 60 | } |
734 | | |
735 | | static void _ccv_cnnp_init_states_for_tensors(void* const context, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const input, const ccv_nnc_tensor_symbol_t output_symbol) |
736 | 341 | { |
737 | 341 | ccv_nnc_tensor_init_states_t* const tensor_init_states = (ccv_nnc_tensor_init_states_t*)context; |
738 | 341 | ccv_nnc_tensor_arena_t* const tensor_arena = tensor_init_states->tensor_arena; |
739 | 341 | ccv_nnc_tensor_t* const output_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, output_symbol); |
740 | 341 | if (!output_tensor) |
741 | 0 | return; |
742 | 341 | const int d = output_symbol.d; |
743 | 341 | assert(d < tensor_init_states->compiled_data->tensors_init.size); |
744 | 341 | uint32_t* const init_v = CCV_NNC_INIT_V(tensor_init_states->compiled_data->tensors_init.v); |
745 | 341 | if (init_v[d >> 5] & (1u << (d & 0x1f))) |
746 | 34 | return; |
747 | 307 | init_v[d >> 5] |= (1u << (d & 0x1f)); |
748 | 307 | ccv_nnc_cmd_exec(cmd, hint, flags, &input, input ? 116 : 0291 , &output_tensor, 1, 0); |
749 | 307 | const ccv_nnc_symbolic_graph_t* const graph = tensor_init_states->graph; |
750 | 307 | const int parallel_count = tensor_init_states->parallel_count; |
751 | 307 | int i; |
752 | 787 | for (i = 1; i < parallel_count; i++480 ) |
753 | 480 | { |
754 | 480 | ccv_nnc_tensor_t* const copy = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_copy(graph, output_symbol, i)); |
755 | 480 | if (copy) |
756 | 480 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &output_tensor, 1, ©, 1, 0); |
757 | 480 | } |
758 | 307 | } |
759 | | |
760 | | // This method can only handle cases we added new tensors and exec, never delete. This invariant is true because |
761 | | // we setup everything (including calling simplify method) in ccv_cnnp_model_compile method, before this rewind setup. |
762 | | static void _ccv_cnnp_model_rewind_graph(ccv_cnnp_model_t* const model) |
763 | 2 | { |
764 | 2 | assert(model->graph); |
765 | 2 | assert(model->compiled_data); |
766 | 2 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
767 | 2 | assert(compiled_data->rewindables); |
768 | 2 | int i; |
769 | 51 | for (i = 0; i < compiled_data->rewindables->rnum; i++49 ) |
770 | 49 | { |
771 | 49 | const ccv_cnnp_rewind_symbol_t* const rewind_symbol = (ccv_cnnp_rewind_symbol_t*)ccv_array_get(compiled_data->rewindables, i); |
772 | 49 | if (rewind_symbol->type == CCV_CNNP_REWIND_GRAPH_EXEC) |
773 | 16 | ccv_nnc_graph_exec_symbol_free(model->graph, rewind_symbol->graph_exec); |
774 | 33 | else if (rewind_symbol->type == CCV_CNNP_REWIND_TENSOR) |
775 | 33 | ccv_nnc_tensor_symbol_free(model->graph, rewind_symbol->tensor); |
776 | 49 | } |
777 | 2 | ccv_array_clear(compiled_data->rewindables); |
778 | 2 | ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
779 | 2 | } |
780 | | |
781 | | static void _ccv_cnnp_model_tensor_symbol_new_hook(void* context, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_param_t info, const char* const name) |
782 | 6.13k | { |
783 | 6.13k | const ccv_cnnp_rewind_symbol_t rewind_symbol = { |
784 | 6.13k | .type = CCV_CNNP_REWIND_TENSOR, |
785 | 6.13k | .tensor = symbol |
786 | 6.13k | }; |
787 | 6.13k | ccv_array_t* const rewind_symbols = (ccv_array_t*)context; |
788 | 6.13k | ccv_array_push(rewind_symbols, &rewind_symbol); |
789 | 6.13k | } |
790 | | |
791 | | static void _ccv_cnnp_model_tensor_symbol_alias_new_hook(void* context, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_symbol_t from_symbol, const int ofs[CCV_NNC_MAX_DIM_ALLOC], const int inc[CCV_NNC_MAX_DIM_ALLOC], const ccv_nnc_tensor_param_t info, const char* const name) |
792 | 475 | { |
793 | 475 | const ccv_cnnp_rewind_symbol_t rewind_symbol = { |
794 | 475 | .type = CCV_CNNP_REWIND_TENSOR, |
795 | 475 | .tensor = symbol |
796 | 475 | }; |
797 | 475 | ccv_array_t* const rewind_symbols = (ccv_array_t*)context; |
798 | 475 | ccv_array_push(rewind_symbols, &rewind_symbol); |
799 | 475 | } |
800 | | |
801 | | static void _ccv_cnnp_model_graph_exec_symbol_new_hook(void* context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_symbol_t* const inputs, const int input_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const char* const name) |
802 | 2.34k | { |
803 | 2.34k | const ccv_cnnp_rewind_symbol_t rewind_symbol = { |
804 | 2.34k | .type = CCV_CNNP_REWIND_GRAPH_EXEC, |
805 | 2.34k | .graph_exec = symbol |
806 | 2.34k | }; |
807 | 2.34k | ccv_array_t* const rewind_symbols = (ccv_array_t*)context; |
808 | 2.34k | ccv_array_push(rewind_symbols, &rewind_symbol); |
809 | 2.34k | } |
810 | | |
811 | | static void _ccv_cnnp_model_graph_symbol_exec_set_for_graph_exec_arena(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena, const int parallel_count, const ccv_nnc_graph_exec_symbol_t exec_symbol, const ccv_nnc_cmd_t cmd, ccv_nnc_symbolic_graph_t* const symbolic_graph) |
812 | 35.0k | { |
813 | 35.0k | ccv_nnc_graph_exec_t const update_exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, exec_symbol); |
814 | 35.0k | if (!CCV_NO_GRAPH_EXEC(update_exec)) |
815 | 19.9k | ccv_nnc_graph_exec_set(update_exec.graph, update_exec, cmd); |
816 | 35.0k | int i; |
817 | 49.9k | for (i = 1; i < parallel_count; i++14.8k ) |
818 | 14.8k | { |
819 | 14.8k | ccv_nnc_graph_exec_symbol_t copy_symbol = ccv_nnc_graph_exec_symbol_copy(symbolic_graph, exec_symbol, i); |
820 | 14.8k | const ccv_nnc_graph_exec_t copy = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, copy_symbol); |
821 | 14.8k | if (!CCV_NO_GRAPH_EXEC(copy)) |
822 | 14.6k | ccv_nnc_graph_exec_set(copy.graph, copy, cmd); |
823 | 14.8k | } |
824 | 35.0k | } |
825 | | |
826 | | static void _ccv_cnnp_model_graph_exec_symbol_set(ccv_nnc_symbolic_graph_t* const symbolic_graph, ccv_cnnp_compiled_data_t* const compiled_data, const int parallel_count, const ccv_nnc_graph_exec_symbol_t exec_symbol, const ccv_nnc_cmd_t cmd) |
827 | 20.0k | { |
828 | 20.0k | assert(compiled_data); |
829 | 20.0k | assert(symbolic_graph); |
830 | 20.0k | ccv_nnc_graph_exec_symbol_set(symbolic_graph, exec_symbol, cmd); |
831 | 20.0k | int i; |
832 | 35.0k | for (i = 1; i < parallel_count; i++14.9k ) |
833 | 14.9k | { |
834 | 14.9k | ccv_nnc_graph_exec_symbol_t copy_symbol = ccv_nnc_graph_exec_symbol_copy(symbolic_graph, exec_symbol, i); |
835 | 14.9k | if (copy_symbol.graph) |
836 | 14.8k | ccv_nnc_graph_exec_symbol_set(symbolic_graph, copy_symbol, cmd); |
837 | 14.9k | } |
838 | 20.0k | ccv_nnc_graph_exec_arena_t* const graph_exec_arena = compiled_data->graph_exec_arena; |
839 | 20.0k | if (graph_exec_arena) |
840 | 20.0k | _ccv_cnnp_model_graph_symbol_exec_set_for_graph_exec_arena(graph_exec_arena, parallel_count, exec_symbol, cmd, symbolic_graph); |
841 | | // Skip backward graph exec arena because it is for a specific accum symbolic graph, not the main graph (model->graph) |
842 | 20.0k | ccv_nnc_graph_exec_arena_t* const gradient_graph_exec_arena = compiled_data->apply_gradients.graph_exec_arena; |
843 | 20.0k | if (gradient_graph_exec_arena) |
844 | 15.0k | _ccv_cnnp_model_graph_symbol_exec_set_for_graph_exec_arena(gradient_graph_exec_arena, parallel_count, exec_symbol, cmd, symbolic_graph); |
845 | 20.0k | } |
846 | | |
847 | | static int _ccv_cnnp_set_minimizer_for_parameter(ccv_nnc_symbolic_graph_t* const graph, ccv_cnnp_compiled_data_t* const compiled_data, ccv_nnc_graph_exec_symbol_t* const update_nodes, ccv_nnc_tensor_symbol_t* const updated_parameters, ccv_nnc_tensor_symbol_map_t* const saved_aux, const int parallel_count, const ccv_nnc_cmd_t minimizer, const int saved_aux_size, const int max_saved_aux_size, const int parameter_indice) |
848 | 20.0k | { |
849 | 20.0k | int this_parameter_flag = 0; |
850 | 20.0k | if (update_nodes[parameter_indice].d == CCV_NNC_NO_TENSOR_SYMBOL) |
851 | 0 | return this_parameter_flag; |
852 | 20.0k | const ccv_nnc_cmd_t old_minimizer = ccv_nnc_graph_exec_symbol_cmd(graph, update_nodes[parameter_indice]); |
853 | 20.0k | int j, k; |
854 | | // For no-op, we can preserve previous saved_aux_size. |
855 | 20.0k | if (old_minimizer.cmd != minimizer.cmd && minimizer.cmd != CCV_NNC_NOOP71 ) |
856 | 67 | { |
857 | | // If the old minimizer is a noop, then the old_saved_aux_size should be whatever its previous |
858 | | // saved_aux_size is, otherwise we will reinit the saved_aux repeatedly if you switch between |
859 | | // noop and a minimizer. We don't want that because we do that in high-level frameworks to |
860 | | // make sure some model parameters don't update if we don't want them to. |
861 | 67 | int old_saved_aux_size; |
862 | 67 | if (old_minimizer.cmd == CCV_NNC_NOOP) |
863 | 67 | { |
864 | 67 | int input_size; |
865 | 67 | ccv_nnc_graph_exec_symbol_io(graph, update_nodes[parameter_indice], 0, &input_size, 0, 0); |
866 | 67 | if (input_size < 2) // This is not legit. |
867 | 0 | old_saved_aux_size = ccv_nnc_minimizer_saved_aux_size(old_minimizer); |
868 | 67 | else // See ccv_nnc_minimizer_saved_aux_size, the saved_aux is inputs excluding gradients and parameters. |
869 | 67 | old_saved_aux_size = input_size - 2; |
870 | 67 | } else |
871 | 0 | old_saved_aux_size = ccv_nnc_minimizer_saved_aux_size(old_minimizer); |
872 | 67 | if (old_saved_aux_size != saved_aux_size) |
873 | 65 | { |
874 | 65 | this_parameter_flag = 1; |
875 | 65 | if (saved_aux_size > old_saved_aux_size) |
876 | 65 | { |
877 | | // Allocate new tensor symbols. |
878 | 65 | const ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(graph, updated_parameters[parameter_indice]); |
879 | 189 | for (j = old_saved_aux_size; j < saved_aux_size; j++124 ) |
880 | 124 | { |
881 | 124 | saved_aux[parameter_indice * max_saved_aux_size + j].source = ccv_nnc_tensor_symbol_new(graph, info, 0); |
882 | 124 | saved_aux[parameter_indice * max_saved_aux_size + j].destination = ccv_nnc_tensor_symbol_new(graph, info, 0); |
883 | 124 | const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type); |
884 | 460 | for (k = 1; k < parallel_count; k++336 ) |
885 | 336 | { |
886 | 336 | ccv_nnc_tensor_param_t dev_info = info; |
887 | 336 | if (k != device_id) |
888 | 336 | CCV_TENSOR_SET_DEVICE_ID(dev_info.type, k); |
889 | 0 | else |
890 | 0 | CCV_TENSOR_SET_DEVICE_ID(dev_info.type, 0); |
891 | 336 | const ccv_nnc_tensor_symbol_t src_copy = ccv_nnc_tensor_symbol_new(graph, dev_info, 0); |
892 | 336 | const ccv_nnc_tensor_symbol_t dest_copy = ccv_nnc_tensor_symbol_new(graph, dev_info, 0); |
893 | 336 | ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k, src_copy); |
894 | 336 | ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k, dest_copy); |
895 | 336 | } |
896 | 124 | } |
897 | 65 | } else { |
898 | 0 | for (j = saved_aux_size; j < old_saved_aux_size; j++) |
899 | 0 | { |
900 | 0 | for (k = 1; k < parallel_count; k++) |
901 | 0 | { |
902 | 0 | const ccv_nnc_tensor_symbol_t src_copy = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k); |
903 | 0 | if (src_copy.d >= 0) |
904 | 0 | { |
905 | 0 | ccv_nnc_tensor_symbol_free(graph, src_copy); |
906 | 0 | ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k, NO_TENSOR_SYMBOL); |
907 | 0 | } |
908 | 0 | const ccv_nnc_tensor_symbol_t dest_copy = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k); |
909 | 0 | if (dest_copy.d >= 0) |
910 | 0 | { |
911 | 0 | ccv_nnc_tensor_symbol_free(graph, dest_copy); |
912 | 0 | ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k, NO_TENSOR_SYMBOL); |
913 | 0 | } |
914 | 0 | } |
915 | 0 | ccv_nnc_tensor_symbol_free(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source); |
916 | 0 | ccv_nnc_tensor_symbol_free(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination); |
917 | 0 | saved_aux[parameter_indice * max_saved_aux_size + j].source = saved_aux[parameter_indice * max_saved_aux_size + j].destination = NO_TENSOR_SYMBOL; |
918 | 0 | } |
919 | 0 | } |
920 | 65 | } |
921 | 67 | } |
922 | 20.0k | _ccv_cnnp_model_graph_exec_symbol_set(graph, compiled_data, parallel_count, update_nodes[parameter_indice], minimizer); |
923 | 20.0k | if (this_parameter_flag) |
924 | 65 | { |
925 | 65 | ccv_nnc_tensor_symbol_t update_inputs[saved_aux_size + 2]; |
926 | 65 | ccv_nnc_tensor_symbol_t update_outputs[saved_aux_size + 1]; |
927 | 65 | const int* inputs = 0; |
928 | 65 | int input_size = 0; |
929 | 65 | ccv_nnc_graph_exec_symbol_io(graph, update_nodes[parameter_indice], &inputs, &input_size, 0, 0); |
930 | 65 | assert(input_size >= 1); |
931 | 65 | update_inputs[0].d = inputs[0]; |
932 | 65 | update_inputs[0].graph = graph; |
933 | 65 | update_inputs[1].d = inputs[1]; |
934 | 65 | update_inputs[1].graph = graph; |
935 | 65 | update_outputs[0] = updated_parameters[parameter_indice]; |
936 | 189 | for (j = 0; j < saved_aux_size; j++124 ) |
937 | 124 | { |
938 | 124 | update_inputs[j + 2] = saved_aux[parameter_indice * max_saved_aux_size + j].source; |
939 | 124 | update_outputs[j + 1] = saved_aux[parameter_indice * max_saved_aux_size + j].destination; |
940 | 124 | } |
941 | 65 | ccv_nnc_graph_exec_symbol_set_io(graph, update_nodes[parameter_indice], update_inputs, saved_aux_size + 2, update_outputs, saved_aux_size + 1); |
942 | 233 | for (k = 1; k < parallel_count; k++168 ) |
943 | 168 | { |
944 | 168 | const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(graph, update_nodes[parameter_indice], k); |
945 | 168 | assert(copy.d >= 0); |
946 | 168 | ccv_nnc_graph_exec_symbol_io(graph, copy, &inputs, &input_size, 0, 0); |
947 | 168 | assert(input_size >= 1); |
948 | 168 | update_inputs[0].d = inputs[0]; |
949 | 168 | update_inputs[0].graph = graph; |
950 | 168 | update_inputs[1].d = inputs[1]; |
951 | 168 | update_inputs[1].graph = graph; |
952 | 168 | update_outputs[0] = ccv_nnc_tensor_symbol_copy(graph, updated_parameters[parameter_indice], k); |
953 | 504 | for (j = 0; j < saved_aux_size; j++336 ) |
954 | 336 | { |
955 | 336 | update_inputs[j + 2] = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k); |
956 | 336 | update_outputs[j + 1] = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k); |
957 | 336 | } |
958 | 168 | ccv_nnc_graph_exec_symbol_set_io(graph, copy, update_inputs, saved_aux_size + 2, update_outputs, saved_aux_size + 1); |
959 | 168 | } |
960 | 65 | } |
961 | 20.0k | return this_parameter_flag; |
962 | 20.0k | } |
963 | | |
964 | | typedef struct { |
965 | | int parameter_size; |
966 | | ccv_nnc_cmd_t minimizer; |
967 | | ccv_cnnp_model_io_t parameters[1]; |
968 | | } ccv_cnnp_set_minimizer_for_parameter_t; |
969 | | |
970 | | static int _ccv_cnnp_apply_parameters_with_minimizer(ccv_cnnp_model_t* const model) |
971 | 296 | { |
972 | 296 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
973 | 296 | assert(compiled_data); |
974 | 296 | const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size; |
975 | | // We update all parameters, at this point, we have one minimizer. |
976 | 296 | const int parameter_size = compiled_data->parameters->rnum; |
977 | 296 | ccv_nnc_graph_exec_symbol_t* const update_nodes = compiled_data->update_nodes; |
978 | 296 | ccv_nnc_symbolic_graph_t* const symbolic_graph = model->graph; |
979 | 296 | assert(symbolic_graph); |
980 | 296 | const int parallel_count = ccv_max(model->parallel_count, 1); |
981 | 296 | ccv_array_t* const parameters = compiled_data->minimize.parameters; |
982 | 296 | ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0); |
983 | 296 | int i, j, flag = 0; |
984 | 301 | for (i = 0; i < parameters->rnum; i++5 ) |
985 | 5 | { |
986 | 5 | ccv_cnnp_set_minimizer_for_parameter_t* const set_minimizer_for_parameter = *(ccv_cnnp_set_minimizer_for_parameter_t**)ccv_array_get(parameters, i); |
987 | 10 | for (j = 0; j < set_minimizer_for_parameter->parameter_size; j++5 ) |
988 | 5 | { |
989 | 5 | const int param_sel = set_minimizer_for_parameter->parameters[j]->param_sel > 0 ? set_minimizer_for_parameter->parameters[j]->param_sel - 13 : set_minimizer_for_parameter->parameters[j]->param_sel2 ; |
990 | 5 | assert(set_minimizer_for_parameter->parameters[j]->param_sel != 0); |
991 | 5 | const int old_rnum = parameter_indices->rnum; |
992 | 5 | ccv_cnnp_model_add_to_parameter_indices(set_minimizer_for_parameter->parameters[j]->model, param_sel, parameter_indices); |
993 | 5 | const int param_ref = set_minimizer_for_parameter->parameters[j]->param_ref > 0 ? set_minimizer_for_parameter->parameters[j]->param_ref - 10 : set_minimizer_for_parameter->parameters[j]->param_ref; |
994 | 5 | assert(set_minimizer_for_parameter->parameters[j]->param_ref != 0); |
995 | 5 | if (param_ref >= 0) |
996 | 0 | { |
997 | 0 | assert(param_ref + old_rnum < parameter_indices->rnum); |
998 | 0 | *(int*)ccv_array_get(parameter_indices, old_rnum) = *(int*)ccv_array_get(parameter_indices, param_ref + old_rnum); |
999 | 0 | parameter_indices->rnum = old_rnum + 1; |
1000 | 0 | } |
1001 | 5 | } |
1002 | 5 | const int saved_aux_size = ccv_nnc_minimizer_saved_aux_size(set_minimizer_for_parameter->minimizer); |
1003 | | // We may have duplicated indices, but that is OK, we will set it twice. |
1004 | 58 | for (j = 0; j < parameter_indices->rnum; j++53 ) |
1005 | 53 | { |
1006 | 53 | const int d = *(int*)ccv_array_get(parameter_indices, j); |
1007 | 53 | assert(d <= parameter_size); |
1008 | 53 | if (_ccv_cnnp_set_minimizer_for_parameter(symbolic_graph, compiled_data, update_nodes, compiled_data->updated_parameters, compiled_data->saved_aux, parallel_count, set_minimizer_for_parameter->minimizer, saved_aux_size, max_saved_aux_size, d)) |
1009 | 0 | flag = 1; |
1010 | 53 | } |
1011 | 5 | ccv_array_clear(parameter_indices); |
1012 | 5 | } |
1013 | 296 | ccv_array_free(parameter_indices); |
1014 | 296 | return flag; |
1015 | 296 | } |
1016 | | |
1017 | | static void _ccv_cnnp_scatter_saved_aux(ccv_nnc_tensor_symbol_map_t* const saved_aux, const int parameter_size, const int old_saved_aux_size, const int new_saved_aux_size) |
1018 | 2.25k | { |
1019 | 2.25k | if (new_saved_aux_size == old_saved_aux_size) |
1020 | 2.24k | return; |
1021 | 2.25k | assert(new_saved_aux_size > old_saved_aux_size)7 ; |
1022 | 7 | int i, j; |
1023 | 72 | for (i = parameter_size - 1; i >= 0; i--65 ) |
1024 | 65 | { |
1025 | 189 | for (j = new_saved_aux_size - 1; j >= old_saved_aux_size; j--124 ) |
1026 | 124 | saved_aux[i * new_saved_aux_size + j].source = saved_aux[i * new_saved_aux_size + j].destination = NO_TENSOR_SYMBOL; |
1027 | 65 | for (j = old_saved_aux_size - 1; j >= 0; j--0 ) |
1028 | 0 | saved_aux[i * new_saved_aux_size + j] = saved_aux[i * old_saved_aux_size + j]; |
1029 | 65 | } |
1030 | 7 | } |
1031 | | |
1032 | | static void _ccv_cnnp_model_set_rewindables(ccv_cnnp_model_t* const model) |
1033 | 44 | { |
1034 | 44 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
1035 | 44 | assert(compiled_data); |
1036 | 44 | if (!compiled_data->rewindables) |
1037 | 44 | compiled_data->rewindables = ccv_array_new(sizeof(ccv_cnnp_rewind_symbol_t), 0, 0); |
1038 | 44 | ccv_nnc_tensor_symbol_new_hook(model->graph, _ccv_cnnp_model_tensor_symbol_new_hook, compiled_data->rewindables, 0); |
1039 | 44 | ccv_nnc_tensor_symbol_alias_new_hook(model->graph, _ccv_cnnp_model_tensor_symbol_alias_new_hook, compiled_data->rewindables, 0); |
1040 | 44 | ccv_nnc_graph_exec_symbol_new_hook(model->graph, _ccv_cnnp_model_graph_exec_symbol_new_hook, compiled_data->rewindables, 0); |
1041 | 44 | } |
1042 | | |
1043 | | static void _ccv_cnnp_model_gradient_init(ccv_cnnp_model_t* const model, const int gradient_mode, const uint64_t disable_outgrad, ccv_nnc_tensor_t* const* const fits, const int fit_size) |
1044 | 2.24k | { |
1045 | 2.24k | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
1046 | 2.24k | assert(compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE); |
1047 | 2.24k | assert(gradient_mode != CCV_CNNP_COMPILED_DATA_GRADIENT_NONE); |
1048 | 2.24k | const int evaluate_to_size = compiled_data->evaluate.to_size; |
1049 | 2.24k | assert(evaluate_to_size > 0); |
1050 | 2.24k | const int parallel_count = ccv_max(model->parallel_count, 1); |
1051 | 2.24k | compiled_data->evaluate.tos = ccrealloc(compiled_data->evaluate.tos, sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size * parallel_count + sizeof(ccv_nnc_graph_exec_t) * evaluate_to_size * parallel_count); |
1052 | 2.24k | compiled_data->evaluate.to_ops = (ccv_nnc_graph_exec_t*)(compiled_data->evaluate.tos + evaluate_to_size * parallel_count); |
1053 | 2.24k | int i, j; |
1054 | 2.24k | const int output_size = model->output_size; |
1055 | 2.24k | assert(!fits || fit_size == output_size * parallel_count); |
1056 | 2.24k | if (fits) |
1057 | 12 | for (i = 0; 6 i < output_size; i++6 ) |
1058 | 6 | ccv_nnc_tensor_symbol_set(model->graph, compiled_data->fits[i], fits[i]->info); |
1059 | 2.24k | const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size; |
1060 | 2.24k | const int parameter_size = compiled_data->parameters->rnum; |
1061 | 2.24k | compiled_data->updated_parameters = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size + sizeof(ccv_nnc_graph_exec_symbol_t) * parameter_size + sizeof(ccv_nnc_tensor_symbol_map_t) * max_saved_aux_size * parameter_size); |
1062 | 2.24k | compiled_data->update_nodes = (ccv_nnc_graph_exec_symbol_t*)(compiled_data->updated_parameters + parameter_size); |
1063 | 2.24k | compiled_data->saved_aux = (ccv_nnc_tensor_symbol_map_t*)(compiled_data->update_nodes + parameter_size); |
1064 | 2.24k | int parameter_size_maybe_more = parameter_size; |
1065 | 2.24k | compiled_data->disable_outgrad = disable_outgrad; |
1066 | 2.24k | int outgrad_size; |
1067 | 2.24k | if (gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES || model->input_size == 02.23k ) |
1068 | 9 | outgrad_size = 0; |
1069 | 2.23k | else if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_NONE) // Compute minimize with gradients including inputs. |
1070 | 2.23k | outgrad_size = model->input_size; |
1071 | 3 | else { |
1072 | 3 | assert(disable_outgrad != CCV_CNNP_DISABLE_OUTGRAD_ALL); // If it is disable all, gradient mode won't be this. |
1073 | 3 | outgrad_size = 0; |
1074 | 10 | for (i = 0; i < model->input_size; i++7 ) |
1075 | 7 | if (!(disable_outgrad & ((uint64_t)1 << i))) |
1076 | 3 | ++outgrad_size; |
1077 | 3 | } |
1078 | 2.24k | compiled_data->outgrad_size = outgrad_size; |
1079 | 2.24k | parameter_size_maybe_more += outgrad_size; |
1080 | 2.24k | compiled_data->gradients = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size_maybe_more + sizeof(ccv_nnc_graph_exec_symbol_t) * parameter_size_maybe_more * parallel_count); |
1081 | 2.24k | compiled_data->outgrads = parameter_size_maybe_more > parameter_size ? compiled_data->gradients + parameter_size2.23k : 09 ; |
1082 | 2.24k | compiled_data->backward.tos = (ccv_nnc_graph_exec_symbol_t*)(compiled_data->gradients + parameter_size_maybe_more); |
1083 | 2.24k | compiled_data->backward.to_size = parameter_size_maybe_more; |
1084 | 2.24k | ccv_nnc_tensor_symbol_t* parameters = (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0); |
1085 | 2.24k | if (compiled_data->parameter_flags) |
1086 | 4 | { |
1087 | 4 | parameters = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size); |
1088 | 25 | for (i = 0; i < parameter_size; i++21 ) |
1089 | 21 | if (compiled_data->parameter_flags[i >> 6] & ((uint64_t)1 << (i & 63))) |
1090 | 14 | parameters[i] = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i); |
1091 | 7 | else |
1092 | 7 | parameters[i] = NO_TENSOR_SYMBOL; |
1093 | 4 | } |
1094 | 2.24k | if (gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES || model->input_size == 02.23k ) |
1095 | 9 | ccv_nnc_symbolic_graph_minimize(model->graph, compiled_data->minimize.minimizer, compiled_data->f, output_size, parameters, parameter_size, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), compiled_data->gradients, compiled_data->updated_parameters, compiled_data->saved_aux, compiled_data->update_nodes); |
1096 | 2.23k | else if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_NONE) // Compute minimize with gradients including inputs. |
1097 | 2.23k | ccv_nnc_symbolic_graph_minimize(model->graph, compiled_data->minimize.minimizer, compiled_data->f, output_size, parameters, parameter_size, model->inputs, model->input_size, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), compiled_data->gradients, compiled_data->updated_parameters, compiled_data->saved_aux, compiled_data->update_nodes); |
1098 | 3 | else { // Compute minimize with gradients including selected inputs. |
1099 | 3 | assert(model->input_size > 0); |
1100 | 3 | assert(disable_outgrad != CCV_CNNP_DISABLE_OUTGRAD_ALL); // If it is disable all, gradient mode won't be this. |
1101 | 3 | assert(outgrad_size > 0); |
1102 | 3 | ccv_nnc_tensor_symbol_t outgrads[outgrad_size]; |
1103 | 3 | j = 0; |
1104 | 10 | for (i = 0; i < model->input_size; i++7 ) |
1105 | 7 | if (!(disable_outgrad & ((uint64_t)1 << i))) |
1106 | 3 | outgrads[j++] = model->inputs[i]; |
1107 | 3 | ccv_nnc_symbolic_graph_minimize(model->graph, compiled_data->minimize.minimizer, compiled_data->f, output_size, parameters, parameter_size, outgrads, outgrad_size, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), compiled_data->gradients, compiled_data->updated_parameters, compiled_data->saved_aux, compiled_data->update_nodes); |
1108 | 3 | } |
1109 | 2.24k | if (compiled_data->parameter_flags) |
1110 | 4 | ccfree(parameters); |
1111 | 2.24k | _ccv_cnnp_scatter_saved_aux(compiled_data->saved_aux, parameter_size, ccv_nnc_minimizer_saved_aux_size(compiled_data->minimize.minimizer), compiled_data->minimize.max_saved_aux_size); |
1112 | 2.24k | if (compiled_data->minimize.parameters) |
1113 | 5 | _ccv_cnnp_apply_parameters_with_minimizer(model); |
1114 | | // Go through gradient checkpoints to generate tensor inputs for backward pass just before executing the backward pass. |
1115 | 2.24k | ccv_cnnp_model_apply_gradient_checkpoints(compiled_data, model->graph); |
1116 | 4.48k | for (i = 0; i < output_size; i++2.24k ) |
1117 | 2.24k | { |
1118 | 2.24k | const ccv_nnc_tensor_symbol_t df = ccv_nnc_tensor_symbol_for_backward(model->graph, compiled_data->f[i]); |
1119 | | // Init this to 1 so we can backprop. |
1120 | 2.24k | ccv_nnc_tensor_symbol_set_flags(model->graph, df, CCV_NNC_TENSOR_SYMBOL_INIT_ONES); |
1121 | 2.24k | } |
1122 | 2.24k | compiled_data->backward.to_size = 0; |
1123 | 7.15k | for (i = 0; i < parameter_size_maybe_more; i++4.91k ) |
1124 | 4.91k | if (compiled_data->gradients[i].d != CCV_NNC_NO_TENSOR_SYMBOL) |
1125 | 4.90k | compiled_data->backward.tos[compiled_data->backward.to_size++] = ccv_nnc_graph_exec_symbol_for_backward(model->graph, compiled_data->gradients[i]); |
1126 | 2.24k | ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS); |
1127 | 2.24k | ccv_nnc_symbolic_graph_set_destinations(model->graph, compiled_data->update_nodes, parameter_size); |
1128 | 4.49k | for (i = 0; i < parameter_size_maybe_more - parameter_size; i++2.25k ) |
1129 | 2.25k | { |
1130 | 2.25k | if (compiled_data->outgrads[i].d < 0) // When we go through input, we might find zero-length inputs, and for these, we cannot have any outgrads. |
1131 | 0 | continue; |
1132 | 2.25k | const ccv_nnc_graph_exec_symbol_t outgrad = ccv_nnc_graph_exec_symbol_for_backward(model->graph, compiled_data->outgrads[i]); |
1133 | 2.25k | const int* tos; |
1134 | 2.25k | int to_size; |
1135 | 2.25k | ccv_nnc_graph_exec_symbol_to(model->graph, outgrad, &tos, &to_size); |
1136 | 2.25k | if (to_size == 0) // If this is the end (no minimizers afterwards). We need to attach this as a destination. Otherwise this is covered in update_nodes. |
1137 | 9 | { |
1138 | 9 | const ccv_nnc_graph_exec_symbol_t* destinations = ccv_nnc_symbolic_graph_destinations(model->graph); |
1139 | 9 | const int destination_count = ccv_nnc_symbolic_graph_destination_size(model->graph); |
1140 | 9 | int flag = 0; |
1141 | 9 | const int outgrad_destination_start = ccv_max(0, destination_count - i); |
1142 | 11 | for (j = i - 1; !flag && j >= 09 ; j--2 ) |
1143 | 2 | if (j + outgrad_destination_start < destination_count) |
1144 | 2 | flag = (destinations[j + outgrad_destination_start].d == outgrad.d); |
1145 | 9 | if (!flag) // Only if we cannot find it, we add it. |
1146 | 7 | ccv_nnc_symbolic_graph_add_destination(model->graph, outgrad); |
1147 | 9 | } |
1148 | 2.25k | } |
1149 | 2.24k | if (parallel_count > 1) |
1150 | 8 | { |
1151 | 8 | ccv_nnc_symbolic_graph_data_parallel(model->graph, parallel_count, |
1152 | 8 | 0, 0, |
1153 | 8 | compiled_data->gradients, parameter_size /* No need to deal with outgrads, we don't allreduce outgrads */, |
1154 | 8 | compiled_data->gradients /* We only care about gradients before allreduce, thus, update our current pointers */, |
1155 | 8 | 0, 0, 0, |
1156 | 8 | CCV_NNC_PARALLEL_REDUCE_OP_SUM, |
1157 | 8 | SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph)); |
1158 | 8 | ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
1159 | 16 | for (i = 0; i < evaluate_to_size; i++8 ) |
1160 | 32 | for (j = 1; 8 j < parallel_count; j++24 ) |
1161 | 24 | { |
1162 | 24 | const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->evaluate.tos[i], j); |
1163 | 24 | if (copy.d != CCV_NNC_NO_GRAPH_EXEC_SYMBOL) |
1164 | 24 | compiled_data->evaluate.tos[compiled_data->evaluate.to_size++] = copy; |
1165 | 24 | } |
1166 | 8 | const int backward_to_size = compiled_data->backward.to_size; |
1167 | 146 | for (i = 0; i < backward_to_size; i++138 ) |
1168 | 552 | for (j = 1; 138 j < parallel_count; j++414 ) |
1169 | 414 | { |
1170 | 414 | const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->backward.tos[i], j); |
1171 | 414 | if (copy.d != CCV_NNC_NO_GRAPH_EXEC_SYMBOL) |
1172 | 414 | compiled_data->backward.tos[compiled_data->backward.to_size++] = copy; |
1173 | 414 | } |
1174 | 8 | } |
1175 | | // Only use memory compression if we are in gradient parameter mode. |
1176 | 2.24k | if (gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES || gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS2.23k ) |
1177 | 2.24k | { |
1178 | 2.24k | if (model->memory_compression) |
1179 | 0 | ccv_nnc_symbolic_graph_memory_compression(model->graph, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph)); |
1180 | 2.24k | if (model->memory_reduction) |
1181 | 0 | ccv_nnc_symbolic_graph_memory_reduction(model->graph, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph)); |
1182 | 2.24k | } |
1183 | 2.24k | compiled_data->backward.to_size = _ccv_nnc_array_dedup_graph_exec_symbols(compiled_data->backward.tos, compiled_data->backward.to_size); |
1184 | 2.24k | compiled_data->gradient_mode = gradient_mode; |
1185 | 2.24k | } |
1186 | | |
1187 | | void ccv_cnnp_model_tensors_init_0(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data) |
1188 | 94 | { |
1189 | 94 | assert(!compiled_data->tensors.parameters); |
1190 | 94 | const int parameter_size = compiled_data->parameters->rnum; |
1191 | 94 | const int parallel_count = ccv_max(model->parallel_count, 1); |
1192 | 94 | const int internal_size = compiled_data->internals->rnum; |
1193 | 94 | compiled_data->tensors_init.size = ccv_nnc_tensor_symbol_count(model->graph); |
1194 | 94 | compiled_data->tensors_init.v = cccalloc(((compiled_data->tensors_init.size + 31) >> 5), sizeof(uint32_t)); |
1195 | 94 | compiled_data->tensors.parameters = (ccv_nnc_tensor_t**)cccalloc((parameter_size + internal_size) * parallel_count, sizeof(ccv_nnc_tensor_t*)); |
1196 | 94 | compiled_data->tensors.internals = compiled_data->tensors.parameters + parameter_size * parallel_count; |
1197 | 94 | } |
1198 | | |
1199 | | int ccv_cnnp_model_tensors_any_to_alloc(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data) |
1200 | 3 | { |
1201 | 3 | int i, j; |
1202 | 3 | const int parameter_size = compiled_data->parameters->rnum; |
1203 | 3 | const int parallel_count = ccv_max(model->parallel_count, 1); |
1204 | 3 | const int internal_size = compiled_data->internals->rnum; |
1205 | 19 | for (i = 0; i < parameter_size; i++16 ) |
1206 | 16 | { |
1207 | | // parameters has to be allocated all together. |
1208 | 16 | if (compiled_data->tensors.parameters[i]) |
1209 | 16 | { |
1210 | 16 | for (j = 1; j < parallel_count; j++0 ) |
1211 | 0 | { assert(compiled_data->tensors.parameters[i + j * parameter_size]); } |
1212 | 16 | continue; |
1213 | 16 | } |
1214 | 0 | return 1; |
1215 | 16 | } |
1216 | 3 | for (i = 0; i < internal_size; i++0 ) |
1217 | 0 | { |
1218 | 0 | if (!compiled_data->tensors.internals[i]) |
1219 | 0 | return 1; |
1220 | 0 | for (j = 1; j < parallel_count; j++) |
1221 | 0 | if (!compiled_data->tensors.internals[i + j * internal_size]) |
1222 | 0 | return 1; |
1223 | 0 | } |
1224 | 3 | return 0; |
1225 | 3 | } |
1226 | | |
1227 | | void ccv_cnnp_model_tensors_init_1(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data) |
1228 | 91 | { |
1229 | 91 | int i, j; |
1230 | 91 | const int parameter_size = compiled_data->parameters->rnum; |
1231 | 91 | const int parallel_count = ccv_max(model->parallel_count, 1); |
1232 | 91 | const int internal_size = compiled_data->internals->rnum; |
1233 | 379 | for (i = 0; i < parameter_size; i++288 ) |
1234 | 288 | { |
1235 | | // parameters has to be allocated all together. |
1236 | 288 | if (compiled_data->tensors.parameters[i]) |
1237 | 0 | { |
1238 | 0 | for (j = 1; j < parallel_count; j++) |
1239 | 0 | { assert(compiled_data->tensors.parameters[i + j * parameter_size]); } |
1240 | 0 | continue; |
1241 | 0 | } |
1242 | 288 | const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i); |
1243 | 288 | ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(parameter.graph, parameter); |
1244 | 288 | if (CCV_TENSOR_GET_DEVICE(info.type) == CCV_COMPUTE_DEVICE_ANY) |
1245 | 104 | CCV_TENSOR_SET_DEVICE_ID(info.type, 0); |
1246 | 288 | const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type); |
1247 | 288 | compiled_data->tensors.parameters[i] = ccv_nnc_tensor_new(0, info, 0); |
1248 | 690 | for (j = 1; j < parallel_count; j++402 ) |
1249 | 402 | { |
1250 | 402 | if (j != device_id) |
1251 | 402 | CCV_TENSOR_SET_DEVICE_ID(info.type, j); |
1252 | 0 | else |
1253 | 0 | CCV_TENSOR_SET_DEVICE_ID(info.type, 0); |
1254 | 402 | compiled_data->tensors.parameters[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0); |
1255 | 402 | } |
1256 | 288 | } |
1257 | 91 | const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v); |
1258 | 153 | for (i = 0; i < internal_size; i++62 ) |
1259 | 62 | { |
1260 | 62 | const ccv_nnc_tensor_symbol_t retained = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i); |
1261 | 62 | const int d = retained.d; |
1262 | 62 | if (init_v[d >> 5] & (1u << (d & 0x1f))) |
1263 | 0 | continue; |
1264 | 62 | ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(retained.graph, retained); |
1265 | 62 | if (CCV_TENSOR_GET_DEVICE(info.type) == CCV_COMPUTE_DEVICE_ANY) |
1266 | 7 | CCV_TENSOR_SET_DEVICE_ID(info.type, 0); |
1267 | 62 | const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type); |
1268 | 62 | if (!compiled_data->tensors.internals[i]) |
1269 | 62 | compiled_data->tensors.internals[i] = ccv_nnc_tensor_new(0, info, 0); |
1270 | 158 | for (j = 1; j < parallel_count; j++96 ) |
1271 | 96 | { |
1272 | 96 | if (j != device_id) |
1273 | 96 | CCV_TENSOR_SET_DEVICE_ID(info.type, j); |
1274 | 0 | else |
1275 | 0 | CCV_TENSOR_SET_DEVICE_ID(info.type, 0); |
1276 | 96 | if (!compiled_data->tensors.internals[i + j * internal_size]) |
1277 | 96 | compiled_data->tensors.internals[i + j * internal_size] = ccv_nnc_tensor_new(0, info, 0); |
1278 | 96 | } |
1279 | 62 | } |
1280 | 91 | compiled_data->tensors_init.v = CCV_NNC_INIT_V(compiled_data->tensors_init.v); // Remove 1 if any. |
1281 | 91 | } |
1282 | | |
1283 | | static void _ccv_cnnp_model_tensors_init(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data) |
1284 | 91 | { |
1285 | 91 | ccv_cnnp_model_tensors_init_0(model, compiled_data); |
1286 | 91 | ccv_cnnp_model_tensors_init_1(model, compiled_data); |
1287 | 91 | } |
1288 | | |
1289 | | static void _ccv_cnnp_model_copy_tensors(const uint32_t* const tensors_init, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t* const* const tensors, const int tensor_size, const int parallel_count) |
1290 | 6 | { |
1291 | 6 | assert(parallel_count > 0); |
1292 | 6 | int i, j; |
1293 | 12 | for (i = 0; i < tensor_size; i++6 ) |
1294 | 6 | { |
1295 | 6 | if (!tensors[i]) |
1296 | 0 | continue; |
1297 | 6 | const int d = tensor_symbols[i].d; |
1298 | 6 | if (!(tensors_init[d >> 5] & (1u << (d & 0x1f)))) |
1299 | 0 | continue; |
1300 | 24 | for (j = 1; 6 j < parallel_count; j++18 ) |
1301 | 18 | if (tensors[i + j * tensor_size]) |
1302 | 18 | { |
1303 | 18 | ccv_nnc_tensor_t* const input = CCV_NNC_TENSOR(tensors[i]); |
1304 | 18 | ccv_nnc_tensor_t* const output = CCV_NNC_TENSOR(tensors[i + j * tensor_size]); |
1305 | 18 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &input, 1, &output, 1, 0); |
1306 | 18 | } |
1307 | 6 | } |
1308 | 6 | } |
1309 | | |
1310 | | static void _ccv_cnnp_model_remove_nocopies(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t** const tensors, const int tensor_size, const int parallel_count) |
1311 | 99 | { |
1312 | 99 | assert(parallel_count > 0); |
1313 | 99 | int i, j; |
1314 | 162 | for (i = 0; i < tensor_size; i++63 ) |
1315 | 63 | { |
1316 | 63 | const ccv_nnc_tensor_symbol_t tensor_symbol = tensor_symbols[i]; |
1317 | 159 | for (j = 1; j < parallel_count; j++96 ) |
1318 | 96 | { |
1319 | 96 | const ccv_nnc_tensor_symbol_t copy = ccv_nnc_tensor_symbol_copy(graph, tensor_symbol, j); |
1320 | 96 | ccv_nnc_tensor_t* copy_tensor = tensors[i + j * tensor_size]; |
1321 | 96 | if (copy_tensor && copy.d == CCV_NNC_NO_TENSOR_SYMBOL) |
1322 | 0 | { // We shouldn't allocate this, free it up. |
1323 | 0 | ccv_nnc_tensor_free(tensors[i + j * tensor_size]); |
1324 | 0 | tensors[i + j * tensor_size] = 0; |
1325 | 0 | } |
1326 | 96 | } |
1327 | 63 | } |
1328 | 99 | } |
1329 | | |
1330 | | static void _ccv_cnnp_model_bind_tensors(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t* const* const tensors, const int tensor_size, const int parallel_count, ccv_array_t* const tensor_binds) |
1331 | 531 | { |
1332 | 531 | assert(parallel_count > 0); |
1333 | 531 | int i, j; |
1334 | 1.91k | for (i = 0; i < tensor_size; i++1.38k ) |
1335 | 1.38k | { |
1336 | 1.38k | ccv_nnc_tensor_symbol_t tensor_symbol = tensor_symbols[i]; |
1337 | 1.38k | if (tensor_symbol.d == CCV_NNC_NO_TENSOR_SYMBOL) |
1338 | 7 | continue; |
1339 | 1.37k | if (graph) |
1340 | 1.37k | { |
1341 | 1.37k | const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(graph, tensor_symbol); |
1342 | 1.37k | if (alias_to.d != CCV_NNC_NO_TENSOR_SYMBOL) |
1343 | 0 | tensor_symbol = alias_to; |
1344 | 1.37k | } |
1345 | 1.37k | ccv_nnc_tensor_t* const tensor = CCV_NNC_TENSOR(tensors[i]); |
1346 | 1.37k | if (tensor && tensor_symbol.d != CCV_NNC_NO_TENSOR_SYMBOL1.37k ) |
1347 | 1.37k | { |
1348 | 1.37k | const ccv_nnc_tensor_bind_t retained_bind = { |
1349 | 1.37k | .symbol = tensor_symbol, |
1350 | 1.37k | .tensor = tensor |
1351 | 1.37k | }; |
1352 | 1.37k | ccv_array_push(tensor_binds, &retained_bind); |
1353 | 1.37k | } |
1354 | 2.92k | for (j = 1; j < parallel_count; j++1.54k ) |
1355 | 1.54k | { |
1356 | 1.54k | const ccv_nnc_tensor_symbol_t copy = ccv_nnc_tensor_symbol_copy(graph, tensor_symbol, j); |
1357 | 1.54k | ccv_nnc_tensor_t* copy_tensor = tensors[i + j * tensor_size]; |
1358 | 1.54k | if (copy_tensor && copy.d != CCV_NNC_NO_TENSOR_SYMBOL) |
1359 | 1.54k | { |
1360 | 1.54k | const ccv_nnc_tensor_bind_t bind = { |
1361 | 1.54k | .symbol = copy, |
1362 | 1.54k | .tensor = tensors[i + j * tensor_size] |
1363 | 1.54k | }; |
1364 | 1.54k | ccv_array_push(tensor_binds, &bind); |
1365 | 1.54k | } |
1366 | 1.54k | } |
1367 | 1.37k | } |
1368 | 531 | } |
1369 | | |
1370 | | static void _ccv_cnnp_compiled_data_graph_free(ccv_cnnp_compiled_data_t* const compiled_data) |
1371 | 2.40k | { |
1372 | 2.40k | if (compiled_data->graph) |
1373 | 99 | ccv_nnc_graph_free(compiled_data->graph); |
1374 | 2.40k | compiled_data->graph = 0; |
1375 | 2.40k | compiled_data->is_test = 0; |
1376 | 2.40k | if (compiled_data->tensor_arena) |
1377 | 99 | ccv_nnc_tensor_arena_free(compiled_data->tensor_arena); |
1378 | 2.40k | compiled_data->tensor_arena = 0; |
1379 | 2.40k | if (compiled_data->graph_exec_arena) |
1380 | 99 | ccv_nnc_graph_exec_arena_free(compiled_data->graph_exec_arena); |
1381 | 2.40k | compiled_data->graph_exec_arena = 0; |
1382 | 2.40k | if (compiled_data->backward.from_ops) |
1383 | 32 | ccfree(compiled_data->backward.from_ops); |
1384 | 2.40k | compiled_data->backward.from_ops = 0; |
1385 | 2.40k | if (compiled_data->evaluate.schedule) |
1386 | 37 | ccv_nnc_graph_static_schedule_free(compiled_data->evaluate.schedule); |
1387 | 2.40k | compiled_data->evaluate.schedule = 0; |
1388 | 2.40k | if (compiled_data->backward.schedule) |
1389 | 28 | ccv_nnc_graph_static_schedule_free(compiled_data->backward.schedule); |
1390 | 2.40k | compiled_data->backward.schedule = 0; |
1391 | 2.40k | } |
1392 | | |
1393 | | static void _ccv_cnnp_compiled_data_gradient_free(ccv_cnnp_compiled_data_t* const compiled_data) |
1394 | 2.30k | { |
1395 | 2.30k | if (compiled_data->gradients) |
1396 | 2.24k | ccfree(compiled_data->gradients); |
1397 | 2.30k | compiled_data->gradients = 0; |
1398 | 2.30k | if (compiled_data->updated_parameters) |
1399 | 2.24k | ccfree(compiled_data->updated_parameters); |
1400 | 2.30k | compiled_data->updated_parameters = 0; |
1401 | 2.30k | compiled_data->update_nodes = 0; |
1402 | 2.30k | compiled_data->saved_aux = 0; |
1403 | 2.30k | } |
1404 | | |
1405 | | static void _ccv_cnnp_compiled_data_backward_free(ccv_cnnp_compiled_data_t* const compiled_data) |
1406 | 2.34k | { |
1407 | 2.34k | if (compiled_data->backward.gradients) |
1408 | 5 | ccfree(compiled_data->backward.gradients); |
1409 | 2.34k | compiled_data->backward.gradients = 0; |
1410 | 2.34k | if (compiled_data->backward.accum) |
1411 | 5 | ccv_nnc_graph_free(compiled_data->backward.accum); |
1412 | 2.34k | compiled_data->backward.accum = 0; |
1413 | 2.34k | if (compiled_data->backward.tensor_arena) |
1414 | 5 | ccv_nnc_tensor_arena_free(compiled_data->backward.tensor_arena); |
1415 | 2.34k | compiled_data->backward.tensor_arena = 0; |
1416 | 2.34k | if (compiled_data->backward.graph_exec_arena) |
1417 | 5 | ccv_nnc_graph_exec_arena_free(compiled_data->backward.graph_exec_arena); |
1418 | 2.34k | compiled_data->backward.graph_exec_arena = 0; |
1419 | 2.34k | } |
1420 | | |
1421 | | static void _ccv_cnnp_compiled_data_apply_gradients_free(ccv_cnnp_compiled_data_t* const compiled_data) |
1422 | 2.31k | { |
1423 | 2.31k | if (compiled_data->apply_gradients.graph) |
1424 | 24 | ccv_nnc_graph_free(compiled_data->apply_gradients.graph); |
1425 | 2.31k | compiled_data->apply_gradients.graph = 0; |
1426 | 2.31k | if (compiled_data->apply_gradients.tensor_arena) |
1427 | 24 | ccv_nnc_tensor_arena_free(compiled_data->apply_gradients.tensor_arena); |
1428 | 2.31k | compiled_data->apply_gradients.tensor_arena = 0; |
1429 | 2.31k | if (compiled_data->apply_gradients.graph_exec_arena) |
1430 | 24 | ccv_nnc_graph_exec_arena_free(compiled_data->apply_gradients.graph_exec_arena); |
1431 | 2.31k | compiled_data->apply_gradients.graph_exec_arena = 0; |
1432 | 2.31k | } |
1433 | | |
1434 | | // Compile the graph to run ccv_cnnp_model_fit |
1435 | | static void _ccv_cnnp_model_fit_jit(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const fits, const int fit_size, ccv_nnc_tensor_t* const* const outputs, const int output_size) |
1436 | 8 | { |
1437 | 8 | int i, j; |
1438 | 8 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
1439 | 8 | assert(!compiled_data->graph || compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_FIT_MODE); |
1440 | 8 | compiled_data->graph_mode = CCV_CNNP_MODEL_GRAPH_FIT_MODE; |
1441 | 8 | const int parallel_count = ccv_max(model->parallel_count, 1); |
1442 | 8 | assert(output_size == model->output_size * parallel_count); |
1443 | 8 | assert(!fits || output_size == fit_size); |
1444 | 8 | assert(output_size > 0); |
1445 | 8 | if (compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE) |
1446 | 8 | { |
1447 | 8 | _ccv_cnnp_model_set_rewindables(model); |
1448 | 8 | _ccv_cnnp_model_gradient_init(model, CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES, CCV_CNNP_DISABLE_OUTGRAD_ALL, fits, fit_size); |
1449 | 8 | } else if (0 compiled_data->gradient_mode != CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES0 ) { |
1450 | 0 | _ccv_cnnp_model_rewind_graph(model); |
1451 | 0 | _ccv_cnnp_compiled_data_gradient_free(compiled_data); |
1452 | 0 | compiled_data->gradient_mode = CCV_CNNP_COMPILED_DATA_GRADIENT_NONE; |
1453 | 0 | _ccv_cnnp_model_gradient_init(model, CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES, CCV_CNNP_DISABLE_OUTGRAD_ALL, fits, fit_size); |
1454 | 0 | } |
1455 | 8 | const int tensors_init = !!compiled_data->tensors_init.v; |
1456 | 8 | if (!tensors_init) |
1457 | 4 | _ccv_cnnp_model_tensors_init(model, compiled_data); |
1458 | 4 | else if ((uintptr_t)compiled_data->tensors_init.v & (uintptr_t)1) |
1459 | | // Check if it is not fully allocated, if it is not, init_1. |
1460 | 0 | ccv_cnnp_model_tensors_init_1(model, compiled_data); |
1461 | 8 | ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0); |
1462 | 8 | assert((input_size % parallel_count) == 0); |
1463 | 8 | assert((output_size % parallel_count) == 0); |
1464 | 8 | assert((fit_size % parallel_count) == 0); |
1465 | 8 | const int input_size_per_p = input_size / parallel_count; |
1466 | 8 | _ccv_cnnp_model_bind_tensors(model->graph, model->inputs, inputs, input_size_per_p, parallel_count, tensor_binds); |
1467 | 8 | const int output_size_per_p = output_size / parallel_count; |
1468 | 8 | _ccv_cnnp_model_bind_tensors(model->graph, model->outputs, outputs, output_size_per_p, parallel_count, tensor_binds); |
1469 | 8 | const int fit_size_per_p = fit_size / parallel_count; |
1470 | 8 | _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->fits, fits, fit_size_per_p, parallel_count, tensor_binds); |
1471 | 8 | const int parameter_size = compiled_data->parameters->rnum; |
1472 | 8 | _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds); |
1473 | 8 | _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->updated_parameters, compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds); |
1474 | 8 | const int internal_size = compiled_data->internals->rnum; |
1475 | 8 | _ccv_cnnp_model_remove_nocopies(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count); |
1476 | 8 | _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count, tensor_binds); |
1477 | 8 | ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena); |
1478 | 8 | ccv_array_free(tensor_binds); |
1479 | 8 | const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v); |
1480 | 8 | if (tensors_init && parallel_count > 14 ) |
1481 | 0 | _ccv_cnnp_model_copy_tensors(init_v, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, compiled_data->parameters->rnum, parallel_count); |
1482 | | // If tensor is not init'ed, we need to init states first. |
1483 | 8 | if (_ccv_cnnp_any_to_init(compiled_data)) |
1484 | 7 | { |
1485 | 7 | ccv_nnc_tensor_init_states_t tensor_init_states = { |
1486 | 7 | .parallel_count = parallel_count, |
1487 | 7 | .graph = model->graph, |
1488 | 7 | .compiled_data = compiled_data, |
1489 | 7 | .tensor_arena = compiled_data->tensor_arena |
1490 | 7 | }; |
1491 | 7 | ccv_cnnp_model_init_states(model, model->graph, _ccv_cnnp_init_states_for_tensors, &tensor_init_states); |
1492 | 7 | } |
1493 | 8 | compiled_data->is_test = 0; |
1494 | 8 | const int saved_aux_size = ccv_nnc_minimizer_saved_aux_size(compiled_data->minimize.minimizer); |
1495 | | // No need to set because it is default to training mode. |
1496 | | // ccv_cnnp_model_set_is_test(model, 0, _ccv_cnnp_cmd_update_for_execs, &update); |
1497 | 105 | for (i = 0; i < saved_aux_size * parameter_size; i++97 ) |
1498 | 97 | { |
1499 | 97 | if (compiled_data->saved_aux[i].source.d == CCV_NNC_NO_TENSOR_SYMBOL) |
1500 | 5 | continue; |
1501 | 92 | ccv_nnc_tensor_t* const tensor = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, compiled_data->saved_aux[i].source); |
1502 | 92 | ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &tensor, 1, 0); |
1503 | 296 | for (j = 1; j < parallel_count; j++204 ) |
1504 | 204 | { |
1505 | 204 | ccv_nnc_tensor_t* const copy = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, compiled_data->saved_aux[i].source, j)); |
1506 | 204 | if (copy) |
1507 | 204 | ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, ©, 1, 0); |
1508 | 204 | } |
1509 | 92 | } |
1510 | 8 | const int evaluate_to_size = compiled_data->evaluate.to_size; |
1511 | 8 | compiled_data->evaluate.to_op_size = 0; |
1512 | 22 | for (i = 0; i < evaluate_to_size; i++14 ) |
1513 | 14 | { |
1514 | 14 | ccv_nnc_graph_exec_t const to = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, compiled_data->evaluate.tos[i]); |
1515 | 14 | if (to.graph) |
1516 | 14 | compiled_data->evaluate.to_ops[compiled_data->evaluate.to_op_size++] = to; |
1517 | 14 | } |
1518 | 8 | ccv_nnc_graph_set_default_static_schedule(compiled_data->graph, compiled_data->stream_type, model->max_stream_count); |
1519 | 8 | ccv_nnc_graph_autotune(compiled_data->graph, model->workspace_size, 0, TRAVERSE_FULL); |
1520 | 8 | } |
1521 | | |
1522 | | ccv_nnc_stream_context_t* ccv_cnnp_model_default_stream(const ccv_cnnp_model_t* const model) |
1523 | 0 | { |
1524 | 0 | const ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
1525 | 0 | if (!compiled_data || !compiled_data->graph) |
1526 | 0 | return 0; |
1527 | 0 | return ccv_nnc_graph_default_stream(compiled_data->graph); |
1528 | 0 | } |
1529 | | |
1530 | | uint64_t ccv_cnnp_model_memory_size(const ccv_cnnp_model_t* const model) |
1531 | 0 | { |
1532 | 0 | const ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
1533 | 0 | if (!compiled_data || !compiled_data->tensor_arena) |
1534 | 0 | return 0; |
1535 | 0 | return ccv_nnc_tensor_arena_size(compiled_data->tensor_arena); |
1536 | 0 | } |
1537 | | |
1538 | | static void _ccv_cnnp_bind_tensors_to_arena(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t* const* const tensors, const int tensor_size, const int parallel_count) |
1539 | 38.9k | { |
1540 | 38.9k | int i, j; |
1541 | 114k | for (i = 0; i < tensor_size; i++75.6k ) |
1542 | 75.6k | { |
1543 | 75.6k | ccv_nnc_tensor_symbol_t tensor_symbol = tensor_symbols[i]; |
1544 | 75.6k | if (tensor_symbol.d == CCV_NNC_NO_TENSOR_SYMBOL) |
1545 | 0 | continue; |
1546 | 75.6k | if (graph) |
1547 | 72.7k | { |
1548 | 72.7k | const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(graph, tensor_symbol); |
1549 | 72.7k | if (alias_to.d != CCV_NNC_NO_TENSOR_SYMBOL) |
1550 | 0 | tensor_symbol = alias_to; |
1551 | 72.7k | } |
1552 | 75.6k | ccv_nnc_tensor_bind_symbol(tensor_arena, tensor_symbol, tensors[i]); |
1553 | 77.4k | for (j = 1; j < parallel_count; j++1.77k ) |
1554 | 1.77k | { |
1555 | 1.77k | const ccv_nnc_tensor_symbol_t copy = ccv_nnc_tensor_symbol_copy(graph, tensor_symbol, j); |
1556 | 1.77k | if (copy.d != CCV_NNC_NO_TENSOR_SYMBOL) |
1557 | 1.77k | ccv_nnc_tensor_bind_symbol(tensor_arena, copy, tensors[i + tensor_size * j]); |
1558 | 1.77k | } |
1559 | 75.6k | } |
1560 | 38.9k | } |
1561 | | |
1562 | | void ccv_cnnp_model_fit(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const fits, const int fit_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context) |
1563 | 2.54k | { |
1564 | 2.54k | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
1565 | 2.54k | assert(compiled_data); |
1566 | 2.54k | const int parallel_count = ccv_max(model->parallel_count, 1); |
1567 | 2.54k | assert(output_size == model->output_size * parallel_count); |
1568 | 2.54k | assert(input_size == model->input_size * parallel_count); |
1569 | 2.54k | assert(!fits || fit_size == output_size); |
1570 | 2.54k | assert(model->graph); |
1571 | 2.54k | if (!compiled_data->graph || compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_FIT_MODE2.53k ) |
1572 | 8 | { |
1573 | 8 | _ccv_cnnp_compiled_data_graph_free(compiled_data); |
1574 | 8 | _ccv_cnnp_compiled_data_backward_free(compiled_data); |
1575 | 8 | _ccv_cnnp_compiled_data_apply_gradients_free(compiled_data); |
1576 | | // Compile the symbolic graph down only when needed. |
1577 | 8 | _ccv_cnnp_model_fit_jit(model, inputs, input_size, fits, fit_size, outputs, output_size); |
1578 | 2.53k | } else { |
1579 | 2.53k | assert((input_size % parallel_count) == 0); |
1580 | 2.53k | assert((output_size % parallel_count) == 0); |
1581 | 2.53k | assert((fit_size % parallel_count) == 0); |
1582 | 2.53k | const int input_size_per_p = input_size / parallel_count; |
1583 | 2.53k | _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->inputs, inputs, input_size_per_p, parallel_count); |
1584 | 2.53k | const int output_size_per_p = output_size / parallel_count; |
1585 | 2.53k | _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->outputs, outputs, output_size_per_p, parallel_count); |
1586 | 2.53k | const int fit_size_per_p = fit_size / parallel_count; |
1587 | 2.53k | _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, compiled_data->fits, fits, fit_size_per_p, parallel_count); |
1588 | 2.53k | } |
1589 | 2.54k | if (compiled_data->is_test) |
1590 | 0 | { |
1591 | 0 | compiled_data->is_test = 0; |
1592 | 0 | ccv_nnc_graph_exec_update_t update = { |
1593 | 0 | .parallel_count = parallel_count, |
1594 | 0 | .graph = model->graph, |
1595 | 0 | .graph_exec_arena = compiled_data->graph_exec_arena, |
1596 | 0 | }; |
1597 | 0 | ccv_cnnp_model_set_is_test(model, 0, _ccv_cnnp_cmd_update_for_execs, &update); |
1598 | 0 | } |
1599 | 2.54k | ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, 0, tensor_tape, stream_context); |
1600 | 2.54k | } |
1601 | | |
1602 | | // Compile the graph to run ccv_cnnp_model_evaluate with require_grad = false (MULTISTAGE_MODE_NO_GRAD). |
1603 | | static void _ccv_cnnp_model_multistage_no_grad_jit(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size) |
1604 | 59 | { |
1605 | 59 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
1606 | 59 | compiled_data->graph_mode = CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE_NO_GRAD; |
1607 | 59 | const int parallel_count = ccv_max(model->parallel_count, 1); |
1608 | 59 | assert(output_size == model->output_size * parallel_count); |
1609 | 59 | assert(output_size > 0); |
1610 | | // If the gradient is not initialized, continue to setup parallel process. We don't init gradient here, but rather, |
1611 | | // we setup proper rewindables so the graph can be rewinded to previous state before we run data parallel. |
1612 | 59 | if (parallel_count > 1 && compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE6 ) |
1613 | 6 | { |
1614 | 6 | const int evaluate_to_size = compiled_data->evaluate.to_size; |
1615 | 6 | compiled_data->evaluate.tos = ccrealloc(compiled_data->evaluate.tos, sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size * parallel_count + sizeof(ccv_nnc_graph_exec_t) * evaluate_to_size * parallel_count); |
1616 | 6 | _ccv_cnnp_model_set_rewindables(model); |
1617 | 6 | ccv_nnc_symbolic_graph_data_parallel(model->graph, parallel_count, |
1618 | 6 | 0, 0, |
1619 | 6 | 0, 0, 0, |
1620 | 6 | 0, 0, 0, |
1621 | 6 | CCV_NNC_PARALLEL_REDUCE_OP_SUM, |
1622 | 6 | SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph)); |
1623 | 6 | ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
1624 | 6 | int i, j; |
1625 | 12 | for (i = 0; i < evaluate_to_size; i++6 ) |
1626 | 24 | for (j = 1; 6 j < parallel_count; j++18 ) |
1627 | 18 | { |
1628 | 18 | const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->evaluate.tos[i], j); |
1629 | 18 | if (copy.d != CCV_NNC_NO_GRAPH_EXEC_SYMBOL) |
1630 | 18 | compiled_data->evaluate.tos[compiled_data->evaluate.to_size++] = copy; |
1631 | 18 | } |
1632 | 6 | } |
1633 | 59 | const int tensors_init = !!compiled_data->tensors_init.v; |
1634 | 59 | if (!tensors_init) |
1635 | 35 | _ccv_cnnp_model_tensors_init(model, compiled_data); |
1636 | 24 | else if ((uintptr_t)compiled_data->tensors_init.v & (uintptr_t)1) |
1637 | | // Check if it is not fully allocated, if it is not, init_1. |
1638 | 0 | ccv_cnnp_model_tensors_init_1(model, compiled_data); |
1639 | 59 | ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0); |
1640 | 59 | assert((input_size % parallel_count) == 0); |
1641 | 59 | assert((output_size % parallel_count) == 0); |
1642 | 59 | const int input_size_per_p = input_size / parallel_count; |
1643 | 59 | _ccv_cnnp_model_bind_tensors(model->graph, model->inputs, inputs, input_size_per_p, parallel_count, tensor_binds); |
1644 | 59 | const int output_size_per_p = output_size / parallel_count; |
1645 | 59 | _ccv_cnnp_model_bind_tensors(model->graph, model->outputs, outputs, output_size_per_p, parallel_count, tensor_binds); |
1646 | 59 | const int parameter_size = compiled_data->parameters->rnum; |
1647 | 59 | _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds); |
1648 | 59 | const int internal_size = compiled_data->internals->rnum; |
1649 | 59 | _ccv_cnnp_model_remove_nocopies(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count); |
1650 | 59 | _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count, tensor_binds); |
1651 | | // If we generated gradient for the graph, only compile part of the graph because the rest is irrelevant for evaluation. |
1652 | 59 | ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), compiled_data->evaluate.tos, compiled_data->evaluate.to_size, &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena); |
1653 | 59 | ccv_array_free(tensor_binds); |
1654 | 59 | const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v); |
1655 | | // If tensor is not init'ed, we need to init states first. |
1656 | 59 | if (tensors_init && parallel_count > 124 ) |
1657 | 6 | _ccv_cnnp_model_copy_tensors(init_v, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, compiled_data->parameters->rnum, parallel_count); |
1658 | 59 | if (_ccv_cnnp_any_to_init(compiled_data)) |
1659 | 17 | { |
1660 | 17 | ccv_nnc_tensor_init_states_t tensor_init_states = { |
1661 | 17 | .parallel_count = parallel_count, |
1662 | 17 | .graph = model->graph, |
1663 | 17 | .compiled_data = compiled_data, |
1664 | 17 | .tensor_arena = compiled_data->tensor_arena |
1665 | 17 | }; |
1666 | 17 | ccv_cnnp_model_init_states(model, model->graph, _ccv_cnnp_init_states_for_tensors, &tensor_init_states); |
1667 | 17 | } |
1668 | 59 | compiled_data->is_test = 1; |
1669 | 59 | ccv_nnc_graph_exec_update_t update = { |
1670 | 59 | .parallel_count = parallel_count, |
1671 | 59 | .graph = model->graph, |
1672 | 59 | .graph_exec_arena = compiled_data->graph_exec_arena, |
1673 | 59 | }; |
1674 | 59 | ccv_cnnp_model_set_is_test(model, 1, _ccv_cnnp_cmd_update_for_execs, &update); |
1675 | 59 | ccv_nnc_graph_set_default_static_schedule(compiled_data->graph, compiled_data->stream_type, model->max_stream_count); |
1676 | 59 | ccv_nnc_graph_autotune(compiled_data->graph, model->workspace_size, 0, TRAVERSE_FULL); |
1677 | 59 | } |
1678 | | |
1679 | | static void _ccv_cnnp_model_gradient_tensors_init(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data) |
1680 | 31 | { |
1681 | 31 | assert(!compiled_data->tensors.gradients); |
1682 | 31 | const int parameter_size = compiled_data->parameters->rnum; |
1683 | 31 | const int parallel_count = ccv_max(model->parallel_count, 1); |
1684 | 31 | compiled_data->tensors.gradients = (ccv_nnc_tensor_t**)ccmalloc(sizeof(ccv_nnc_tensor_t*) * parameter_size * 2 * parallel_count); |
1685 | 31 | compiled_data->tensors.accum_gradients = compiled_data->tensors.gradients + parameter_size * parallel_count; |
1686 | 31 | int i, j; |
1687 | 181 | for (i = 0; i < parameter_size; i++150 ) |
1688 | 150 | { |
1689 | 150 | if (compiled_data->parameter_flags && !(compiled_data->parameter_flags[i >> 6] & ((uint64_t)1 << (i & 63)))6 ) |
1690 | 2 | { |
1691 | 2 | compiled_data->tensors.gradients[i] = 0; |
1692 | 2 | compiled_data->tensors.accum_gradients[i] = 0; |
1693 | 2 | for (j = 1; j < parallel_count; j++0 ) |
1694 | 0 | { |
1695 | 0 | compiled_data->tensors.gradients[i + j * parameter_size] = 0; |
1696 | 0 | compiled_data->tensors.accum_gradients[i + j * parameter_size] = 0; |
1697 | 0 | } |
1698 | 2 | continue; |
1699 | 2 | } |
1700 | 148 | const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i); |
1701 | 148 | ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(parameter.graph, parameter); |
1702 | 148 | if (CCV_TENSOR_GET_DEVICE(info.type) == CCV_COMPUTE_DEVICE_ANY) |
1703 | 38 | CCV_TENSOR_SET_DEVICE_ID(info.type, 0); |
1704 | 148 | const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type); |
1705 | 148 | compiled_data->tensors.gradients[i] = ccv_nnc_tensor_new(0, info, 0); |
1706 | 148 | compiled_data->tensors.accum_gradients[i] = 0; // delay the accumulated gradient allocation until when we need it. |
1707 | 328 | for (j = 1; j < parallel_count; j++180 ) |
1708 | 180 | { |
1709 | 180 | if (j != device_id) |
1710 | 180 | CCV_TENSOR_SET_DEVICE_ID(info.type, j); |
1711 | 0 | else |
1712 | 0 | CCV_TENSOR_SET_DEVICE_ID(info.type, 0); |
1713 | 180 | compiled_data->tensors.gradients[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0); |
1714 | 180 | compiled_data->tensors.accum_gradients[i + j * parameter_size] = 0; |
1715 | 180 | } |
1716 | 148 | } |
1717 | 31 | } |
1718 | | |
1719 | | static int _ccv_cnnp_is_disable_outgrad_all(const uint64_t disable_outgrad, const int input_size) |
1720 | 8.03k | { |
1721 | 8.03k | if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_ALL) |
1722 | 15 | return 1; |
1723 | 8.02k | if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_NONE) |
1724 | 8.01k | return 0; |
1725 | 7 | int i; |
1726 | 7 | for (i = 0; i < input_size; i++0 ) |
1727 | 7 | if (!(disable_outgrad & ((uint64_t)1 << i))) |
1728 | 7 | return 0; |
1729 | 0 | return 1; |
1730 | 7 | } |
1731 | | |
1732 | | // Compile the graph to run ccv_cnnp_model_evaluate with requires_grad = true (MULTISTAGE_MODE). |
1733 | | // Particularly, this method compiles the evaluation and backprop graph (the main graph). |
1734 | | static void _ccv_cnnp_model_multistage_jit_0(ccv_cnnp_model_t* const model, const uint64_t disable_outgrad, const int is_test, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size) |
1735 | 32 | { |
1736 | 32 | int i, j; |
1737 | 32 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
1738 | 32 | const int target_gradient_mode = _ccv_cnnp_is_disable_outgrad_all(disable_outgrad, model->input_size) ? CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES1 : CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS31 ; |
1739 | 32 | assert(!compiled_data->graph || compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE || compiled_data->gradient_mode != target_gradient_mode); |
1740 | 32 | compiled_data->graph_mode = CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE; |
1741 | 32 | const int parallel_count = ccv_max(model->parallel_count, 1); |
1742 | 32 | assert(output_size == model->output_size * parallel_count); |
1743 | 32 | assert(output_size > 0); |
1744 | | // There shouldn't be a loss function if we evaluate with multistage jit. |
1745 | 32 | assert(compiled_data->loss.cmd == CCV_NNC_NOOP); |
1746 | 32 | if (compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE) |
1747 | 30 | { |
1748 | 30 | _ccv_cnnp_model_set_rewindables(model); |
1749 | 30 | _ccv_cnnp_model_gradient_init(model, target_gradient_mode, disable_outgrad, 0, 0); // The type of outputs and fits should be the same. We only use type here. |
1750 | 30 | } else if (2 compiled_data->gradient_mode != target_gradient_mode2 ) { |
1751 | 2 | _ccv_cnnp_model_rewind_graph(model); |
1752 | 2 | _ccv_cnnp_compiled_data_gradient_free(compiled_data); |
1753 | 2 | compiled_data->gradient_mode = CCV_CNNP_COMPILED_DATA_GRADIENT_NONE; |
1754 | 2 | _ccv_cnnp_model_gradient_init(model, target_gradient_mode, disable_outgrad, 0, 0); // The type of outputs and fits should be the same. We only use type here. |
1755 | 2 | } |
1756 | 32 | const int tensors_init = !!compiled_data->tensors_init.v; |
1757 | 32 | if (!tensors_init) |
1758 | 24 | _ccv_cnnp_model_tensors_init(model, compiled_data); |
1759 | 8 | else if ((uintptr_t)compiled_data->tensors_init.v & (uintptr_t)1) |
1760 | | // Check if it is not fully allocated, if it is not, init_1. |
1761 | 0 | ccv_cnnp_model_tensors_init_1(model, compiled_data); |
1762 | 32 | ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0); |
1763 | 32 | assert((input_size % parallel_count) == 0); |
1764 | 32 | assert((output_size % parallel_count) == 0); |
1765 | 32 | const int input_size_per_p = input_size / parallel_count; |
1766 | 32 | _ccv_cnnp_model_bind_tensors(model->graph, model->inputs, inputs, input_size_per_p, parallel_count, tensor_binds); |
1767 | 32 | const int output_size_per_p = output_size / parallel_count; |
1768 | 32 | _ccv_cnnp_model_bind_tensors(model->graph, model->outputs, outputs, output_size_per_p, parallel_count, tensor_binds); |
1769 | 32 | const int parameter_size = compiled_data->parameters->rnum; |
1770 | 32 | _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds); |
1771 | 32 | const int internal_size = compiled_data->internals->rnum; |
1772 | 32 | _ccv_cnnp_model_remove_nocopies(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count); |
1773 | 32 | _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count, tensor_binds); |
1774 | 32 | if (!compiled_data->tensors.gradients) |
1775 | 31 | _ccv_cnnp_model_gradient_tensors_init(model, compiled_data); |
1776 | 32 | _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count, tensor_binds); |
1777 | 32 | if (compiled_data->backward.to_size > 0) |
1778 | 32 | ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), compiled_data->backward.tos, compiled_data->backward.to_size, &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena); |
1779 | 0 | else |
1780 | 0 | ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), compiled_data->evaluate.tos, compiled_data->evaluate.to_size, &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena); |
1781 | 32 | ccv_array_free(tensor_binds); |
1782 | 32 | const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v); |
1783 | 32 | if (tensors_init && parallel_count > 18 ) |
1784 | 0 | _ccv_cnnp_model_copy_tensors(init_v, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, compiled_data->parameters->rnum, parallel_count); |
1785 | | // If tensor is not init'ed, we need to init states first. |
1786 | 32 | if (_ccv_cnnp_any_to_init(compiled_data)) |
1787 | 21 | { |
1788 | 21 | ccv_nnc_tensor_init_states_t tensor_init_states = { |
1789 | 21 | .parallel_count = parallel_count, |
1790 | 21 | .graph = model->graph, |
1791 | 21 | .compiled_data = compiled_data, |
1792 | 21 | .tensor_arena = compiled_data->tensor_arena |
1793 | 21 | }; |
1794 | 21 | ccv_cnnp_model_init_states(model, model->graph, _ccv_cnnp_init_states_for_tensors, &tensor_init_states); |
1795 | 21 | } |
1796 | 32 | compiled_data->is_test = is_test; |
1797 | 32 | ccv_nnc_graph_exec_update_t update = { |
1798 | 32 | .parallel_count = parallel_count, |
1799 | 32 | .graph = model->graph, |
1800 | 32 | .graph_exec_arena = compiled_data->graph_exec_arena, |
1801 | 32 | }; |
1802 | 32 | ccv_cnnp_model_set_is_test(model, is_test, _ccv_cnnp_cmd_update_for_execs, &update); |
1803 | 32 | const int evaluate_to_size = compiled_data->evaluate.to_size; |
1804 | 32 | compiled_data->evaluate.to_op_size = 0; |
1805 | 32 | ccv_array_t* const backward_from = ccv_array_new(sizeof(int), 0, 0); |
1806 | 82 | for (i = 0; i < evaluate_to_size; i++50 ) |
1807 | 50 | { |
1808 | 50 | ccv_nnc_graph_exec_t const to_op = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, compiled_data->evaluate.tos[i]); |
1809 | 50 | if (to_op.graph) |
1810 | 50 | compiled_data->evaluate.to_ops[compiled_data->evaluate.to_op_size++] = to_op; |
1811 | 50 | const int* tos; |
1812 | 50 | int to_size; |
1813 | 50 | ccv_nnc_graph_exec_symbol_to(model->graph, compiled_data->evaluate.tos[i], &tos, &to_size); |
1814 | 100 | for (j = 0; j < to_size; j++50 ) |
1815 | 50 | { |
1816 | 50 | ccv_nnc_graph_exec_t const to_op = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, (ccv_nnc_graph_exec_symbol_t){ |
1817 | 50 | .d = tos[j], |
1818 | 50 | .graph = model->graph |
1819 | 50 | }); |
1820 | 50 | if (to_op.graph) |
1821 | 50 | ccv_array_add_unique_int(backward_from, to_op.d); |
1822 | 50 | } |
1823 | 50 | } |
1824 | 32 | assert(backward_from->rnum > 0); |
1825 | 32 | compiled_data->backward.from_op_size = backward_from->rnum; |
1826 | 32 | compiled_data->backward.from_ops = (ccv_nnc_graph_exec_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_t) * backward_from->rnum); |
1827 | 82 | for (i = 0; i < backward_from->rnum; i++50 ) |
1828 | 50 | compiled_data->backward.from_ops[i] = (ccv_nnc_graph_exec_t){ |
1829 | 50 | .d = *(int*)ccv_array_get(backward_from, i), |
1830 | 50 | .graph = compiled_data->graph, |
1831 | 50 | }; |
1832 | | // If there are any set node (to set some tensors to 0) inserted through backward pass, these won't be executed if we just do sources -> evaluate.to_ops, backward.from_ops -> destinations. We need this logic to find out these nodes and explicitly adding them to backward.from_ops. |
1833 | 32 | ccv_nnc_graph_exec_info_t* const exec_info = (ccv_nnc_graph_exec_info_t*)ccv_array_get(compiled_data->graph->exec_info, 0); |
1834 | 32 | const int exec_info_size = compiled_data->graph->exec_info->rnum; |
1835 | 32 | uint32_t* const visited = cccalloc((exec_info_size + 31) >> 5, sizeof(uint32_t)); |
1836 | 32 | const ccv_nnc_graph_exec_t* const sources = (ccv_nnc_graph_exec_t*)ccv_array_get(compiled_data->graph->sources, 0); |
1837 | 32 | const int source_size = compiled_data->graph->sources->rnum; |
1838 | 64 | ccv_nnc_graph_visit_t* visit = ccv_nnc_graph_visit_new32 (compiled_data->graph, exec_info, exec_info_size, sources, source_size, compiled_data->evaluate.to_ops, compiled_data->evaluate.to_op_size, 0); |
1839 | 618 | ccv_nnc_graph_visit_for(visit, exec_info, node, idx) { |
1840 | 618 | visited[(idx >> 5)] |= (1u << (idx & 31)); |
1841 | 618 | } ccv_nnc_graph_visit_endfor |
1842 | 64 | ccv_nnc_graph_visit_free(visit); |
1843 | 64 | const ccv_nnc_graph_exec_t* const destinations = (ccv_nnc_graph_exec_t*)ccv_array_get32 (compiled_data->graph->destinations, 0); |
1844 | 64 | const int destination_size = compiled_data->graph->destinations->rnum; |
1845 | 64 | visit = ccv_nnc_graph_visit_new32 (compiled_data->graph, exec_info, exec_info_size, compiled_data->backward.from_ops, compiled_data->backward.from_op_size, destinations, destination_size, 0); |
1846 | 675 | ccv_nnc_graph_visit_for(visit, exec_info, node, idx) { |
1847 | 675 | visited[(idx >> 5)] |= (1u << (idx & 31)); |
1848 | 675 | } ccv_nnc_graph_visit_endfor |
1849 | 64 | ccv_nnc_graph_visit_free(visit); |
1850 | 64 | visit = ccv_nnc_graph_visit_new32 (compiled_data->graph, exec_info, exec_info_size, sources, source_size, destinations, destination_size, 0); |
1851 | | // Find any missing nodes to be added as source. Right now, these are only set nodes. |
1852 | 1.34k | ccv_nnc_graph_visit_for(visit, exec_info, node, idx) { |
1853 | 1.34k | if (!(visited[(idx >> 5)] & (1u << (idx & 31)))) |
1854 | 50 | { |
1855 | 50 | assert(exec_info[idx].cmd.cmd == CCV_NNC_SET_FORWARD); |
1856 | 50 | if (exec_info[idx].cmd.info.blas.a[0] == 0) // Special-casing for empty out the tensor set function, not for the set grad to 1 one. |
1857 | 0 | ccv_array_add_unique_int(backward_from, idx); |
1858 | 50 | } |
1859 | 1.34k | } ccv_nnc_graph_visit_endfor |
1860 | 32 | ccv_nnc_graph_visit_free(visit); |
1861 | 32 | ccfree(visited); |
1862 | 32 | if (backward_from->rnum != compiled_data->backward.from_op_size) // If it doesn't match, need to redo this. |
1863 | 0 | { |
1864 | 0 | compiled_data->backward.from_op_size = backward_from->rnum; |
1865 | 0 | compiled_data->backward.from_ops = (ccv_nnc_graph_exec_t*)ccrealloc(compiled_data->backward.from_ops, sizeof(ccv_nnc_graph_exec_t) * backward_from->rnum); |
1866 | 0 | for (i = 0; i < backward_from->rnum; i++) |
1867 | 0 | compiled_data->backward.from_ops[i] = (ccv_nnc_graph_exec_t){ |
1868 | 0 | .d = *(int*)ccv_array_get(backward_from, i), |
1869 | 0 | .graph = compiled_data->graph, |
1870 | 0 | }; |
1871 | 0 | } |
1872 | 32 | ccv_array_free(backward_from); |
1873 | 32 | ccv_nnc_graph_set_default_static_schedule(compiled_data->graph, compiled_data->stream_type, model->max_stream_count); |
1874 | 32 | ccv_nnc_graph_autotune(compiled_data->graph, model->workspace_size, 0, TRAVERSE_FULL); |
1875 | 32 | } |
1876 | | |
1877 | | void ccv_cnnp_model_dry_run(ccv_cnnp_model_t* const model, const ccv_cnnp_evaluate_param_t params, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size) |
1878 | 8.00k | { |
1879 | 8.00k | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
1880 | 8.00k | assert(compiled_data); |
1881 | 8.00k | const int parallel_count = ccv_max(model->parallel_count, 1); |
1882 | 8.00k | assert(output_size == model->output_size * parallel_count); |
1883 | 8.00k | assert(input_size == model->input_size * parallel_count); |
1884 | 8.00k | assert(model->graph); |
1885 | 8.00k | const int target_gradient_mode = _ccv_cnnp_is_disable_outgrad_all(params.disable_outgrad, model->input_size) ? CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES14 : CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS7.99k ; |
1886 | 8.00k | const int mode_mismatch = (params.requires_grad && (7.85k compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE7.85k || compiled_data->gradient_mode != target_gradient_mode7.82k || compiled_data->disable_outgrad != params.disable_outgrad7.82k )); |
1887 | 8.00k | if (!compiled_data->graph || mode_mismatch7.91k ) |
1888 | 91 | { |
1889 | 91 | _ccv_cnnp_compiled_data_graph_free(compiled_data); |
1890 | 91 | if (mode_mismatch) // If mode mismatch, we need to redo the backward as well (no need to redo apply_gradients, it doesn't require target_gradient_mode or disable_outgrad. |
1891 | 32 | _ccv_cnnp_compiled_data_backward_free(compiled_data); |
1892 | 91 | if (params.requires_grad) |
1893 | 32 | _ccv_cnnp_model_multistage_jit_0(model, params.disable_outgrad, params.is_test, inputs, input_size, outputs, output_size); |
1894 | 59 | else |
1895 | 59 | _ccv_cnnp_model_multistage_no_grad_jit(model, inputs, input_size, outputs, output_size); |
1896 | 7.91k | } else { |
1897 | 7.91k | ccv_nnc_tensor_arena_clear_bindings(compiled_data->tensor_arena); |
1898 | 7.91k | assert((input_size % parallel_count) == 0); |
1899 | 7.91k | const int input_size_per_p = input_size / parallel_count; |
1900 | 7.91k | _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->inputs, inputs, input_size_per_p, parallel_count); |
1901 | 7.91k | assert((output_size % parallel_count) == 0); |
1902 | 7.91k | const int output_size_per_p = output_size / parallel_count; |
1903 | 7.91k | _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->outputs, outputs, output_size_per_p, parallel_count); |
1904 | 7.91k | } |
1905 | 8.00k | if (compiled_data->is_test != params.is_test) |
1906 | 64 | { |
1907 | 64 | compiled_data->is_test = params.is_test; |
1908 | 64 | ccv_nnc_graph_exec_update_t update = { |
1909 | 64 | .parallel_count = parallel_count, |
1910 | 64 | .graph = model->graph, |
1911 | 64 | .graph_exec_arena = compiled_data->graph_exec_arena, |
1912 | 64 | }; |
1913 | 64 | ccv_cnnp_model_set_is_test(model, params.is_test, _ccv_cnnp_cmd_update_for_execs, &update); |
1914 | 64 | } |
1915 | 8.00k | } |
1916 | | |
1917 | | void ccv_cnnp_model_evaluate(ccv_cnnp_model_t* const model, const ccv_cnnp_evaluate_param_t params, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context) |
1918 | 8.00k | { |
1919 | 8.00k | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
1920 | 8.00k | assert(compiled_data); |
1921 | 8.00k | ccv_cnnp_model_dry_run(model, params, inputs, input_size, outputs, output_size); |
1922 | 8.00k | if (compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE_NO_GRAD) |
1923 | 73 | ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, 0, tensor_tape, stream_context); |
1924 | 7.93k | else { |
1925 | 7.93k | if (!compiled_data->evaluate.schedule) |
1926 | 37 | compiled_data->evaluate.schedule = ccv_nnc_graph_static_schedule_new(compiled_data->graph, compiled_data->stream_type, model->max_stream_count, 0, 0, compiled_data->evaluate.to_ops, compiled_data->evaluate.to_op_size); |
1927 | 7.93k | ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, compiled_data->evaluate.schedule, tensor_tape, stream_context); |
1928 | 7.93k | } |
1929 | 8.00k | } |
1930 | | |
1931 | | // Compile the graph to run ccv_cnnp_model_backward after ccv_cnnp_model_evaluate with requires_grad = true (MULTISTAGE_MODE). |
1932 | | // Particularly, this method compiles the accumulator graph. |
1933 | | static void _ccv_cnnp_model_multistage_jit_1(ccv_cnnp_model_t* const model) |
1934 | 5 | { |
1935 | 5 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
1936 | 5 | assert(compiled_data); |
1937 | 5 | assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE); |
1938 | 5 | ccv_nnc_symbolic_graph_t* accum = ccv_nnc_symbolic_graph_new(); |
1939 | 5 | const int parallel_count = ccv_max(model->parallel_count, 1); |
1940 | 5 | const int parameter_size = compiled_data->parameters->rnum; |
1941 | 5 | int i, j; |
1942 | 5 | compiled_data->backward.gradients = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size * parallel_count * 3); |
1943 | 5 | compiled_data->backward.accum_gradients = compiled_data->backward.gradients + parameter_size * parallel_count; |
1944 | 5 | compiled_data->backward.updated_accum_gradients = compiled_data->backward.accum_gradients + parameter_size * parallel_count; |
1945 | 20 | for (i = 0; i < parameter_size; i++15 ) |
1946 | 30 | for (j = 0; 15 j < parallel_count; j++15 ) |
1947 | 15 | if (compiled_data->tensors.gradients[i + j * parameter_size]) |
1948 | 15 | { |
1949 | 15 | const ccv_nnc_tensor_param_t info = compiled_data->tensors.gradients[i + j * parameter_size]->info; |
1950 | | // Now, the old gradient is the accumulated gradient, getting new gradient tensor setup so we can collect them. |
1951 | 15 | compiled_data->tensors.accum_gradients[i + j * parameter_size] = compiled_data->tensors.gradients[i + j * parameter_size]; |
1952 | 15 | compiled_data->tensors.gradients[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0); |
1953 | 15 | ccv_nnc_tensor_symbol_t inputs[2]; |
1954 | 15 | inputs[0] = compiled_data->backward.accum_gradients[i + j * parameter_size] = ccv_nnc_tensor_symbol_new(accum, info, 0); |
1955 | 15 | inputs[1] = compiled_data->backward.gradients[i + j * parameter_size] = ccv_nnc_tensor_symbol_new(accum, info, 0); |
1956 | 15 | ccv_nnc_tensor_symbol_t output = compiled_data->backward.updated_accum_gradients[i + j * parameter_size] = ccv_nnc_tensor_symbol_new(accum, info, 0); |
1957 | 15 | ccv_nnc_graph_exec_symbol_new(accum, CMD_EWSUM_FORWARD(), inputs, 2, &output, 1, 0); |
1958 | 15 | } else { |
1959 | 0 | compiled_data->backward.accum_gradients[i + j * parameter_size] = NO_TENSOR_SYMBOL; |
1960 | 0 | compiled_data->backward.gradients[i + j * parameter_size] = NO_TENSOR_SYMBOL; |
1961 | 0 | compiled_data->backward.updated_accum_gradients[i + j * parameter_size] = NO_TENSOR_SYMBOL; |
1962 | 0 | } |
1963 | 5 | ccv_nnc_graph_exec_symbol_autogen(accum, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
1964 | 5 | if (ccv_nnc_symbolic_graph_source_size(accum) == 0) |
1965 | 0 | { |
1966 | 0 | ccv_nnc_symbolic_graph_free(accum); |
1967 | | // Create empty graph. |
1968 | 0 | compiled_data->backward.accum = ccv_nnc_graph_new(); |
1969 | 0 | ccv_nnc_graph_topsort(compiled_data->backward.accum, 0, 0); |
1970 | 0 | return; |
1971 | 0 | } |
1972 | 5 | ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0); |
1973 | 5 | _ccv_cnnp_model_bind_tensors(accum, compiled_data->backward.accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1, tensor_binds); |
1974 | 5 | _ccv_cnnp_model_bind_tensors(accum, compiled_data->backward.gradients, compiled_data->tensors.gradients, parameter_size * parallel_count, 1, tensor_binds); |
1975 | 5 | _ccv_cnnp_model_bind_tensors(accum, compiled_data->backward.updated_accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1, tensor_binds); |
1976 | 5 | ccv_nnc_symbolic_graph_compile(accum, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(accum), SYMBOLIC_GRAPH_DESTINATIONS(accum), &compiled_data->backward.accum, &compiled_data->backward.tensor_arena, &compiled_data->backward.graph_exec_arena); |
1977 | 5 | ccv_nnc_symbolic_graph_free(accum); |
1978 | 5 | ccv_array_free(tensor_binds); |
1979 | 5 | ccv_nnc_graph_set_default_static_schedule(compiled_data->backward.accum, compiled_data->stream_type, model->max_stream_count); |
1980 | 5 | } |
1981 | | |
1982 | | void ccv_cnnp_model_backward(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const ingrads, const int ingrad_size, ccv_nnc_tensor_t* const* const outgrads, const int outgrad_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context) |
1983 | 7.91k | { |
1984 | 7.91k | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
1985 | 7.91k | assert(compiled_data); |
1986 | 7.91k | assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE); |
1987 | 7.91k | const int parallel_count = ccv_max(model->parallel_count, 1); |
1988 | 7.91k | assert(ingrad_size == 0 || ingrad_size == model->output_size * parallel_count); |
1989 | 7.91k | if (outgrad_size > 0) |
1990 | 2.51k | { assert(outgrad_size == compiled_data->outgrad_size * parallel_count); } |
1991 | 7.91k | assert(model->graph); |
1992 | 7.91k | assert(compiled_data->graph); |
1993 | 7.91k | const int parameter_size = compiled_data->parameters->rnum; |
1994 | | // If we need to accumulate the gradients now, do jit on accumulator. |
1995 | 7.91k | if (compiled_data->backward.count > 0) |
1996 | 1.71k | { |
1997 | 1.71k | if (!compiled_data->backward.accum) |
1998 | 5 | _ccv_cnnp_model_multistage_jit_1(model); |
1999 | 1.71k | else if (compiled_data->backward.count == 1) { |
2000 | | // On this round, we need to switch accumulated gradients with gradients (so we can do accumulation properly). |
2001 | 496 | int i; |
2002 | 1.48k | for (i = 0; i < parameter_size * parallel_count; i++986 ) |
2003 | 986 | { |
2004 | 986 | ccv_nnc_tensor_t* tensor; |
2005 | 986 | CCV_SWAP(compiled_data->tensors.accum_gradients[i], compiled_data->tensors.gradients[i], tensor); |
2006 | 986 | } |
2007 | 496 | if (compiled_data->backward.tensor_arena) |
2008 | 496 | { |
2009 | 496 | ccv_nnc_tensor_arena_clear_bindings(compiled_data->backward.tensor_arena); |
2010 | | // Do rebind in case we messed up the binding (we switch accum_gradients and gradients). |
2011 | 496 | _ccv_cnnp_bind_tensors_to_arena(compiled_data->backward.tensor_arena, 0, compiled_data->backward.gradients, compiled_data->tensors.gradients, parameter_size * parallel_count, 1); |
2012 | 496 | _ccv_cnnp_bind_tensors_to_arena(compiled_data->backward.tensor_arena, 0, compiled_data->backward.accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1); |
2013 | 496 | _ccv_cnnp_bind_tensors_to_arena(compiled_data->backward.tensor_arena, 0, compiled_data->backward.updated_accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1); |
2014 | 496 | } |
2015 | 496 | } |
2016 | 1.71k | } |
2017 | 7.91k | const int ingrad_size_per_p = model->output_size; |
2018 | 7.91k | const int outgrad_size_per_p = compiled_data->outgrad_size; |
2019 | 7.91k | int i, j; |
2020 | 15.8k | for (i = 0; i < ingrad_size_per_p; i++7.91k ) |
2021 | 7.91k | { |
2022 | 7.91k | const ccv_nnc_tensor_symbol_t ingrad = ccv_nnc_tensor_symbol_for_backward(model->graph, compiled_data->f[i]); |
2023 | 7.91k | if (!ingrad_size || !ingrads3.79k || ingrads[i] == 03.79k ) |
2024 | 4.22k | { |
2025 | | // Set it to 1 if it is not specified. |
2026 | 4.22k | ccv_nnc_tensor_t* const ingrad_tensor = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, ingrad); |
2027 | 4.22k | if (ingrad_tensor) |
2028 | 4.22k | ccv_nnc_cmd_exec(CMD_SET_FORWARD(1), ccv_nnc_no_hint, 0, 0, 0, TENSOR_LIST(ingrad_tensor), stream_context); |
2029 | 4.34k | for (j = 1; j < parallel_count; j++120 ) |
2030 | 120 | { |
2031 | 120 | ccv_nnc_tensor_t* const ingrad_tensor = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, ingrad, j)); |
2032 | 120 | if (ingrad_tensor) |
2033 | 120 | ccv_nnc_cmd_exec(CMD_SET_FORWARD(1), ccv_nnc_no_hint, 0, 0, 0, TENSOR_LIST(ingrad_tensor), stream_context); |
2034 | 120 | } |
2035 | 4.22k | } else { |
2036 | | // Make sure the length matches, in case it is an alias. |
2037 | 3.69k | assert(ccv_nnc_tensor_count(ingrads[i]->info) == ccv_nnc_tensor_count(ccv_nnc_tensor_symbol_params(model->graph, ingrad))); |
2038 | 3.69k | ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, ingrad, ingrads[i]); |
2039 | 3.69k | for (j = 1; j < parallel_count; j++6 ) |
2040 | 6 | ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, ingrad, j), ingrads[i + ingrad_size_per_p * j]); |
2041 | 3.69k | } |
2042 | 7.91k | } |
2043 | 7.91k | if (outgrad_size > 0) |
2044 | 2.51k | { |
2045 | 2.51k | assert(compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS && "shouldn't pass disable_outgrad to ccv_cnnp_model_evaluate before if you plan to compute outgrad"); |
2046 | 5.14k | for (i = 0; 2.51k i < outgrad_size_per_p; i++2.62k ) |
2047 | 2.62k | if (outgrads[i]) |
2048 | 2.43k | { |
2049 | 2.43k | const ccv_nnc_tensor_symbol_t outgrad = compiled_data->outgrads[i]; |
2050 | 2.43k | ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, outgrad, outgrads[i]); |
2051 | 2.43k | for (j = 1; j < parallel_count; j++6 ) |
2052 | 6 | ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, outgrad, j), outgrads[i + outgrad_size_per_p * j]); |
2053 | 2.43k | } |
2054 | 5.40k | } else { |
2055 | 5.40k | assert(compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES || |
2056 | 5.40k | compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS); |
2057 | 5.40k | } |
2058 | | // We need to rebind here because in ccv_cnnp_evaluate, we clear bindings, that will reset all bindings for the gradients. |
2059 | | // For parameters and internals these are fine because when we clear bindings, it restores to original bindings, which are these |
2060 | | // parameters and internals. The same cannot be said for gradients due to the accum_gradients switching. |
2061 | 7.91k | _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count); |
2062 | 7.91k | if (!compiled_data->backward.schedule) |
2063 | 28 | compiled_data->backward.schedule = ccv_nnc_graph_static_schedule_new(compiled_data->graph, compiled_data->stream_type, model->max_stream_count, compiled_data->backward.from_ops, compiled_data->backward.from_op_size, 0, 0); |
2064 | | // Run the backward pass. |
2065 | 7.91k | ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, compiled_data->backward.schedule, tensor_tape, stream_context); |
2066 | | // If we need to run accumulation round, do that now. |
2067 | 7.91k | if (compiled_data->backward.count > 0) |
2068 | 1.71k | ccv_nnc_graph_run_with_schedule(compiled_data->backward.accum, 0, 0, 0, stream_context); |
2069 | | // Update the count, this determines whether we need to accumulate or not. |
2070 | 7.91k | ++compiled_data->backward.count; |
2071 | 7.91k | } |
2072 | | |
2073 | | // Compile the graph to run ccv_cnnp_model_apply_gradients after ccv_cnnp_model_backward (MULTISTAGE_MODE). |
2074 | | // Particularly, this method compiles the parameter update graph. |
2075 | | static void _ccv_cnnp_model_multistage_jit_2(ccv_cnnp_model_t* const model) |
2076 | 24 | { |
2077 | 24 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
2078 | 24 | assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE); |
2079 | 24 | const int parallel_count = ccv_max(model->parallel_count, 1); |
2080 | 24 | const int parameter_size = compiled_data->parameters->rnum; |
2081 | 24 | ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0); |
2082 | 24 | _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds); |
2083 | 24 | _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->updated_parameters, compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds); |
2084 | | // Bind accumulated gradients. |
2085 | 24 | if (compiled_data->backward.count > 1) |
2086 | 4 | _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->gradients, compiled_data->tensors.accum_gradients, parameter_size, parallel_count, tensor_binds); |
2087 | 20 | else |
2088 | 20 | _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count, tensor_binds); |
2089 | 24 | ccv_array_t* const apply_gradients_from = ccv_array_new(sizeof(int), 0, 0); |
2090 | 24 | int i, j; |
2091 | 256 | for (i = 0; i < compiled_data->backward.to_size; i++232 ) |
2092 | 232 | { |
2093 | 232 | const int* tos; |
2094 | 232 | int to_size; |
2095 | 232 | ccv_nnc_graph_exec_symbol_to(model->graph, compiled_data->backward.tos[i], &tos, &to_size); |
2096 | 738 | for (j = 0; j < to_size; j++506 ) |
2097 | 506 | { |
2098 | | // Check if this is already show up in the backward graph, if that is the case, it won't be in the apply |
2099 | | // gradients graph. |
2100 | 506 | const ccv_nnc_graph_exec_t exec = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, (ccv_nnc_graph_exec_symbol_t){ |
2101 | 506 | .d = tos[j], |
2102 | 506 | .graph = model->graph, |
2103 | 506 | }); |
2104 | 506 | if (!exec.graph) |
2105 | 316 | ccv_array_add_unique_int(apply_gradients_from, tos[j]); |
2106 | 506 | } |
2107 | 232 | } |
2108 | 24 | const int from_size = apply_gradients_from->rnum; |
2109 | 24 | if (from_size == 0) |
2110 | 0 | { |
2111 | 0 | ccv_array_free(apply_gradients_from); |
2112 | 0 | ccv_array_free(tensor_binds); |
2113 | 0 | return; |
2114 | 0 | } |
2115 | 24 | ccv_nnc_graph_exec_symbol_t* const froms = (ccv_nnc_graph_exec_symbol_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_t) * from_size); |
2116 | 160 | for (i = 0; i < from_size; i++136 ) |
2117 | 136 | froms[i] = (ccv_nnc_graph_exec_symbol_t){ |
2118 | 136 | .d = *(int*)ccv_array_get(apply_gradients_from, i), |
2119 | 136 | .graph = model->graph |
2120 | 136 | }; |
2121 | 24 | ccv_array_free(apply_gradients_from); |
2122 | | // It can only ends with updates on the parameters. |
2123 | 24 | ccv_array_t* const tos = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), parameter_size * parallel_count, 0); |
2124 | 160 | for (i = 0; i < parameter_size; i++136 ) |
2125 | 136 | { |
2126 | 136 | if (compiled_data->update_nodes[i].d == CCV_NNC_NO_TENSOR_SYMBOL) |
2127 | 0 | continue; |
2128 | 136 | ccv_array_push(tos, &compiled_data->update_nodes[i]); |
2129 | 316 | for (j = 1; j < parallel_count; j++180 ) |
2130 | 180 | { |
2131 | 180 | const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->update_nodes[i], j); |
2132 | 180 | ccv_array_push(tos, ©); |
2133 | 180 | } |
2134 | 136 | } |
2135 | 24 | ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, froms, from_size, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(tos, 0), tos->rnum, &compiled_data->apply_gradients.graph, &compiled_data->apply_gradients.tensor_arena, &compiled_data->apply_gradients.graph_exec_arena); |
2136 | 24 | ccv_array_free(tos); |
2137 | 24 | ccv_array_free(tensor_binds); |
2138 | 24 | ccfree(froms); |
2139 | 24 | const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size; |
2140 | 219 | for (i = 0; i < max_saved_aux_size * parameter_size; i++195 ) |
2141 | 195 | { |
2142 | | // Skip on no tensor. |
2143 | 195 | if (compiled_data->saved_aux[i].source.d == CCV_NNC_NO_TENSOR_SYMBOL) |
2144 | 0 | continue; |
2145 | 195 | ccv_nnc_tensor_t* const tensor = ccv_nnc_tensor_from_symbol(compiled_data->apply_gradients.tensor_arena, compiled_data->saved_aux[i].source); |
2146 | 195 | ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &tensor, 1, 0); |
2147 | 543 | for (j = 1; j < parallel_count; j++348 ) |
2148 | 348 | { |
2149 | 348 | ccv_nnc_tensor_t* const copy = ccv_nnc_tensor_from_symbol(compiled_data->apply_gradients.tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, compiled_data->saved_aux[i].source, j)); |
2150 | 348 | if (copy) |
2151 | 348 | ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, ©, 1, 0); |
2152 | 348 | } |
2153 | 195 | } |
2154 | 24 | ccv_nnc_graph_set_default_static_schedule(compiled_data->apply_gradients.graph, compiled_data->stream_type, model->max_stream_count); |
2155 | 24 | } |
2156 | | |
2157 | | void ccv_cnnp_model_apply_gradients(ccv_cnnp_model_t* const model, ccv_nnc_stream_context_t* const stream_context) |
2158 | 7.84k | { |
2159 | 7.84k | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
2160 | 7.84k | assert(compiled_data); |
2161 | 7.84k | assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE); |
2162 | 7.84k | const int parallel_count = ccv_max(model->parallel_count, 1); |
2163 | 7.84k | assert(model->graph); |
2164 | 7.84k | assert(compiled_data->graph); |
2165 | | // Skip if there is no backward pass. |
2166 | 7.84k | if (compiled_data->backward.count <= 0) |
2167 | 1.65k | return; |
2168 | | // Skip if there is no parameters. |
2169 | 6.19k | if (compiled_data->parameters->rnum == 0) |
2170 | 3 | { |
2171 | 3 | compiled_data->backward.count = 0; |
2172 | 3 | return; |
2173 | 3 | } |
2174 | 6.19k | if (!compiled_data->apply_gradients.graph) |
2175 | 24 | _ccv_cnnp_model_multistage_jit_2(model); |
2176 | 6.16k | else { |
2177 | 6.16k | const int parameter_size = compiled_data->parameters->rnum; |
2178 | 6.16k | ccv_nnc_tensor_arena_clear_bindings(compiled_data->apply_gradients.tensor_arena); |
2179 | | // Change to bind accum_gradients if we do gradient accumulation (run backward more than once). |
2180 | 6.16k | if (compiled_data->backward.count > 1) |
2181 | 497 | _ccv_cnnp_bind_tensors_to_arena(compiled_data->apply_gradients.tensor_arena, model->graph, compiled_data->gradients, compiled_data->tensors.accum_gradients, parameter_size, parallel_count); |
2182 | 5.67k | else |
2183 | 5.67k | _ccv_cnnp_bind_tensors_to_arena(compiled_data->apply_gradients.tensor_arena, model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count); |
2184 | 6.16k | } |
2185 | 6.19k | if (compiled_data->apply_gradients.graph) |
2186 | 6.19k | ccv_nnc_graph_run_with_schedule(compiled_data->apply_gradients.graph, 0, 0, 0, stream_context); |
2187 | | // Reset backward count to 0. |
2188 | 6.19k | compiled_data->backward.count = 0; |
2189 | 6.19k | } |
2190 | | |
2191 | | void ccv_cnnp_model_set_parameter(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter, const ccv_nnc_tensor_t* const tensor) |
2192 | 35 | { |
2193 | 35 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
2194 | 35 | const int param_sel = parameter->param_sel > 0 ? parameter->param_sel - 18 : parameter->param_sel27 ; |
2195 | 35 | assert(parameter->param_sel != 0); |
2196 | 35 | const int tensors_init = !!compiled_data->tensors_init.v; |
2197 | 35 | if (!tensors_init) |
2198 | 19 | _ccv_cnnp_model_tensors_init(model, compiled_data); |
2199 | 16 | else if ((uintptr_t)compiled_data->tensors_init.v & (uintptr_t)1) |
2200 | | // Check if it is not fully allocated, if it is not, init_1. |
2201 | 0 | ccv_cnnp_model_tensors_init_1(model, compiled_data); |
2202 | 35 | ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0); |
2203 | 35 | ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices); |
2204 | 35 | const int param_ref = parameter->param_ref > 0 ? parameter->param_ref - 134 : parameter->param_ref1 ; |
2205 | 35 | if (param_ref < 0) |
2206 | 1 | { assert(parameter_indices->rnum == 1); } |
2207 | 34 | else |
2208 | 34 | { assert(param_ref < parameter_indices->rnum); } |
2209 | 35 | const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0); |
2210 | 35 | ccv_array_free(parameter_indices); |
2211 | 35 | const int parameter_size = compiled_data->parameters->rnum; |
2212 | 35 | assert(d >= 0); |
2213 | 35 | assert(d < parameter_size); |
2214 | 35 | const int parallel_count = ccv_max(model->parallel_count, 1); |
2215 | 35 | ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(compiled_data->tensors.parameters[d]); |
2216 | 35 | assert(dest); |
2217 | 35 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)tensor), TENSOR_LIST(dest), 0); |
2218 | 35 | int i; |
2219 | 35 | for (i = 1; i < parallel_count; i++0 ) |
2220 | 0 | { |
2221 | 0 | ccv_nnc_tensor_t* const copy_tensor = CCV_NNC_TENSOR(compiled_data->tensors.parameters[d + i * parameter_size]); |
2222 | 0 | if (copy_tensor) |
2223 | 0 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dest), TENSOR_LIST(copy_tensor), 0); |
2224 | 0 | } |
2225 | | // Mark this symbol as init'ed. |
2226 | 35 | const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, d))->d; |
2227 | 35 | uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v); |
2228 | 35 | init_v[s >> 5] |= (1u << (s & 0x1f)); |
2229 | 35 | } |
2230 | | |
2231 | | void ccv_cnnp_model_parameter_copy(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter, ccv_nnc_tensor_t* const tensor) |
2232 | 6 | { |
2233 | 6 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
2234 | 6 | const int param_sel = parameter->param_sel > 0 ? parameter->param_sel - 13 : parameter->param_sel3 ; |
2235 | 6 | assert(parameter->param_sel != 0); |
2236 | 6 | assert(compiled_data->tensors.parameters); |
2237 | 6 | ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0); |
2238 | 6 | ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices); |
2239 | 6 | const int param_ref = parameter->param_ref > 0 ? parameter->param_ref - 13 : parameter->param_ref3 ; |
2240 | 6 | if (param_ref < 0) |
2241 | 3 | { assert(parameter_indices->rnum == 1); } |
2242 | 3 | else |
2243 | 3 | { assert(param_ref < parameter_indices->rnum); } |
2244 | 6 | const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0); |
2245 | 6 | ccv_array_free(parameter_indices); |
2246 | 6 | const int parameter_size = compiled_data->parameters->rnum; |
2247 | 6 | assert(d >= 0); |
2248 | 6 | assert(d < parameter_size); |
2249 | | // We don't need to consider parallel_count, every parameter on each device is identical. |
2250 | 6 | ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(compiled_data->tensors.parameters[d]); |
2251 | 6 | assert(src); |
2252 | 6 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(src), TENSOR_LIST(tensor), 0); |
2253 | 6 | } |
2254 | | |
2255 | | ccv_nnc_tensor_param_t ccv_cnnp_model_parameter_tensor_params(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter) |
2256 | 1 | { |
2257 | 1 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
2258 | 1 | const int param_sel = parameter->param_sel > 0 ? parameter->param_sel - 10 : parameter->param_sel; |
2259 | 1 | assert(parameter->param_sel != 0); |
2260 | 1 | assert(compiled_data->tensors.parameters); |
2261 | 1 | ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0); |
2262 | 1 | ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices); |
2263 | 1 | const int param_ref = parameter->param_ref > 0 ? parameter->param_ref - 10 : parameter->param_ref; |
2264 | 1 | if (param_ref < 0) |
2265 | 1 | { assert(parameter_indices->rnum == 1); } |
2266 | 0 | else |
2267 | 0 | { assert(param_ref < parameter_indices->rnum); } |
2268 | 1 | const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0); |
2269 | 1 | ccv_array_free(parameter_indices); |
2270 | 1 | const int parameter_size = compiled_data->parameters->rnum; |
2271 | 1 | assert(d >= 0); |
2272 | 1 | assert(d < parameter_size); |
2273 | | // We don't need to consider parallel_count, every parameter on each device is identical. |
2274 | 1 | ccv_nnc_tensor_t* const tensor = CCV_NNC_TENSOR(compiled_data->tensors.parameters[d]); |
2275 | 1 | assert(tensor); |
2276 | 1 | return tensor->info; |
2277 | 1 | } |
2278 | | |
2279 | | const char* ccv_cnnp_model_parameter_name(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter) |
2280 | 2 | { |
2281 | 2 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
2282 | 2 | const int param_sel = parameter->param_sel > 0 ? parameter->param_sel - 1 : parameter->param_sel0 ; |
2283 | 2 | assert(parameter->param_sel != 0); |
2284 | 2 | ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0); |
2285 | 2 | ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices); |
2286 | 2 | const int param_ref = parameter->param_ref > 0 ? parameter->param_ref - 1 : parameter->param_ref0 ; |
2287 | 2 | if (param_ref < 0) |
2288 | 0 | { assert(parameter_indices->rnum == 1); } |
2289 | 2 | else |
2290 | 2 | { assert(param_ref < parameter_indices->rnum); } |
2291 | 2 | const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0); |
2292 | 2 | ccv_array_free(parameter_indices); |
2293 | 2 | const int parameter_size = compiled_data->parameters->rnum; |
2294 | 2 | assert(d >= 0); |
2295 | 2 | assert(d < parameter_size); |
2296 | 2 | return *(char**)ccv_array_get(compiled_data->ids.parameters, d); |
2297 | 2 | } |
2298 | | |
2299 | | int ccv_cnnp_model_parameter_count(ccv_cnnp_model_t* const model) |
2300 | 0 | { |
2301 | 0 | assert(model->compiled_data); |
2302 | 0 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
2303 | 0 | return compiled_data->parameters->rnum; |
2304 | 0 | } |
2305 | | |
2306 | | uint64_t ccv_cnnp_model_parameters_size(ccv_cnnp_model_t* const model) |
2307 | 0 | { |
2308 | 0 | assert(model->compiled_data); |
2309 | 0 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
2310 | 0 | const int parameter_size = compiled_data->parameters->rnum; |
2311 | 0 | int i; |
2312 | 0 | const ccv_nnc_symbolic_graph_t* const graph = model->graph; |
2313 | 0 | uint64_t size = 0; |
2314 | 0 | for (i = 0; i < parameter_size; i++) |
2315 | 0 | { |
2316 | 0 | const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d; |
2317 | 0 | ccv_nnc_tensor_param_t params = ccv_nnc_tensor_symbol_params(graph, (ccv_nnc_tensor_symbol_t){ |
2318 | 0 | .graph = graph, |
2319 | 0 | .d = d |
2320 | 0 | }); |
2321 | 0 | size += ccv_nnc_tensor_data_size(params); |
2322 | 0 | } |
2323 | 0 | return size; |
2324 | 0 | } |
2325 | | |
2326 | | int ccv_cnnp_model_parameters_move(ccv_cnnp_model_t* const model, char** const names, ccv_nnc_tensor_t** const tensors, const int count, int type) |
2327 | 3 | { |
2328 | 3 | assert(model->compiled_data); |
2329 | 3 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
2330 | 3 | if (count != compiled_data->parameters->rnum) |
2331 | 0 | return 0; |
2332 | 3 | if (CCV_TENSOR_GET_DEVICE(type) == CCV_COMPUTE_DEVICE_ANY) |
2333 | 0 | CCV_TENSOR_SET_DEVICE_ID(type, 0); |
2334 | 3 | int i; |
2335 | | // We don't need to consider parallel_count, every parameter on each device is identical. |
2336 | 6 | for (i = 0; i < count; i++3 ) |
2337 | 3 | { |
2338 | 3 | ccv_nnc_tensor_t* tensor = compiled_data->tensors.parameters[i]; |
2339 | 3 | if ((uintptr_t)tensor & (uintptr_t)1) // If it is not owned. We don't do anything. |
2340 | 0 | { |
2341 | 0 | tensors[i] = 0; |
2342 | 0 | continue; |
2343 | 0 | } |
2344 | 3 | tensor = CCV_NNC_TENSOR(tensor); |
2345 | 3 | if (tensor->info.type == type) |
2346 | 3 | tensors[i] = tensor; |
2347 | 0 | else { |
2348 | 0 | ccv_nnc_tensor_param_t info = tensor->info; |
2349 | 0 | info.type = type; |
2350 | 0 | tensors[i] = ccv_nnc_tensor_new(0, info, 0); // Create this tensor, don't initiate copy yet. |
2351 | 0 | } |
2352 | 3 | } |
2353 | 6 | for (i = 0; i < count; i++3 ) |
2354 | 3 | { |
2355 | 3 | ccv_nnc_tensor_t* tensor = compiled_data->tensors.parameters[i]; |
2356 | 3 | if ((uintptr_t)tensor & (uintptr_t)1) // If it is not owned. We don't do anything. |
2357 | 0 | continue; |
2358 | 3 | tensor = CCV_NNC_TENSOR(tensor); |
2359 | | // Now initiate transfer. We should do this one on a stream. |
2360 | 3 | if (tensor->info.type != type) |
2361 | 0 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensor), TENSOR_LIST(tensors[i]), 0); |
2362 | 3 | } |
2363 | | // Copy names and remove parameters. |
2364 | 6 | for (i = 0; i < count; i++3 ) |
2365 | 3 | { |
2366 | 3 | ccv_nnc_tensor_t* const tensor = compiled_data->tensors.parameters[i]; |
2367 | 3 | if ((uintptr_t)tensor & (uintptr_t)1) // If it is not owned. We don't do anything. |
2368 | 0 | { |
2369 | 0 | names[i] = 0; |
2370 | 0 | continue; |
2371 | 0 | } |
2372 | 3 | const char* const name = *(char**)ccv_array_get(compiled_data->ids.parameters, i); |
2373 | 3 | const size_t name_len = ccv_min(strnlen(name, 1023), 1023); |
2374 | 3 | names[i] = ccmalloc(name_len + 1); |
2375 | 3 | names[i][name_len] = 0; |
2376 | 3 | memcpy(names[i], name, name_len); |
2377 | 3 | compiled_data->tensors.parameters[i] = 0; |
2378 | 3 | } |
2379 | 3 | return 1; |
2380 | 3 | } |
2381 | | |
2382 | | KHASH_MAP_INIT_STR(ccv_cnnp_parameter_id, int) |
2383 | | |
2384 | | void ccv_cnnp_model_set_parameters_from_key_values(ccv_cnnp_model_t* const model, char* const* const names, ccv_nnc_tensor_t** const tensors, const int count, const int invalidates) |
2385 | 2 | { |
2386 | 2 | assert(model->compiled_data); |
2387 | 2 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
2388 | 2 | int i; |
2389 | 2 | khash_t(ccv_cnnp_parameter_id)* id_map = 0; |
2390 | 2 | if (count != compiled_data->parameters->rnum) |
2391 | 0 | { |
2392 | 0 | id_map = kh_init(ccv_cnnp_parameter_id); |
2393 | | // Build the map between name and the index. |
2394 | 0 | for (i = 0; i < count; i++) |
2395 | 0 | { |
2396 | 0 | int ret; |
2397 | 0 | const khiter_t k = kh_put(ccv_cnnp_parameter_id, id_map, names[i], &ret); |
2398 | 0 | assert(ret != 0); |
2399 | 0 | kh_val(id_map, k) = i; |
2400 | 0 | } |
2401 | 0 | } |
2402 | 2 | const int parameter_size = compiled_data->parameters->rnum; |
2403 | 2 | int* copy_back = 0; |
2404 | 2 | const int tensors_init = !!compiled_data->tensors_init.v; |
2405 | 2 | if (!tensors_init) |
2406 | 1 | ccv_cnnp_model_tensors_init_0(model, compiled_data); |
2407 | 2 | const int parallel_count = ccv_max(model->parallel_count, 1); |
2408 | 2 | uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v); |
2409 | 4 | for (i = 0; i < parameter_size; i++2 ) |
2410 | 2 | { |
2411 | 2 | int j = i; |
2412 | 2 | const char* const name = *(char**)ccv_array_get(compiled_data->ids.parameters, ccv_min(count - 1, i)); |
2413 | 2 | if (strncmp(name, names[i], 1023) != 0) |
2414 | 0 | { |
2415 | | // Build the map. |
2416 | 0 | if (id_map == 0) |
2417 | 0 | { |
2418 | 0 | id_map = kh_init(ccv_cnnp_parameter_id); |
2419 | 0 | for (j = 0; j < count; j++) |
2420 | 0 | { |
2421 | 0 | int ret; |
2422 | 0 | const khiter_t k = kh_put(ccv_cnnp_parameter_id, id_map, names[j], &ret); |
2423 | 0 | assert(ret != 0); |
2424 | 0 | kh_val(id_map, k) = j; |
2425 | 0 | } |
2426 | 0 | } |
2427 | 0 | const khiter_t k = kh_get(ccv_cnnp_parameter_id, id_map, name); |
2428 | 0 | if (k == kh_end(id_map)) // Cannot find the name, skip. |
2429 | 0 | continue; |
2430 | 0 | j = kh_val(id_map, k); |
2431 | 0 | } |
2432 | 2 | if (compiled_data->tensors.parameters[i]) // Cannot be a shared parameter to read. |
2433 | 0 | { assert(!((uintptr_t)compiled_data->tensors.parameters[i] & (uintptr_t)1)); } |
2434 | 2 | const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i); |
2435 | 2 | ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(parameter.graph, parameter); |
2436 | 2 | if (CCV_TENSOR_GET_DEVICE(info.type) == CCV_COMPUTE_DEVICE_ANY) |
2437 | 1 | CCV_TENSOR_SET_DEVICE_ID(info.type, 0); |
2438 | 2 | const int d = parameter.d; |
2439 | 2 | if (info.type == tensors[j]->info.type && invalidates) // Can move. |
2440 | 1 | { |
2441 | | // Deallocate it if needed. |
2442 | 1 | if (!((uintptr_t)compiled_data->tensors.parameters[i] & (uintptr_t)1)) |
2443 | 1 | if (compiled_data->tensors.parameters[i]) |
2444 | 0 | ccv_nnc_tensor_free(compiled_data->tensors.parameters[i]); |
2445 | 1 | compiled_data->tensors.parameters[i] = tensors[j]; |
2446 | 1 | tensors[j] = 0; |
2447 | 1 | } else { |
2448 | 1 | if (!compiled_data->tensors.parameters[i]) |
2449 | 1 | { // Not allocated, to allocate first. |
2450 | | // Create new one, make sure we create this by having the right parameters. |
2451 | 1 | const int type = info.type; |
2452 | 1 | info = tensors[j]->info; |
2453 | 1 | info.type = type; // Revert back the type. |
2454 | 1 | compiled_data->tensors.parameters[i] = ccv_nnc_tensor_new(0, info, 0); |
2455 | 1 | } |
2456 | 1 | if (!copy_back) |
2457 | 1 | copy_back = (int*)cccalloc(parameter_size, sizeof(int)); |
2458 | 1 | copy_back[i] = j + 1; |
2459 | 1 | } |
2460 | 2 | init_v[d >> 5] |= (1u << (d & 0x1f)); |
2461 | | // Create this tensor for other data parallel allocations. |
2462 | 2 | info = compiled_data->tensors.parameters[i]->info; // In case we loaded a different info. |
2463 | 2 | const int device_id = CCV_TENSOR_GET_DEVICE_ID(info.type); |
2464 | 2 | for (j = 1; j < parallel_count; j++0 ) |
2465 | 0 | if (!compiled_data->tensors.parameters[i + j * parameter_size]) |
2466 | 0 | { |
2467 | 0 | if (j != device_id) |
2468 | 0 | CCV_TENSOR_SET_DEVICE_ID(info.type, j); |
2469 | 0 | else |
2470 | 0 | CCV_TENSOR_SET_DEVICE_ID(info.type, 0); |
2471 | 0 | compiled_data->tensors.parameters[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0); |
2472 | 0 | } |
2473 | | // No need to copy over, this is done in ccv_cnnp_model.c's copy_tensors method. |
2474 | 2 | } |
2475 | 2 | if (id_map) |
2476 | 0 | kh_destroy(ccv_cnnp_parameter_id, id_map); |
2477 | | // Now do the transfer. |
2478 | 2 | if (copy_back) |
2479 | 1 | { |
2480 | 2 | for (i = 0; i < parameter_size; i++1 ) |
2481 | 1 | { |
2482 | 1 | ccv_nnc_tensor_t* const tensor = CCV_NNC_TENSOR(compiled_data->tensors.parameters[i]); |
2483 | 1 | if (copy_back[i] == 0) |
2484 | 0 | continue; |
2485 | 1 | const int j = copy_back[i] - 1; |
2486 | 1 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(tensors[j]), TENSOR_LIST(tensor), 0); |
2487 | 1 | } |
2488 | 1 | if (invalidates) |
2489 | 0 | for (i = 0; i < parameter_size; i++) |
2490 | 0 | { |
2491 | 0 | if (copy_back[i] == 0) |
2492 | 0 | continue; |
2493 | 0 | const int j = copy_back[i] - 1; |
2494 | 0 | ccv_nnc_tensor_free(tensors[j]); |
2495 | 0 | tensors[j] = 0; |
2496 | 0 | } |
2497 | 1 | ccfree(copy_back); |
2498 | 1 | } |
2499 | 2 | } |
2500 | | |
2501 | | ccv_cnnp_model_io_t ccv_cnnp_model_parameter_first(ccv_cnnp_model_t* const model, ccv_cnnp_model_parameters_filter_f first, void* const context) |
2502 | 0 | { |
2503 | 0 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
2504 | 0 | assert(compiled_data); |
2505 | 0 | const int parameter_size = compiled_data->parameters->rnum; |
2506 | 0 | int i; |
2507 | 0 | for (i = 0; i < parameter_size; i++) |
2508 | 0 | { |
2509 | 0 | const char* const name = *(char**)ccv_array_get(compiled_data->ids.parameters, i); |
2510 | 0 | if (first(model, name, context)) |
2511 | 0 | return ccv_cnnp_model_parameters(model, -1, i); |
2512 | 0 | } |
2513 | 0 | return 0; |
2514 | 0 | } |
2515 | | |
2516 | | ccv_array_t* ccv_cnnp_model_parameters_filter(ccv_cnnp_model_t* const model, ccv_cnnp_model_parameters_filter_f filter, void* const context) |
2517 | 0 | { |
2518 | 0 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
2519 | 0 | assert(compiled_data); |
2520 | 0 | ccv_array_t* const parameters = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 0, 0); |
2521 | 0 | const int parameter_size = compiled_data->parameters->rnum; |
2522 | 0 | int i; |
2523 | 0 | for (i = 0; i < parameter_size; i++) |
2524 | 0 | { |
2525 | 0 | const char* const name = *(char**)ccv_array_get(compiled_data->ids.parameters, i); |
2526 | 0 | if (filter(model, name, context)) |
2527 | 0 | { |
2528 | 0 | ccv_cnnp_model_io_t parameter = ccv_cnnp_model_parameters(model, -1, i); |
2529 | 0 | ccv_array_push(parameters, ¶meter); |
2530 | 0 | } |
2531 | 0 | } |
2532 | 0 | return parameters; |
2533 | |
|
2534 | 0 | } |
2535 | | |
2536 | | CCV_WARN_UNUSED(ccv_cnnp_model_io_t) ccv_cnnp_model_parameter_first_uninit(ccv_cnnp_model_t* const model) |
2537 | 0 | { |
2538 | 0 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
2539 | 0 | assert(compiled_data); |
2540 | 0 | const int tensors_init = !!compiled_data->tensors_init.v; |
2541 | 0 | if (!tensors_init) // If nothing initialized, we return parameter 0. |
2542 | 0 | return ccv_cnnp_model_parameters(model, -1, 0); |
2543 | 0 | const int parameter_size = compiled_data->parameters->rnum; |
2544 | 0 | int i; |
2545 | 0 | const uint32_t* const init_v = CCV_NNC_INIT_V(compiled_data->tensors_init.v); |
2546 | 0 | for (i = 0; i < parameter_size; i++) |
2547 | 0 | { |
2548 | 0 | const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d; |
2549 | 0 | if (!(init_v[d >> 5] & (1u << (d & 0x1f)))) |
2550 | 0 | return ccv_cnnp_model_parameters(model, -1, i); |
2551 | 0 | } |
2552 | 0 | return 0; |
2553 | 0 | } |
2554 | | |
2555 | | static ccv_array_t* _ccv_cnnp_model_parameter_indices(const ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, int* const param_ref) |
2556 | 49 | { |
2557 | 49 | const int to_param_sel = parameters->param_sel > 0 ? parameters->param_sel - 10 : parameters->param_sel; |
2558 | 49 | assert(parameters->param_sel != 0); |
2559 | 49 | ccv_array_t* const to_parameter_indices = ccv_array_new(sizeof(int), 0, 0); |
2560 | 49 | ccv_cnnp_model_add_to_parameter_indices(parameters->model, to_param_sel, to_parameter_indices); |
2561 | 49 | *param_ref = parameters->param_ref > 0 ? parameters->param_ref - 10 : parameters->param_ref; |
2562 | 49 | return to_parameter_indices; |
2563 | 49 | } |
2564 | | |
2565 | | static void _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters, ccv_array_t** const parameter_indices, int* const param_ref, ccv_array_t** const from_parameter_indices, int* const from_param_ref, const int only_init_0) |
2566 | 14 | { |
2567 | | // If the model is not compiled yet. Compile them now. |
2568 | 14 | if (!model->graph) |
2569 | 3 | { |
2570 | 3 | model->graph = ccv_nnc_symbolic_graph_new(); |
2571 | 3 | assert(from_model->compiled_data); |
2572 | 3 | const int input_size = from_model->input_size; |
2573 | 3 | ccv_nnc_tensor_param_t input_params[input_size]; |
2574 | 3 | int i; |
2575 | 9 | for (i = 0; i < input_size; i++6 ) |
2576 | 6 | input_params[i] = ccv_nnc_tensor_symbol_params(from_model->graph, from_model->inputs[i]); |
2577 | 3 | _ccv_cnnp_model_compile(model, input_params, input_size, from_model->compiled_data->loss); |
2578 | 3 | model->parallel_count = from_model->parallel_count; |
2579 | 3 | model->memory_compression = from_model->memory_compression; |
2580 | 3 | model->memory_reduction = from_model->memory_reduction; |
2581 | 3 | model->gradient_checkpointing = from_model->gradient_checkpointing; |
2582 | 3 | model->compiled_data->stream_type = from_model->compiled_data->stream_type; |
2583 | 3 | model->compiled_data->minimize.minimizer = from_model->compiled_data->minimize.minimizer; |
2584 | 3 | model->compiled_data->minimize.max_saved_aux_size = from_model->compiled_data->minimize.max_saved_aux_size; |
2585 | 3 | } |
2586 | 14 | ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data; |
2587 | 14 | assert(to_compiled_data); |
2588 | 14 | const int to_tensors_init = !!to_compiled_data->tensors_init.v; |
2589 | 14 | if (!to_tensors_init) |
2590 | 10 | { |
2591 | 10 | if (only_init_0) |
2592 | 1 | ccv_cnnp_model_tensors_init_0(model, to_compiled_data); |
2593 | 9 | else |
2594 | 9 | _ccv_cnnp_model_tensors_init(model, to_compiled_data); |
2595 | 10 | } else if (4 !only_init_04 && (uintptr_t)to_compiled_data->tensors_init.v & (uintptr_t)13 ) |
2596 | | // Check if it is not fully allocated, if it is not, init_1. |
2597 | 0 | ccv_cnnp_model_tensors_init_1(model, to_compiled_data); |
2598 | 14 | assert(to_compiled_data->tensors.parameters); |
2599 | 14 | *parameter_indices = _ccv_cnnp_model_parameter_indices(model, parameters, param_ref); |
2600 | 14 | *from_parameter_indices = _ccv_cnnp_model_parameter_indices(from_model, from_parameters, from_param_ref); |
2601 | 14 | if (*from_param_ref < 0 && *param_ref >= 0) |
2602 | 0 | { assert((*from_parameter_indices)->rnum == 1); } |
2603 | 14 | else if (*from_param_ref >= 0) |
2604 | 0 | { assert(*from_param_ref < (*from_parameter_indices)->rnum); } |
2605 | 14 | if (*param_ref < 0 && *from_param_ref >= 0) |
2606 | 0 | { assert((*parameter_indices)->rnum == 1); } |
2607 | 14 | else if (*param_ref >= 0) |
2608 | 0 | { assert(*param_ref < (*parameter_indices)->rnum); } |
2609 | 14 | } |
2610 | | |
2611 | | void ccv_cnnp_model_set_parameters(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters) |
2612 | 9 | { |
2613 | 9 | ccv_array_t* to_parameter_indices; |
2614 | 9 | int to_param_ref; |
2615 | 9 | ccv_array_t* from_parameter_indices; |
2616 | 9 | int from_param_ref; |
2617 | 9 | _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(model, parameters, from_model, from_parameters, &to_parameter_indices, &to_param_ref, &from_parameter_indices, &from_param_ref, 0); |
2618 | | // Should be exactly the same tensor. |
2619 | 9 | if (to_param_ref < 0 && from_param_ref < 0) |
2620 | 9 | { assert(from_parameter_indices->rnum == to_parameter_indices->rnum); } |
2621 | | // To models. |
2622 | 9 | ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data; |
2623 | 9 | assert(to_compiled_data); |
2624 | | // From models. |
2625 | 9 | const ccv_cnnp_compiled_data_t* const from_compiled_data = from_model->compiled_data; |
2626 | 9 | const int parallel_count = ccv_max(model->parallel_count, 1); |
2627 | 9 | const int to_parameter_size = to_compiled_data->parameters->rnum; |
2628 | 9 | const int rnum = (to_param_ref < 0 && from_param_ref < 0) ? from_parameter_indices->rnum : 10 ; |
2629 | 9 | int i, j; |
2630 | 9 | const uint32_t* const from_init_v = CCV_NNC_INIT_V(from_compiled_data->tensors_init.v); |
2631 | 9 | uint32_t* const to_init_v = CCV_NNC_INIT_V(to_compiled_data->tensors_init.v); |
2632 | 18 | for (i = 0; i < rnum; i++9 ) |
2633 | 9 | { |
2634 | 9 | const int src_d = *(int*)ccv_array_get(from_parameter_indices,from_param_ref >= 0 ? from_param_ref : i); |
2635 | 9 | assert(src_d >= 0); |
2636 | 9 | assert(src_d < from_compiled_data->parameters->rnum); |
2637 | 9 | const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(from_compiled_data->parameters, src_d))->d; |
2638 | | // If the original is not init'ed. We cannot copy from. |
2639 | 9 | if (!(from_init_v[s >> 5] & (1u << (s & 0x1f)))) |
2640 | 0 | continue; |
2641 | 9 | const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i); |
2642 | 9 | assert(dest_d >= 0); |
2643 | 9 | assert(dest_d < to_compiled_data->parameters->rnum); |
2644 | 9 | ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(from_compiled_data->tensors.parameters[src_d]); |
2645 | 9 | assert(src); |
2646 | 9 | ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d]); |
2647 | 9 | assert(dest); |
2648 | 9 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(src), TENSOR_LIST(dest), 0); |
2649 | 27 | for (j = 1; j < parallel_count; j++18 ) |
2650 | 18 | { |
2651 | 18 | ccv_nnc_tensor_t* const copy_tensor = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size]); |
2652 | 18 | if (copy_tensor) |
2653 | 18 | ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dest), TENSOR_LIST(copy_tensor), 0); |
2654 | 18 | } |
2655 | | // Mark this symbol as init'ed. |
2656 | 9 | const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(to_compiled_data->parameters, dest_d))->d; |
2657 | 9 | to_init_v[d >> 5] |= (1u << (d & 0x1f)); |
2658 | 9 | } |
2659 | 9 | ccv_array_free(to_parameter_indices); |
2660 | 9 | ccv_array_free(from_parameter_indices); |
2661 | 9 | } |
2662 | | |
2663 | | void ccv_cnnp_model_share_parameters(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters, ccv_cnnp_model_parameters_renamer_f renamer, void* const context) |
2664 | 2 | { |
2665 | 2 | ccv_array_t* to_parameter_indices; |
2666 | 2 | int to_param_ref; |
2667 | 2 | ccv_array_t* from_parameter_indices; |
2668 | 2 | int from_param_ref; |
2669 | 2 | _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(model, parameters, from_model, from_parameters, &to_parameter_indices, &to_param_ref, &from_parameter_indices, &from_param_ref, 1); |
2670 | | // Should be exactly the same tensor. |
2671 | 2 | if (renamer == 0 && to_param_ref < 01 && from_param_ref < 01 ) |
2672 | 1 | { assert(from_parameter_indices->rnum == to_parameter_indices->rnum); } |
2673 | | // To models. |
2674 | 2 | ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data; |
2675 | 2 | assert(to_compiled_data); |
2676 | | // From models. |
2677 | 2 | const ccv_cnnp_compiled_data_t* const from_compiled_data = from_model->compiled_data; |
2678 | 2 | const int parallel_count = ccv_max(model->parallel_count, 1); |
2679 | 2 | assert(parallel_count == ccv_max(from_model->parallel_count, 1)); // Should have the same parallel count can share parameters. |
2680 | 2 | const int from_parameter_size = from_compiled_data->parameters->rnum; |
2681 | 2 | const int to_parameter_size = to_compiled_data->parameters->rnum; |
2682 | 2 | const int rnum = (to_param_ref < 0 && from_param_ref < 0) ? to_parameter_indices->rnum : 10 ; |
2683 | 2 | int i, j; |
2684 | 2 | khash_t(ccv_cnnp_parameter_id)* id_map = 0; |
2685 | 2 | char* updated_name = 0; |
2686 | 2 | const uint32_t* const from_init_v = CCV_NNC_INIT_V(from_compiled_data->tensors_init.v); |
2687 | 2 | uint32_t* const to_init_v = CCV_NNC_INIT_V(to_compiled_data->tensors_init.v); |
2688 | 8 | for (i = 0; i < rnum; i++6 ) |
2689 | 6 | { |
2690 | 6 | int src_d = (from_param_ref >= 0 ? from_param_ref0 : i) < from_parameter_indices->rnum ? *(int*)4 ccv_array_get4 (from_parameter_indices,from_param_ref >= 0 ? from_param_ref : i) : from_parameter_size2 ; |
2691 | | // Need to figure out how to use the renamer here. |
2692 | 6 | const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i); |
2693 | 6 | assert(dest_d >= 0); |
2694 | 6 | assert(dest_d < to_parameter_size); |
2695 | 6 | if (renamer) |
2696 | 3 | { |
2697 | 3 | const char* const src_name = (src_d < from_parameter_size && src_d >= 01 ) ? *(char**)1 ccv_array_get1 (from_compiled_data->ids.parameters, src_d) : 02 ; |
2698 | 3 | const char* const dest_name = *(char**)ccv_array_get(to_compiled_data->ids.parameters, dest_d); |
2699 | 3 | if (!updated_name) |
2700 | 1 | updated_name = (char*)ccmalloc(1024); |
2701 | 3 | const size_t src_name_len = src_name == 0 ? 02 : ccv_min1 (strnlen(src_name, 1023), 1023); |
2702 | 3 | if (src_name_len > 0) |
2703 | 1 | memcpy(updated_name, src_name, src_name_len); |
2704 | 3 | updated_name[src_name_len] = 0; |
2705 | 3 | if (renamer(context, dest_name, updated_name, 1024) != 0) |
2706 | 0 | continue; // Skip this. |
2707 | 3 | if (src_name != 0 && memcmp(updated_name, src_name, src_name_len) == 01 && strnlen(updated_name, 1023) == src_name_len0 ) |
2708 | 0 | { |
2709 | | // Nothing changed. |
2710 | 3 | } else { |
2711 | 3 | if (!id_map) |
2712 | 1 | { |
2713 | 1 | id_map = kh_init(ccv_cnnp_parameter_id); |
2714 | 2 | for (j = 0; j < from_parameter_size; j++1 ) |
2715 | 1 | { |
2716 | 1 | int ret; |
2717 | 1 | const khiter_t k = kh_put(ccv_cnnp_parameter_id, id_map, *(char**)ccv_array_get(from_compiled_data->ids.parameters, j), &ret); |
2718 | 1 | assert(ret != 0); |
2719 | 1 | kh_val(id_map, k) = j; |
2720 | 1 | } |
2721 | 1 | } |
2722 | 3 | const khiter_t k = kh_get(ccv_cnnp_parameter_id, id_map, updated_name); |
2723 | 3 | if (k == kh_end(id_map)) // Cannot find the name, skip. |
2724 | 2 | continue; |
2725 | 1 | src_d = kh_val(id_map, k); |
2726 | 1 | assert(src_d >= 0); |
2727 | 1 | assert(src_d < from_parameter_size); |
2728 | 1 | } |
2729 | 3 | } |
2730 | 6 | assert(src_d >= 0)4 ; |
2731 | 4 | assert(src_d < from_parameter_size); |
2732 | 4 | const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(from_compiled_data->parameters, src_d))->d; |
2733 | | // If the original is not init'ed. We cannot share from. |
2734 | 4 | if (!(from_init_v[s >> 5] & (1u << (s & 0x1f)))) |
2735 | 0 | continue; |
2736 | 8 | for (j = 0; 4 j < parallel_count; j++4 ) |
2737 | 4 | { |
2738 | 4 | ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(from_compiled_data->tensors.parameters[src_d + j * from_parameter_size]); |
2739 | 4 | assert(src); |
2740 | 4 | ccv_nnc_tensor_t* const dest = to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size]; |
2741 | 4 | if (dest && !((uintptr_t)dest & (uintptr_t)1)1 ) |
2742 | 1 | ccv_nnc_tensor_free(dest); |
2743 | 4 | to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size] = (ccv_nnc_tensor_t*)((uintptr_t)src | (uintptr_t)1); |
2744 | 4 | } |
2745 | | // Mark this symbol as init'ed. |
2746 | 4 | const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(to_compiled_data->parameters, dest_d))->d; |
2747 | 4 | to_init_v[d >> 5] |= (1u << (d & 0x1f)); |
2748 | 4 | } |
2749 | 2 | ccv_array_free(to_parameter_indices); |
2750 | 2 | ccv_array_free(from_parameter_indices); |
2751 | 2 | if (id_map) |
2752 | 1 | kh_destroy(ccv_cnnp_parameter_id, id_map); |
2753 | 2 | if (updated_name) |
2754 | 1 | ccfree(updated_name); |
2755 | | // Mark it as incomplete so we will call init_1. |
2756 | 2 | if (ccv_cnnp_model_tensors_any_to_alloc(model, to_compiled_data)) |
2757 | 0 | to_compiled_data->tensors_init.v = (uint32_t*)((uintptr_t)to_compiled_data->tensors_init.v | (uintptr_t)1); |
2758 | 2 | else // Remove the flag. |
2759 | 2 | to_compiled_data->tensors_init.v = CCV_NNC_INIT_V(to_compiled_data->tensors_init.v); |
2760 | 2 | } |
2761 | | |
2762 | | ccv_nnc_stream_context_t* ccv_cnnp_compiled_data_get_stream(ccv_cnnp_compiled_data_t* const compiled_data, const int type) |
2763 | 24 | { |
2764 | 24 | if (!compiled_data->stream_map) |
2765 | 4 | compiled_data->stream_map = kh_init(stream_map); |
2766 | 24 | int ret = 0; |
2767 | 24 | khiter_t k = kh_put(stream_map, compiled_data->stream_map, type, &ret); |
2768 | 24 | assert(ret >= 0); |
2769 | 24 | ccv_nnc_stream_context_t* stream = kh_val(compiled_data->stream_map, k); |
2770 | | // If ret == 0, the key already exist, we can return directly, otherwise, create and return. |
2771 | 24 | if (ret != 0) |
2772 | 16 | { |
2773 | 16 | stream = ccv_nnc_stream_context_new(type); |
2774 | 16 | kh_val(compiled_data->stream_map, k) = stream; |
2775 | 16 | } |
2776 | 24 | return stream; |
2777 | 24 | } |
2778 | | |
2779 | | void ccv_cnnp_model_parameters_zip_map(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const aux_ins, const int aux_in_size, ccv_nnc_tensor_t* const* const aux_outs, const int aux_out_size, ccv_nnc_stream_context_t* const stream_context, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters) |
2780 | 3 | { |
2781 | 3 | ccv_array_t* to_parameter_indices; |
2782 | 3 | int to_param_ref; |
2783 | 3 | ccv_array_t* from_parameter_indices; |
2784 | 3 | int from_param_ref; |
2785 | 3 | _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(model, parameters, from_model, from_parameters, &to_parameter_indices, &to_param_ref, &from_parameter_indices, &from_param_ref, 0); |
2786 | | // Should be exactly the same tensor. |
2787 | 3 | if (to_param_ref < 0 && from_param_ref < 0) |
2788 | 3 | { assert(from_parameter_indices->rnum == to_parameter_indices->rnum); } |
2789 | | // To models. |
2790 | 3 | ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data; |
2791 | 3 | assert(to_compiled_data); |
2792 | | // From models. |
2793 | 3 | const ccv_cnnp_compiled_data_t* const from_compiled_data = from_model->compiled_data; |
2794 | 3 | const int parallel_count = ccv_max(model->parallel_count, 1); |
2795 | 3 | const int to_parameter_size = to_compiled_data->parameters->rnum; |
2796 | 3 | const int rnum = (to_param_ref < 0 && from_param_ref < 0) ? from_parameter_indices->rnum : 10 ; |
2797 | 3 | assert(aux_in_size >= 0); |
2798 | 3 | assert(aux_out_size >= 0); |
2799 | 3 | int i, j; |
2800 | 3 | ccv_nnc_tensor_t* inputs[aux_in_size + 2]; |
2801 | 3 | ccv_nnc_tensor_t* outputs[aux_out_size + 1]; |
2802 | 3 | for (i = 0; i < aux_in_size; i++0 ) |
2803 | 0 | inputs[i + 2] = aux_ins[i]; |
2804 | 3 | for (i = 0; i < aux_out_size; i++0 ) |
2805 | 0 | outputs[i + 1] = aux_outs[i]; |
2806 | 3 | const uint32_t* const from_init_v = CCV_NNC_INIT_V(from_compiled_data->tensors_init.v); |
2807 | 3 | uint32_t* const to_init_v = CCV_NNC_INIT_V(to_compiled_data->tensors_init.v); |
2808 | 6 | for (i = 0; i < rnum; i++3 ) |
2809 | 3 | { |
2810 | 3 | const int src_d = *(int*)ccv_array_get(from_parameter_indices,from_param_ref >= 0 ? from_param_ref : i); |
2811 | 3 | assert(src_d >= 0); |
2812 | 3 | assert(src_d < from_compiled_data->parameters->rnum); |
2813 | 3 | const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(from_compiled_data->parameters, src_d))->d; |
2814 | | // If the original is not init'ed. We cannot copy from. |
2815 | 3 | if (!(from_init_v[s >> 5] & (1u << (s & 0x1f)))) |
2816 | 0 | continue; |
2817 | 3 | const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i); |
2818 | 3 | assert(dest_d >= 0); |
2819 | 3 | assert(dest_d < to_compiled_data->parameters->rnum); |
2820 | 3 | if (parallel_count > 1) |
2821 | 2 | { |
2822 | 2 | ccv_nnc_stream_context_t* streams[parallel_count]; |
2823 | 2 | ccv_nnc_stream_signal_t* signal; |
2824 | 2 | if (stream_context) |
2825 | 1 | signal = ccv_nnc_stream_context_emit_signal_new(stream_context); |
2826 | 10 | for (j = 0; j < parallel_count; j++8 ) |
2827 | 8 | { |
2828 | 8 | ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(from_compiled_data->tensors.parameters[src_d + j * to_parameter_size]); |
2829 | 8 | ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size]); |
2830 | 8 | if (!dest || !src) |
2831 | 0 | { |
2832 | 0 | streams[j] = 0; |
2833 | 0 | continue; |
2834 | 0 | } |
2835 | | // At the moment, can only handle them on the same device. |
2836 | 8 | assert(CCV_TENSOR_GET_MEMORY(src->info.type) == CCV_TENSOR_GET_MEMORY(dest->info.type)); |
2837 | 8 | assert(CCV_TENSOR_GET_DEVICE_ID(src->info.type) == CCV_TENSOR_GET_DEVICE_ID(dest->info.type)); |
2838 | 8 | const int stream_type = CCV_TENSOR_GET_MEMORY(src->info.type) == CCV_TENSOR_GPU_MEMORY ? CCV_STREAM_CONTEXT_GPU : CCV_STREAM_CONTEXT_CPU0 ; |
2839 | 8 | const int device_id = CCV_TENSOR_GET_DEVICE_ID(src->info.type); |
2840 | 8 | int type = stream_type; |
2841 | 8 | CCV_STREAM_SET_DEVICE_ID(type, device_id); |
2842 | 8 | ccv_nnc_stream_context_t* const stream_0 = ccv_cnnp_compiled_data_get_stream(to_compiled_data, type); |
2843 | | // Wait signal to finish. |
2844 | 8 | if (stream_context) |
2845 | 4 | ccv_nnc_stream_context_wait_signal(stream_0, signal); |
2846 | 8 | inputs[0] = outputs[0] = dest; |
2847 | 8 | inputs[1] = src; |
2848 | 8 | ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 2, outputs, aux_out_size + 1, stream_0); |
2849 | 8 | if (stream_context) |
2850 | 4 | { |
2851 | 4 | ccv_nnc_stream_signal_t* const signal = ccv_nnc_stream_context_emit_signal_new(stream_0); |
2852 | 4 | ccv_nnc_stream_context_wait_signal(stream_context, signal); |
2853 | 4 | } |
2854 | 8 | streams[j] = stream_0; |
2855 | 8 | } |
2856 | | // If this should be blocking, blocking it. |
2857 | 2 | if (!stream_context) |
2858 | 5 | for (j = 0; 1 j < parallel_count; j++4 ) |
2859 | 4 | if (streams[j]) |
2860 | 4 | ccv_nnc_stream_context_wait(streams[j]); |
2861 | 2 | } else { |
2862 | 1 | ccv_nnc_tensor_t* const src = CCV_NNC_TENSOR(from_compiled_data->tensors.parameters[src_d]); |
2863 | 1 | assert(src); |
2864 | 1 | ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d]); |
2865 | 1 | assert(dest); |
2866 | 1 | inputs[0] = outputs[0] = dest; |
2867 | 1 | inputs[1] = src; |
2868 | 1 | ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 2, outputs, aux_out_size + 1, stream_context); |
2869 | 1 | } |
2870 | | // Mark this symbol as init'ed. |
2871 | 3 | const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(to_compiled_data->parameters, dest_d))->d; |
2872 | 3 | to_init_v[d >> 5] |= (1u << (d & 0x1f)); |
2873 | 3 | } |
2874 | 3 | ccv_array_free(to_parameter_indices); |
2875 | 3 | ccv_array_free(from_parameter_indices); |
2876 | 3 | } |
2877 | | |
2878 | | void ccv_cnnp_model_parameters_map(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const aux_ins, const int aux_in_size, ccv_nnc_tensor_t* const* const aux_outs, const int aux_out_size, ccv_nnc_stream_context_t* const stream_context) |
2879 | 15 | { |
2880 | 15 | int to_param_ref; |
2881 | 15 | ccv_array_t* const to_parameter_indices = _ccv_cnnp_model_parameter_indices(model, parameters, &to_param_ref); |
2882 | | // To models. |
2883 | 15 | ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data; |
2884 | 15 | assert(to_compiled_data); |
2885 | | // Tensor has to be inited already. |
2886 | 15 | assert(!!to_compiled_data->tensors_init.v); |
2887 | 15 | assert(to_compiled_data->tensors.parameters); |
2888 | | // From models. |
2889 | 15 | const int parallel_count = ccv_max(model->parallel_count, 1); |
2890 | 15 | const int to_parameter_size = to_compiled_data->parameters->rnum; |
2891 | 15 | const int rnum = (to_param_ref < 0) ? to_parameter_indices->rnum : 10 ; |
2892 | 15 | assert(aux_in_size >= 0); |
2893 | 15 | assert(aux_out_size >= 0); |
2894 | 15 | int i, j; |
2895 | 15 | ccv_nnc_tensor_t* inputs[aux_in_size + 1]; |
2896 | 15 | ccv_nnc_tensor_t* outputs[aux_out_size + 1]; |
2897 | 15 | for (i = 0; i < aux_in_size; i++0 ) |
2898 | 0 | inputs[i + 1] = aux_ins[i]; |
2899 | 15 | for (i = 0; i < aux_out_size; i++0 ) |
2900 | 0 | outputs[i + 1] = aux_outs[i]; |
2901 | 30 | for (i = 0; i < rnum; i++15 ) |
2902 | 15 | { |
2903 | 15 | const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i); |
2904 | 15 | assert(dest_d >= 0); |
2905 | 15 | assert(dest_d < to_compiled_data->parameters->rnum); |
2906 | 15 | if (parallel_count > 1) |
2907 | 4 | { |
2908 | 4 | ccv_nnc_stream_context_t* streams[parallel_count]; |
2909 | 4 | ccv_nnc_stream_signal_t* signal; |
2910 | 4 | if (stream_context) |
2911 | 1 | signal = ccv_nnc_stream_context_emit_signal_new(stream_context); |
2912 | 20 | for (j = 0; j < parallel_count; j++16 ) |
2913 | 16 | { |
2914 | 16 | ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size]); |
2915 | 16 | if (!dest) |
2916 | 0 | { |
2917 | 0 | streams[j] = 0; |
2918 | 0 | continue; |
2919 | 0 | } |
2920 | 16 | const int stream_type = CCV_TENSOR_GET_MEMORY(dest->info.type) == CCV_TENSOR_GPU_MEMORY ? CCV_STREAM_CONTEXT_GPU : CCV_STREAM_CONTEXT_CPU0 ; |
2921 | 16 | const int device_id = CCV_TENSOR_GET_DEVICE_ID(dest->info.type); |
2922 | 16 | int type = stream_type; |
2923 | 16 | CCV_STREAM_SET_DEVICE_ID(type, device_id); |
2924 | 16 | ccv_nnc_stream_context_t* const stream_0 = ccv_cnnp_compiled_data_get_stream(to_compiled_data, type); |
2925 | | // Wait signal to finish. |
2926 | 16 | if (stream_context) |
2927 | 4 | ccv_nnc_stream_context_wait_signal(stream_0, signal); |
2928 | 16 | inputs[0] = outputs[0] = dest; |
2929 | 16 | ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_0); |
2930 | 16 | if (stream_context) |
2931 | 4 | { |
2932 | 4 | ccv_nnc_stream_signal_t* const signal = ccv_nnc_stream_context_emit_signal_new(stream_0); |
2933 | 4 | ccv_nnc_stream_context_wait_signal(stream_context, signal); |
2934 | 4 | } |
2935 | 16 | streams[j] = stream_0; |
2936 | 16 | } |
2937 | | // If this should be blocking, blocking it. |
2938 | 4 | if (!stream_context) |
2939 | 15 | for (j = 0; 3 j < parallel_count; j++12 ) |
2940 | 12 | if (streams[j]) |
2941 | 12 | ccv_nnc_stream_context_wait(streams[j]); |
2942 | 11 | } else { |
2943 | 11 | ccv_nnc_tensor_t* const dest = CCV_NNC_TENSOR(to_compiled_data->tensors.parameters[dest_d]); |
2944 | 11 | assert(dest); |
2945 | 11 | inputs[0] = outputs[0] = dest; |
2946 | 11 | ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_context); |
2947 | 11 | } |
2948 | | // No need to mark this symbol as init'ed, it is already. |
2949 | 15 | } |
2950 | 15 | ccv_array_free(to_parameter_indices); |
2951 | 15 | } |
2952 | | |
2953 | | void ccv_cnnp_model_parameter_gradients_map(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const aux_ins, const int aux_in_size, ccv_nnc_tensor_t* const* const aux_outs, const int aux_out_size, ccv_nnc_stream_context_t* const stream_context) |
2954 | 6 | { |
2955 | 6 | int to_param_ref; |
2956 | 6 | ccv_array_t* const to_parameter_indices = _ccv_cnnp_model_parameter_indices(model, parameters, &to_param_ref); |
2957 | | // To models. |
2958 | 6 | ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data; |
2959 | 6 | assert(to_compiled_data); |
2960 | | // Tensor has to be inited already. |
2961 | 6 | assert(!!to_compiled_data->tensors_init.v); |
2962 | 6 | ccv_nnc_tensor_t** tensor_gradients; |
2963 | 6 | if (to_compiled_data->backward.count > 1) |
2964 | 3 | tensor_gradients = to_compiled_data->tensors.accum_gradients; |
2965 | 3 | else |
2966 | 3 | tensor_gradients = to_compiled_data->tensors.gradients; |
2967 | 6 | assert(tensor_gradients); |
2968 | | // From models. |
2969 | 6 | const int parallel_count = ccv_max(model->parallel_count, 1); |
2970 | 6 | const int to_parameter_size = to_compiled_data->parameters->rnum; |
2971 | 6 | const int rnum = (to_param_ref < 0) ? to_parameter_indices->rnum : 10 ; |
2972 | 6 | assert(aux_in_size >= 0); |
2973 | 6 | assert(aux_out_size >= 0); |
2974 | 6 | int i, j; |
2975 | 6 | ccv_nnc_tensor_t* inputs[aux_in_size + 1]; |
2976 | 6 | ccv_nnc_tensor_t* outputs[aux_out_size + 1]; |
2977 | 10 | for (i = 0; i < aux_in_size; i++4 ) |
2978 | 4 | inputs[i + 1] = aux_ins[i]; |
2979 | 14 | for (i = 0; i < aux_out_size; i++8 ) |
2980 | 8 | outputs[i + 1] = aux_outs[i]; |
2981 | 12 | for (i = 0; i < rnum; i++6 ) |
2982 | 6 | { |
2983 | 6 | const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i); |
2984 | 6 | assert(dest_d >= 0); |
2985 | 6 | assert(dest_d < to_compiled_data->parameters->rnum); |
2986 | 6 | if (parallel_count > 1) |
2987 | 0 | { |
2988 | 0 | ccv_nnc_stream_context_t* streams[parallel_count]; |
2989 | 0 | ccv_nnc_stream_signal_t* signal; |
2990 | 0 | if (stream_context) |
2991 | 0 | signal = ccv_nnc_stream_context_emit_signal_new(stream_context); |
2992 | 0 | for (j = 0; j < parallel_count; j++) |
2993 | 0 | { |
2994 | 0 | ccv_nnc_tensor_t* const dest = tensor_gradients[dest_d + j * to_parameter_size]; |
2995 | 0 | if (!dest) |
2996 | 0 | { |
2997 | 0 | streams[j] = 0; |
2998 | 0 | continue; |
2999 | 0 | } |
3000 | 0 | const int stream_type = CCV_TENSOR_GET_MEMORY(dest->info.type) == CCV_TENSOR_GPU_MEMORY ? CCV_STREAM_CONTEXT_GPU : CCV_STREAM_CONTEXT_CPU; |
3001 | 0 | const int device_id = CCV_TENSOR_GET_DEVICE_ID(dest->info.type); |
3002 | 0 | int type = stream_type; |
3003 | 0 | CCV_STREAM_SET_DEVICE_ID(type, device_id); |
3004 | 0 | ccv_nnc_stream_context_t* const stream_0 = ccv_cnnp_compiled_data_get_stream(to_compiled_data, type); |
3005 | | // Wait signal to finish. |
3006 | 0 | if (stream_context) |
3007 | 0 | ccv_nnc_stream_context_wait_signal(stream_0, signal); |
3008 | 0 | inputs[0] = outputs[0] = dest; |
3009 | 0 | ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_0); |
3010 | 0 | if (stream_context) |
3011 | 0 | { |
3012 | 0 | ccv_nnc_stream_signal_t* const signal = ccv_nnc_stream_context_emit_signal_new(stream_0); |
3013 | 0 | ccv_nnc_stream_context_wait_signal(stream_context, signal); |
3014 | 0 | } |
3015 | 0 | streams[j] = stream_0; |
3016 | 0 | } |
3017 | | // If this should be blocking, blocking it. |
3018 | 0 | if (!stream_context) |
3019 | 0 | for (j = 0; j < parallel_count; j++) |
3020 | 0 | if (streams[j]) |
3021 | 0 | ccv_nnc_stream_context_wait(streams[j]); |
3022 | 6 | } else { |
3023 | 6 | ccv_nnc_tensor_t* const dest = tensor_gradients[dest_d]; |
3024 | 6 | if (!dest) |
3025 | 0 | continue; |
3026 | 6 | assert(dest); |
3027 | 6 | inputs[0] = outputs[0] = dest; |
3028 | 6 | ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_context); |
3029 | 6 | } |
3030 | | // No need to mark this symbol as init'ed, it is already. |
3031 | 6 | } |
3032 | 6 | ccv_array_free(to_parameter_indices); |
3033 | 6 | } |
3034 | | |
3035 | | ccv_nnc_cmd_t ccv_cnnp_model_minimizer(ccv_cnnp_model_t* const model) |
3036 | 2.20k | { |
3037 | 2.20k | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
3038 | 2.20k | assert(compiled_data); |
3039 | 2.20k | return compiled_data->minimize.minimizer; |
3040 | 2.20k | } |
3041 | | |
3042 | | void ccv_cnnp_model_set_minimizer(ccv_cnnp_model_t* const model, const ccv_nnc_cmd_t minimizer, const int reset, const ccv_cnnp_model_io_t* const set_parameters, const int set_parameter_size) |
3043 | 4.36k | { |
3044 | 4.36k | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
3045 | 4.36k | assert(compiled_data); |
3046 | 4.36k | const int parameter_size = compiled_data->parameters->rnum; |
3047 | 4.36k | if (parameter_size == 0) |
3048 | 3 | return; |
3049 | 4.35k | if (reset) |
3050 | 2.49k | { assert(set_parameters == 0 && set_parameter_size == 0); } |
3051 | 4.35k | const int old_max_saved_aux_size = compiled_data->minimize.max_saved_aux_size; |
3052 | 4.35k | const int saved_aux_size = ccv_nnc_minimizer_saved_aux_size(minimizer); |
3053 | 4.35k | if (saved_aux_size > compiled_data->minimize.max_saved_aux_size) |
3054 | 7 | compiled_data->minimize.max_saved_aux_size = saved_aux_size; |
3055 | 4.35k | const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size; |
3056 | | // We update all parameters, at this point, we have one minimizer. |
3057 | 4.35k | if (set_parameters == 0 || set_parameter_size == 0301 ) |
3058 | 4.05k | compiled_data->minimize.minimizer = minimizer; |
3059 | 4.35k | int i; |
3060 | 4.35k | if (set_parameters && set_parameter_size301 ) |
3061 | 301 | { |
3062 | | // I need to save what's the minimizer along with this. |
3063 | 301 | if (!compiled_data->minimize.parameters) |
3064 | 5 | compiled_data->minimize.parameters = ccv_array_new(sizeof(ccv_cnnp_set_minimizer_for_parameter_t*), 1, 0); |
3065 | 301 | ccv_cnnp_set_minimizer_for_parameter_t* const set_minimizer_for_parameter = ccmalloc(sizeof(ccv_cnnp_set_minimizer_for_parameter_t) + (set_parameter_size - 1) * sizeof(ccv_cnnp_model_io_t)); |
3066 | 301 | set_minimizer_for_parameter->minimizer = minimizer; |
3067 | 301 | set_minimizer_for_parameter->parameter_size = set_parameter_size; |
3068 | 301 | memcpy(set_minimizer_for_parameter->parameters, set_parameters, sizeof(ccv_cnnp_model_io_t) * set_parameter_size); |
3069 | 301 | ccv_array_push(compiled_data->minimize.parameters, &set_minimizer_for_parameter); |
3070 | 301 | } |
3071 | | // If reset is true, clear the parameters array. |
3072 | 4.35k | if (reset && compiled_data->minimize.parameters2.49k ) |
3073 | 291 | { |
3074 | 582 | for (i = 0; i < compiled_data->minimize.parameters->rnum; i++291 ) |
3075 | 291 | ccfree(*(ccv_cnnp_set_minimizer_for_parameter_t**)ccv_array_get(compiled_data->minimize.parameters, i)); |
3076 | 291 | ccv_array_clear(compiled_data->minimize.parameters); |
3077 | 291 | } |
3078 | 4.35k | if (!compiled_data->update_nodes) |
3079 | 9 | return; |
3080 | 4.34k | ccv_nnc_symbolic_graph_t* const symbolic_graph = model->graph; |
3081 | 4.34k | assert(symbolic_graph); |
3082 | 4.34k | if (saved_aux_size > old_max_saved_aux_size) |
3083 | 7 | { |
3084 | 7 | assert(compiled_data->updated_parameters); |
3085 | | // Reallocate first, move them around later. |
3086 | 7 | compiled_data->updated_parameters = (ccv_nnc_tensor_symbol_t*)ccrealloc(compiled_data->updated_parameters, sizeof(ccv_nnc_tensor_symbol_t) * parameter_size + sizeof(ccv_nnc_graph_exec_symbol_t) * parameter_size + sizeof(ccv_nnc_tensor_symbol_map_t) * saved_aux_size * parameter_size); |
3087 | 7 | compiled_data->update_nodes = (ccv_nnc_graph_exec_symbol_t*)(compiled_data->updated_parameters + parameter_size); |
3088 | 7 | compiled_data->saved_aux = (ccv_nnc_tensor_symbol_map_t*)(compiled_data->update_nodes + parameter_size); |
3089 | | // We need to do this from back to front because saved_aux_size > old_saved_aux_size, it could overlap. |
3090 | 7 | _ccv_cnnp_scatter_saved_aux(compiled_data->saved_aux, parameter_size, old_max_saved_aux_size, saved_aux_size); |
3091 | 7 | } |
3092 | 4.34k | int flag = 0; |
3093 | 4.34k | const int parallel_count = ccv_max(model->parallel_count, 1); |
3094 | 4.34k | if (set_parameters && set_parameter_size296 ) |
3095 | 296 | { |
3096 | 296 | ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0); |
3097 | 592 | for (i = 0; i < set_parameter_size; i++296 ) |
3098 | 296 | { |
3099 | 296 | const int param_sel = set_parameters[i]->param_sel > 0 ? set_parameters[i]->param_sel - 1291 : set_parameters[i]->param_sel5 ; |
3100 | 296 | assert(set_parameters[i]->param_sel != 0); |
3101 | 296 | const int old_rnum = parameter_indices->rnum; |
3102 | 296 | ccv_cnnp_model_add_to_parameter_indices(set_parameters[i]->model, param_sel, parameter_indices); |
3103 | 296 | const int param_ref = set_parameters[i]->param_ref > 0 ? set_parameters[i]->param_ref - 10 : set_parameters[i]->param_ref; |
3104 | 296 | assert(set_parameters[i]->param_ref != 0); |
3105 | 296 | if (param_ref >= 0) |
3106 | 0 | { |
3107 | 0 | assert(param_ref + old_rnum < parameter_indices->rnum); |
3108 | 0 | *(int*)ccv_array_get(parameter_indices, old_rnum) = *(int*)ccv_array_get(parameter_indices, param_ref + old_rnum); |
3109 | 0 | parameter_indices->rnum = old_rnum + 1; |
3110 | 0 | } |
3111 | 296 | } |
3112 | | // We may have duplicated indices, but that is OK, we will set it twice. |
3113 | 5.24k | for (i = 0; 296 i < parameter_indices->rnum; i++4.95k ) |
3114 | 4.95k | { |
3115 | 4.95k | const int d = *(int*)ccv_array_get(parameter_indices, i); |
3116 | 4.95k | if (_ccv_cnnp_set_minimizer_for_parameter(symbolic_graph, compiled_data, compiled_data->update_nodes, compiled_data->updated_parameters, compiled_data->saved_aux, parallel_count, minimizer, saved_aux_size, max_saved_aux_size, d)) |
3117 | 0 | flag = 1; |
3118 | 4.95k | } |
3119 | 296 | ccv_array_free(parameter_indices); |
3120 | 4.05k | } else { |
3121 | 19.1k | for (i = 0; i < parameter_size; i++15.0k ) |
3122 | 15.0k | if (_ccv_cnnp_set_minimizer_for_parameter(symbolic_graph, compiled_data, compiled_data->update_nodes, compiled_data->updated_parameters, compiled_data->saved_aux, parallel_count, minimizer, saved_aux_size, max_saved_aux_size, i)) |
3123 | 65 | flag = 1; |
3124 | 4.05k | if (compiled_data->minimize.parameters) |
3125 | 291 | if (_ccv_cnnp_apply_parameters_with_minimizer(model)) |
3126 | 0 | flag = 1; |
3127 | 4.05k | } |
3128 | 4.34k | if (flag) |
3129 | 7 | { |
3130 | | // If saved_aux_size doesn't match, we need to remove / add new saved_aux to the graph. But first, free up apply gradients graph. |
3131 | 7 | if (compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_FIT_MODE) |
3132 | 0 | _ccv_cnnp_compiled_data_graph_free(compiled_data); |
3133 | 7 | _ccv_cnnp_compiled_data_apply_gradients_free(compiled_data); |
3134 | 7 | } |
3135 | 4.34k | } |
3136 | | |
3137 | | void ccv_cnnp_model_set_compile_params(ccv_cnnp_model_t* const model, const ccv_nnc_symbolic_graph_compile_param_t compile_params) |
3138 | 0 | { |
3139 | 0 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
3140 | 0 | assert(compiled_data); |
3141 | 0 | compiled_data->compile_params = compile_params; |
3142 | 0 | } |
3143 | | |
3144 | | void ccv_cnnp_model_dot(const ccv_cnnp_model_t* const model, const int flags, FILE** const outs, const int out_size) |
3145 | 48 | { |
3146 | 48 | if (model->graph && out_size > 047 ) |
3147 | 47 | ccv_nnc_symbolic_graph_dot(model->graph, flags, outs[0]); |
3148 | 48 | if (model->compiled_data && model->compiled_data->graph47 && out_size > 116 ) |
3149 | 0 | ccv_nnc_graph_dot(model->compiled_data->graph, flags, outs[1]); |
3150 | 48 | if (model->compiled_data && model->compiled_data->backward.accum47 && out_size > 20 ) |
3151 | 0 | ccv_nnc_graph_dot(model->compiled_data->backward.accum, flags, outs[2]); |
3152 | 48 | if (model->compiled_data && model->compiled_data->apply_gradients.graph47 && out_size > 33 ) |
3153 | 0 | ccv_nnc_graph_dot(model->compiled_data->apply_gradients.graph, flags, outs[3]); |
3154 | 48 | } |
3155 | | |
3156 | | void ccv_cnnp_model_format(const ccv_cnnp_model_t* const model, const ccv_nnc_symbolic_graph_format_f format_fn, void* const context) |
3157 | 0 | { |
3158 | 0 | if (model->graph) |
3159 | 0 | ccv_nnc_symbolic_graph_format(model->graph, 0, 0, 0, 0, format_fn, context); |
3160 | 0 | } |
3161 | | |
3162 | | static void _ccv_cnnp_compiled_data_free(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data) |
3163 | 2.30k | { |
3164 | 2.30k | int i; |
3165 | 2.30k | const int parameter_size = compiled_data->parameters->rnum; |
3166 | 2.30k | ccv_array_free(compiled_data->parameters); |
3167 | 2.30k | if (compiled_data->parameter_flags) |
3168 | 10 | ccfree(compiled_data->parameter_flags); |
3169 | 2.30k | const int internal_size = compiled_data->internals->rnum; |
3170 | 2.30k | ccv_array_free(compiled_data->internals); |
3171 | 2.30k | assert(compiled_data->ids.parameters->rnum == parameter_size); |
3172 | 2.30k | assert(compiled_data->ids.internals->rnum == internal_size); |
3173 | 5.25k | for (i = 0; 2.30k i < parameter_size; i++2.95k ) |
3174 | 2.95k | ccfree(*(char**)ccv_array_get(compiled_data->ids.parameters, i)); |
3175 | 2.30k | ccv_array_free(compiled_data->ids.parameters); |
3176 | 2.46k | for (i = 0; i < internal_size; i++165 ) |
3177 | 165 | ccfree(*(char**)ccv_array_get(compiled_data->ids.internals, i)); |
3178 | 2.30k | ccv_array_free(compiled_data->ids.internals); |
3179 | 2.30k | const int parallel_count = ccv_max(model->parallel_count, 1); |
3180 | 2.30k | if (compiled_data->tensors.parameters) |
3181 | 94 | { |
3182 | 798 | for (i = 0; i < parameter_size * parallel_count; i++704 ) |
3183 | | // If it is not marked as not belonging, we can free it. |
3184 | 704 | if (!((uintptr_t)compiled_data->tensors.parameters[i] & (uintptr_t)1)) |
3185 | 700 | if (compiled_data->tensors.parameters[i]) |
3186 | 698 | ccv_nnc_tensor_free(compiled_data->tensors.parameters[i]); |
3187 | 253 | for (i = 0; i < internal_size * parallel_count; i++159 ) |
3188 | 159 | if (compiled_data->tensors.internals[i]) |
3189 | 158 | ccv_nnc_tensor_free(compiled_data->tensors.internals[i]); |
3190 | 94 | ccfree(compiled_data->tensors.parameters); |
3191 | 94 | } |
3192 | 2.30k | if (compiled_data->tensors.gradients) |
3193 | 31 | { |
3194 | 361 | for (i = 0; i < parameter_size * parallel_count; i++330 ) |
3195 | 330 | { |
3196 | 330 | if (compiled_data->tensors.gradients[i]) |
3197 | 328 | ccv_nnc_tensor_free(compiled_data->tensors.gradients[i]); |
3198 | 330 | if (compiled_data->tensors.accum_gradients[i]) |
3199 | 15 | ccv_nnc_tensor_free(compiled_data->tensors.accum_gradients[i]); |
3200 | 330 | } |
3201 | 31 | ccfree(compiled_data->tensors.gradients); |
3202 | 31 | } |
3203 | 2.30k | if (compiled_data->minimize.parameters) |
3204 | 5 | { |
3205 | 15 | for (i = 0; i < compiled_data->minimize.parameters->rnum; i++10 ) |
3206 | 10 | ccfree(*(ccv_cnnp_set_minimizer_for_parameter_t**)ccv_array_get(compiled_data->minimize.parameters, i)); |
3207 | 5 | ccv_array_free(compiled_data->minimize.parameters); |
3208 | 5 | } |
3209 | 2.30k | if (compiled_data->rewindables) |
3210 | 44 | ccv_array_free(compiled_data->rewindables); |
3211 | 2.30k | if (compiled_data->tensors_init.v) |
3212 | 94 | ccfree(CCV_NNC_INIT_V(compiled_data->tensors_init.v)); |
3213 | 2.30k | if (compiled_data->evaluate.tos) |
3214 | 2.30k | ccfree(compiled_data->evaluate.tos); |
3215 | 2.30k | compiled_data->evaluate.tos = 0; |
3216 | 2.30k | if (compiled_data->stream_map) |
3217 | 4 | { |
3218 | 4 | khiter_t k; |
3219 | 36 | for (k = kh_begin4 (compiled_data->stream_map); k != kh_end(compiled_data->stream_map); ++k32 ) |
3220 | 32 | { |
3221 | 32 | if (!kh_exist(compiled_data->stream_map, k)) |
3222 | 16 | continue; |
3223 | 16 | ccv_nnc_stream_context_t* const stream = kh_val(compiled_data->stream_map, k); |
3224 | 16 | ccv_nnc_stream_context_free(stream); |
3225 | 16 | } |
3226 | 4 | kh_destroy(stream_map, compiled_data->stream_map); |
3227 | 4 | } |
3228 | 2.30k | _ccv_cnnp_compiled_data_graph_free(compiled_data); |
3229 | 2.30k | _ccv_cnnp_compiled_data_gradient_free(compiled_data); |
3230 | 2.30k | _ccv_cnnp_compiled_data_backward_free(compiled_data); |
3231 | 2.30k | _ccv_cnnp_compiled_data_apply_gradients_free(compiled_data); |
3232 | 2.30k | if (compiled_data->gradient_checkpoints) |
3233 | 2 | { |
3234 | 4 | for (i = 0; i < compiled_data->gradient_checkpoints->rnum; i++2 ) |
3235 | 2 | { |
3236 | 2 | ccv_cnnp_model_gradient_checkpoint_t* const checkpoint = (ccv_cnnp_model_gradient_checkpoint_t*)ccv_array_get(compiled_data->gradient_checkpoints, i); |
3237 | 2 | assert(checkpoint->inputs); |
3238 | 2 | ccfree(checkpoint->inputs); |
3239 | 2 | ccv_array_free(checkpoint->tensor_symbols); |
3240 | 2 | } |
3241 | 2 | ccv_array_free(compiled_data->gradient_checkpoints); |
3242 | 2 | } |
3243 | 2.30k | ccv_nnc_xpu_alloc_destroy(&compiled_data->xpu_alloc); |
3244 | 2.30k | ccfree(compiled_data); |
3245 | 2.30k | } |
3246 | | |
3247 | | void ccv_cnnp_model_free(ccv_cnnp_model_t* const model) |
3248 | 5.44k | { |
3249 | 5.44k | ccv_cnnp_model_deinit(model); |
3250 | 5.44k | if (model->isa->dealloc) |
3251 | 1.22k | model->isa->dealloc(model); |
3252 | 5.44k | if (model->io) |
3253 | 794 | { |
3254 | 794 | int i; |
3255 | 1.95k | for (i = 0; i < model->io->rnum; i++1.15k ) |
3256 | 1.15k | { |
3257 | 1.15k | ccv_cnnp_model_io_t model_io = *(ccv_cnnp_model_io_t*)ccv_array_get(model->io, i); |
3258 | 1.15k | if (model_io->outgoings) |
3259 | 650 | ccv_array_free(model_io->outgoings); |
3260 | 1.15k | if (model_io->incomings) |
3261 | 591 | ccv_array_free(model_io->incomings); |
3262 | 1.15k | if (model_io->dependencies) |
3263 | 2 | ccv_array_free(model_io->dependencies); |
3264 | 1.15k | ccfree(model_io); |
3265 | 1.15k | } |
3266 | 794 | ccv_array_free(model->io); |
3267 | 794 | } |
3268 | 5.44k | if (model->parameter_indices) |
3269 | 2.52k | ccv_array_free(model->parameter_indices); |
3270 | 5.44k | if (model->inputs) |
3271 | 2.30k | ccfree(model->inputs); |
3272 | 5.44k | if (model->graph) |
3273 | 2.30k | ccv_nnc_symbolic_graph_free(model->graph); |
3274 | 5.44k | if (model->compiled_data) |
3275 | 2.30k | _ccv_cnnp_compiled_data_free(model, model->compiled_data); |
3276 | 5.44k | if (model->name) |
3277 | 216 | ccfree(model->name); |
3278 | 5.44k | ccfree(model); |
3279 | 5.44k | } |
3280 | | |
3281 | | void ccv_cnnp_model_cancel(ccv_cnnp_model_t* const model) |
3282 | 0 | { |
3283 | 0 | ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; |
3284 | 0 | if (!compiled_data) |
3285 | 0 | return; |
3286 | 0 | if (compiled_data->graph) |
3287 | 0 | ccv_nnc_graph_cancel(compiled_data->graph); |
3288 | 0 | if (compiled_data->apply_gradients.graph) |
3289 | 0 | ccv_nnc_graph_cancel(compiled_data->apply_gradients.graph); |
3290 | 0 | } |
3291 | | |
3292 | | void ccv_cnnp_model_set_flags(ccv_cnnp_model_t* const model, const int flags) |
3293 | 0 | { |
3294 | 0 | model->exec_flags = flags; |
3295 | 0 | } |
3296 | | |
3297 | | int ccv_cnnp_model_flags(ccv_cnnp_model_t* const model) |
3298 | 0 | { |
3299 | 0 | return model->exec_flags; |
3300 | 0 | } |