/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_dynamic_graph_apply_gradients.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "ccv_nnc.h" |
2 | | #include "ccv_nnc_easy.h" |
3 | | #include "ccv_nnc_internal.h" |
4 | | #include "ccv_nnc_easy.h" |
5 | | #include "ccv_internal.h" |
6 | | #include "_ccv_nnc_dynamic_graph.h" |
7 | | |
8 | | // MARK - Level-4.5 API |
9 | | |
10 | | void ccv_nnc_dynamic_graph_apply_gradients(ccv_nnc_dynamic_graph_t* const dynamic_graph, const ccv_nnc_cmd_t minimizer, const ccv_nnc_tensor_variable_t* const gradients, const int gradient_size, ccv_nnc_tensor_variable_t* const parameters, const int parameter_size, ccv_nnc_tensor_variable_t* const saved_aux, const int parallel, ccv_nnc_stream_context_t* const stream_context) |
11 | 1.75k | { |
12 | 1.75k | assert(gradient_size == parameter_size); |
13 | 1.75k | assert(!dynamic_graph->no_grad); |
14 | | // Call apply gradients to stateful execs first. |
15 | 1.75k | int i, j; |
16 | 1.75k | if (dynamic_graph->stateful_execs) |
17 | 753 | { |
18 | 3.15k | for (i = 0; i < dynamic_graph->stateful_execs->rnum; i++2.40k ) |
19 | 2.40k | { |
20 | 2.40k | ccv_nnc_stateful_exec_t* const stateful_exec = *(ccv_nnc_stateful_exec_t**)ccv_array_get(dynamic_graph->stateful_execs, i); |
21 | | // We only apply gradients when backward round has done. |
22 | 2.40k | if (stateful_exec && stateful_exec->did_backward_but_not_apply_gradients) |
23 | 2.40k | { |
24 | 2.40k | const ccv_nnc_stateful_cmd_vtab_t* const isa = (ccv_nnc_stateful_cmd_vtab_t*)stateful_exec->cmd.isa; |
25 | 2.40k | if (isa->apply_gradients) |
26 | 2.40k | isa->apply_gradients(stateful_exec->cmd, stream_context); |
27 | 2.40k | stateful_exec->did_backward_but_not_apply_gradients = 0; |
28 | 2.40k | if (stateful_exec->should_free) |
29 | 2.10k | { |
30 | 2.10k | ccfree(stateful_exec); |
31 | 2.10k | *(ccv_nnc_stateful_exec_t**)ccv_array_get(dynamic_graph->stateful_execs, i) = 0; |
32 | 2.10k | if (i < dynamic_graph->reuse_stateful_exec || dynamic_graph->reuse_stateful_exec < 0) |
33 | 450 | dynamic_graph->reuse_stateful_exec = i; |
34 | 2.10k | } |
35 | 2.40k | } |
36 | 2.40k | } |
37 | 753 | } |
38 | 1.75k | if (parameter_size == 0) |
39 | 349 | return; |
40 | 1.40k | const int aux_size = ccv_nnc_minimizer_saved_aux_size(minimizer); |
41 | 1.40k | const int saved_aux_size = parameter_size * aux_size; |
42 | 1.40k | ccv_nnc_tensor_symbol_t update_inputs[aux_size + 2]; |
43 | 1.40k | ccv_nnc_tensor_symbol_t update_outputs[aux_size + 1]; |
44 | 1.40k | int freeable_size = 0; |
45 | 1.40k | ccv_nnc_graph_exec_symbol_t sources[parameter_size]; |
46 | 1.40k | ccv_nnc_graph_exec_symbol_t minimizes[parameter_size]; |
47 | 1.40k | ccv_nnc_tensor_variable_t freeables[parameter_size + saved_aux_size]; |
48 | 1.40k | ccv_array_t* const symbol_stack = ccv_array_new(sizeof(ccv_nnc_tape_symbol_t), 1, 0); |
49 | 1.40k | ccv_nnc_tensor_symbol_new_hook(dynamic_graph->tape, ccv_nnc_dynamic_graph_push_backward_tensor_symbol, symbol_stack, 0); |
50 | 1.40k | ccv_nnc_tensor_symbol_alias_new_hook(dynamic_graph->tape, ccv_nnc_dynamic_graph_push_backward_tensor_symbol_alias, symbol_stack, 0); |
51 | 1.40k | ccv_nnc_graph_exec_symbol_new_hook(dynamic_graph->tape, ccv_nnc_dynamic_graph_push_backward_graph_exec_symbol, symbol_stack, 0); |
52 | 1.40k | ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), parameter_size * 3 + saved_aux_size * 2, 0); |
53 | 1.40k | const int parallel_count = ccv_max(parallel, 1); |
54 | 1.40k | const int per_parameter_size = parameter_size / parallel_count; |
55 | 1.40k | assert((parameter_size % parallel_count) == 0); |
56 | 1.40k | ccv_nnc_tensor_symbol_t* const allreduce_inputs = parallel_count > 1 ? (ccv_nnc_tensor_symbol_t*)alloca4 (sizeof(ccv_nnc_tensor_symbol_t) * parallel_count * 2 + sizeof(ccv_nnc_graph_exec_symbol_t) * per_parameter_size) : 01.40k ; |
57 | 1.40k | ccv_nnc_tensor_symbol_t* const allreduce_outputs = allreduce_inputs ? allreduce_inputs + parallel_count4 : 01.40k ; |
58 | 1.40k | ccv_nnc_graph_exec_symbol_t* const allreduces = allreduce_outputs ? (ccv_nnc_graph_exec_symbol_t*)(allreduce_outputs + parallel_count)4 : 01.40k ; |
59 | 1.40k | if (parallel_count > 1) // Doing allreduce first. |
60 | 4 | { |
61 | 10 | for (i = 0; i < per_parameter_size; i++6 ) |
62 | 6 | { |
63 | 26 | for (j = 0; j < parallel_count; j++20 ) |
64 | 20 | { |
65 | 20 | const int idx = i + j * per_parameter_size; |
66 | 20 | assert(parameters[idx]->symbol.d >= 0); |
67 | 20 | const ccv_nnc_tensor_param_t info = parameters[idx]->info; |
68 | 20 | const ccv_nnc_tensor_symbol_t gradient = allreduce_inputs[j] = ccv_nnc_tensor_symbol_new(dynamic_graph->tape, info, 0); |
69 | 20 | const ccv_nnc_tensor_bind_t bind = { |
70 | 20 | .symbol = gradient, |
71 | 20 | .tensor = ccv_nnc_tensor_from_variable(dynamic_graph, gradients[idx], stream_context) |
72 | 20 | }; |
73 | 20 | ccv_array_push(tensor_binds, &bind); |
74 | 20 | allreduce_outputs[j] = ccv_nnc_tensor_symbol_new(dynamic_graph->tape, info, 0); |
75 | 20 | } |
76 | 6 | allreduces[i] = ccv_nnc_graph_exec_symbol_new(dynamic_graph->tape, CMD_COMM_ALLREDUCE_FORWARD(), allreduce_inputs, parallel_count, allreduce_outputs, parallel_count, 0); |
77 | 26 | for (j = 0; j < parallel_count; j++20 ) |
78 | 20 | { |
79 | 20 | const int idx = i + j * per_parameter_size; |
80 | 20 | assert(parameters[idx]->symbol.d >= 0); |
81 | 20 | const ccv_nnc_tensor_param_t info = parameters[idx]->info; |
82 | 20 | update_inputs[0] = allreduce_outputs[j]; |
83 | 20 | update_inputs[1] = parameters[idx]->symbol; |
84 | 20 | ccv_nnc_tensor_bind_t bind = { |
85 | 20 | .symbol = parameters[idx]->symbol, |
86 | 20 | .tensor = ccv_nnc_tensor_from_variable(dynamic_graph, parameters[idx], stream_context) |
87 | 20 | }; |
88 | 20 | ccv_array_push(tensor_binds, &bind); |
89 | 20 | freeables[freeable_size++] = ccv_nnc_tensor_variable_exchange_new(dynamic_graph, parameters[idx]); |
90 | 20 | bind.symbol = update_outputs[0] = ccv_nnc_tensor_symbol_new(dynamic_graph->tape, info, 0); |
91 | 20 | bind.tensor = ccv_nnc_tensor_from_variable(dynamic_graph, parameters[idx], stream_context); |
92 | 20 | ccv_array_push(tensor_binds, &bind); |
93 | 20 | int k; |
94 | 20 | ccv_nnc_tensor_symbol_t set_zeros[aux_size]; |
95 | 20 | int set_zero_size = 0; |
96 | 52 | for (k = 0; k < aux_size; k++32 ) |
97 | 32 | update_inputs[2 + k] = ccv_nnc_tensor_symbol_new(dynamic_graph->tape, info, 0); |
98 | 52 | for (k = 0; k < aux_size; k++32 ) |
99 | 32 | if (!ccv_nnc_tensor_variable_contains_value(saved_aux[idx * aux_size + k])) // Need to 0 init the saved aux in this case. |
100 | 32 | { |
101 | 32 | if (ccv_nnc_is_tensor_auto(saved_aux[idx * aux_size + k]->info)) |
102 | 16 | saved_aux[idx * aux_size + k]->info = info; |
103 | 32 | set_zeros[set_zero_size++] = update_inputs[2 + k]; |
104 | 32 | } |
105 | 52 | for (k = 0; k < aux_size; k++32 ) |
106 | 32 | { |
107 | 32 | bind.symbol = update_inputs[2 + k]; |
108 | 32 | bind.tensor = ccv_nnc_tensor_from_variable(dynamic_graph, saved_aux[idx * aux_size + k], stream_context); |
109 | 32 | ccv_array_push(tensor_binds, &bind); |
110 | 32 | freeables[freeable_size++] = ccv_nnc_tensor_variable_exchange_new(dynamic_graph, saved_aux[idx * aux_size + k]); |
111 | 32 | bind.symbol = update_outputs[1 + k] = ccv_nnc_tensor_symbol_new(dynamic_graph->tape, info, 0); |
112 | 32 | bind.tensor = ccv_nnc_tensor_from_variable(dynamic_graph, saved_aux[idx * aux_size + k], stream_context); |
113 | 32 | ccv_array_push(tensor_binds, &bind); |
114 | 32 | } |
115 | 20 | sources[idx] = minimizes[idx] = ccv_nnc_graph_exec_symbol_new(dynamic_graph->tape, minimizer, update_inputs, aux_size + 2, update_outputs, aux_size + 1, 0); |
116 | 20 | if (set_zero_size > 0) |
117 | 16 | { |
118 | 16 | const ccv_nnc_graph_exec_symbol_t set_zero = ccv_nnc_graph_exec_symbol_new(dynamic_graph->tape, CMD_SET_FORWARD(0), 0, 0, set_zeros, set_zero_size, 0); |
119 | 16 | ccv_nnc_graph_exec_symbol_concat(dynamic_graph->tape, set_zero, minimizes[idx]); |
120 | 16 | ccv_nnc_graph_exec_symbol_concat(dynamic_graph->tape, allreduces[i], set_zero); |
121 | 16 | sources[idx] = set_zero; |
122 | 16 | } else |
123 | 4 | ccv_nnc_graph_exec_symbol_concat(dynamic_graph->tape, allreduces[i], minimizes[idx]); |
124 | 20 | } |
125 | 6 | } |
126 | 1.40k | } else { |
127 | 2.80k | for (i = 0; i < per_parameter_size; i++1.40k ) |
128 | 1.40k | { |
129 | 1.40k | assert(parameters[i]->symbol.d >= 0); |
130 | 1.40k | const ccv_nnc_tensor_param_t info = parameters[i]->info; |
131 | 1.40k | const ccv_nnc_tensor_symbol_t gradient = update_inputs[0] = ccv_nnc_tensor_symbol_new(dynamic_graph->tape, info, 0); |
132 | 1.40k | ccv_nnc_tensor_bind_t bind = { |
133 | 1.40k | .symbol = gradient, |
134 | 1.40k | .tensor = ccv_nnc_tensor_from_variable(dynamic_graph, gradients[i], stream_context) |
135 | 1.40k | }; |
136 | 1.40k | ccv_array_push(tensor_binds, &bind); |
137 | 1.40k | update_inputs[1] = parameters[i]->symbol; |
138 | 1.40k | bind.symbol = parameters[i]->symbol; |
139 | 1.40k | bind.tensor = ccv_nnc_tensor_from_variable(dynamic_graph, parameters[i], stream_context); |
140 | 1.40k | ccv_array_push(tensor_binds, &bind); |
141 | 1.40k | freeables[freeable_size++] = ccv_nnc_tensor_variable_exchange_new(dynamic_graph, parameters[i]); |
142 | 1.40k | bind.symbol = update_outputs[0] = ccv_nnc_tensor_symbol_new(dynamic_graph->tape, info, 0); |
143 | 1.40k | bind.tensor = ccv_nnc_tensor_from_variable(dynamic_graph, parameters[i], stream_context); |
144 | 1.40k | ccv_array_push(tensor_binds, &bind); |
145 | 1.40k | ccv_nnc_tensor_symbol_t set_zeros[aux_size]; |
146 | 1.40k | int set_zero_size = 0; |
147 | 2.80k | for (j = 0; j < aux_size; j++1.40k ) |
148 | 1.40k | update_inputs[2 + j] = ccv_nnc_tensor_symbol_new(dynamic_graph->tape, info, 0); |
149 | 2.80k | for (j = 0; j < aux_size; j++1.40k ) |
150 | 1.40k | if (!ccv_nnc_tensor_variable_contains_value(saved_aux[i * aux_size + j])) // Need to 0 init the saved aux in this case. |
151 | 4 | { |
152 | 4 | if (ccv_nnc_is_tensor_auto(saved_aux[i * aux_size + j]->info)) |
153 | 0 | saved_aux[i * aux_size + j]->info = info; |
154 | 4 | set_zeros[set_zero_size++] = update_inputs[2 + j]; |
155 | 4 | } |
156 | 2.80k | for (j = 0; j < aux_size; j++1.40k ) |
157 | 1.40k | { |
158 | 1.40k | bind.symbol = update_inputs[2 + j]; |
159 | 1.40k | bind.tensor = ccv_nnc_tensor_from_variable(dynamic_graph, saved_aux[i * aux_size + j], stream_context); |
160 | 1.40k | ccv_array_push(tensor_binds, &bind); |
161 | 1.40k | freeables[freeable_size++] = ccv_nnc_tensor_variable_exchange_new(dynamic_graph, saved_aux[i * aux_size + j]); |
162 | 1.40k | bind.symbol = update_outputs[1 + j] = ccv_nnc_tensor_symbol_new(dynamic_graph->tape, info, 0); |
163 | 1.40k | bind.tensor = ccv_nnc_tensor_from_variable(dynamic_graph, saved_aux[i * aux_size + j], stream_context); |
164 | 1.40k | ccv_array_push(tensor_binds, &bind); |
165 | 1.40k | } |
166 | 1.40k | sources[i] = minimizes[i] = ccv_nnc_graph_exec_symbol_new(dynamic_graph->tape, minimizer, update_inputs, aux_size + 2, update_outputs, aux_size + 1, 0); |
167 | 1.40k | if (set_zero_size > 0) |
168 | 3 | { |
169 | 3 | const ccv_nnc_graph_exec_symbol_t set_zero = ccv_nnc_graph_exec_symbol_new(dynamic_graph->tape, CMD_SET_FORWARD(0), 0, 0, set_zeros, set_zero_size, 0); |
170 | 3 | ccv_nnc_graph_exec_symbol_concat(dynamic_graph->tape, set_zero, minimizes[i]); |
171 | 3 | sources[i] = set_zero; |
172 | 3 | } |
173 | 1.40k | } |
174 | 1.40k | } |
175 | 1.40k | ccv_nnc_dy_xpu_alloc_t xpu_alloc = { |
176 | 1.40k | .xpu_alloc = &dynamic_graph->xpu_alloc, |
177 | 1.40k | .stream = stream_context |
178 | 1.40k | }; |
179 | 1.40k | ccv_nnc_symbolic_graph_compile_param_t compile_params = { |
180 | 1.40k | .allocator = { |
181 | 1.40k | .isa = &ccv_nnc_dy_allocator_isa, |
182 | 1.40k | .context = { |
183 | 1.40k | .alloc = &xpu_alloc, |
184 | 1.40k | .free = &dynamic_graph->xpu_alloc, |
185 | 1.40k | } |
186 | 1.40k | } |
187 | 1.40k | }; |
188 | 1.40k | ccv_nnc_graph_t* graph = 0; |
189 | 1.40k | ccv_nnc_tensor_arena_t* tensor_arena = 0; |
190 | 1.40k | ccv_nnc_graph_exec_arena_t* exec_arena = 0; |
191 | 1.40k | ccv_nnc_symbolic_graph_compile(dynamic_graph->tape, compile_params, |
192 | 1.40k | (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, |
193 | 1.40k | 0, 0, |
194 | 1.40k | parallel_count > 1 ? allreduces4 : sources1.40k , parallel_count > 1 ? per_parameter_size4 : parameter_size1.40k , |
195 | 1.40k | minimizes, parameter_size, |
196 | 1.40k | &graph, &tensor_arena, &exec_arena); |
197 | 1.40k | ccv_nnc_tensor_symbol_new_hook(dynamic_graph->tape, 0, 0, 0); |
198 | 1.40k | ccv_nnc_tensor_symbol_alias_new_hook(dynamic_graph->tape, 0, 0, 0); |
199 | 1.40k | ccv_nnc_graph_exec_symbol_new_hook(dynamic_graph->tape, 0, 0, 0); |
200 | 1.40k | ccv_array_free(tensor_binds); |
201 | | // Remove newly added symbols to restore the graph. |
202 | 8.58k | for (i = 0; i < symbol_stack->rnum; i++7.18k ) |
203 | 7.18k | { |
204 | 7.18k | const ccv_nnc_tape_symbol_t* const symbol = (ccv_nnc_tape_symbol_t*)ccv_array_get(symbol_stack, i); |
205 | 7.18k | if (symbol->type == CCV_NNC_SYMBOL_TENSOR || symbol->type == CCV_NNC_SYMBOL_TENSOR_ALIAS1.44k ) |
206 | 5.73k | ccv_nnc_tensor_symbol_free(dynamic_graph->tape, (ccv_nnc_tensor_symbol_t){ |
207 | 5.73k | .d = symbol->d, |
208 | 5.73k | .graph = dynamic_graph->tape |
209 | 5.73k | }); |
210 | 1.44k | else if (symbol->type == CCV_NNC_SYMBOL_GRAPH_EXEC) |
211 | 1.44k | ccv_nnc_graph_exec_symbol_free(dynamic_graph->tape, (ccv_nnc_graph_exec_symbol_t){ |
212 | 1.44k | .d = symbol->d, |
213 | 1.44k | .graph = dynamic_graph->tape |
214 | 1.44k | }); |
215 | 7.18k | } |
216 | 1.40k | ccv_array_free(symbol_stack); |
217 | 1.40k | if (stream_context) |
218 | 3 | { |
219 | 3 | ccv_nnc_graph_set_default_static_schedule(graph, ccv_nnc_stream_context_type(stream_context), dynamic_graph->max_stream_count); |
220 | 3 | ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, stream_context); |
221 | 3 | ccv_nnc_tensor_arena_buffer_free(tensor_arena); |
222 | 3 | ccv_nnc_compilation_artifact_t* const artifact = ccv_nnc_compilation_artifact_new(graph, tensor_arena, exec_arena); |
223 | 3 | ccv_nnc_stream_context_add_callback(stream_context, (ccv_nnc_callback_f)ccv_nnc_compilation_artifact_free, artifact); |
224 | 1.40k | } else { |
225 | 1.40k | if (parallel_count > 1) |
226 | 1 | { // We need to schedule it, now to figure out what stream type we are at. |
227 | 1 | int flag = 0; |
228 | 2 | for (i = 0; !flag && i < parameter_size1 ; i++1 ) |
229 | 1 | flag = (CCV_TENSOR_GET_MEMORY(parameters[i]->info.type) == CCV_TENSOR_GPU_MEMORY); |
230 | 1 | const int stream_type = flag ? CCV_STREAM_CONTEXT_GPU : CCV_STREAM_CONTEXT_CPU0 ; |
231 | 1 | ccv_nnc_graph_set_default_static_schedule(graph, stream_type, dynamic_graph->max_stream_count); |
232 | 1 | ccv_nnc_stream_context_t* const default_stream = ccv_nnc_graph_default_stream(graph); |
233 | 1 | ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, default_stream); |
234 | 1 | ccv_nnc_stream_context_wait(default_stream); |
235 | 1 | } else |
236 | 1.40k | ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0); |
237 | 1.40k | ccv_nnc_graph_free(graph); |
238 | 1.40k | ccv_nnc_tensor_arena_free(tensor_arena); |
239 | 1.40k | ccv_nnc_graph_exec_arena_free(exec_arena); |
240 | 1.40k | } |
241 | | // Now, able to free some of the reused outputs. This need to be the last step otherwise some of the exec symbols |
242 | | // above may be freed by this operation. |
243 | 4.26k | for (i = 0; i < freeable_size; i++2.85k ) |
244 | 2.85k | ccv_nnc_tensor_variable_free(dynamic_graph, freeables[i]); |
245 | 1.40k | } |