/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_symbolic_graph_backward.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "ccv_nnc.h" |
2 | | #include "ccv_nnc_easy.h" |
3 | | #include "ccv_nnc_internal.h" |
4 | | #include "ccv_internal.h" |
5 | | #include "_ccv_nnc_symbolic_graph.h" |
6 | | |
7 | | // MARK - Level-3.5 API |
8 | | |
9 | | typedef struct { |
10 | | int f_wrt; // Check if both f_symbols and wrt_symbols flow through this node. |
11 | | ccv_array_t* outgoings; // backward traverse nodes. |
12 | | uint64_t* input_bitmasks; |
13 | | int input_bitmask_size; |
14 | | uint64_t* output_bitmasks; |
15 | | int output_bitmask_size; |
16 | | } ccv_nnc_graph_backward_info_t; |
17 | | |
18 | | typedef struct { |
19 | | int input_size; |
20 | | int* inputs; |
21 | | int output; |
22 | | ccv_array_t* outgoings; |
23 | | float value; |
24 | | ccv_nnc_graph_exec_symbol_t symbol; |
25 | | } ccv_nnc_sum_or_set_graph_exec_symbol_t; |
26 | | |
27 | | typedef struct { |
28 | | int input_size; |
29 | | int output_size; |
30 | | int* inputs; |
31 | | int* outputs; |
32 | | ccv_array_t* outgoings; |
33 | | ccv_nnc_cmd_t cmd; |
34 | | ccv_nnc_graph_exec_symbol_t symbol; |
35 | | } ccv_nnc_autograd_graph_exec_symbol_t; |
36 | | |
37 | | typedef struct { |
38 | | int d; // The pointer to the forward level object. |
39 | | int alias_ref; // The alias ref to itself (autograd_tensor_symbols array). |
40 | | int flags; // Flags for this symbol. |
41 | | ccv_nnc_tensor_symbol_t symbol; |
42 | | } ccv_nnc_autograd_tensor_symbol_t; |
43 | | |
44 | | typedef struct { |
45 | | int d; // The tensor symbol ref. |
46 | | int x; // The exec symbol ref. |
47 | | ccv_array_t* exec_registry; // Additional exec symbol refs, similar to x, only useful for aliasing. |
48 | | ccv_array_t* alias_registry; // int point to all the alias (if this is not an alias). The alias is the object in autograd_tensor_symbols, you need another level of indirection to get the actual forward level alias. |
49 | | } ccv_nnc_tensor_ref_t; |
50 | | |
51 | | typedef struct { |
52 | | int c; // The start non-accumulated version. |
53 | | ccv_array_t* ref_version; // tensor ref point to the reverse tensor symbol. |
54 | | } ccv_nnc_autograd_tensor_version_t; |
55 | | |
56 | | typedef struct { |
57 | | int d; |
58 | | int alias_ref; |
59 | | } ccv_nnc_sum_variable_t; |
60 | | |
61 | | // This method tries to figure out if a set of aliases can cover the whole tensor dim. |
62 | | // This is not a precise implementation though. The requirement is to answer this question |
63 | | // with a given memory constraint, therefore, only allow up to 65536 different tensor locations. |
64 | | // If you have more than that, it will assume that it doesn't have fully assigned aliases, |
65 | | // and will return 0. |
66 | | |
67 | | // Return 1 if inserted successfully. |
68 | | static inline int _ccv_nnc_try_mix(int* const md, const int ins, const int c) |
69 | 43 | { |
70 | 43 | if (!c) |
71 | 25 | { |
72 | 25 | md[0] = ins; |
73 | 25 | return 1; |
74 | 25 | } |
75 | 18 | int ll = 0, uu = c - 1; |
76 | 18 | int mm; |
77 | 20 | do { |
78 | 20 | mm = ll + ((uu - ll) >> 1); |
79 | 20 | if (ins == md[mm]) |
80 | 16 | return 0; |
81 | 4 | else if (ins < md[mm]) |
82 | 2 | uu = mm - 1; |
83 | 2 | else if (ins > md[mm]) |
84 | 2 | ll = mm + 1; |
85 | 20 | } while (ll <= uu4 ); |
86 | 2 | if (ll < c) |
87 | 2 | memmove(md + ll + 1, md + ll, sizeof(int) * (c - ll)); |
88 | 2 | md[ll] = ins; |
89 | 2 | return 1; |
90 | 18 | } |
91 | | |
92 | | static inline int _ccv_nnc_mix_idx(const int* const md, const int ins, const int c) |
93 | 30 | { |
94 | 30 | if (c <= 1) |
95 | 22 | return 0; |
96 | 8 | int ll = 0, uu = c - 1; |
97 | 8 | int mm; |
98 | 14 | do { |
99 | 14 | mm = ll + ((uu - ll) >> 1); |
100 | 14 | if (ins == md[mm]) |
101 | 8 | return mm; |
102 | 6 | else if (ins < md[mm]) |
103 | 0 | uu = mm - 1; |
104 | 6 | else if (ins > md[mm]) |
105 | 6 | ll = mm + 1; |
106 | 14 | } while (ll <= uu6 ); |
107 | 0 | assert(0 && "Shouldn't reach here"); |
108 | 0 | return -1; |
109 | 0 | } |
110 | | |
111 | | static inline void _ccv_nnc_try_set_pix_0(const int* const ofs, const int* const dim, const int* const tensor_dim, int* const* const scmd, const int* const cube_dim, const int* const cube_step, uint32_t* const cube, int offset) |
112 | 6 | { |
113 | 6 | const int s = (ofs[0] == 0) ? 03 : _ccv_nnc_mix_idx(scmd[0], ofs[0], cube_dim[0]) + 13 ; |
114 | 6 | const int d = ((ofs[0] + dim[0] == tensor_dim[0]) ? cube_dim[0]3 : _ccv_nnc_mix_idx(scmd[0], ofs[0] + 3 ccv_max3 (1, dim[0]), cube_dim[0])) + 1; |
115 | 6 | assert(s >= 0 && d > s); |
116 | 6 | int i; |
117 | 12 | for (i = s; i < d; i++6 ) |
118 | | // Fill this pix. I can make this faster by loop through full ones (divided by 8), but too lazy. |
119 | 6 | cube[(offset + i) >> 5] |= (1u << ((offset + i) & 0x1f)); |
120 | 6 | } |
121 | | |
122 | | static inline void _ccv_nnc_try_set_pix_1(const int* const ofs, const int* const dim, const int* const tensor_dim, int* const* const scmd, const int* const cube_dim, const int* const cube_step, uint32_t* const cube, int offset) |
123 | 16 | { |
124 | 16 | const int s0 = (ofs[0] == 0) ? 014 : _ccv_nnc_mix_idx(scmd[0], ofs[0], cube_dim[0]) + 12 ; |
125 | 16 | const int d0 = ((ofs[0] + dim[0] == tensor_dim[0]) ? cube_dim[0]14 : _ccv_nnc_mix_idx(scmd[0], ofs[0] + 2 ccv_max2 (1, dim[0]), cube_dim[0])) + 1; |
126 | 16 | assert(s0 >= 0 && d0 > s0); |
127 | 16 | const int s1 = (ofs[1] == 0) ? 010 : _ccv_nnc_mix_idx(scmd[1], ofs[1], cube_dim[1]) + 16 ; |
128 | 16 | const int d1 = ((ofs[1] + dim[1] == tensor_dim[1]) ? cube_dim[1]10 : _ccv_nnc_mix_idx(scmd[1], ofs[1] + 6 ccv_max6 (1, dim[1]), cube_dim[1])) + 1; |
129 | 16 | assert(s1 >= 0 && d1 > s1); |
130 | 16 | int i, j; |
131 | 16 | const int step1 = cube_step[1]; |
132 | 16 | if (step1 == d0 - s0) |
133 | 12 | { |
134 | | // Faster one, we can simply loop through. |
135 | 26 | for (i = s1 * step1; i < d1 * step1; i++14 ) |
136 | 14 | cube[(offset + i) >> 5] |= (1u << ((offset + i) & 0x1f)); |
137 | 12 | } else { |
138 | 4 | offset += s1 * step1; |
139 | | // There are gaps, slow one. |
140 | 8 | for (i = s1; i < d1; i++, offset += step14 ) |
141 | 8 | for (j = s0; 4 j < d0; j++4 ) |
142 | 4 | cube[(offset + j) >> 5] |= (1u << ((offset + j) & 0x1f)); |
143 | 4 | } |
144 | 16 | } |
145 | | |
146 | | static inline void _ccv_nnc_try_set_pix(const int* const ofs, const int* const dim, const int* const tensor_dim, int* const* const scmd, const int* const cube_dim, const int* const cube_step, uint32_t* const cube, int offset, const int dim_idx) |
147 | 30 | { |
148 | 30 | switch (dim_idx) |
149 | 30 | { |
150 | 16 | case 1: |
151 | 16 | _ccv_nnc_try_set_pix_1(ofs, dim, tensor_dim, scmd, cube_dim, cube_step, cube, offset); |
152 | 16 | return; |
153 | 6 | case 0: |
154 | 6 | _ccv_nnc_try_set_pix_0(ofs, dim, tensor_dim, scmd, cube_dim, cube_step, cube, offset); |
155 | 6 | return; |
156 | 30 | } |
157 | 8 | int i; |
158 | 8 | const int s = (ofs[dim_idx] == 0) ? 06 : _ccv_nnc_mix_idx(scmd[dim_idx], ofs[dim_idx], cube_dim[dim_idx]) + 12 ; |
159 | 8 | const int d = ((ofs[dim_idx] + dim[dim_idx] == tensor_dim[dim_idx]) ? cube_dim[dim_idx]2 : _ccv_nnc_mix_idx(scmd[dim_idx], ofs[dim_idx] + 6 ccv_max6 (1, dim[dim_idx]), cube_dim[dim_idx])) + 1; |
160 | 8 | assert(s >= 0 && d > s); |
161 | 16 | for (i = s; 8 i < d; i++8 ) |
162 | 8 | _ccv_nnc_try_set_pix(ofs, dim, tensor_dim, scmd, cube_dim, cube_step, cube, offset + i * cube_step[dim_idx], dim_idx - 1); |
163 | 8 | } |
164 | | |
165 | | static int _ccv_nnc_tensor_ref_fully_assigned_with_aliases(const ccv_nnc_tensor_ref_t* const tensor_ref, const ccv_array_t* const autograd_tensor_symbols, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info) |
166 | 2.11k | { |
167 | | // Only work with tensor_ref of aliases. |
168 | 2.11k | assert(tensor_ref->alias_registry); |
169 | 2.11k | const ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d); |
170 | 2.11k | assert(tensor_symbol_info[autograd->d].alias_ref == 0); |
171 | 2.11k | const int* tensor_dim = tensor_symbol_info[autograd->d].info.dim; |
172 | 2.11k | const int tensor_count = ccv_nnc_dimension_count(tensor_dim); |
173 | 2.11k | int i, j; |
174 | 2.15k | for (i = 0; i < tensor_ref->alias_registry->rnum; i++39 ) |
175 | 2.12k | { |
176 | 2.12k | const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, i); |
177 | 2.12k | assert(d < autograd_tensor_symbols->rnum); |
178 | 2.12k | const ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d); |
179 | 2.12k | assert(tensor_symbol_info[autograd->d].alias_ref); |
180 | 2.12k | const int* stride = tensor_symbol_info[autograd->d].stride; |
181 | | // If this is just reshaped (i.e., dimension is the same, and inc covers the whole). We have fully assigned. |
182 | 2.12k | if (ccv_nnc_is_tensor_stride_packed(stride, tensor_symbol_info[autograd->d].info.dim) && ccv_nnc_dimension_count(tensor_symbol_info[autograd->d].info.dim) == tensor_count2.10k ) |
183 | 2.09k | return 1; |
184 | | // Otherwise if inc doesn't match original dim, it is not covered. |
185 | 39 | if (!ccv_nnc_is_tensor_stride_packed(stride, tensor_dim)) |
186 | 0 | return 0; |
187 | 39 | } |
188 | | /* We need a solid cube (potentially hyper dimensional) to compute if there are overlaps. |
189 | | * To make this cube as small as possible, we need to map the actual tensor dimension |
190 | | * (therefore, we don't actually allocate the whole tensor to compute overlaps) to a smaller |
191 | | * cube given the ofs and dim size of its aliases. |
192 | | * |
193 | | * The following code generated the dimension mapping (using scratch space) with binary search + insertion |
194 | | * and then we fill the cube with a given tensor alias's dimensional information (ofs, dim). |
195 | | * Afterwards, we simply need to check if the cube is totally filled up to know if this tensor |
196 | | * is fully assigned with its aliases (if that is the case, we can skip zeroing for this tensor). |
197 | | * |
198 | | * There are several restrictions though to make this faster: 1). I cannot handle any cube that all side |
199 | | * lengths combined larger than 1023 (scm only have 1024 scratch space). 2). I cannot handle any cube |
200 | | * that the total volume is larger than 2048 * 8 (I only allocate 2K on stack for this). |
201 | | * */ |
202 | 26 | int scm[1024]; // Having 1024 int scratch space for mapping dimensions. (Or sparse coordinate mapping). |
203 | 26 | int cube_dim[CCV_NNC_MAX_DIM_ALLOC] = {}; // Mapping dimension size. |
204 | 26 | int cube_size = 1; |
205 | 26 | int* scmptr = scm; |
206 | 50 | for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && tensor_dim[i]; i++24 ) |
207 | 40 | { |
208 | 40 | int head = 0, tail = 0; // Note that we touched both the head and tail (otherwise this dimension is not fully covered). |
209 | 40 | int len = 0; |
210 | 105 | for (j = 0; j < tensor_ref->alias_registry->rnum; j++65 ) |
211 | 65 | { |
212 | 65 | const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, j); |
213 | 65 | assert(d < autograd_tensor_symbols->rnum); |
214 | 65 | const ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d); |
215 | 65 | assert(tensor_symbol_info[autograd->d].alias_ref); |
216 | 65 | const int* ofs = tensor_symbol_info[autograd->d].ofs; |
217 | 65 | const int* dim = tensor_symbol_info[autograd->d].info.dim; |
218 | 65 | head = head || (ofs[i] == 0)44 ; |
219 | 65 | tail = tail || (ofs[i] + 47 ccv_max47 (1, dim[i]) == tensor_dim[i]); |
220 | 65 | if (ofs[i] != 0) |
221 | 14 | len += _ccv_nnc_try_mix(scmptr, ofs[i], len); |
222 | 65 | if (scmptr - scm + len >= 1024) // Cannot handle that much, abort. |
223 | 0 | return 0; |
224 | 65 | if (ofs[i] + ccv_max(1, dim[i]) < tensor_dim[i]) |
225 | 29 | len += _ccv_nnc_try_mix(scmptr, ofs[i] + ccv_max(1, dim[i]), len); |
226 | 65 | if (scmptr - scm + len >= 1024) // Cannot handle that much, abort. |
227 | 0 | return 0; |
228 | 65 | } |
229 | 40 | if (!head || !tail39 ) |
230 | 16 | return 0; |
231 | 24 | cube_size *= (len + 1); |
232 | 24 | cube_dim[i] = len; |
233 | 24 | scmptr += len; // Moving to next level. |
234 | 24 | } |
235 | | // The cube map is too large, cannot do the computation, assume it is not fully assigned. |
236 | 10 | if (cube_size > 2048 * 8) |
237 | 0 | return 0; |
238 | | // binary map to see if it fills up. |
239 | 10 | uint32_t cube[(cube_size + 31) >> 5]; |
240 | 10 | memset(cube, 0, sizeof(uint32_t) * ((cube_size + 31) >> 5)); |
241 | 10 | int* scmd[CCV_NNC_MAX_DIM_ALLOC] = {}; // Sparse coordinate map at dimension x. |
242 | 10 | int cube_step[CCV_NNC_MAX_DIM_ALLOC] = {}; |
243 | 32 | for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && tensor_dim[i]; i++22 ) |
244 | 22 | { |
245 | 22 | cube_step[i] = (i > 0) ? cube_step[i - 1] * (cube_dim[i - 1] + 1)12 : 110 ; |
246 | 22 | scmd[i] = (i > 0) ? scmd[i - 1] + cube_dim[i - 1]12 : scm10 ; |
247 | 22 | } |
248 | 10 | const int max_dim = i; |
249 | 32 | for (i = 0; i < tensor_ref->alias_registry->rnum; i++22 ) |
250 | 22 | { |
251 | 22 | const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, i); |
252 | 22 | assert(d < autograd_tensor_symbols->rnum); |
253 | 22 | const ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d); |
254 | 22 | assert(tensor_symbol_info[autograd->d].alias_ref); |
255 | 22 | const int* ofs = tensor_symbol_info[autograd->d].ofs; |
256 | 22 | const int* dim = tensor_symbol_info[autograd->d].info.dim; |
257 | 22 | _ccv_nnc_try_set_pix(ofs, dim, tensor_dim, scmd, cube_dim, cube_step, cube, 0, max_dim - 1); |
258 | 22 | } |
259 | | // Compare to see now if the binary map filled up. If it filled up, we know it is fully assigned. |
260 | 10 | for (i = 0; i < (cube_size >> 5); i++0 ) |
261 | 0 | if (cube[i] < 0xffffffff) |
262 | 0 | return 0; |
263 | 10 | if ((cube_size & 0x1f) > 0) |
264 | 10 | { |
265 | | // Fetch the rest. |
266 | 10 | uint32_t r = 0; |
267 | 32 | for (i = 0; i < (cube_size & 0x1f); i++22 ) |
268 | 22 | r |= (1u << i); |
269 | 10 | assert(cube[((cube_size + 31) >> 5) - 1] <= r); |
270 | 10 | if (cube[((cube_size + 31) >> 5) - 1] < r) |
271 | 0 | return 0; |
272 | 10 | } |
273 | 10 | return 1; |
274 | 10 | } |
275 | | |
276 | | static int _ccv_nnc_tensor_ref_version_find_init(const ccv_nnc_autograd_tensor_version_t* const tensor_ver) |
277 | 5 | { |
278 | 5 | int i; |
279 | 10 | for (i = 0; i < tensor_ver->ref_version->rnum; i++5 ) |
280 | 7 | if (((ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, i))->x < 0) |
281 | 2 | return i; |
282 | 3 | return -1; |
283 | 5 | } |
284 | | |
285 | | static void _ccv_nnc_graph_sum_autograd_tensor_versions(const int idx, const int d, const int exec_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, ccv_nnc_autograd_tensor_version_t* const tensor_ver, ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs, ccv_array_t* const autograd_tensor_symbols, ccv_array_t* const sum_or_set_execs) |
286 | 4.27k | { |
287 | 4.27k | int i, j; |
288 | 4.27k | assert(tensor_ver->c < tensor_ver->ref_version->rnum); |
289 | 4.27k | const int input_size = tensor_ver->ref_version->rnum - tensor_ver->c; |
290 | 4.27k | int* inputs = (int*)ccmalloc(sizeof(int) * input_size); |
291 | 12.8k | for (i = tensor_ver->c; i < tensor_ver->ref_version->rnum; i++8.57k ) |
292 | 8.57k | inputs[i] = ((ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, i))->d; |
293 | 4.27k | const ccv_nnc_autograd_tensor_symbol_t tensor_sym = { |
294 | 4.27k | .d = d |
295 | 4.27k | }; |
296 | 4.27k | ccv_array_push(autograd_tensor_symbols, &tensor_sym); |
297 | 4.27k | ccv_nnc_sum_or_set_graph_exec_symbol_t sum_exec = { |
298 | 4.27k | .input_size = input_size, |
299 | 4.27k | .inputs = inputs, |
300 | 4.27k | .output = autograd_tensor_symbols->rnum - 1 |
301 | 4.27k | }; |
302 | 4.27k | if (idx >= 0) |
303 | 4.24k | { |
304 | 4.24k | sum_exec.outgoings = ccv_array_new(sizeof(int), 1, 0); |
305 | 4.24k | ccv_array_push(sum_exec.outgoings, &idx); |
306 | 4.24k | } |
307 | 4.27k | ccv_array_push(sum_or_set_execs, &sum_exec); |
308 | 4.27k | const int outgoing = exec_symbol_info_size + sum_or_set_execs->rnum - 1; |
309 | 12.8k | for (i = tensor_ver->c; i < tensor_ver->ref_version->rnum; i++8.57k ) |
310 | 8.57k | { |
311 | 8.57k | const ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, i); |
312 | 8.57k | const int x = tensor_ref->x; |
313 | 8.57k | if (x < 0) /* This is initialization tensor, it has to be occurred before the execution anyway. */ |
314 | 1 | { |
315 | | // No alias. |
316 | 1 | assert(!tensor_ref->alias_registry); |
317 | | // No associated additional execs. |
318 | 1 | assert(!tensor_ref->exec_registry); |
319 | 1 | continue; |
320 | 1 | } |
321 | 8.57k | if (x < exec_symbol_info_size) |
322 | 8.57k | { |
323 | 8.57k | ccv_nnc_autograd_graph_exec_symbol_t* back_exec = autograd_execs + x; |
324 | 8.57k | if (!back_exec->outgoings) |
325 | 39 | back_exec->outgoings = ccv_array_new(sizeof(int), 1, 0); |
326 | 8.57k | ccv_array_replace_unique_int(back_exec->outgoings, idx, outgoing); |
327 | 8.57k | } else { |
328 | | // This tensor_ref is generated by the sum operation. |
329 | 0 | ccv_nnc_sum_or_set_graph_exec_symbol_t* sum_or_set = (ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, x - exec_symbol_info_size); |
330 | 0 | ccv_array_replace_unique_int(sum_or_set->outgoings, idx, outgoing); |
331 | 0 | } |
332 | | // If this tensor have associated alias, we need to init it to zeros when it is allocated (we only need to set a flag here) |
333 | | // it is handled at compilation phase. |
334 | 8.57k | if (tensor_ref->alias_registry && |
335 | | // Loop over to see if this tensor is fully occupied to avoid extra zero step. |
336 | 8.57k | !_ccv_nnc_tensor_ref_fully_assigned_with_aliases(tensor_ref, autograd_tensor_symbols, tensor_symbol_info)22 ) |
337 | 8 | { |
338 | 8 | ccv_nnc_autograd_tensor_symbol_t* tensor_sym = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d); |
339 | | // By having alias_registry, what this symbol represents must not by an alias. |
340 | 8 | assert(tensor_sym->alias_ref == 0); |
341 | 8 | tensor_sym->flags = CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS; |
342 | 8 | } |
343 | 8.57k | if (tensor_ref->exec_registry) |
344 | 4 | for (j = 0; 2 j < tensor_ref->exec_registry->rnum; j++2 ) |
345 | 2 | { |
346 | 2 | const int x = *(int*)ccv_array_get(tensor_ref->exec_registry, j); |
347 | 2 | assert(x >= 0); |
348 | | // The exec_registry can only be generated by alias registry, therefore, it cannot reference to a sum operation. |
349 | 2 | assert(x < exec_symbol_info_size); |
350 | 2 | ccv_nnc_autograd_graph_exec_symbol_t* back_exec = autograd_execs + x; |
351 | 2 | if (!back_exec->outgoings) |
352 | 1 | back_exec->outgoings = ccv_array_new(sizeof(int), 1, 0); |
353 | 2 | ccv_array_replace_unique_int(back_exec->outgoings, idx, outgoing); |
354 | 2 | } |
355 | 8.57k | } |
356 | 4.27k | const ccv_nnc_tensor_ref_t tensor_ref = { |
357 | 4.27k | .d = autograd_tensor_symbols->rnum - 1, |
358 | 4.27k | .x = outgoing |
359 | 4.27k | }; |
360 | 4.27k | ccv_array_push(tensor_ver->ref_version, &tensor_ref); |
361 | | /* Move the c pointer up to the latest summed result. */ |
362 | 4.27k | tensor_ver->c = tensor_ver->ref_version->rnum - 1; |
363 | 4.27k | } |
364 | | |
365 | | static int _ccv_nnc_tensor_ref_version_involve_alias(const ccv_nnc_tensor_ref_t* const tensor_ref, const ccv_array_t* const autograd_tensor_symbols, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_tensor_symbol_info_t* const alias) |
366 | 69 | { |
367 | 69 | assert(alias->alias_ref > 0); |
368 | | // No alias_registry, must conflict (owns the whole band). |
369 | 69 | if (!tensor_ref->alias_registry) |
370 | 25 | return 1; |
371 | 44 | int i; |
372 | 63 | for (i = 0; i < tensor_ref->alias_registry->rnum; i++19 ) |
373 | 54 | { |
374 | 54 | const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, i); |
375 | 54 | assert(d < autograd_tensor_symbols->rnum); |
376 | 54 | ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d); |
377 | 54 | if (ccv_nnc_over_tensor_symbol_aliases(tensor_symbol_info + autograd->d, alias)) |
378 | 35 | return 1; |
379 | 54 | } |
380 | | // All aliases referenced by this ref_version doesn't overlap with the provided one, thus, there is no conflict at all. |
381 | 9 | return 0; |
382 | 44 | } |
383 | | |
384 | | static int _ccv_nnc_tensor_ref_version_find_alias(const ccv_nnc_tensor_ref_t* const tensor_ref, const ccv_array_t* const autograd_tensor_symbols, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_tensor_symbol_info_t* const alias) |
385 | 30 | { |
386 | 30 | assert(alias->alias_ref > 0); |
387 | | // No alias_registry, thus, cannot find the exact matched alias. |
388 | 30 | if (!tensor_ref->alias_registry) |
389 | 11 | return -1; |
390 | 19 | int i; |
391 | 34 | for (i = 0; i < tensor_ref->alias_registry->rnum; i++15 ) |
392 | 26 | { |
393 | 26 | const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, i); |
394 | 26 | assert(d < autograd_tensor_symbols->rnum); |
395 | 26 | ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d); |
396 | | // This must reference to an alias. |
397 | 26 | assert(tensor_symbol_info[autograd->d].alias_ref); |
398 | 26 | const int* stride = tensor_symbol_info[autograd->d].stride; |
399 | 26 | const int* ofs = tensor_symbol_info[autograd->d].ofs; |
400 | 26 | const int* dim = tensor_symbol_info[autograd->d].info.dim; |
401 | | // If everything matches, this is the required alias. |
402 | 26 | if (memcmp(stride, alias->stride, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) == 0 && |
403 | 26 | memcmp(ofs, alias->ofs, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) == 0 && |
404 | 26 | memcmp(dim, alias->info.dim, sizeof(int) * 11 CCV_NNC_MAX_DIM_ALLOC11 ) == 0) |
405 | 11 | return d; |
406 | 26 | } |
407 | 8 | return -1; |
408 | 19 | } |
409 | | |
410 | | static int _ccv_nnc_tensor_ref_version_has_this_alias_exclusively(const ccv_nnc_tensor_ref_t* const tensor_ref, const ccv_array_t* const autograd_tensor_symbols, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_tensor_symbol_info_t* const alias) |
411 | 4 | { |
412 | 4 | assert(alias->alias_ref > 0); |
413 | | // No alias_registry, thus, cannot find the exact matched alias. |
414 | 4 | if (!tensor_ref->alias_registry) |
415 | 0 | return 0; |
416 | 4 | int i; |
417 | 8 | for (i = 0; i < tensor_ref->alias_registry->rnum; i++4 ) |
418 | 5 | { |
419 | 5 | const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, i); |
420 | 5 | assert(d < autograd_tensor_symbols->rnum); |
421 | 5 | ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d); |
422 | | // This must reference to an alias. |
423 | 5 | assert(tensor_symbol_info[autograd->d].alias_ref); |
424 | 5 | const int* stride = tensor_symbol_info[autograd->d].stride; |
425 | 5 | const int* ofs = tensor_symbol_info[autograd->d].ofs; |
426 | 5 | const int* dim = tensor_symbol_info[autograd->d].info.dim; |
427 | 5 | if (memcmp(stride, alias->stride, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) != 0 || |
428 | 5 | memcmp(ofs, alias->ofs, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) != 0 || |
429 | 5 | memcmp(dim, alias->info.dim, sizeof(int) * 4 CCV_NNC_MAX_DIM_ALLOC4 ) != 0) |
430 | 1 | return 0; |
431 | 5 | } |
432 | | // If everything matches for every alias in registry, we can use any of the alias directly. |
433 | 3 | return 1; |
434 | 4 | } |
435 | | |
436 | | static int _ccv_nnc_graph_sum_autograd_tensor_versions_alias(const int idx, const int d, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int exec_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const alias, ccv_nnc_autograd_tensor_version_t* const tensor_ver, ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs, ccv_array_t* const autograd_tensor_symbols, ccv_array_t* const sum_or_set_execs) |
437 | 21 | { |
438 | 21 | assert(tensor_ver->c < tensor_ver->ref_version->rnum); |
439 | 21 | int i, j = 0; |
440 | 21 | struct { |
441 | 21 | int k; |
442 | 21 | int i; |
443 | 21 | } kd[tensor_ver->ref_version->rnum - tensor_ver->c]; |
444 | 51 | for (i = tensor_ver->c; i < tensor_ver->ref_version->rnum; i++30 ) |
445 | 30 | { |
446 | 30 | ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, i); |
447 | 30 | const int k = _ccv_nnc_tensor_ref_version_find_alias(tensor_ref, autograd_tensor_symbols, tensor_symbol_info, alias); |
448 | 30 | if (k >= 0) |
449 | 11 | kd[j++] = (typeof(kd[0])){ |
450 | 11 | .k = k, .i = i |
451 | 11 | }; |
452 | 19 | else if (_ccv_nnc_tensor_ref_version_involve_alias(tensor_ref, autograd_tensor_symbols, tensor_symbol_info, alias)) |
453 | 19 | kd[j++] = (typeof(kd[0])) { |
454 | 19 | .k = -1, .i = i // It has dependency to the original tensor (non-alias) now, label this with highest bit. |
455 | 19 | }; |
456 | 30 | } |
457 | | // Can only find one. This is the easy case, we can simply return that symbol (or its alias). |
458 | 21 | if (j == 1) |
459 | 15 | { |
460 | 15 | if (kd[0].k >= 0) |
461 | 4 | return kd[0].k; // Only can find one alias, that is the one. |
462 | | // Otherwise, need to create a new alias. |
463 | 11 | ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[0].i); |
464 | 11 | ccv_nnc_autograd_tensor_symbol_t* ref = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d); |
465 | | // Since we create new alias, we need to set the referenced one to be allocated with 0s. |
466 | 11 | if (ref->alias_ref) // If this is an alias, it has to be zero initialized. |
467 | 0 | { |
468 | 0 | ref = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, ref->alias_ref - 1); |
469 | 0 | assert(ref->alias_ref == 0); // This is original. |
470 | 0 | ref->flags = CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS; |
471 | 11 | } else if (tensor_ref->alias_registry && // Otherwise, to see if this symbol is fully occupied. |
472 | | // Loop over to see if this tensor is fully occupied to avoid extra zero step. |
473 | 11 | !_ccv_nnc_tensor_ref_fully_assigned_with_aliases(tensor_ref, autograd_tensor_symbols, tensor_symbol_info)3 ) { |
474 | 1 | ref->flags = CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS; |
475 | 1 | } |
476 | 11 | ccv_nnc_autograd_tensor_symbol_t tensor_sym = { |
477 | 11 | .d = d, |
478 | 11 | .alias_ref = tensor_ref->d + 1 |
479 | 11 | }; |
480 | 11 | ccv_array_push(autograd_tensor_symbols, &tensor_sym); |
481 | 11 | const int ad = autograd_tensor_symbols->rnum - 1; |
482 | 11 | if (tensor_ref->alias_registry) // Only push this when it has an alias registry (otherwise it already conflict with everyone). |
483 | 3 | ccv_array_push(tensor_ref->alias_registry, &ad); |
484 | 11 | if (tensor_ref->x >= exec_symbol_info_size && idx >= 02 ) |
485 | 2 | { |
486 | 2 | ccv_nnc_sum_or_set_graph_exec_symbol_t* const sum_or_set_exec = (ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, tensor_ref->x - exec_symbol_info_size); |
487 | | // This may be summed, thus, we need to create a connection between this and the sum. |
488 | 2 | if (!sum_or_set_exec->outgoings) |
489 | 0 | sum_or_set_exec->outgoings = ccv_array_new(sizeof(int), 1, 0); |
490 | 2 | ccv_array_push(sum_or_set_exec->outgoings, &idx); |
491 | 2 | } |
492 | | // The newly inserted tensor symbol. |
493 | 11 | return ad; |
494 | 11 | } |
495 | | // Otherwise, we need to create the sum operation out of these. |
496 | 6 | const int input_size = j; |
497 | 6 | int has_this_alias_exclusively = 1; |
498 | 6 | int* inputs = input_size > 0 ? (int*)ccmalloc(sizeof(int) * input_size) : 00 ; |
499 | 21 | for (i = 0; i < input_size; i++15 ) |
500 | 15 | { |
501 | 15 | ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[i].i); |
502 | | // Can take a fast path if every ref involved has the same alias, our sum operation can be faster (using alias directly). |
503 | 15 | if (has_this_alias_exclusively && kd[i].k >= 08 && _ccv_nnc_tensor_ref_version_has_this_alias_exclusively(tensor_ref, autograd_tensor_symbols, tensor_symbol_info, alias)4 ) |
504 | 3 | inputs[i] = *(int*)ccv_array_get(tensor_ref->alias_registry, 0); // Assigning the alias. |
505 | 12 | else { |
506 | 12 | if (has_this_alias_exclusively) |
507 | 5 | { |
508 | 5 | has_this_alias_exclusively = 0; |
509 | 5 | for (j = 0; j < i; j++0 ) |
510 | 0 | inputs[j] = ((ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[j].i))->d; |
511 | 5 | } |
512 | 12 | inputs[i] = tensor_ref->d; |
513 | 12 | } |
514 | 15 | } |
515 | 6 | ccv_nnc_autograd_tensor_symbol_t tensor_sym = { |
516 | 6 | .d = alias->alias_ref - 1 |
517 | 6 | }; |
518 | 6 | ccv_array_push(autograd_tensor_symbols, &tensor_sym); |
519 | 6 | const int tensor_ref_d = autograd_tensor_symbols->rnum - 1; |
520 | 6 | tensor_sym.d = d; |
521 | 6 | tensor_sym.alias_ref = tensor_ref_d + 1; |
522 | 6 | ccv_array_push(autograd_tensor_symbols, &tensor_sym); |
523 | 6 | const int ad = autograd_tensor_symbols->rnum - 1; |
524 | 6 | ccv_nnc_sum_or_set_graph_exec_symbol_t sum_exec = { |
525 | 6 | .input_size = input_size, |
526 | 6 | .inputs = inputs, |
527 | 6 | .output = has_this_alias_exclusively ? ad1 : tensor_ref_d5 /* If has this alias exclusively, the output should be alias as well. Otherwise the output is the real tensor. */ |
528 | 6 | }; |
529 | 6 | if (idx >= 0) |
530 | 6 | { |
531 | 6 | sum_exec.outgoings = ccv_array_new(sizeof(int), 1, 0); |
532 | 6 | ccv_array_push(sum_exec.outgoings, &idx); |
533 | 6 | } |
534 | 6 | ccv_array_push(sum_or_set_execs, &sum_exec); |
535 | 6 | const int outgoing = exec_symbol_info_size + sum_or_set_execs->rnum - 1; |
536 | 6 | int no_alias_registry = 0; |
537 | 21 | for (i = 0; i < input_size; i++15 ) |
538 | 15 | { |
539 | 15 | ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[i].i); |
540 | 15 | if (!has_this_alias_exclusively) |
541 | 12 | { |
542 | | // If the sum operation is not operating on one alias. I need to zero this tensor out when it is first |
543 | | // allocated (see discussions around the flags I use). |
544 | 12 | ccv_nnc_autograd_tensor_symbol_t* tensor_sym = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d); |
545 | 12 | if (tensor_sym->alias_ref) |
546 | 0 | { |
547 | | // Find the original tensor_sym and set its flags (I prefer to set flags on its original). |
548 | 0 | ccv_nnc_autograd_tensor_symbol_t* ref = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_sym->alias_ref - 1); |
549 | 0 | assert(ref->alias_ref == 0); // This is original. |
550 | 0 | ref->flags = CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS; |
551 | 12 | } else if (tensor_ref->alias_registry && // Otherwise, to see if this symbol is fully occupied. |
552 | | // Loop over to see if this tensor is fully occupied to avoid extra zero step. |
553 | 12 | !_ccv_nnc_tensor_ref_fully_assigned_with_aliases(tensor_ref, autograd_tensor_symbols, tensor_symbol_info)9 ) { |
554 | 6 | tensor_sym->flags = CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS; |
555 | 6 | } |
556 | 12 | } |
557 | | // Check to see if any of these tensors doesn't have alias. |
558 | 15 | no_alias_registry |= (!tensor_ref->alias_registry); |
559 | 15 | const int x = tensor_ref->x; |
560 | 15 | assert(x >= 0); /* Otherwise, this is initialization tensor, which is impossible to be summed up by. */ |
561 | 15 | if (x < exec_symbol_info_size) |
562 | 15 | { |
563 | 15 | ccv_nnc_autograd_graph_exec_symbol_t* back_exec = autograd_execs + x; |
564 | 15 | if (!back_exec->outgoings) |
565 | 0 | back_exec->outgoings = ccv_array_new(sizeof(int), 1, 0); |
566 | 15 | ccv_array_push(back_exec->outgoings, &outgoing); |
567 | 15 | } else { |
568 | 0 | ccv_nnc_sum_or_set_graph_exec_symbol_t* sum_or_set = (ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, x - exec_symbol_info_size); |
569 | 0 | ccv_array_push(sum_or_set->outgoings, &outgoing); |
570 | 0 | } |
571 | 15 | if (tensor_ref->exec_registry) |
572 | 6 | for (j = 0; 3 j < tensor_ref->exec_registry->rnum; j++3 ) |
573 | 3 | { |
574 | 3 | const int x = *(int*)ccv_array_get(tensor_ref->exec_registry, j); |
575 | 3 | assert(x >= 0); /* Otherwise, this is initialization tensor, which is impossible to be summed up by. */ |
576 | 3 | assert(x < exec_symbol_info_size); // exec_registry is only used by alias_registry, it simply cannot reference to a sum operation. |
577 | 3 | ccv_nnc_autograd_graph_exec_symbol_t* back_exec = autograd_execs + x; |
578 | 3 | if (!back_exec->outgoings) |
579 | 0 | back_exec->outgoings = ccv_array_new(sizeof(int), 1, 0); |
580 | 3 | ccv_array_push(back_exec->outgoings, &outgoing); |
581 | 3 | } |
582 | 15 | } |
583 | 6 | const ccv_nnc_tensor_ref_t tensor_ref = { |
584 | 6 | .d = tensor_ref_d, |
585 | 6 | .x = outgoing, |
586 | 6 | .exec_registry = 0, // I don't need to take execution dependencies because this tensor is generated by sum, therefore, we already take that dependency. |
587 | 6 | .alias_registry = !no_alias_registry || has_this_alias_exclusively2 ? ccv_array_new(sizeof(int), 1, 0)4 : 02 |
588 | 6 | }; |
589 | | // If there is no alias registry, then we take the whole tensor ref as one. |
590 | 6 | if (!no_alias_registry || has_this_alias_exclusively2 ) |
591 | 4 | { |
592 | | // If this tensor ref contains multiple different types of alias, have to add them together (otherwise |
593 | | // the computation for if there is an empty slot in this tensor ref is not correct without all the |
594 | | // occupancy availability information). |
595 | 4 | if (!has_this_alias_exclusively) |
596 | 10 | for (i = 0; 3 i < input_size; i++7 ) |
597 | 7 | { |
598 | 7 | ccv_nnc_tensor_ref_t* ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[i].i); |
599 | 7 | assert(ref->alias_registry); |
600 | | // It may get duplicates. But whatever, won't matter the computation. |
601 | 19 | for (j = 0; 7 j < ref->alias_registry->rnum; j++12 ) |
602 | 12 | ccv_array_push(tensor_ref.alias_registry, ccv_array_get(ref->alias_registry, j)); |
603 | 7 | } |
604 | 4 | ccv_array_push(tensor_ref.alias_registry, &ad); |
605 | 4 | } |
606 | 6 | assert(input_size <= tensor_ver->ref_version->rnum - tensor_ver->c); |
607 | 6 | ccv_nnc_tensor_ref_t x; |
608 | 21 | for (i = 0; i < input_size; i++15 ) |
609 | | // If the current one (i + tensor_ver->c) is smaller than the one referenced to, exchange. |
610 | 15 | if (kd[i].i > i + tensor_ver->c) |
611 | 0 | CCV_SWAP(*(ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, i + tensor_ver->c), *(ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[i].i), x); |
612 | 6 | ccv_array_push(tensor_ver->ref_version, &tensor_ref); |
613 | | // We've consumed input_size tensor refs, now move c up to the pointer of non-consumed tensors. |
614 | 6 | tensor_ver->c += input_size; |
615 | 6 | return ad; |
616 | 6 | } |
617 | | |
618 | | typedef struct ccv_nnc_symbolic_graph_backward_prep_s { |
619 | | int exec_symbol_info_size; // Number of graph exec symbols before adding any new symbols related to automatic differentiation. |
620 | | int tensor_symbol_info_size; // Number of tensor symbols before adding anything new. |
621 | | int sub_prep_size; |
622 | | ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info; |
623 | | ccv_nnc_tensor_symbol_info_t* tensor_symbol_info; |
624 | | ccv_nnc_graph_backward_info_t* backward_info; // Corresponding to forward graph exec symbol info, it is exactly in reverse. |
625 | | ccv_nnc_graph_visit_t* forward_visit; // The visitor structure (top sorted index) when doing traversal. |
626 | | ccv_nnc_graph_visit_t* backward_visit; // The visitor structure (top sorted index) when doing reverse traversal. |
627 | | ccv_nnc_autograd_graph_exec_symbol_t* autograd_execs; // The graph exec symbols we need for automatic differentiation. This is a 1:1 mapping for forward graph exec symbols, however, unlike backward_info, its outgoings may be more complex (may contain outgoing flows to sum nodes). |
628 | | ccv_nnc_autograd_tensor_version_t* autograd_tensor_versions; // Corresponding to forward tensor symbols, each may contain multiple versions (due to multi-write). |
629 | | ccv_array_t* autograd_tensor_symbols; // The tensor symbols we need for automatic differentiation (it may not be 1:1 mapping). |
630 | | ccv_array_t* sum_or_set_execs; // The sum nodes, because in reverse mode, a tensor could have multiple versions, we need to sum them up before use. |
631 | | struct ccv_nnc_symbolic_graph_backward_prep_s* sub_preps; // The preps of its sub-graphs. |
632 | | // Pointers not managed by this struct |
633 | | ccv_nnc_symbolic_graph_t* graph; |
634 | | } ccv_nnc_symbolic_graph_backward_prep_t; |
635 | | |
636 | | static ccv_nnc_symbolic_graph_backward_prep_t _ccv_nnc_symbolic_graph_backward_prep(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size) |
637 | 6.78k | { |
638 | 6.78k | const int exec_symbol_info_size = graph->exec_symbol_info->rnum; |
639 | 6.78k | assert(exec_symbol_info_size > 0); |
640 | 6.78k | const int tensor_symbol_info_size = graph->tensor_symbol_info->rnum; |
641 | 6.78k | assert(tensor_symbol_info_size > 0); |
642 | 6.78k | ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * exec_symbol_info_size); |
643 | 6.78k | ccv_nnc_tensor_symbol_info_t* tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * tensor_symbol_info_size); |
644 | 13.5k | ccv_nnc_graph_visit_t* forward_visit = ccv_nnc_graph_visit_new6.78k (graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0), exec_symbol_info_size, sources, source_size, destinations, destination_size, 0); |
645 | 0 | ccv_nnc_symbolic_graph_symbol_infer(graph, forward_visit, sources, source_size, destinations, destination_size, 0, 0, tensor_symbol_info, exec_symbol_info); |
646 | 13.5k | int i; |
647 | | // Now, for each one of these, find a reverse graph. |
648 | 13.5k | ccv_nnc_graph_backward_info_t* backward_info = (ccv_nnc_graph_backward_info_t*)cccalloc6.78k (exec_symbol_info_size, sizeof(ccv_nnc_graph_backward_info_t)); |
649 | 19.1k | ccv_nnc_graph_visit_for(forward_visit, exec_symbol_info, node, idx) { |
650 | 19.1k | assert(ccv_nnc_cmd_is_forward(node->cmd) || node->cmd.cmd == CCV_NNC_NOOP); |
651 | 19.1k | if (node->outgoings) |
652 | 24.7k | for (i = 0; 12.3k i < node->outgoings->rnum; i++12.4k ) |
653 | 12.4k | { |
654 | 12.4k | int d = *(int*)ccv_array_get(node->outgoings, i); |
655 | 12.4k | if (!backward_info[d].outgoings) |
656 | 12.3k | backward_info[d].outgoings = ccv_array_new(sizeof(int32_t), 1, 0); |
657 | 12.4k | ccv_array_push(backward_info[d].outgoings, &idx); |
658 | 12.4k | } |
659 | 19.1k | } ccv_nnc_graph_visit_endfor |
660 | | // Also mark only the output bits that we use. |
661 | 25.9k | for (i = 0; 6.78k i < exec_symbol_info_size; i++19.1k ) |
662 | 19.1k | { |
663 | 19.1k | backward_info[i].input_bitmask_size = ((exec_symbol_info[i].output_size * 2 + exec_symbol_info[i].input_size + 63) >> 6); |
664 | 19.1k | backward_info[i].output_bitmask_size = ((exec_symbol_info[i].input_size + 63) >> 6); |
665 | | // Allocate input / output bitmasks |
666 | 19.1k | if (backward_info[i].input_bitmask_size + backward_info[i].output_bitmask_size > 0) |
667 | 19.1k | { |
668 | 19.1k | backward_info[i].input_bitmasks = (uint64_t*)cccalloc(backward_info[i].input_bitmask_size + backward_info[i].output_bitmask_size, sizeof(uint64_t)); |
669 | 19.1k | if (backward_info[i].output_bitmask_size) |
670 | 19.1k | backward_info[i].output_bitmasks = backward_info[i].input_bitmasks + backward_info[i].input_bitmask_size; |
671 | 19.1k | } |
672 | 19.1k | } |
673 | 6.78k | ccv_nnc_graph_visit_t* backward_visit = ccv_nnc_graph_visit_new(graph, backward_info, exec_symbol_info_size, destinations, destination_size, sources, source_size, 0); |
674 | 6.78k | const int sub_prep_size = graph->sub_graphs ? graph->sub_graphs->rnum2 : 06.77k ; |
675 | 6.78k | ccv_nnc_symbolic_graph_backward_prep_t* sub_preps = sub_prep_size > 0 ? (ccv_nnc_symbolic_graph_backward_prep_t*)2 cccalloc2 (sub_prep_size, sizeof(ccv_nnc_symbolic_graph_backward_prep_t)) : 06.77k ; |
676 | 6.78k | for (i = 0; i < sub_prep_size; i++4 ) |
677 | 4 | { |
678 | 4 | const ccv_nnc_symbolic_graph_t* const sub_graph = *(ccv_nnc_symbolic_graph_t**)ccv_array_get(graph->sub_graphs, i); |
679 | 4 | sub_preps[i] = _ccv_nnc_symbolic_graph_backward_prep(sub_graph, ccv_nnc_symbolic_graph_sources(sub_graph), ccv_nnc_symbolic_graph_source_size(sub_graph), ccv_nnc_symbolic_graph_destinations(sub_graph), ccv_nnc_symbolic_graph_destination_size(sub_graph)); |
680 | 4 | } |
681 | 6.78k | return (ccv_nnc_symbolic_graph_backward_prep_t){ |
682 | 6.78k | .exec_symbol_info_size = exec_symbol_info_size, |
683 | 6.78k | .tensor_symbol_info_size = tensor_symbol_info_size, |
684 | 6.78k | .sub_prep_size = sub_prep_size, |
685 | 6.78k | .exec_symbol_info = exec_symbol_info, |
686 | 6.78k | .tensor_symbol_info = tensor_symbol_info, |
687 | 6.78k | .backward_info = backward_info, |
688 | 6.78k | .forward_visit = forward_visit, |
689 | 6.78k | .backward_visit = backward_visit, |
690 | 6.78k | .sub_preps = sub_preps, |
691 | 6.78k | .graph = (ccv_nnc_symbolic_graph_t*)graph, |
692 | 6.78k | }; |
693 | 6.78k | } |
694 | | |
695 | | static void _ccv_nnc_symbolic_graph_backward_exec_io(const ccv_nnc_graph_exec_symbol_info_t* const node, int** const back_input_map, int** const back_output_map, int* const back_input_size, int* const back_output_size) |
696 | 19.1k | { |
697 | 19.1k | int i; |
698 | 19.1k | if (node->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) |
699 | 7 | { |
700 | 7 | *back_input_map = node->outputs; |
701 | 7 | *back_input_size = node->output_size; |
702 | 14 | for (i = 0; i < node->case_of.argument.offset; i++7 ) |
703 | 7 | (*back_output_map)[i] = node->inputs[i]; |
704 | 7 | const int argument_offset = node->case_of.argument.offset; |
705 | 7 | const int argument_size = node->case_of.argument.size; |
706 | | // Skip the argument range. |
707 | 7 | for (i = argument_offset + argument_size; i < node->input_size; i++0 ) |
708 | 0 | (*back_output_map)[i - argument_size] = node->inputs[i]; |
709 | 7 | *back_output_size = node->input_size - node->case_of.argument.size; |
710 | 19.1k | } else { // if (node->flags & CCV_NNC_GRAPH_EXEC_P_WHILE) { |
711 | 19.1k | *back_input_map = node->outputs; |
712 | 19.1k | *back_input_size = node->output_size; |
713 | 19.1k | *back_output_map = node->inputs; |
714 | 19.1k | *back_output_size = node->input_size; |
715 | 19.1k | } |
716 | 19.1k | } |
717 | | |
718 | | static void _ccv_nnc_symbolic_graph_backward_prep_sub_f_wrt_symbols(const ccv_nnc_graph_exec_symbol_info_t* const forw_exec, const ccv_nnc_symbolic_graph_t* const sub_graph, const int graph_ref, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const uint64_t* const input_bitmasks, const uint64_t* const output_bitmasks, ccv_array_t* const sub_f_symbols, ccv_array_t* const sub_wrt_symbols) |
719 | 8 | { |
720 | 8 | int i, j; |
721 | 8 | ccv_array_clear(sub_wrt_symbols); |
722 | 8 | int forw_outputs[ccv_max(1, forw_exec->output_size)]; |
723 | 8 | int forw_inputs[ccv_max(1, forw_exec->input_size)]; |
724 | 8 | int* back_input_map = forw_outputs; |
725 | 8 | int* back_output_map = forw_inputs; |
726 | 8 | int back_input_size, back_output_size; |
727 | 8 | _ccv_nnc_symbolic_graph_backward_exec_io(forw_exec, &back_input_map, &back_output_map, &back_input_size, &back_output_size); |
728 | 18 | for (i = 0; i < back_output_size; i++10 ) |
729 | 10 | if (output_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63))) |
730 | 8 | { |
731 | 8 | const int d = back_output_map[i]; |
732 | 8 | const ccv_array_t* const s_refs = tensor_symbol_info[d].s_ref; |
733 | 8 | const int s_ref = s_refs && s_refs->rnum > graph_ref ? *(int*)7 ccv_array_get7 (s_refs, graph_ref) - 1 : -11 ; |
734 | 8 | if (s_ref >= 0) |
735 | 4 | { |
736 | 4 | ccv_nnc_tensor_symbol_t sub_wrt_symbol = { |
737 | 4 | .d = s_ref, |
738 | 4 | .graph = sub_graph, |
739 | 4 | }; |
740 | 4 | ccv_array_push(sub_wrt_symbols, &sub_wrt_symbol); |
741 | 4 | } else |
742 | 4 | ccv_array_push(sub_wrt_symbols, &NO_TENSOR_SYMBOL); |
743 | 8 | } |
744 | 8 | ccv_array_clear(sub_f_symbols); |
745 | 16 | for (i = 0; i < back_input_size; i++8 ) |
746 | 8 | if (input_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63))) |
747 | 8 | { |
748 | 8 | const int d = back_input_map[i]; |
749 | 8 | ccv_nnc_tensor_symbol_t sub_f_symbol = { |
750 | 8 | .d = *(int*)ccv_array_get(tensor_symbol_info[d].s_ref, graph_ref) - 1, |
751 | 8 | .graph = sub_graph, |
752 | 8 | }; |
753 | 8 | ccv_array_push(sub_f_symbols, &sub_f_symbol); |
754 | 8 | } |
755 | | // Go through all its assignments (parameterized loop), making them either wrt or f. |
756 | | // The reason is these must flow through the graph, otherwise we cannot form a full |
757 | | // enclosed loop. Also because they are the additional f / wrt symbols, there is |
758 | | // no case that we cannot find their corresponding gradients in the backward sub graphs |
759 | | // (these gradients have to be parameterized to form an enclosed loop as well). |
760 | 30 | for (i = 0; i < sub_graph->tensor_symbol_info->rnum; i++22 ) |
761 | 22 | { |
762 | 22 | const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(sub_graph->tensor_symbol_info, i); |
763 | 22 | if (tensor_symbol_info->assign_ref) |
764 | 2 | { |
765 | 2 | const int assign_ref = tensor_symbol_info->assign_ref - 1; |
766 | | // i is the wrt, assign_ref is the f. |
767 | 2 | int flag = 0; |
768 | 4 | for (j = 0; !flag && j < sub_wrt_symbols->rnum; j++2 ) |
769 | 2 | flag = (((ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, j))->d == i); |
770 | 2 | if (!flag) |
771 | 2 | { |
772 | 2 | ccv_nnc_tensor_symbol_t sub_wrt_symbol = { |
773 | 2 | .d = i, |
774 | 2 | .graph = sub_graph, |
775 | 2 | }; |
776 | 2 | ccv_array_push(sub_wrt_symbols, &sub_wrt_symbol); |
777 | 2 | } |
778 | 2 | flag = 0; |
779 | 4 | for (j = 0; !flag && j < sub_f_symbols->rnum2 ; j++2 ) |
780 | 2 | flag = (((ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_f_symbols, j))->d == assign_ref); |
781 | 2 | if (!flag) |
782 | 0 | { |
783 | 0 | ccv_nnc_tensor_symbol_t sub_f_symbol = { |
784 | 0 | .d = assign_ref, |
785 | 0 | .graph = sub_graph, |
786 | 0 | }; |
787 | 0 | ccv_array_push(sub_f_symbols, &sub_f_symbol); |
788 | 0 | } |
789 | 2 | } |
790 | 22 | } |
791 | 8 | } |
792 | | |
793 | | // Check whether for a given f_symbol, we can compute wrt_symbols at all, if we can, tag the minimal io and ops (some ops can be replaced with noop) required to do so. |
794 | | static int _ccv_nnc_symbolic_graph_backward_prep_prune_ops(const ccv_nnc_symbolic_graph_backward_prep_t* const backward_prep, const ccv_nnc_tensor_symbol_t* const f_symbols, const int f_symbol_size, const ccv_nnc_tensor_symbol_t* const wrt_symbols, const int wrt_symbol_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size) |
795 | 6.78k | { |
796 | 6.78k | int i, j, p; |
797 | 6.78k | const int tensor_symbol_info_size = backward_prep->tensor_symbol_info_size; |
798 | 6.78k | const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = backward_prep->exec_symbol_info; |
799 | 6.78k | const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info =backward_prep->tensor_symbol_info; |
800 | 6.78k | const ccv_nnc_graph_visit_t* const forward_visit = backward_prep->forward_visit; |
801 | | // Now, for each one of these, find a reverse graph. |
802 | 6.78k | ccv_nnc_graph_backward_info_t* const backward_info = backward_prep->backward_info; |
803 | 6.78k | const ccv_nnc_graph_visit_t* const backward_visit = backward_prep->backward_visit; |
804 | | // Find the f_symbols, and tag its flows. |
805 | 19.1k | ccv_nnc_graph_visit_for(backward_visit, backward_info, node, idx) { |
806 | 19.1k | int f = node->f_wrt & 0x1; |
807 | 26.2k | for (i = 0; i < exec_symbol_info[idx].output_size && !f19.4k ; i++7.04k ) |
808 | 7.04k | { |
809 | 7.04k | int d = exec_symbol_info[idx].outputs[i]; |
810 | 7.04k | if (d < 0) |
811 | 206 | continue; |
812 | 6.84k | while (6.83k tensor_symbol_info[d].alias_ref) |
813 | 3 | d = tensor_symbol_info[d].alias_ref - 1; |
814 | 13.6k | for (j = 0; j < f_symbol_size && !f6.87k ; j++6.85k ) |
815 | 6.85k | if (d == f_symbols[j].d) |
816 | 6.79k | f = 1; |
817 | 6.83k | } |
818 | 19.1k | if (f) |
819 | 19.1k | { |
820 | 19.1k | node->f_wrt |= f; |
821 | 19.1k | if (node->outgoings) |
822 | 24.6k | for (i = 0; 12.3k i < node->outgoings->rnum; i++12.3k ) |
823 | 12.3k | { |
824 | 12.3k | int d = *(int*)ccv_array_get(node->outgoings, i); |
825 | 12.3k | backward_info[d].f_wrt |= f; |
826 | 12.3k | } |
827 | 19.1k | } |
828 | 19.1k | } ccv_nnc_graph_visit_endfor |
829 | | // Find the wrt_symbols, and tag its flows. |
830 | 19.1k | ccv_nnc_graph_visit_for(forward_visit, exec_symbol_info, node, idx) { |
831 | 19.1k | int wrt = backward_info[idx].f_wrt & 0x2; |
832 | 30.1k | for (i = 0; i < node->input_size && !wrt27.5k ; i++10.9k ) |
833 | 10.9k | { |
834 | 10.9k | int d = node->inputs[i]; |
835 | 10.9k | if (d < 0) |
836 | 1 | continue; |
837 | 10.9k | while (10.9k tensor_symbol_info[d].alias_ref) |
838 | 7 | d = tensor_symbol_info[d].alias_ref - 1; |
839 | 24.6k | for (j = 0; j < wrt_symbol_size && !wrt13.8k ; j++13.7k ) |
840 | 13.7k | { |
841 | 13.7k | int wrt_d = wrt_symbols[j].d; |
842 | 13.7k | if (wrt_d < 0) |
843 | 29 | continue; |
844 | | // Find the root of this tensor alias. |
845 | 13.7k | if (tensor_symbol_info[wrt_d].alias_ref) |
846 | 2 | wrt_d = tensor_symbol_info[wrt_d].alias_ref - 1; |
847 | 13.7k | if (d == wrt_d) |
848 | 6.85k | wrt = 0x2; |
849 | 13.7k | } |
850 | 10.9k | } |
851 | 19.1k | if (wrt) |
852 | 19.1k | { |
853 | 19.1k | backward_info[idx].f_wrt |= wrt; |
854 | 19.1k | if (node->outgoings) |
855 | 24.7k | for (i = 0; 12.3k i < node->outgoings->rnum; i++12.3k ) |
856 | 12.3k | { |
857 | 12.3k | int d = *(int*)ccv_array_get(node->outgoings, i); |
858 | 12.3k | backward_info[d].f_wrt |= wrt; |
859 | 12.3k | } |
860 | 19.1k | } |
861 | 19.1k | } ccv_nnc_graph_visit_endfor |
862 | 6.78k | enum { |
863 | 6.78k | WRT_SYMBOL_USE = 1, |
864 | 6.78k | F_SYMBOL_USE = 2 |
865 | 6.78k | }; |
866 | 6.78k | uint8_t* used_grad = (uint8_t*)cccalloc(tensor_symbol_info_size, sizeof(uint8_t)); |
867 | | // First, all f_symbols and wrt_symbols are used. |
868 | 13.5k | for (i = 0; i < f_symbol_size; i++6.79k ) |
869 | 6.79k | if (f_symbols[i].d >= 0) |
870 | 6.79k | used_grad[tensor_symbol_info[f_symbols[i].d].alias_ref ? tensor_symbol_info[f_symbols[i].d].alias_ref - 10 : f_symbols[i].d] |= F_SYMBOL_USE; |
871 | 16.3k | for (i = 0; i < wrt_symbol_size; i++9.53k ) |
872 | 9.53k | if (wrt_symbols[i].d >= 0) |
873 | 9.52k | used_grad[tensor_symbol_info[wrt_symbols[i].d].alias_ref ? tensor_symbol_info[wrt_symbols[i].d].alias_ref - 11 : wrt_symbols[i].d9.52k ] |= WRT_SYMBOL_USE; |
874 | | // Do optimistic assumption, and then compute used_grad |
875 | 19.1k | ccv_nnc_graph_visit_for(forward_visit, exec_symbol_info, _, idx) { |
876 | 19.1k | ccv_nnc_graph_backward_info_t* node = backward_info + idx; |
877 | | /* Only interested in the ones on the f / wrt flow */ |
878 | 19.1k | if ((node->f_wrt & 0x3) == 0x3) |
879 | 19.1k | { |
880 | 19.1k | const ccv_nnc_graph_exec_symbol_info_t* forw_exec = exec_symbol_info + idx; |
881 | 19.1k | ccv_nnc_cmd_t cmd = forw_exec->cmd; |
882 | 19.1k | if (cmd.cmd != CCV_NNC_NOOP) |
883 | 19.1k | cmd.cmd += 1; /* Backward command is the one after forward command. */ |
884 | 19.1k | assert(ccv_nnc_cmd_is_backward(cmd) || cmd.cmd == CCV_NNC_NOOP); |
885 | 92.9k | for (i = 0; 19.1k i < forw_exec->output_size * 2 + forw_exec->input_size; i++73.8k ) |
886 | 73.8k | if (!(i >= forw_exec->output_size && i < forw_exec->output_size + forw_exec->input_size54.3k && |
887 | 73.8k | forw_exec->inputs[i - forw_exec->output_size] < 034.7k ) && // If the input is empty, no need. |
888 | 73.8k | !(73.8k i >= forw_exec->output_size + forw_exec->input_size73.8k && i < forw_exec->output_size * 2 + forw_exec->input_size19.5k && |
889 | 73.8k | forw_exec->outputs[i - forw_exec->output_size - forw_exec->input_size] < 019.5k ) && // If the output is empty, no need. |
890 | 73.8k | !(73.6k i < forw_exec->output_size73.6k && forw_exec->outputs[i] < 019.5k )) // If the output is empty for gradient, no need. |
891 | 73.4k | node->input_bitmasks[i >> 6] |= ((uint64_t)1 << (i & 63)); |
892 | 53.8k | for (i = 0; i < forw_exec->input_size; i++34.7k ) |
893 | 34.7k | if (!(forw_exec->inputs[i] < 0)) // If the inputs is empty, no need. |
894 | 34.7k | node->output_bitmasks[i >> 6] |= ((uint64_t)1 << (i & 63)); |
895 | 19.1k | int maybe_noop = 1; |
896 | 23.1k | for (i = 0; i < forw_exec->input_size; i++4.05k ) |
897 | | /* See if it is used as wrt, if not, no need to run this node at all. */ |
898 | 23.1k | if (forw_exec->inputs[i] >= 0 && used_grad[23.1k tensor_symbol_info[forw_exec->inputs[i]].alias_ref23.1k ? tensor_symbol_info[forw_exec->inputs[i]].alias_ref - 12.09k : forw_exec->inputs[i]21.0k ] & WRT_SYMBOL_USE) |
899 | 19.1k | { |
900 | 19.1k | maybe_noop = 0; |
901 | 19.1k | break; |
902 | 19.1k | } |
903 | 19.1k | if (maybe_noop) |
904 | 0 | { |
905 | 0 | for (i = 0; i < node->input_bitmask_size; i++) |
906 | 0 | node->input_bitmasks[i] = 0; |
907 | 0 | for (i = 0; i < node->output_bitmask_size; i++) |
908 | 0 | node->output_bitmasks[i] = 0; |
909 | 0 | node->output_bitmask_size = 0; |
910 | 19.1k | } else if (cmd.cmd == CCV_NNC_GRAPH_FORWARD || cmd.cmd == CCV_NNC_GRAPH_BACKWARD) { |
911 | | // Clear out all potential outputs if we think it is not a wrt symbols. |
912 | 6 | for (i = 0; i < forw_exec->input_size; i++4 ) |
913 | 4 | if ((node->output_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63))) && |
914 | 4 | !(used_grad[tensor_symbol_info[forw_exec->inputs[i]].alias_ref ? tensor_symbol_info[forw_exec->inputs[i]].alias_ref - 10 : forw_exec->inputs[i]] & WRT_SYMBOL_USE)) |
915 | 1 | node->output_bitmasks[i >> 6] &= ~((uint64_t)1 << (i & 63)); |
916 | | // But for now, assuming we need all input gradients. |
917 | | // Clear out all inputs / outputs from forward op. |
918 | 8 | for (i = forw_exec->output_size; i < forw_exec->output_size * 2 + forw_exec->input_size; i++6 ) |
919 | 6 | node->input_bitmasks[i >> 6] &= ~((uint64_t)1 << (i & 63)); |
920 | 19.1k | } else if (ccv_nnc_cmd_bitmask(cmd, forw_exec->output_size * 2 + forw_exec->input_size, forw_exec->input_size, node->input_bitmasks, node->input_bitmask_size, node->output_bitmasks, node->output_bitmask_size)) { |
921 | 16.7k | int flag; /* Only continue if it changed */ |
922 | 32.1k | do { |
923 | 32.1k | flag = 0; |
924 | | /* Check if the output first */ |
925 | 93.3k | for (i = 0; i < forw_exec->input_size; i++61.2k ) |
926 | | /* Only try to eliminate the one that is not used. */ |
927 | 61.2k | if ((node->output_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63))) && |
928 | 61.2k | !(used_grad[52.6k tensor_symbol_info[forw_exec->inputs[i]].alias_ref52.6k ? tensor_symbol_info[forw_exec->inputs[i]].alias_ref - 13.25k : forw_exec->inputs[i]49.3k ] & WRT_SYMBOL_USE)) |
929 | 8.62k | { |
930 | 8.62k | node->output_bitmasks[i >> 6] &= ~((uint64_t)1 << (i & 63)); |
931 | | /* If it worked, mark it as flagged. */ |
932 | 8.62k | if (ccv_nnc_cmd_bitmask(cmd, forw_exec->output_size * 2 + forw_exec->input_size, forw_exec->input_size, node->input_bitmasks, node->input_bitmask_size, node->output_bitmasks, node->output_bitmask_size)) |
933 | 8.58k | flag = 1; |
934 | 46 | else /* Refit this with the bit back again. */ |
935 | 46 | node->output_bitmasks[i >> 6] |= ((uint64_t)1 << (i & 63)); |
936 | 8.62k | } |
937 | 159k | for (i = 0; i < forw_exec->output_size * 2 + forw_exec->input_size; i++127k ) |
938 | 127k | if ((node->input_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63))) && |
939 | 127k | (96.4k i >= forw_exec->output_size96.4k || |
940 | 96.4k | !(used_grad[32.3k tensor_symbol_info[forw_exec->outputs[i]].alias_ref32.3k ? tensor_symbol_info[forw_exec->outputs[i]].alias_ref - 143 : forw_exec->outputs[i]32.3k ] & F_SYMBOL_USE))) |
941 | 82.0k | { /* Try to eliminate one of the input. */ |
942 | 82.0k | node->input_bitmasks[i >> 6] &= ~((uint64_t)1 << (i & 63)); |
943 | | /* If it worked, mark it as flagged. */ |
944 | 82.0k | if (ccv_nnc_cmd_bitmask(cmd, forw_exec->output_size * 2 + forw_exec->input_size, forw_exec->input_size, node->input_bitmasks, node->input_bitmask_size, node->output_bitmasks, node->output_bitmask_size)) |
945 | 28.8k | flag = 1; |
946 | 53.2k | else /* Refit this with the bit back again. */ |
947 | 53.2k | node->input_bitmasks[i >> 6] |= ((uint64_t)1 << (i & 63)); |
948 | 82.0k | } |
949 | 32.1k | } while (flag); |
950 | 16.7k | } |
951 | 38.6k | for (i = 0; i < forw_exec->output_size; i++19.5k ) |
952 | 19.5k | if (node->input_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63))) |
953 | | /* Mark it is used as wrt. */ |
954 | 19.1k | used_grad[tensor_symbol_info[forw_exec->outputs[i]].alias_ref ? tensor_symbol_info[forw_exec->outputs[i]].alias_ref - 121 : forw_exec->outputs[i]19.0k ] |= WRT_SYMBOL_USE; |
955 | 53.8k | for (i = 0; i < forw_exec->input_size; i++34.7k ) |
956 | | /* Mark it is used as f. */ |
957 | 34.7k | if (node->output_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63))) |
958 | 26.1k | used_grad[tensor_symbol_info[forw_exec->inputs[i]].alias_ref ? tensor_symbol_info[forw_exec->inputs[i]].alias_ref - 12.12k : forw_exec->inputs[i]24.0k ] |= F_SYMBOL_USE; |
959 | 19.1k | } |
960 | 19.1k | } ccv_nnc_graph_visit_endfor |
961 | 6.78k | ccv_array_t* sub_f_symbols = 0; |
962 | 6.78k | ccv_array_t* sub_wrt_symbols = 0; |
963 | 19.1k | ccv_nnc_graph_visit_for(forward_visit, exec_symbol_info, _, idx) { |
964 | 19.1k | ccv_nnc_graph_backward_info_t* node = backward_info + idx; |
965 | 19.1k | const ccv_nnc_graph_exec_symbol_info_t* forw_exec = exec_symbol_info + idx; |
966 | | /* Only interested in the ones on the f / wrt flow */ |
967 | 19.1k | if ((node->f_wrt & 0x3) == 0x3 && forw_exec->graph_ref_size > 019.1k ) |
968 | 2 | { |
969 | 2 | uint64_t stack_input_bitmasks1[node->input_bitmask_size]; |
970 | 2 | uint64_t stack_input_bitmasks2[node->input_bitmask_size]; |
971 | 2 | uint64_t* const input_bitmasks = forw_exec->graph_ref_size > 1 ? stack_input_bitmasks11 : node->input_bitmasks1 ; |
972 | | // We collect input masks into this location. |
973 | 2 | if (forw_exec->graph_ref_size > 1) |
974 | 1 | memset(stack_input_bitmasks2, 0, sizeof(uint64_t) * node->input_bitmask_size); |
975 | 6 | for (p = 0; p < forw_exec->graph_ref_size; p++4 ) |
976 | 4 | { |
977 | | // Reset the stack input bitmasks. |
978 | 4 | if (forw_exec->graph_ref_size > 1) |
979 | 3 | memcpy(stack_input_bitmasks1, node->input_bitmasks, sizeof(uint64_t) * node->input_bitmask_size); |
980 | | // Now calling it recursively until we are sure no f_symbols can be removed. |
981 | 4 | const int graph_ref = CCV_NNC_GRAPH_REF(forw_exec)[p] - 1; |
982 | 4 | ccv_nnc_symbolic_graph_backward_prep_t* const sub_prep = backward_prep->sub_preps + graph_ref; |
983 | 4 | if (!sub_wrt_symbols) |
984 | 2 | sub_wrt_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0); |
985 | 2 | else |
986 | 2 | ccv_array_clear(sub_wrt_symbols); |
987 | 12 | for (i = 0; i < forw_exec->input_size; i++8 ) |
988 | 8 | if (node->output_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63))) |
989 | 7 | { |
990 | 7 | const ccv_array_t* const s_refs = tensor_symbol_info[forw_exec->inputs[i]].s_ref; |
991 | 7 | const int s_ref = s_refs && s_refs->rnum > graph_ref ? *(int*)5 ccv_array_get5 (s_refs, graph_ref) - 1 : -12 ; |
992 | 7 | if (s_ref >= 0) |
993 | 3 | { |
994 | 3 | ccv_nnc_tensor_symbol_t sub_wrt_symbol = { |
995 | 3 | .d = s_ref, |
996 | 3 | .graph = sub_prep->graph, |
997 | 3 | }; |
998 | 3 | ccv_array_push(sub_wrt_symbols, &sub_wrt_symbol); |
999 | 3 | } |
1000 | 7 | } |
1001 | 4 | int flag; // Only continue if it changed */ |
1002 | 4 | do { |
1003 | 4 | flag = 0; |
1004 | 8 | for (i = 0; i < forw_exec->output_size; i++4 ) |
1005 | | // Try to reduce number of inputs for the backward graph. If it is not tagged as F_SYMBOL_USE, we can reduce it. |
1006 | | // It is reducible because this sub graph may have multiple computation paths, therefore, some of these may not |
1007 | | // involve our wrt symbols at all. |
1008 | 4 | if (!(used_grad[tensor_symbol_info[forw_exec->outputs[i]].alias_ref ? tensor_symbol_info[forw_exec->outputs[i]].alias_ref - 10 : forw_exec->outputs[i]] & F_SYMBOL_USE) && |
1009 | 4 | input_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63))0 ) |
1010 | 0 | { /* Try to eliminate one of the input. */ |
1011 | 0 | input_bitmasks[i >> 6] &= ~((uint64_t)1 << (i & 63)); |
1012 | 0 | if (!sub_f_symbols) |
1013 | 0 | sub_f_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0); |
1014 | 0 | else |
1015 | 0 | ccv_array_clear(sub_f_symbols); |
1016 | 0 | for (j = 0; j < forw_exec->output_size; j++) |
1017 | 0 | if (node->input_bitmasks[j >> 6] & ((uint64_t)1 << (j & 63))) |
1018 | 0 | { |
1019 | 0 | const int s_ref = *(int*)ccv_array_get(tensor_symbol_info[forw_exec->outputs[j]].s_ref, graph_ref) - 1; |
1020 | 0 | assert(s_ref >= 0); |
1021 | 0 | ccv_nnc_tensor_symbol_t sub_f_symbol = { |
1022 | 0 | .d = s_ref, |
1023 | 0 | .graph = sub_prep->graph, |
1024 | 0 | }; |
1025 | 0 | ccv_array_push(sub_f_symbols, &sub_f_symbol); |
1026 | 0 | } |
1027 | 0 | if (_ccv_nnc_symbolic_graph_backward_prep_prune_ops(sub_prep, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_f_symbols, 0), sub_f_symbols->rnum, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, 0), sub_wrt_symbols->rnum, ccv_nnc_symbolic_graph_sources(sub_prep->graph), ccv_nnc_symbolic_graph_source_size(sub_prep->graph), ccv_nnc_symbolic_graph_destinations(sub_prep->graph), ccv_nnc_symbolic_graph_destination_size(sub_prep->graph))) |
1028 | 0 | flag = 1; |
1029 | 0 | else /* Refit this with the bit back again. */ |
1030 | 0 | input_bitmasks[i >> 6] |= ((uint64_t)1 << (i & 63)); |
1031 | 0 | } |
1032 | 4 | } while (flag); |
1033 | | // I am done, need to redo above for sub_prep, and it has to be successful now. |
1034 | 4 | if (!sub_f_symbols) |
1035 | 2 | sub_f_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0); |
1036 | 2 | else |
1037 | 2 | ccv_array_clear(sub_f_symbols); |
1038 | 8 | for (i = 0; i < forw_exec->output_size; i++4 ) |
1039 | 4 | if (input_bitmasks[i >> 6] & ((uint64_t)1 << (i & 63))) |
1040 | 4 | { |
1041 | 4 | const int s_ref = *(int*)ccv_array_get(tensor_symbol_info[forw_exec->outputs[i]].s_ref, graph_ref) - 1; |
1042 | 4 | assert(s_ref >= 0); |
1043 | 4 | ccv_nnc_tensor_symbol_t sub_f_symbol = { |
1044 | 4 | .d = s_ref, |
1045 | 4 | .graph = sub_prep->graph, |
1046 | 4 | }; |
1047 | 4 | ccv_array_push(sub_f_symbols, &sub_f_symbol); |
1048 | 4 | } |
1049 | 4 | _ccv_nnc_symbolic_graph_backward_prep_prune_ops(sub_prep, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_f_symbols, 0), sub_f_symbols->rnum, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, 0), sub_wrt_symbols->rnum, ccv_nnc_symbolic_graph_sources(sub_prep->graph), ccv_nnc_symbolic_graph_source_size(sub_prep->graph), ccv_nnc_symbolic_graph_destinations(sub_prep->graph), ccv_nnc_symbolic_graph_destination_size(sub_prep->graph)); |
1050 | 4 | if (forw_exec->graph_ref_size > 1) |
1051 | 6 | for (i = 0; 3 i < node->input_bitmask_size; i++3 ) |
1052 | 3 | stack_input_bitmasks2[i] |= input_bitmasks[i]; |
1053 | 4 | } |
1054 | 2 | if (forw_exec->graph_ref_size > 1) |
1055 | 1 | memcpy(node->input_bitmasks, stack_input_bitmasks2, sizeof(uint64_t) * node->input_bitmask_size); |
1056 | 2 | } |
1057 | 19.1k | } ccv_nnc_graph_visit_endfor |
1058 | 6.78k | if (sub_f_symbols) |
1059 | 2 | ccv_array_free(sub_f_symbols); |
1060 | 6.78k | if (sub_wrt_symbols) |
1061 | 2 | ccv_array_free(sub_wrt_symbols); |
1062 | 6.78k | int flag = 1; |
1063 | 13.5k | for (i = 0; i < f_symbol_size && flag6.79k ; i++6.79k ) |
1064 | 6.79k | flag = (used_grad[tensor_symbol_info[f_symbols[i].d].alias_ref ? tensor_symbol_info[f_symbols[i].d].alias_ref - 10 : f_symbols[i].d] & WRT_SYMBOL_USE); |
1065 | 6.78k | ccfree(used_grad); |
1066 | 6.78k | return flag; |
1067 | 6.78k | } |
1068 | | |
1069 | | static void _ccv_nnc_symbolic_graph_backward_prep_gen(ccv_nnc_symbolic_graph_backward_prep_t* const backward_prep, const ccv_nnc_tensor_symbol_t* const f_symbols, const int f_symbol_size, const ccv_nnc_tensor_symbol_t* const wrt_symbols, const int wrt_symbol_size, const int is_while, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size) |
1070 | 6.78k | { |
1071 | 6.78k | const int exec_symbol_info_size = backward_prep->exec_symbol_info_size; |
1072 | 6.78k | const int tensor_symbol_info_size = backward_prep->tensor_symbol_info_size; |
1073 | 6.78k | const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = backward_prep->exec_symbol_info; |
1074 | 6.78k | const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info =backward_prep->tensor_symbol_info; |
1075 | 6.78k | const ccv_nnc_graph_visit_t* const forward_visit = backward_prep->forward_visit; |
1076 | | // Now, for each one of these, find a reverse graph. |
1077 | 6.78k | ccv_nnc_graph_backward_info_t* const backward_info = backward_prep->backward_info; |
1078 | 6.78k | const ccv_nnc_graph_visit_t* const backward_visit = backward_prep->backward_visit; |
1079 | 6.78k | int i, j; |
1080 | | // Now, only the flow from f_symbols back to wrt_symbols are interested to us. |
1081 | | // Visit the graph in reverse order, build the AD nodes. |
1082 | 6.78k | ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs = (ccv_nnc_autograd_graph_exec_symbol_t*)cccalloc(exec_symbol_info_size, sizeof(ccv_nnc_autograd_graph_exec_symbol_t)); |
1083 | 6.78k | int max_forw_input_size = 0, max_forw_output_size = 0; |
1084 | 25.9k | for (i = 0; i < exec_symbol_info_size; i++19.1k ) |
1085 | 19.1k | if ((backward_info[i].f_wrt & 0x3) == 0x3) |
1086 | 19.1k | { |
1087 | 19.1k | max_forw_input_size = ccv_max(max_forw_input_size, exec_symbol_info[i].input_size); |
1088 | 19.1k | max_forw_output_size = ccv_max(max_forw_output_size, exec_symbol_info[i].output_size); |
1089 | 19.1k | if (backward_info[i].outgoings) |
1090 | 12.3k | { |
1091 | | // Copy over the outgoing bits. |
1092 | 12.3k | autograd_execs[i].outgoings = ccv_array_new(sizeof(int), backward_info[i].outgoings->rnum, 0); |
1093 | 24.6k | for (j = 0; j < backward_info[i].outgoings->rnum; j++12.3k ) |
1094 | 12.3k | { |
1095 | 12.3k | const int d = *(int*)ccv_array_get(backward_info[i].outgoings, j); |
1096 | | // Only push the outgoing node if it is in the f_wrt path. |
1097 | 12.3k | if ((backward_info[d].f_wrt & 0x3) == 0x3) |
1098 | 12.3k | ccv_array_push(autograd_execs[i].outgoings, &d); |
1099 | 12.3k | } |
1100 | 12.3k | } |
1101 | 19.1k | } |
1102 | 6.78k | int max_forw_inputs[ccv_max(1, max_forw_input_size)]; |
1103 | 6.78k | int max_forw_outputs[ccv_max(1, max_forw_output_size)]; |
1104 | 6.78k | ccv_nnc_autograd_tensor_version_t* const autograd_tensor_versions = (ccv_nnc_autograd_tensor_version_t*)cccalloc(tensor_symbol_info_size, sizeof(ccv_nnc_autograd_tensor_version_t)); |
1105 | 6.78k | ccv_array_t* autograd_tensor_symbols = ccv_array_new(sizeof(ccv_nnc_autograd_tensor_symbol_t), tensor_symbol_info_size, 0); |
1106 | 6.78k | ccv_array_t* sum_or_set_execs = ccv_array_new(sizeof(ccv_nnc_sum_or_set_graph_exec_symbol_t), 0, 0); |
1107 | 19.1k | ccv_nnc_graph_visit_for(backward_visit, backward_info, back_info_node, idx) { |
1108 | | /* This is required by both f flow and wrt flow, therefore, an interest to us */ |
1109 | 19.1k | if ((back_info_node->f_wrt & 0x3) == 0x3) |
1110 | 19.1k | { |
1111 | 19.1k | const ccv_nnc_graph_exec_symbol_info_t* forw_exec = exec_symbol_info + idx; |
1112 | 19.1k | ccv_nnc_autograd_graph_exec_symbol_t* back_exec = autograd_execs + idx; |
1113 | 19.1k | back_exec->cmd = forw_exec->cmd; |
1114 | 19.1k | if (back_exec->cmd.cmd != CCV_NNC_NOOP) |
1115 | 19.1k | back_exec->cmd.cmd += 1; /* Backward command is the one after forward command. */ |
1116 | 19.1k | assert(ccv_nnc_cmd_is_backward(back_exec->cmd) || back_exec->cmd.cmd == CCV_NNC_NOOP); |
1117 | 19.1k | if (!back_info_node->output_bitmask_size) /* This has no output, can be a noop. */ |
1118 | 0 | back_exec->cmd.cmd = CCV_NNC_NOOP; |
1119 | 19.1k | else { |
1120 | 19.1k | int* back_input_map = max_forw_outputs; |
1121 | 19.1k | int* back_output_map = max_forw_inputs; |
1122 | 19.1k | _ccv_nnc_symbolic_graph_backward_exec_io(forw_exec, &back_input_map, &back_output_map, &back_exec->input_size, &back_exec->output_size); |
1123 | 19.1k | back_exec->inputs = ccmalloc(sizeof(int) * (back_exec->input_size + back_exec->output_size)); |
1124 | 19.1k | back_exec->outputs = back_exec->inputs + back_exec->input_size; |
1125 | | /* Need to compute input before we compute output */ |
1126 | 38.6k | for (i = 0; i < back_exec->input_size; i++19.5k ) |
1127 | 19.5k | { |
1128 | | /* If we can skip this input, do that. */ |
1129 | 19.5k | if (!(back_info_node->input_bitmasks[i >> 6] & ((uint64_t)1 << i))) |
1130 | 424 | continue; |
1131 | 19.1k | const int d = back_input_map[i]; |
1132 | 19.1k | const int alias_ref = tensor_symbol_info[d].alias_ref; |
1133 | 19.1k | ccv_nnc_autograd_tensor_version_t* tensor_ver = alias_ref ? autograd_tensor_versions + (alias_ref - 1)21 : autograd_tensor_versions + d19.0k ; |
1134 | | /* Initialization tensor, should corresponding to f symbols */ |
1135 | 19.1k | if (!tensor_ver->ref_version) |
1136 | 6.79k | { |
1137 | 6.79k | ccv_nnc_autograd_tensor_symbol_t tensor_sym = {}; |
1138 | 6.79k | if (!alias_ref) |
1139 | 6.79k | { |
1140 | 6.79k | tensor_sym.d = d; |
1141 | 6.79k | ccv_array_push(autograd_tensor_symbols, &tensor_sym); |
1142 | 6.79k | const ccv_nnc_tensor_ref_t tensor_ref = { |
1143 | 6.79k | .d = autograd_tensor_symbols->rnum - 1, |
1144 | 6.79k | .x = idx, |
1145 | 6.79k | .alias_registry = 0 |
1146 | 6.79k | }; |
1147 | 6.79k | tensor_ver->ref_version = ccv_array_new(sizeof(ccv_nnc_tensor_ref_t), 1, 0); |
1148 | 6.79k | ccv_array_push(tensor_ver->ref_version, &tensor_ref); |
1149 | 6.79k | } else { |
1150 | 2 | tensor_sym.d = alias_ref - 1; |
1151 | 2 | ccv_array_push(autograd_tensor_symbols, &tensor_sym); |
1152 | 2 | const ccv_nnc_tensor_ref_t tensor_ref = { |
1153 | 2 | .d = autograd_tensor_symbols->rnum - 1, |
1154 | 2 | .x = idx, |
1155 | 2 | .alias_registry = ccv_array_new(sizeof(int), 1, 0) |
1156 | 2 | }; |
1157 | 2 | tensor_ver->ref_version = ccv_array_new(sizeof(ccv_nnc_tensor_ref_t), 1, 0); |
1158 | 2 | ccv_array_push(tensor_ver->ref_version, &tensor_ref); |
1159 | 2 | tensor_sym.d = d; /* set back */ |
1160 | 2 | tensor_sym.alias_ref = tensor_ref.d + 1; |
1161 | 2 | ccv_array_push(autograd_tensor_symbols, &tensor_sym); |
1162 | 2 | const int ad = autograd_tensor_symbols->rnum - 1; |
1163 | 2 | ccv_array_push(tensor_ref.alias_registry, &ad); |
1164 | 2 | } |
1165 | 6.79k | } |
1166 | | /* The simplest case (most common), it is not an alias. */ |
1167 | 19.1k | if (!alias_ref) |
1168 | 19.0k | { |
1169 | | /* Even simpler, this only have one reference tensor, thus, pass this as input. */ |
1170 | 19.0k | if (tensor_ver->c == tensor_ver->ref_version->rnum - 1) |
1171 | 14.8k | { |
1172 | 14.8k | ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, tensor_ver->c); |
1173 | | /* There are alias associated with this tensor ref, zero it out when this tensor is allocated. */ |
1174 | | /* This is is required. Consider the case that we have an alias of this tensor used somehwere */ |
1175 | | /* on forward pass, when we compute backward, we have that alias computed first, however, its */ |
1176 | | /* underlying tensor is not zero initialized, and we will end up with garbage values here. */ |
1177 | 14.8k | if (tensor_ref->alias_registry && |
1178 | | /* Loop over to see if this tensor is fully occupied to avoid extra zero step. */ |
1179 | 14.8k | !_ccv_nnc_tensor_ref_fully_assigned_with_aliases(tensor_ref, autograd_tensor_symbols, tensor_symbol_info)2.08k ) |
1180 | 1 | { |
1181 | 1 | ccv_nnc_autograd_tensor_symbol_t* tensor_sym = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d); |
1182 | 1 | assert(tensor_sym->alias_ref == 0); |
1183 | 1 | tensor_sym->flags = CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS; |
1184 | 1 | } |
1185 | 14.8k | back_exec->inputs[i] = tensor_ref->d; |
1186 | 14.8k | } else { |
1187 | | /* Otherwise, we need to sum them up, and then pass the summed result to the computation. */ |
1188 | 4.24k | _ccv_nnc_graph_sum_autograd_tensor_versions(idx, d, exec_symbol_info_size, tensor_symbol_info, tensor_ver, autograd_execs, autograd_tensor_symbols, sum_or_set_execs); |
1189 | 4.24k | ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, tensor_ver->c); |
1190 | 4.24k | back_exec->inputs[i] = tensor_ref->d; |
1191 | 4.24k | } |
1192 | 19.0k | } else |
1193 | | /* If this is an alias, go through all available tensor ref versions */ |
1194 | 21 | back_exec->inputs[i] = _ccv_nnc_graph_sum_autograd_tensor_versions_alias(idx, d, tensor_symbol_info, exec_symbol_info_size, tensor_symbol_info + d, tensor_ver, autograd_execs, autograd_tensor_symbols, sum_or_set_execs); |
1195 | 19.1k | } |
1196 | 53.8k | for (i = 0; 19.1k i < back_exec->output_size; i++34.7k ) |
1197 | 34.7k | { |
1198 | | /* If we can skip this output, do that. */ |
1199 | 34.7k | if (!(back_info_node->output_bitmasks[i >> 6] & ((uint64_t)1 << i))) |
1200 | 8.59k | continue; |
1201 | 26.1k | const int d = back_output_map[i]; |
1202 | 26.1k | const int alias_ref = tensor_symbol_info[d].alias_ref; |
1203 | 26.1k | ccv_nnc_autograd_tensor_symbol_t tensor_sym = { |
1204 | 26.1k | .d = d |
1205 | 26.1k | }; |
1206 | | /* The simplest case (most common), it is not an alias. */ |
1207 | 26.1k | if (!alias_ref) |
1208 | 24.0k | { |
1209 | 24.0k | ccv_array_push(autograd_tensor_symbols, &tensor_sym); |
1210 | 24.0k | const ccv_nnc_tensor_ref_t tensor_ref = { |
1211 | 24.0k | .d = autograd_tensor_symbols->rnum - 1, |
1212 | 24.0k | .x = idx, |
1213 | 24.0k | .exec_registry = 0, |
1214 | 24.0k | .alias_registry = 0 |
1215 | 24.0k | }; |
1216 | 24.0k | ccv_nnc_autograd_tensor_version_t* tensor_ver = autograd_tensor_versions + d; |
1217 | 24.0k | if (!tensor_ver->ref_version) |
1218 | 19.7k | tensor_ver->ref_version = ccv_array_new(sizeof(ccv_nnc_tensor_ref_t), 1, 0); |
1219 | 24.0k | ccv_array_push(tensor_ver->ref_version, &tensor_ref); |
1220 | 24.0k | back_exec->outputs[i] = tensor_ref.d; |
1221 | 24.0k | } else { |
1222 | | /* Otherwise, in case that this is an alias, we try to find the existing one (in tensor_ver |
1223 | | * see if can meet the need (thus, for the tensor info / ofs, it fits). */ |
1224 | 2.12k | ccv_nnc_autograd_tensor_version_t* tensor_ver = autograd_tensor_versions + (alias_ref - 1); |
1225 | 2.12k | if (!tensor_ver->ref_version) |
1226 | 2.09k | tensor_ver->ref_version = ccv_array_new(sizeof(ccv_nnc_tensor_ref_t), 1, 0); |
1227 | | /* If already exists a ref version, check if any of these not-sealed tensors have free space. */ |
1228 | 2.12k | int found = 0; |
1229 | 2.17k | for (j = tensor_ver->c; !found && j < tensor_ver->ref_version->rnum2.16k ; j++50 ) |
1230 | 50 | { |
1231 | 50 | ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, j); |
1232 | 50 | if (!_ccv_nnc_tensor_ref_version_involve_alias(tensor_ref, autograd_tensor_symbols, tensor_symbol_info, tensor_symbol_info + d)) |
1233 | 9 | { |
1234 | 9 | tensor_sym.alias_ref = tensor_ref->d + 1; |
1235 | 9 | ccv_array_push(autograd_tensor_symbols, &tensor_sym); |
1236 | 9 | const int ad = autograd_tensor_symbols->rnum - 1; |
1237 | 9 | ccv_array_push(tensor_ref->alias_registry, &ad); |
1238 | 9 | if (!tensor_ref->exec_registry) |
1239 | 7 | tensor_ref->exec_registry = ccv_array_new(sizeof(int), 1, 0); |
1240 | 9 | ccv_array_push(tensor_ref->exec_registry, &idx); |
1241 | 9 | back_exec->outputs[i] = ad; |
1242 | 9 | found = 1; |
1243 | 9 | } |
1244 | 50 | } |
1245 | 2.12k | if (!found) /* Cannot find an tensor ref to insert, create one first */ |
1246 | 2.11k | { |
1247 | 2.11k | tensor_sym.d = alias_ref - 1; /* Reference back to the non-alias. */ |
1248 | 2.11k | ccv_array_push(autograd_tensor_symbols, &tensor_sym); |
1249 | 2.11k | const ccv_nnc_tensor_ref_t tensor_ref = { |
1250 | 2.11k | .d = autograd_tensor_symbols->rnum - 1, |
1251 | 2.11k | .x = idx, |
1252 | 2.11k | .exec_registry = 0, |
1253 | 2.11k | .alias_registry = ccv_array_new(sizeof(int), 1, 0) |
1254 | 2.11k | }; |
1255 | 2.11k | ccv_array_push(tensor_ver->ref_version, &tensor_ref); |
1256 | 2.11k | tensor_sym.d = d; /* set back */ |
1257 | 2.11k | tensor_sym.alias_ref = tensor_ref.d + 1; |
1258 | 2.11k | ccv_array_push(autograd_tensor_symbols, &tensor_sym); |
1259 | 2.11k | const int ad = autograd_tensor_symbols->rnum - 1; |
1260 | 2.11k | ccv_array_push(tensor_ref.alias_registry, &ad); |
1261 | 2.11k | back_exec->outputs[i] = ad; |
1262 | 2.11k | } |
1263 | 2.12k | } |
1264 | 26.1k | } |
1265 | 19.1k | } |
1266 | 19.1k | } |
1267 | 19.1k | } ccv_nnc_graph_visit_endfor |
1268 | | // Find all relevant wrt symbols, generate sum for them if needed. |
1269 | 16.3k | for (i = 0; 6.78k i < wrt_symbol_size; i++9.53k ) |
1270 | 9.53k | { |
1271 | 9.53k | const int d = wrt_symbols[i].d; |
1272 | 9.53k | if (d < 0) |
1273 | 9 | continue; |
1274 | 9.52k | const int ref_d = (!tensor_symbol_info[d].alias_ref) ? d9.52k : tensor_symbol_info[d].alias_ref - 11 ; |
1275 | 9.52k | ccv_nnc_autograd_tensor_version_t* tensor_ver = autograd_tensor_versions + ref_d; |
1276 | 9.52k | if (!tensor_ver->ref_version) |
1277 | 1 | { |
1278 | | // This wrt symbol is not available at all, for this case, we set its flag to init zero. |
1279 | 1 | const ccv_nnc_autograd_tensor_symbol_t tensor_sym = { |
1280 | 1 | .d = ref_d |
1281 | 1 | }; |
1282 | 1 | ccv_array_push(autograd_tensor_symbols, &tensor_sym); |
1283 | 1 | ccv_nnc_sum_or_set_graph_exec_symbol_t set_exec = { |
1284 | 1 | .value = 0, |
1285 | 1 | .output = autograd_tensor_symbols->rnum - 1, |
1286 | 1 | }; |
1287 | 1 | ccv_array_push(sum_or_set_execs, &set_exec); |
1288 | | // Insert the one to be set to zero. |
1289 | 1 | const ccv_nnc_tensor_ref_t tensor_ref = { |
1290 | 1 | .d = autograd_tensor_symbols->rnum - 1, |
1291 | 1 | .x = exec_symbol_info_size + sum_or_set_execs->rnum - 1, |
1292 | 1 | }; |
1293 | 1 | tensor_ver->ref_version = ccv_array_new(sizeof(ccv_nnc_tensor_ref_t), 1, 0); |
1294 | 1 | ccv_array_push(tensor_ver->ref_version, &tensor_ref); |
1295 | 1 | continue; |
1296 | 1 | } |
1297 | | // If it is a while loop, we need to insert an accumulator to the graph (this is expressed as a initialization tensor summed with existing results). |
1298 | | // First, insert the initialization tensor if this wrt results is not used directly in next while loop (thus, it participates the computation, therefore, no need to accumulate). |
1299 | 9.52k | if (is_while && !tensor_symbol_info[ref_d].assign_ref2 && |
1300 | 9.52k | _ccv_nnc_tensor_ref_version_find_init(tensor_ver) < 01 ) // If the initialization tensor is not inserted yet. |
1301 | 1 | { |
1302 | 1 | const ccv_nnc_autograd_tensor_symbol_t tensor_sym = { |
1303 | 1 | .d = ref_d |
1304 | 1 | }; |
1305 | 1 | ccv_array_push(autograd_tensor_symbols, &tensor_sym); |
1306 | | // Insert the one to be summed. |
1307 | 1 | const ccv_nnc_tensor_ref_t tensor_ref = { |
1308 | 1 | .d = autograd_tensor_symbols->rnum - 1, |
1309 | 1 | .x = -1, // This denotes it is an initialization vector. |
1310 | 1 | }; |
1311 | 1 | ccv_array_push(tensor_ver->ref_version, &tensor_ref); |
1312 | 1 | } |
1313 | | // If there are more than one tensor in the list, it is possible to sum them up. |
1314 | 9.52k | if (tensor_ver->c < tensor_ver->ref_version->rnum - 1) |
1315 | 30 | _ccv_nnc_graph_sum_autograd_tensor_versions(-1, ref_d, exec_symbol_info_size, tensor_symbol_info, tensor_ver, autograd_execs, autograd_tensor_symbols, sum_or_set_execs); |
1316 | | // The tensor version should have ref_version, and only one now (after sum up). |
1317 | 9.52k | assert(tensor_ver->c == tensor_ver->ref_version->rnum - 1); |
1318 | 9.52k | } |
1319 | | // Adding additional fields to backward_prep now. |
1320 | 6.78k | backward_prep->autograd_execs = autograd_execs; |
1321 | 6.78k | backward_prep->autograd_tensor_versions = autograd_tensor_versions; |
1322 | 6.78k | backward_prep->autograd_tensor_symbols = autograd_tensor_symbols; |
1323 | 6.78k | backward_prep->sum_or_set_execs = sum_or_set_execs; |
1324 | 6.78k | ccv_array_t* sub_f_symbols = 0; |
1325 | 6.78k | ccv_array_t* sub_wrt_symbols = 0; |
1326 | 19.1k | ccv_nnc_graph_visit_for(forward_visit, exec_symbol_info, _, idx) { |
1327 | 19.1k | ccv_nnc_graph_backward_info_t* node = backward_info + idx; |
1328 | 19.1k | const ccv_nnc_graph_exec_symbol_info_t* forw_exec = exec_symbol_info + idx; |
1329 | | /* Only interested in the ones on the f / wrt flow */ |
1330 | 19.1k | if ((node->f_wrt & 0x3) == 0x3) |
1331 | 19.1k | { |
1332 | 19.1k | const int is_while = (forw_exec->flags & CCV_NNC_GRAPH_EXEC_P_WHILE); |
1333 | 19.1k | for (i = 0; i < forw_exec->graph_ref_size; i++4 ) |
1334 | 4 | { |
1335 | | // Now calling it recursively until we are sure no f_symbols can be removed. |
1336 | 4 | const int graph_ref = CCV_NNC_GRAPH_REF(forw_exec)[i] - 1; |
1337 | 4 | ccv_nnc_symbolic_graph_backward_prep_t* const sub_prep = backward_prep->sub_preps + graph_ref; |
1338 | 4 | if (!sub_wrt_symbols) |
1339 | 2 | sub_wrt_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0); |
1340 | 4 | if (!sub_f_symbols) |
1341 | 2 | sub_f_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0); |
1342 | 4 | _ccv_nnc_symbolic_graph_backward_prep_sub_f_wrt_symbols(forw_exec, sub_prep->graph, graph_ref, tensor_symbol_info, node->input_bitmasks, node->output_bitmasks, sub_f_symbols, sub_wrt_symbols); |
1343 | 4 | _ccv_nnc_symbolic_graph_backward_prep_gen(sub_prep, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_f_symbols, 0), sub_f_symbols->rnum, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, 0), sub_wrt_symbols->rnum, is_while, ccv_nnc_symbolic_graph_sources(sub_prep->graph), ccv_nnc_symbolic_graph_source_size(sub_prep->graph), ccv_nnc_symbolic_graph_destinations(sub_prep->graph), ccv_nnc_symbolic_graph_destination_size(sub_prep->graph)); |
1344 | 4 | } |
1345 | 19.1k | } |
1346 | 19.1k | } ccv_nnc_graph_visit_endfor |
1347 | 6.78k | if (sub_f_symbols) |
1348 | 2 | ccv_array_free(sub_f_symbols); |
1349 | 6.78k | if (sub_wrt_symbols) |
1350 | 2 | ccv_array_free(sub_wrt_symbols); |
1351 | 6.78k | } |
1352 | | |
1353 | | static void _ccv_nnc_symbolic_graph_backward_prep_free(const ccv_nnc_symbolic_graph_backward_prep_t backward_prep) |
1354 | 6.78k | { |
1355 | 6.78k | int i, j; |
1356 | 6.78k | const int exec_symbol_info_size = backward_prep.exec_symbol_info_size; |
1357 | 6.78k | const int tensor_symbol_info_size = backward_prep.tensor_symbol_info_size; |
1358 | 6.78k | ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs = backward_prep.autograd_execs; |
1359 | 6.78k | if (autograd_execs) |
1360 | 6.78k | { |
1361 | 25.9k | for (i = 0; i < exec_symbol_info_size; i++19.1k ) |
1362 | 19.1k | { |
1363 | 19.1k | if (autograd_execs[i].inputs) |
1364 | 19.1k | ccfree(autograd_execs[i].inputs); |
1365 | 19.1k | if (autograd_execs[i].outgoings) |
1366 | 12.3k | ccv_array_free(autograd_execs[i].outgoings); |
1367 | 19.1k | } |
1368 | 6.78k | ccfree(autograd_execs); |
1369 | 6.78k | } |
1370 | 6.78k | ccv_nnc_autograd_tensor_version_t* const autograd_tensor_versions = backward_prep.autograd_tensor_versions; |
1371 | 6.78k | if (autograd_tensor_versions) |
1372 | 6.78k | { |
1373 | 46.4k | for (i = 0; i < tensor_symbol_info_size; i++39.6k ) |
1374 | 39.6k | { |
1375 | 39.6k | if (autograd_tensor_versions[i].ref_version) |
1376 | 28.6k | { |
1377 | 65.9k | for (j = 0; j < autograd_tensor_versions[i].ref_version->rnum; j++37.2k ) |
1378 | 37.2k | { |
1379 | 37.2k | ccv_nnc_tensor_ref_t* ref_version = (ccv_nnc_tensor_ref_t*)ccv_array_get(autograd_tensor_versions[i].ref_version, j); |
1380 | 37.2k | if (ref_version->exec_registry) |
1381 | 7 | ccv_array_free(ref_version->exec_registry); |
1382 | 37.2k | if (ref_version->alias_registry) |
1383 | 2.12k | ccv_array_free(ref_version->alias_registry); |
1384 | 37.2k | } |
1385 | 28.6k | ccv_array_free(autograd_tensor_versions[i].ref_version); |
1386 | 28.6k | } |
1387 | 39.6k | } |
1388 | 6.78k | ccfree(autograd_tensor_versions); |
1389 | 6.78k | } |
1390 | 6.78k | if (backward_prep.autograd_tensor_symbols) |
1391 | 6.78k | ccv_array_free(backward_prep.autograd_tensor_symbols); |
1392 | 6.78k | ccv_array_t* const sum_or_set_execs = backward_prep.sum_or_set_execs; |
1393 | 6.78k | if (sum_or_set_execs) |
1394 | 6.78k | { |
1395 | 11.0k | for (i = 0; i < sum_or_set_execs->rnum; i++4.28k ) |
1396 | 4.28k | { |
1397 | 4.28k | ccv_nnc_sum_or_set_graph_exec_symbol_t* sum_or_set = (ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, i); |
1398 | 4.28k | if (sum_or_set->inputs) |
1399 | 4.28k | ccfree(sum_or_set->inputs); |
1400 | 4.28k | if (sum_or_set->outgoings) |
1401 | 4.25k | ccv_array_free(sum_or_set->outgoings); |
1402 | 4.28k | } |
1403 | 6.78k | ccv_array_free(sum_or_set_execs); |
1404 | 6.78k | } |
1405 | | // Now afterwards, these are mandatory. |
1406 | 6.78k | ccv_nnc_graph_backward_info_t* const backward_info = backward_prep.backward_info; |
1407 | 25.9k | for (i = 0; i < exec_symbol_info_size; i++19.1k ) |
1408 | 19.1k | { |
1409 | 19.1k | if (backward_info[i].outgoings) |
1410 | 12.3k | ccv_array_free(backward_info[i].outgoings); |
1411 | 19.1k | if (backward_info[i].input_bitmasks) |
1412 | 19.1k | ccfree(backward_info[i].input_bitmasks); |
1413 | 19.1k | } |
1414 | 6.78k | ccfree(backward_info); |
1415 | 6.78k | ccv_nnc_graph_visit_free(backward_prep.backward_visit); |
1416 | 6.78k | ccv_nnc_graph_visit_free(backward_prep.forward_visit); |
1417 | 6.78k | ccfree(backward_prep.exec_symbol_info); |
1418 | 6.78k | ccfree(backward_prep.tensor_symbol_info); |
1419 | 6.78k | for (i = 0; i < backward_prep.sub_prep_size; i++4 ) |
1420 | 4 | _ccv_nnc_symbolic_graph_backward_prep_free(backward_prep.sub_preps[i]); |
1421 | 6.78k | if (backward_prep.sub_preps) |
1422 | 2 | ccfree(backward_prep.sub_preps); |
1423 | 6.78k | } |
1424 | | |
1425 | | static void _ccv_nnc_add_backward_breakpoint_for_symbol(const ccv_nnc_symbolic_graph_backward_prep_t* const backward_prep, const ccv_nnc_graph_exec_symbol_t breakpoint, ccv_nnc_symbolic_graph_t* const graph, ccv_array_t* const sub_breakpoints) |
1426 | 1 | { |
1427 | 1 | const ccv_nnc_graph_exec_symbol_t noop = ccv_nnc_graph_exec_symbol_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), 0, 0, 0, 0, 0); |
1428 | 1 | ccv_array_push(sub_breakpoints, &noop); |
1429 | | // Now need to hook this up to the graph. |
1430 | 1 | const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = backward_prep->exec_symbol_info; |
1431 | 1 | const ccv_nnc_graph_visit_t* const forward_visit = backward_prep->forward_visit; |
1432 | | // Now, for each one of these, find a reverse graph. |
1433 | 1 | ccv_nnc_graph_backward_info_t* const backward_info = backward_prep->backward_info; |
1434 | 1 | int i; |
1435 | | // Clean up the high bit. |
1436 | 4 | for (i = 0; i < backward_prep->exec_symbol_info_size; i++3 ) |
1437 | 3 | backward_info[i].f_wrt &= ~0x4; |
1438 | 1 | assert((backward_info[breakpoint.d].f_wrt & 0x3) != 0x3); |
1439 | 1 | backward_info[breakpoint.d].f_wrt |= 0x4; |
1440 | 1 | const ccv_nnc_graph_visit_t* const backward_visit = backward_prep->backward_visit; |
1441 | 1 | const ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs = backward_prep->autograd_execs; |
1442 | | // Going forward to find whether this breakpoint is a source node to some f_wrt nodes. |
1443 | 3 | ccv_nnc_graph_visit_for(forward_visit, exec_symbol_info, forw_exec, idx) { |
1444 | 3 | ccv_nnc_graph_backward_info_t* const node = backward_info + idx; |
1445 | | // If it is tagged on breakpoint flow, but not as both f or wrt, flow through it. |
1446 | 3 | if ((node->f_wrt & 0x4) && (node->f_wrt & 0x3) != 0x31 ) |
1447 | 1 | for (i = 0; forw_exec->outgoings && i < forw_exec->outgoings->rnum0 ; i++0 ) |
1448 | 0 | { |
1449 | 0 | const int outgoing_idx = *(int*)ccv_array_get(forw_exec->outgoings, i); |
1450 | 0 | ccv_nnc_graph_backward_info_t* const outgoing_node = backward_info + outgoing_idx; |
1451 | | // If this is a f_wrt node. Concatenate. |
1452 | 0 | if (!(outgoing_node->f_wrt & 0x4) && (outgoing_node->f_wrt & 0x3) == 0x3) |
1453 | 0 | ccv_nnc_graph_exec_symbol_concat(graph, autograd_execs[outgoing_idx].symbol, noop); |
1454 | 0 | outgoing_node->f_wrt |= 0x4; |
1455 | 0 | } |
1456 | 3 | } ccv_nnc_graph_visit_endfor |
1457 | | // Going backward to find whether this breakpoint is a destination node for some f_wrt_nodes. |
1458 | 3 | ccv_nnc_graph_visit_for(backward_visit, backward_info, node, idx) { |
1459 | 3 | if ((node->f_wrt & 0x4) && (node->f_wrt & 0x3) != 0x32 ) |
1460 | 2 | for (i = 0; 1 node->outgoings && i < node->outgoings->rnum; i++1 ) |
1461 | 1 | { |
1462 | 1 | const int outgoing_idx = *(int*)ccv_array_get(node->outgoings, i); |
1463 | 1 | ccv_nnc_graph_backward_info_t* const outgoing_node = backward_info + outgoing_idx; |
1464 | | // If this is a f_wrt node. Concatenate. |
1465 | 1 | if (!(outgoing_node->f_wrt & 0x4) && (outgoing_node->f_wrt & 0x3) == 0x3) |
1466 | 1 | ccv_nnc_graph_exec_symbol_concat(graph, noop, autograd_execs[outgoing_idx].symbol); |
1467 | 1 | outgoing_node->f_wrt |= 0x4; |
1468 | 1 | } |
1469 | 3 | } ccv_nnc_graph_visit_endfor |
1470 | 1 | } |
1471 | | |
1472 | | static ccv_nnc_autograd_tensor_symbol_t* _ccv_nnc_autograd_tensor_symbol_from_tensor_version(ccv_array_t* const autograd_tensor_symbols, const ccv_nnc_autograd_tensor_version_t* const tensor_ver) |
1473 | 7 | { |
1474 | 7 | assert(tensor_ver->ref_version); |
1475 | 7 | const ccv_nnc_tensor_ref_t* const tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, tensor_ver->c); |
1476 | 7 | return (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d); |
1477 | 7 | } |
1478 | | |
1479 | | static void _ccv_nnc_symbolic_graph_set_backward_carry_overs(const ccv_nnc_symbolic_graph_backward_prep_t* const backward_prep, const ccv_nnc_tensor_symbol_t* const wrt_symbols, const int wrt_symbol_size, ccv_nnc_symbolic_graph_t* const graph) |
1480 | 1 | { |
1481 | 1 | int i; |
1482 | 5 | for (i = 0; i < backward_prep->graph->tensor_symbol_info->rnum; i++4 ) |
1483 | 4 | { |
1484 | 4 | const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = backward_prep->tensor_symbol_info + i; |
1485 | 4 | if (tensor_symbol_info->assign_ref) |
1486 | 1 | { |
1487 | 1 | const int assign_ref = tensor_symbol_info->assign_ref - 1; |
1488 | 1 | ccv_nnc_autograd_tensor_symbol_t* const destination_autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(backward_prep->autograd_tensor_symbols, backward_prep->autograd_tensor_versions + assign_ref); |
1489 | 1 | ccv_nnc_autograd_tensor_symbol_t* const source_autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(backward_prep->autograd_tensor_symbols, backward_prep->autograd_tensor_versions + i); |
1490 | 1 | ccv_nnc_symbolic_graph_set_carry_overs(graph, (ccv_nnc_tensor_symbol_map_t []){ |
1491 | 1 | { .source = source_autograd_symbol->symbol, .destination = destination_autograd_symbol->symbol } |
1492 | 1 | }, 1); |
1493 | 1 | } |
1494 | 4 | } |
1495 | 3 | for (i = 0; i < wrt_symbol_size; i++2 ) |
1496 | 2 | { |
1497 | 2 | const int d = wrt_symbols[i].d; |
1498 | 2 | if (d < 0) |
1499 | 0 | continue; |
1500 | 2 | const int ref_d = (!backward_prep->tensor_symbol_info[d].alias_ref) ? d : backward_prep->tensor_symbol_info[d].alias_ref - 10 ; |
1501 | 2 | const ccv_nnc_autograd_tensor_version_t* const tensor_ver = backward_prep->autograd_tensor_versions + ref_d; |
1502 | 2 | const int init_ref_ver = _ccv_nnc_tensor_ref_version_find_init(tensor_ver); |
1503 | 2 | if (init_ref_ver >= 0) |
1504 | 1 | { |
1505 | 1 | const int init_d = ((ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, init_ref_ver))->d; |
1506 | 1 | ccv_nnc_autograd_tensor_symbol_t* const destination_autograd_symbol = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(backward_prep->autograd_tensor_symbols, init_d); |
1507 | 1 | ccv_nnc_autograd_tensor_symbol_t* const source_autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(backward_prep->autograd_tensor_symbols, backward_prep->autograd_tensor_versions + ref_d); |
1508 | 1 | ccv_nnc_symbolic_graph_set_carry_overs(graph, (ccv_nnc_tensor_symbol_map_t []){ |
1509 | 1 | { .source = source_autograd_symbol->symbol, .destination = destination_autograd_symbol->symbol } |
1510 | 1 | }, 1); |
1511 | 1 | } |
1512 | 2 | } |
1513 | 1 | } |
1514 | | |
1515 | | static void _ccv_nnc_symbolic_graph_add_init_zeros(const ccv_nnc_symbolic_graph_backward_prep_t* const sub_prep, const ccv_nnc_tensor_symbol_t* const wrt_symbols, const int wrt_symbol_size, ccv_nnc_symbolic_graph_t* const graph, ccv_nnc_symbolic_graph_t* const sub_graph, ccv_array_t* const symbols) |
1516 | 1 | { |
1517 | 1 | int i; |
1518 | 3 | for (i = 0; i < wrt_symbol_size; i++2 ) |
1519 | 2 | { |
1520 | 2 | const int d = wrt_symbols[i].d; |
1521 | 2 | if (d < 0) |
1522 | 0 | continue; |
1523 | 2 | const int ref_d = (!sub_prep->tensor_symbol_info[d].alias_ref) ? d : sub_prep->tensor_symbol_info[d].alias_ref - 10 ; |
1524 | 2 | const ccv_nnc_autograd_tensor_version_t* const tensor_ver = sub_prep->autograd_tensor_versions + ref_d; |
1525 | 2 | const int init_ref_ver = _ccv_nnc_tensor_ref_version_find_init(tensor_ver); |
1526 | 2 | if (init_ref_ver >= 0) |
1527 | 1 | { |
1528 | | // Need de-dup logic. |
1529 | 1 | const int init_d = ((ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, init_ref_ver))->d; |
1530 | 1 | ccv_nnc_autograd_tensor_symbol_t* const init_autograd_symbol = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(sub_prep->autograd_tensor_symbols, init_d); |
1531 | 1 | const ccv_nnc_tensor_symbol_info_t* const sub_init_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(sub_graph->tensor_symbol_info, init_autograd_symbol->symbol.d); |
1532 | | // If it doesn't have a parent ref yet, create one. |
1533 | 1 | if (!sub_init_symbol_info->p_ref) |
1534 | 1 | { |
1535 | 1 | ccv_nnc_tensor_symbol_t new_symbol = ccv_nnc_tensor_symbol_new(graph, sub_prep->tensor_symbol_info[ref_d].info, 0); |
1536 | 1 | ccv_nnc_tensor_symbol_set_flags(graph, new_symbol, CCV_NNC_TENSOR_SYMBOL_INIT_ZEROS); |
1537 | 1 | ccv_array_push(symbols, &new_symbol); |
1538 | 1 | ccv_nnc_tensor_symbol_hookup(graph, sub_graph, new_symbol, init_autograd_symbol->symbol); |
1539 | 1 | } |
1540 | 1 | } |
1541 | 2 | } |
1542 | 1 | } |
1543 | | |
1544 | | static void _ccv_nnc_symbolic_graph_add_tape_vars(const ccv_nnc_symbolic_graph_backward_prep_t* const sub_prep, ccv_nnc_symbolic_graph_t* const root, ccv_nnc_symbolic_graph_t* const graph, ccv_nnc_symbolic_graph_t* const sub_graph, ccv_array_t* const symbols) |
1545 | 4 | { |
1546 | 4 | int i; |
1547 | 24 | for (i = 0; i < sub_graph->tensor_symbol_info->rnum; i++20 ) |
1548 | 20 | { |
1549 | 20 | const ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(sub_graph->tensor_symbol_info, i); |
1550 | 20 | if ((symbol_info->flags & CCV_NNC_TENSOR_SYMBOL_TAPE_VAR) && symbol_info->pair_ref7 ) |
1551 | 7 | { |
1552 | 7 | const int pair_ref = symbol_info->pair_ref - 1; |
1553 | 7 | const ccv_nnc_tensor_symbol_t root_symbol = ccv_nnc_tensor_symbol_resolve(root, (ccv_nnc_tensor_symbol_t){ |
1554 | 7 | .d = pair_ref, |
1555 | 7 | .graph = sub_prep->graph, |
1556 | 7 | }); |
1557 | 7 | if (root_symbol.d >= 0) |
1558 | 3 | { |
1559 | 3 | ccv_nnc_tensor_symbol_hookup(root, sub_graph, root_symbol, (ccv_nnc_tensor_symbol_t){ |
1560 | 3 | .d = i, |
1561 | 3 | .graph = sub_graph, |
1562 | 3 | }); |
1563 | 3 | if (symbols) |
1564 | 2 | { |
1565 | 2 | const ccv_nnc_tensor_symbol_t p_symbol = ccv_nnc_tensor_symbol_resolve(graph, (ccv_nnc_tensor_symbol_t){ |
1566 | 2 | .d = i, |
1567 | 2 | .graph = sub_graph, |
1568 | 2 | }); |
1569 | 2 | ccv_array_push(symbols, &p_symbol); |
1570 | 2 | } |
1571 | 3 | } |
1572 | 7 | } |
1573 | 20 | } |
1574 | 4 | } |
1575 | | |
1576 | | static void _ccv_nnc_symbolic_graph_backward_gen(const ccv_nnc_symbolic_graph_backward_prep_t* const backward_prep, const ccv_nnc_tensor_symbol_t* const f_symbols, const int f_symbol_size, const ccv_nnc_tensor_symbol_t* const wrt_symbols, const int wrt_symbol_size, ccv_nnc_symbolic_graph_t* const graph, ccv_nnc_symbolic_graph_t* const root) |
1577 | 6.78k | { |
1578 | 6.78k | assert(graph == backward_prep->graph || graph->pair == backward_prep->graph); |
1579 | 6.78k | const int exec_symbol_info_size = backward_prep->exec_symbol_info_size; |
1580 | 6.78k | const int tensor_symbol_info_size = backward_prep->tensor_symbol_info_size; |
1581 | 6.78k | const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = backward_prep->exec_symbol_info; |
1582 | 6.78k | const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = backward_prep->tensor_symbol_info; |
1583 | 6.78k | int i, j, k, p; |
1584 | 6.78k | ccv_array_t* const autograd_tensor_symbols = backward_prep->autograd_tensor_symbols; |
1585 | | // Generate required symbols based on the information gathered above. |
1586 | 46.1k | for (i = 0; i < autograd_tensor_symbols->rnum; i++39.4k ) |
1587 | 39.4k | { |
1588 | 39.4k | ccv_nnc_autograd_tensor_symbol_t* symbol = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, i); |
1589 | 39.4k | assert(symbol->d >= 0); |
1590 | 39.4k | assert(symbol->d < tensor_symbol_info_size); |
1591 | 39.4k | const ccv_nnc_tensor_symbol_info_t* const forw_symbol = tensor_symbol_info + symbol->d; |
1592 | 39.4k | if (!symbol->alias_ref) |
1593 | 37.2k | { |
1594 | 37.2k | assert(!forw_symbol->alias_ref); |
1595 | 37.2k | symbol->symbol = ccv_nnc_tensor_symbol_new(graph, forw_symbol->info, 0); |
1596 | 37.2k | ccv_nnc_tensor_symbol_set_flags(graph, symbol->symbol, symbol->flags); |
1597 | 37.2k | } else { |
1598 | 2.14k | assert(forw_symbol->alias_ref); |
1599 | 2.14k | assert(symbol->flags == 0); // We don't set flags on alias. |
1600 | | // Due to our generation order, this must be after the original symbol is created. |
1601 | 2.14k | ccv_nnc_autograd_tensor_symbol_t* ref = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, symbol->alias_ref - 1); |
1602 | 2.14k | symbol->symbol = ccv_nnc_tensor_symbol_alias_new(graph, ref->symbol, forw_symbol->ofs, forw_symbol->stride, forw_symbol->info, 0); |
1603 | 2.14k | } |
1604 | 39.4k | } |
1605 | 6.78k | ccv_nnc_graph_backward_info_t* const backward_info = backward_prep->backward_info; |
1606 | 6.78k | ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs = backward_prep->autograd_execs; |
1607 | 6.78k | ccv_array_t* symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0); |
1608 | 6.78k | ccv_array_t* symbol_map = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_map_t), 0, 0); |
1609 | 6.78k | ccv_array_t* sub_f_symbols = 0; |
1610 | 6.78k | ccv_array_t* sub_wrt_symbols = 0; |
1611 | 6.78k | ccv_array_t* sub_execs = 0; |
1612 | 25.9k | for (i = 0; i < exec_symbol_info_size; i++19.1k ) |
1613 | 19.1k | { |
1614 | | // This is not going to be an interesting node. Skip. |
1615 | 19.1k | if ((backward_info[i].f_wrt & 0x3) != 0x3) |
1616 | 86 | continue; |
1617 | 19.1k | ccv_nnc_graph_backward_info_t* const back_info = backward_info + i; |
1618 | 19.1k | ccv_nnc_autograd_graph_exec_symbol_t* const back_exec = autograd_execs + i; |
1619 | 19.1k | if (back_exec->cmd.cmd == CCV_NNC_NOOP) |
1620 | 1 | { |
1621 | 1 | back_exec->symbol = ccv_nnc_graph_exec_symbol_new(graph, back_exec->cmd, 0, 0, 0, 0, 0); |
1622 | 1 | continue; |
1623 | 1 | } |
1624 | 19.1k | const ccv_nnc_graph_exec_symbol_info_t* const forw_exec = exec_symbol_info + i; |
1625 | 19.1k | if (forw_exec->flags & CCV_NNC_GRAPH_EXEC_P_WHILE) |
1626 | 1 | { |
1627 | 1 | ccv_array_clear(symbols); |
1628 | 1 | const int graph_ref = CCV_NNC_GRAPH_REF(forw_exec)[0] - 1; |
1629 | 1 | ccv_nnc_symbolic_graph_backward_prep_t* sub_prep = backward_prep->sub_preps + graph_ref; |
1630 | 1 | ccv_nnc_symbolic_graph_t* sub_graph = ccv_nnc_symbolic_graph_new(); |
1631 | 1 | sub_graph->pair = sub_prep->graph; |
1632 | 1 | if (!sub_wrt_symbols) |
1633 | 1 | sub_wrt_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0); |
1634 | | // I am done, need to redo above for sub_prep, and it has to be successful now. |
1635 | 1 | if (!sub_f_symbols) |
1636 | 1 | sub_f_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0); |
1637 | 1 | _ccv_nnc_symbolic_graph_backward_prep_sub_f_wrt_symbols(forw_exec, sub_prep->graph, graph_ref, tensor_symbol_info, back_info->input_bitmasks, back_info->output_bitmasks, sub_f_symbols, sub_wrt_symbols); |
1638 | 1 | _ccv_nnc_symbolic_graph_backward_gen(sub_prep, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_f_symbols, 0), sub_f_symbols->rnum, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, 0), sub_wrt_symbols->rnum, sub_graph, root); |
1639 | 1 | back_exec->symbol = ccv_nnc_symbolic_graph_while(graph, back_exec->cmd.cmd, sub_graph, forw_exec->name); |
1640 | 1 | if (!sub_execs) |
1641 | 1 | sub_execs = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), 0, 0); |
1642 | 1 | ccv_array_clear(sub_execs); |
1643 | | // Find the breakpoints in forward graph, creating the reverse one. |
1644 | 2 | for (j = 0; j < sub_prep->graph->breakpoint_size; j++1 ) |
1645 | 1 | { |
1646 | 1 | const int d = sub_prep->graph->breakpoints[j].d; |
1647 | 1 | if (sub_prep->autograd_execs[d].symbol.graph) |
1648 | 0 | ccv_array_push(sub_execs, &sub_prep->autograd_execs[d].symbol); |
1649 | 1 | else |
1650 | 1 | _ccv_nnc_add_backward_breakpoint_for_symbol(sub_prep, sub_prep->graph->breakpoints[j], sub_graph, sub_execs); |
1651 | 1 | } |
1652 | 1 | ccv_nnc_symbolic_graph_set_while_expr(sub_graph, NOOP_GRAPH_WHILE_EXPR, 0, 0, 0, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sub_execs, 0), sub_execs->rnum); |
1653 | 1 | ccv_nnc_graph_exec_symbol_autogen(sub_graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
1654 | 1 | _ccv_nnc_symbolic_graph_set_backward_carry_overs(sub_prep, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, 0), sub_wrt_symbols->rnum, sub_graph); |
1655 | 2 | for (j = 0; j < back_exec->input_size; j++1 ) |
1656 | 1 | if (back_info->input_bitmasks[j >> 6] & ((uint64_t)1 << j)) |
1657 | 1 | ccv_array_push(symbols, &(((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->inputs[j]))->symbol)); |
1658 | | // Find whether in the wrt symbols, anything we need to init to zero, if there are, these need to be inputs here too. |
1659 | 1 | _ccv_nnc_symbolic_graph_add_init_zeros(sub_prep, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, 0), sub_wrt_symbols->rnum, graph, sub_graph, symbols); |
1660 | 1 | _ccv_nnc_symbolic_graph_add_tape_vars(sub_prep, root, graph, sub_graph, symbols); |
1661 | | // input_size at this point, may be different from the back_exec->input_size, the reason is because we may added zeroing tensors as input tensors. |
1662 | 1 | const int input_size = symbols->rnum; |
1663 | 3 | for (j = 0; j < back_exec->output_size; j++2 ) |
1664 | 2 | if (back_info->output_bitmasks[j >> 6] & ((uint64_t)1 << j)) |
1665 | 1 | ccv_array_push(symbols, &(((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->outputs[j]))->symbol)); |
1666 | 1 | const int output_size = symbols->rnum - input_size; |
1667 | 1 | const int p_idx = sub_prep->graph->p_idx - 1; |
1668 | 1 | assert(back_exec->input_size == forw_exec->output_size); |
1669 | 1 | k = 0; |
1670 | 2 | for (j = 0; j < back_exec->input_size; j++1 ) |
1671 | 1 | if (back_info->input_bitmasks[j >> 6] & ((uint64_t)1 << j)) |
1672 | 1 | { |
1673 | 1 | const ccv_nnc_tensor_symbol_info_t* const info = tensor_symbol_info + forw_exec->outputs[j]; |
1674 | 1 | const int s_idx = *(int*)ccv_array_get(info->s_ref, p_idx) - 1; |
1675 | 1 | assert(s_idx >= 0); |
1676 | 1 | const ccv_nnc_autograd_tensor_symbol_t* const autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(sub_prep->autograd_tensor_symbols, sub_prep->autograd_tensor_versions + s_idx); |
1677 | 1 | ccv_nnc_tensor_symbol_hookup(graph, sub_graph, *(ccv_nnc_tensor_symbol_t*)ccv_array_get(symbols, k), autograd_symbol->symbol); |
1678 | 1 | ++k; |
1679 | 1 | } |
1680 | 1 | k = input_size; // Reset k, the symbol pass already set up by add_init_zeros. |
1681 | 1 | assert(back_exec->output_size == forw_exec->input_size); |
1682 | 3 | for (j = 0; 1 j < back_exec->output_size; j++2 ) |
1683 | 2 | if (back_info->output_bitmasks[j >> 6] & ((uint64_t)1 << j)) |
1684 | 1 | { |
1685 | 1 | const ccv_nnc_tensor_symbol_info_t* const info = tensor_symbol_info + forw_exec->inputs[j]; |
1686 | 1 | const int s_idx = *(int*)ccv_array_get(info->s_ref, p_idx) - 1; |
1687 | 1 | assert(s_idx >= 0); |
1688 | 1 | const ccv_nnc_autograd_tensor_symbol_t* const autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(sub_prep->autograd_tensor_symbols, sub_prep->autograd_tensor_versions + s_idx); |
1689 | 1 | ccv_nnc_tensor_symbol_hookup(graph, sub_graph, *(ccv_nnc_tensor_symbol_t*)ccv_array_get(symbols, k), autograd_symbol->symbol); |
1690 | 1 | ++k; |
1691 | 1 | } |
1692 | 1 | ccv_nnc_graph_exec_symbol_set_io(graph, back_exec->symbol, ccv_array_get(symbols, 0), input_size, ccv_array_get(symbols, input_size), output_size); |
1693 | 19.1k | } else if (forw_exec->flags & CCV_NNC_GRAPH_EXEC_CASE_OF) { |
1694 | 1 | ccv_array_clear(symbol_map); |
1695 | 2 | for (j = 0; j < back_exec->output_size; j++1 ) |
1696 | 1 | if (back_info->output_bitmasks[j >> 6] & ((uint64_t)1 << j)) |
1697 | 1 | { |
1698 | 1 | ccv_nnc_tensor_symbol_map_t symbol = { |
1699 | 1 | .source = ((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->inputs[j]))->symbol, |
1700 | 1 | .destination = ((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->outputs[j]))->symbol, |
1701 | 1 | }; |
1702 | 1 | ccv_array_push(symbol_map, &symbol); |
1703 | 1 | } |
1704 | 1 | const int symbol_map_size = symbol_map->rnum; |
1705 | 1 | back_exec->symbol = ccv_nnc_symbolic_graph_case_of_new(graph, back_exec->cmd.cmd, 0, 0, ccv_array_get(symbol_map, 0), symbol_map_size, forw_exec->name); |
1706 | 1 | ccv_nnc_symbolic_graph_set_case_of_expr(graph, back_exec->symbol, NOOP_GRAPH_CASE_OF_EXPR, 0); |
1707 | 4 | for (p = 0; p < forw_exec->graph_ref_size; p++3 ) |
1708 | 3 | { |
1709 | 3 | const int graph_ref = CCV_NNC_GRAPH_REF(forw_exec)[p] - 1; |
1710 | 3 | ccv_nnc_symbolic_graph_backward_prep_t* sub_prep = backward_prep->sub_preps + graph_ref; |
1711 | 3 | ccv_nnc_symbolic_graph_t* sub_graph = ccv_nnc_symbolic_graph_new(); |
1712 | 3 | sub_graph->pair = sub_prep->graph; |
1713 | 3 | if (!sub_wrt_symbols) |
1714 | 1 | sub_wrt_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0); |
1715 | | // I am done, need to redo above for sub_prep, and it has to be successful now. |
1716 | 3 | if (!sub_f_symbols) |
1717 | 1 | sub_f_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0); |
1718 | 3 | _ccv_nnc_symbolic_graph_backward_prep_sub_f_wrt_symbols(forw_exec, sub_prep->graph, graph_ref, tensor_symbol_info, back_info->input_bitmasks, back_info->output_bitmasks, sub_f_symbols, sub_wrt_symbols); |
1719 | 3 | _ccv_nnc_symbolic_graph_backward_gen(sub_prep, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_f_symbols, 0), sub_f_symbols->rnum, (ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, 0), sub_wrt_symbols->rnum, sub_graph, root); |
1720 | 3 | ccv_array_clear(symbol_map); |
1721 | 3 | k = 0; |
1722 | 6 | for (j = 0; j < back_exec->output_size; j++3 ) |
1723 | 3 | if (back_info->output_bitmasks[j >> 6] & ((uint64_t)1 << j)) |
1724 | 3 | { |
1725 | 3 | const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_wrt_symbols, k))->d; |
1726 | 3 | if (d >= 0) |
1727 | 1 | { |
1728 | 1 | const ccv_nnc_autograd_tensor_symbol_t* const autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(sub_prep->autograd_tensor_symbols, sub_prep->autograd_tensor_versions + d); |
1729 | 1 | ccv_nnc_tensor_symbol_map_t symbol = { |
1730 | 1 | .source = autograd_symbol->symbol, |
1731 | 1 | .destination = ((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->outputs[j]))->symbol, |
1732 | 1 | }; |
1733 | 1 | ccv_array_push(symbol_map, &symbol); |
1734 | 2 | } else { |
1735 | | // Create a new tensor in sub-graph and set it to be 0. |
1736 | 2 | const ccv_nnc_autograd_tensor_symbol_t* const autograd_symbol = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->outputs[j]); |
1737 | | // autograd_symbol->d points to the corresponding forward tensor. |
1738 | 2 | ccv_nnc_tensor_symbol_t zero_symbol = ccv_nnc_tensor_symbol_new(sub_graph, tensor_symbol_info[autograd_symbol->d].info, 0); |
1739 | 2 | ccv_nnc_graph_exec_symbol_new(sub_graph, CMD_SET_FORWARD(0), 0, 0, &zero_symbol, 1, 0); |
1740 | 2 | ccv_nnc_tensor_symbol_map_t symbol = { |
1741 | 2 | .source = zero_symbol, |
1742 | 2 | .destination = autograd_symbol->symbol, |
1743 | 2 | }; |
1744 | 2 | ccv_array_push(symbol_map, &symbol); |
1745 | 2 | } |
1746 | 3 | ++k; |
1747 | 3 | } |
1748 | 3 | ccv_nnc_graph_exec_symbol_autogen(sub_graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS); |
1749 | 3 | const int symbol_map_size = symbol_map->rnum; |
1750 | 3 | ccv_nnc_symbolic_graph_set_case_of(graph, back_exec->symbol, sub_graph, p, ccv_array_get(symbol_map, 0), symbol_map_size); |
1751 | | // Hookup input only after this becomes a sub graph of the graph. |
1752 | 3 | k = 0; |
1753 | 6 | for (j = 0; j < back_exec->input_size; j++3 ) |
1754 | 3 | if (back_info->input_bitmasks[j >> 6] & ((uint64_t)1 << j)) |
1755 | 3 | { |
1756 | 3 | const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(sub_f_symbols, k))->d; |
1757 | 3 | assert(d >= 0); |
1758 | | // No corresponding sub tensors allocated. Skip. |
1759 | 3 | if (!sub_prep->autograd_tensor_versions[d].ref_version || |
1760 | 3 | !sub_prep->autograd_tensor_versions[d].ref_version->rnum1 ) |
1761 | 2 | continue; |
1762 | 1 | const ccv_nnc_autograd_tensor_symbol_t* const autograd_symbol = _ccv_nnc_autograd_tensor_symbol_from_tensor_version(sub_prep->autograd_tensor_symbols, sub_prep->autograd_tensor_versions + d); |
1763 | 1 | ccv_nnc_tensor_symbol_hookup(graph, sub_graph, ((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->inputs[j]))->symbol, autograd_symbol->symbol); |
1764 | 1 | ++k; |
1765 | 1 | } |
1766 | | // Need to make sure tape vars are hooked up. |
1767 | 3 | _ccv_nnc_symbolic_graph_add_tape_vars(sub_prep, root, graph, sub_graph, 0); |
1768 | 3 | } |
1769 | 19.1k | } else { |
1770 | 19.1k | ccv_array_clear(symbols); |
1771 | | // Gradient inputs. |
1772 | 38.6k | for (j = 0; j < back_exec->input_size; j++19.5k ) |
1773 | 19.5k | if (back_info->input_bitmasks[j >> 6] & ((uint64_t)1 << j)) |
1774 | 19.1k | ccv_array_push(symbols, &(((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->inputs[j]))->symbol)); |
1775 | 424 | else |
1776 | 424 | ccv_array_push(symbols, &NO_TENSOR_SYMBOL); |
1777 | | // Inputs from forward function. |
1778 | 53.8k | for (j = 0; j < forw_exec->input_size; j++34.7k ) |
1779 | 34.7k | if (!(back_info->input_bitmasks[(j + back_exec->input_size) >> 6] & ((uint64_t)1 << (j + back_exec->input_size)))) |
1780 | 14.4k | ccv_array_push(symbols, &NO_TENSOR_SYMBOL); |
1781 | 20.2k | else { |
1782 | 20.2k | const ccv_nnc_tensor_symbol_t symbol = { |
1783 | 20.2k | .d = forw_exec->inputs[j], |
1784 | 20.2k | .graph = backward_prep->graph |
1785 | 20.2k | }; |
1786 | 20.2k | if (graph == backward_prep->graph) |
1787 | 20.2k | ccv_array_push(symbols, &symbol); |
1788 | 5 | else { // Otherwise, create a new symbol, and set its pair to the old symbol. |
1789 | 5 | const ccv_nnc_tensor_symbol_t new_symbol = ccv_nnc_tensor_symbol_new(graph, tensor_symbol_info[forw_exec->inputs[j]].info, tensor_symbol_info[forw_exec->inputs[j]].name); |
1790 | 5 | ccv_nnc_tensor_symbol_pair_with(graph, new_symbol, symbol); |
1791 | 5 | const int flags = ccv_nnc_tensor_symbol_flags(backward_prep->graph, symbol) | CCV_NNC_TENSOR_SYMBOL_TAPE_VAR; |
1792 | 5 | ccv_nnc_tensor_symbol_set_flags(graph, new_symbol, flags); |
1793 | 5 | ccv_nnc_tensor_symbol_set_flags(backward_prep->graph, symbol, flags); |
1794 | 5 | ccv_array_push(symbols, &new_symbol); |
1795 | 5 | } |
1796 | 20.2k | } |
1797 | | // Outputs from forward function. |
1798 | 38.6k | for (j = 0; j < forw_exec->output_size; j++19.5k ) |
1799 | 19.5k | if (!(back_info->input_bitmasks[(j + back_exec->input_size + forw_exec->input_size) >> 6] & ((uint64_t)1 << (j + back_exec->input_size + forw_exec->input_size)))) |
1800 | 14.3k | ccv_array_push(symbols, &NO_TENSOR_SYMBOL); |
1801 | 5.14k | else { |
1802 | 5.14k | const ccv_nnc_tensor_symbol_t symbol = { |
1803 | 5.14k | .d = forw_exec->outputs[j], |
1804 | 5.14k | .graph = backward_prep->graph |
1805 | 5.14k | }; |
1806 | 5.14k | if (graph == backward_prep->graph) |
1807 | 5.14k | ccv_array_push(symbols, &symbol); |
1808 | 2 | else { // Otherwise, create a new symbol, and set its pair to the old symbol. |
1809 | 2 | const ccv_nnc_tensor_symbol_t new_symbol = ccv_nnc_tensor_symbol_new(graph, tensor_symbol_info[forw_exec->outputs[j]].info, tensor_symbol_info[forw_exec->outputs[j]].name); |
1810 | 2 | ccv_nnc_tensor_symbol_pair_with(graph, new_symbol, symbol); |
1811 | 2 | const int flags = ccv_nnc_tensor_symbol_flags(backward_prep->graph, symbol) | CCV_NNC_TENSOR_SYMBOL_TAPE_VAR; |
1812 | 2 | ccv_nnc_tensor_symbol_set_flags(graph, new_symbol, flags); |
1813 | 2 | ccv_nnc_tensor_symbol_set_flags(backward_prep->graph, symbol, flags); |
1814 | 2 | ccv_array_push(symbols, &new_symbol); |
1815 | 2 | } |
1816 | 5.14k | } |
1817 | 53.8k | for (j = 0; j < back_exec->output_size; j++34.7k ) |
1818 | 34.7k | if (back_info->output_bitmasks[j >> 6] & ((uint64_t)1 << j)) |
1819 | 26.1k | ccv_array_push(symbols, &(((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->outputs[j]))->symbol)); |
1820 | 8.59k | else |
1821 | 8.59k | ccv_array_push(symbols, &NO_TENSOR_SYMBOL); |
1822 | 19.1k | back_exec->symbol = ccv_nnc_graph_exec_symbol_new(graph, back_exec->cmd, ccv_array_get(symbols, 0), back_exec->input_size + forw_exec->input_size + forw_exec->output_size, ccv_array_get(symbols, back_exec->input_size + forw_exec->input_size + forw_exec->output_size), back_exec->output_size, 0); |
1823 | 19.1k | ccv_nnc_graph_exec_symbol_set_hint(graph, back_exec->symbol, exec_symbol_info[i].hint); |
1824 | 19.1k | ccv_nnc_graph_exec_symbol_pair_with(graph, back_exec->symbol, (ccv_nnc_graph_exec_symbol_t){ |
1825 | 19.1k | .d = i, |
1826 | 19.1k | .graph = backward_prep->graph, |
1827 | 19.1k | }); |
1828 | 19.1k | } |
1829 | 19.1k | } |
1830 | 6.78k | if (sub_f_symbols) |
1831 | 2 | ccv_array_free(sub_f_symbols); |
1832 | 6.78k | if (sub_wrt_symbols) |
1833 | 2 | ccv_array_free(sub_wrt_symbols); |
1834 | 6.78k | if (sub_execs) |
1835 | 1 | ccv_array_free(sub_execs); |
1836 | 6.78k | ccv_array_t* const sum_or_set_execs = backward_prep->sum_or_set_execs; |
1837 | 11.0k | for (i = 0; i < sum_or_set_execs->rnum; i++4.28k ) |
1838 | 4.28k | { |
1839 | 4.28k | ccv_nnc_sum_or_set_graph_exec_symbol_t* sum_or_set_exec = (ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, i); |
1840 | | // It is sum, set don't have inputs. |
1841 | 4.28k | if (sum_or_set_exec->input_size) |
1842 | 4.28k | { |
1843 | 4.28k | ccv_array_clear(symbols); |
1844 | | // This is to sum. |
1845 | 12.8k | for (j = 0; j < sum_or_set_exec->input_size; j++8.59k ) |
1846 | 8.59k | ccv_array_push(symbols, &(((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, sum_or_set_exec->inputs[j]))->symbol)); |
1847 | 4.28k | ccv_nnc_cmd_t cmd = ccv_nnc_cmd(CCV_NNC_EWSUM_FORWARD, 0, CMD_GENERIC(), 0); |
1848 | 4.28k | sum_or_set_exec->symbol = ccv_nnc_graph_exec_symbol_new(graph, cmd, ccv_array_get(symbols, 0), sum_or_set_exec->input_size, &(((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, sum_or_set_exec->output))->symbol), 1, 0); |
1849 | 4.28k | } else |
1850 | 1 | sum_or_set_exec->symbol = ccv_nnc_graph_exec_symbol_new(graph, CMD_SET_FORWARD(sum_or_set_exec->value), 0, 0, &(((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, sum_or_set_exec->output))->symbol), 1, 0); |
1851 | 4.28k | } |
1852 | 6.78k | ccv_array_free(symbol_map); |
1853 | 6.78k | ccv_array_free(symbols); |
1854 | 25.9k | for (i = 0; i < exec_symbol_info_size; i++19.1k ) |
1855 | 19.1k | { |
1856 | | // This is not going to be an interesting node. Skip. |
1857 | 19.1k | if ((backward_info[i].f_wrt & 0x3) != 0x3) |
1858 | 86 | continue; |
1859 | 19.1k | ccv_nnc_autograd_graph_exec_symbol_t* const back_exec = autograd_execs + i; |
1860 | | // If on the same graph, we cannot decide whether it is before or after the forw_exec, enforcing it is after forw_exec. |
1861 | 19.1k | if (graph == backward_prep->graph) |
1862 | 19.1k | ccv_nnc_graph_exec_symbol_concat(graph, (ccv_nnc_graph_exec_symbol_t){ |
1863 | 19.1k | .d = i, |
1864 | 19.1k | .graph = graph |
1865 | 19.1k | }, back_exec->symbol); |
1866 | 19.1k | if (back_exec->outgoings) |
1867 | 24.7k | for (j = 0; 12.3k j < back_exec->outgoings->rnum; j++12.4k ) |
1868 | 12.4k | { |
1869 | 12.4k | int d = *(int*)ccv_array_get(back_exec->outgoings, j); |
1870 | 12.4k | if (d < exec_symbol_info_size) |
1871 | 8.08k | ccv_nnc_graph_exec_symbol_concat(graph, back_exec->symbol, autograd_execs[d].symbol); |
1872 | 4.36k | else |
1873 | 4.36k | ccv_nnc_graph_exec_symbol_concat(graph, back_exec->symbol, ((ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, d - exec_symbol_info_size))->symbol); |
1874 | 12.4k | } |
1875 | 19.1k | } |
1876 | 11.0k | for (i = 0; i < sum_or_set_execs->rnum; i++4.28k ) |
1877 | 4.28k | { |
1878 | 4.28k | ccv_nnc_sum_or_set_graph_exec_symbol_t* exec = (ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, i); |
1879 | 4.28k | if (exec->outgoings) |
1880 | 8.50k | for (j = 0; 4.25k j < exec->outgoings->rnum; j++4.25k ) |
1881 | 4.25k | { |
1882 | 4.25k | int d = *(int*)ccv_array_get(exec->outgoings, j); |
1883 | 4.25k | if (d < exec_symbol_info_size) |
1884 | 4.25k | ccv_nnc_graph_exec_symbol_concat(graph, exec->symbol, autograd_execs[d].symbol); |
1885 | 0 | else |
1886 | 0 | ccv_nnc_graph_exec_symbol_concat(graph, exec->symbol, ((ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, d - exec_symbol_info_size))->symbol); |
1887 | 4.25k | } |
1888 | 4.28k | } |
1889 | | // Now, everything is done, set the metadata on graph so that we can lookup later for backward symbols |
1890 | 6.78k | if (graph->backward.tensor_symbol_idx) |
1891 | 4.40k | graph->backward.tensor_symbol_idx = (int*)ccrealloc(graph->backward.tensor_symbol_idx, sizeof(int) * (graph->tensor_symbol_info->rnum + tensor_symbol_info_size)); |
1892 | 2.37k | else |
1893 | 2.37k | graph->backward.tensor_symbol_idx = (int*)ccmalloc(sizeof(int) * (graph->tensor_symbol_info->rnum + tensor_symbol_info_size)); |
1894 | 6.78k | graph->backward.tensor_symbol_size = tensor_symbol_info_size; |
1895 | 6.78k | graph->backward.exec_symbol_idx = graph->backward.tensor_symbol_idx + tensor_symbol_info_size; |
1896 | 6.78k | graph->backward.exec_symbol_size = graph->tensor_symbol_info->rnum; |
1897 | 46.4k | for (i = 0; i < tensor_symbol_info_size; i++39.6k ) |
1898 | 39.6k | graph->backward.tensor_symbol_idx[i] = -1; |
1899 | 85.8k | for (i = 0; i < graph->backward.exec_symbol_size; i++79.1k ) |
1900 | 79.1k | graph->backward.exec_symbol_idx[i] = -1; |
1901 | 6.78k | ccv_nnc_autograd_tensor_version_t* const autograd_tensor_versions = backward_prep->autograd_tensor_versions; |
1902 | | // Assigning for wrt symbols. |
1903 | 16.3k | for (i = 0; i < wrt_symbol_size; i++9.53k ) |
1904 | 9.53k | { |
1905 | 9.53k | const int d = wrt_symbols[i].d; |
1906 | 9.53k | if (d < 0) |
1907 | 9 | continue; |
1908 | 9.52k | assert(d < tensor_symbol_info_size); |
1909 | 9.52k | const ccv_nnc_tensor_symbol_info_t* const forw_symbol = tensor_symbol_info + d; |
1910 | 9.52k | ccv_nnc_autograd_tensor_version_t* const tensor_ver = autograd_tensor_versions + ((!forw_symbol->alias_ref) ? d9.52k : forw_symbol->alias_ref - 11 ); |
1911 | 9.52k | assert(tensor_ver->ref_version); |
1912 | 9.52k | ccv_nnc_tensor_ref_t* const tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, tensor_ver->c); |
1913 | 9.52k | ccv_nnc_autograd_tensor_symbol_t* const autograd_symbol = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d); |
1914 | | // If this wrt symbol is an alias, create extra alias for this. |
1915 | 9.52k | if (!forw_symbol->alias_ref) |
1916 | 9.52k | graph->backward.tensor_symbol_idx[d] = autograd_symbol->symbol.d; |
1917 | 1 | else // We create new alias, and this cannot be referenced from exec_symbol_idx because its size limited to previous tensor symbol size. |
1918 | 1 | graph->backward.tensor_symbol_idx[d] = ccv_nnc_tensor_symbol_alias_new(graph, autograd_symbol->symbol, forw_symbol->ofs, forw_symbol->stride, forw_symbol->info, 0).d; |
1919 | 9.52k | const int dd = autograd_symbol->symbol.d; |
1920 | 9.52k | const int x = tensor_ref->x; |
1921 | 9.52k | if (tensor_ref->exec_registry && tensor_ref->exec_registry->rnum2 ) // Create no-op node. |
1922 | 2 | { |
1923 | 2 | ccv_nnc_graph_exec_symbol_t noop = ccv_nnc_graph_exec_symbol_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), 0, 0, 0, 0, 0); |
1924 | 2 | if (x < exec_symbol_info_size) |
1925 | 2 | ccv_nnc_graph_exec_symbol_concat(graph, autograd_execs[x].symbol, noop); |
1926 | 0 | else |
1927 | 0 | ccv_nnc_graph_exec_symbol_concat(graph, ((ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, x - exec_symbol_info_size))->symbol, noop); |
1928 | 6 | for (j = 0; j < tensor_ref->exec_registry->rnum; j++4 ) |
1929 | 4 | { |
1930 | 4 | const int x = *(int*)ccv_array_get(tensor_ref->exec_registry, j); |
1931 | 4 | assert(x >= 0); /* Otherwise, this is initialization tensor, which is impossible to be summed up by. */ |
1932 | 4 | assert(x < exec_symbol_info_size); // exec_registry is only used by alias_registry, it simply cannot reference to a sum operation. |
1933 | 4 | ccv_nnc_graph_exec_symbol_concat(graph, autograd_execs[x].symbol, noop); |
1934 | 4 | } |
1935 | 2 | graph->backward.exec_symbol_idx[dd] = noop.d; |
1936 | 9.52k | } else { |
1937 | 9.52k | if (x < exec_symbol_info_size) |
1938 | 9.49k | graph->backward.exec_symbol_idx[dd] = autograd_execs[x].symbol.d; |
1939 | 33 | else |
1940 | 33 | graph->backward.exec_symbol_idx[dd] = ((ccv_nnc_sum_or_set_graph_exec_symbol_t*)ccv_array_get(sum_or_set_execs, x - exec_symbol_info_size))->symbol.d; |
1941 | 9.52k | } |
1942 | 9.52k | } |
1943 | | // Assigning for f symbols. |
1944 | 13.5k | for (i = 0; 6.78k i < f_symbol_size; i++6.79k ) |
1945 | 6.79k | { |
1946 | 6.79k | const int d = f_symbols[i].d; |
1947 | 6.79k | assert(d >= 0); |
1948 | 6.79k | assert(d < tensor_symbol_info_size); |
1949 | 6.79k | const ccv_nnc_autograd_tensor_version_t* const tensor_ver = autograd_tensor_versions + d; |
1950 | 6.79k | if (tensor_ver->ref_version) |
1951 | 6.79k | { |
1952 | | // We don't use _ccv_nnc_autograd_tensor_symbol_from_tensor_version because that select the last version, but for us, we need the first version. |
1953 | 6.79k | const ccv_nnc_tensor_ref_t* const tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, 0); |
1954 | 6.79k | const ccv_nnc_autograd_tensor_symbol_t* const autograd_symbol = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d); |
1955 | 6.79k | graph->backward.tensor_symbol_idx[d] = autograd_symbol->symbol.d; |
1956 | | // Cannot find relevant backward exec symbols for f, it could be many. |
1957 | 6.79k | } |
1958 | 6.79k | } |
1959 | 6.78k | } |
1960 | | |
1961 | | void ccv_nnc_symbolic_graph_backward(ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const f_symbols, const int f_symbol_size, const ccv_nnc_tensor_symbol_t* const wrt_symbols, const int wrt_symbol_size, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size) |
1962 | 6.77k | { |
1963 | 6.77k | int i; |
1964 | | // f symbols cannot be alias. |
1965 | 13.5k | for (i = 0; i < f_symbol_size; i++6.79k ) |
1966 | 6.79k | if (f_symbols[i].d >= 0) |
1967 | 6.79k | { |
1968 | 6.79k | assert(f_symbols[i].graph == graph); // f symbol has to be in the current graph. |
1969 | 6.79k | assert(!((ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, f_symbols[i].d))->alias_ref); |
1970 | 6.79k | } |
1971 | 16.3k | for (i = 0; 6.77k i < wrt_symbol_size; i++9.53k ) |
1972 | 9.53k | if (wrt_symbols[i].d >= 0) |
1973 | 9.52k | { |
1974 | 9.52k | assert(wrt_symbols[i].graph == graph); |
1975 | | // This is not an alias, or what it refers to is not an alias. |
1976 | 9.52k | assert(!((ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, wrt_symbols[i].d))->alias_ref || !((ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, ((ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, wrt_symbols[i].d))->alias_ref - 1))->alias_ref); |
1977 | 9.52k | } |
1978 | 6.77k | const int exec_symbol_info_size = graph->exec_symbol_info->rnum; |
1979 | 6.77k | const int tensor_symbol_info_size = graph->tensor_symbol_info->rnum; |
1980 | 6.77k | assert(exec_symbol_info_size > 0); |
1981 | 6.77k | assert(tensor_symbol_info_size > 0); |
1982 | 6.77k | ccv_nnc_symbolic_graph_backward_prep_t backward_prep = _ccv_nnc_symbolic_graph_backward_prep(graph, sources, source_size, destinations, destination_size); |
1983 | 6.77k | _ccv_nnc_symbolic_graph_backward_prep_prune_ops(&backward_prep, f_symbols, f_symbol_size, wrt_symbols, wrt_symbol_size, sources, source_size, destinations, destination_size); |
1984 | 6.77k | _ccv_nnc_symbolic_graph_backward_prep_gen(&backward_prep, f_symbols, f_symbol_size, wrt_symbols, wrt_symbol_size, 0, sources, source_size, destinations, destination_size); |
1985 | 6.77k | _ccv_nnc_symbolic_graph_backward_gen(&backward_prep, f_symbols, f_symbol_size, wrt_symbols, wrt_symbol_size, graph, graph); |
1986 | 6.77k | _ccv_nnc_symbolic_graph_backward_prep_free(backward_prep); |
1987 | 6.77k | } |
1988 | | |
1989 | | ccv_nnc_tensor_symbol_t ccv_nnc_tensor_symbol_for_backward(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t symbol) |
1990 | 27.6k | { |
1991 | 27.6k | assert(symbol.d >= 0); |
1992 | 27.6k | assert(symbol.d < graph->backward.tensor_symbol_size); |
1993 | 27.6k | if (graph->backward.tensor_symbol_idx[symbol.d] < 0) |
1994 | 10 | return NO_TENSOR_SYMBOL; |
1995 | 27.6k | ccv_nnc_tensor_symbol_t tensor = { |
1996 | 27.6k | .d = graph->backward.tensor_symbol_idx[symbol.d], |
1997 | 27.6k | .graph = graph, |
1998 | 27.6k | }; |
1999 | 27.6k | return tensor; |
2000 | 27.6k | } |
2001 | | |
2002 | | ccv_nnc_graph_exec_symbol_t ccv_nnc_graph_exec_symbol_for_backward(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t symbol) |
2003 | 17.7k | { |
2004 | 17.7k | assert(symbol.d >= 0); |
2005 | 17.7k | assert(symbol.d < graph->tensor_symbol_info->rnum); |
2006 | 17.7k | int dd = symbol.d; |
2007 | | // Check if this is an alias. Use the original if it is. |
2008 | 17.7k | ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, dd); |
2009 | 17.7k | if (symbol_info->alias_ref) |
2010 | 2 | dd = symbol_info->alias_ref - 1; |
2011 | 17.7k | assert(dd >= 0); |
2012 | 17.7k | assert(dd < graph->backward.exec_symbol_size); |
2013 | 17.7k | if (graph->backward.exec_symbol_idx[dd] < 0) |
2014 | 0 | return (ccv_nnc_graph_exec_symbol_t){ |
2015 | 0 | .graph = 0, |
2016 | 0 | .d = CCV_NNC_NO_GRAPH_EXEC_SYMBOL |
2017 | 0 | }; |
2018 | 17.7k | ccv_nnc_graph_exec_symbol_t exec = { |
2019 | 17.7k | .d = graph->backward.exec_symbol_idx[dd], |
2020 | 17.7k | .graph = graph |
2021 | 17.7k | }; |
2022 | 17.7k | return exec; |
2023 | 17.7k | } |