File: | nnc/ccv_cnnp_model_gradient_checkpointing.c |
Warning: | line 413, column 26 Array access (via field 'vals') results in a null pointer dereference |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | #include "ccv_nnc.h" | ||||
2 | #include "ccv_nnc_easy.h" | ||||
3 | #include "ccv_nnc_internal.h" | ||||
4 | #include "ccv_internal.h" | ||||
5 | #include "_ccv_cnnp_model.h" | ||||
6 | // This can be removed once we organized ccv_cnnp_apply_gradient_checkpoints better. | ||||
7 | #include "_ccv_nnc_symbolic_graph.h" | ||||
8 | |||||
9 | typedef struct { | ||||
10 | ccv_array_t* outgoings; | ||||
11 | } ccv_nnc_graph_exec_symbol_reverse_t; | ||||
12 | |||||
13 | typedef struct { | ||||
14 | ccv_array_t* tensor_symbols; | ||||
15 | void* old_tensor_symbol_new_hook_context; | ||||
16 | ccv_nnc_tensor_symbol_new_hook_f old_tensor_symbol_new_hook; | ||||
17 | void* old_tensor_symbol_alias_new_hook_context; | ||||
18 | ccv_nnc_tensor_symbol_alias_new_hook_f old_tensor_symbol_alias_new_hook; | ||||
19 | ccv_array_t* graph_exec_symbols; | ||||
20 | ccv_nnc_graph_exec_symbol_new_hook_f old_graph_exec_symbol_new_hook; | ||||
21 | void* old_graph_exec_symbol_new_hook_context; | ||||
22 | } ccv_cnnp_gradient_checkpoint_build_t; | ||||
23 | |||||
24 | static void _ccv_cnnp_gradient_checkpoint_tensor_symbol_new_hook(void* context, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_param_t info, const char* const name) | ||||
25 | { | ||||
26 | ccv_cnnp_gradient_checkpoint_build_t* const build_context = (ccv_cnnp_gradient_checkpoint_build_t*)context; | ||||
27 | ccv_array_push(build_context->tensor_symbols, &symbol); | ||||
28 | if (build_context->old_tensor_symbol_new_hook) | ||||
29 | build_context->old_tensor_symbol_new_hook(build_context->old_tensor_symbol_new_hook_context, symbol, info, name); | ||||
30 | } | ||||
31 | |||||
32 | static void _ccv_cnnp_gradient_checkpoint_tensor_symbol_alias_new_hook(void* context, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_symbol_t from_symbol, const int ofs[CCV_NNC_MAX_DIM_ALLOC(12)], const int inc[CCV_NNC_MAX_DIM_ALLOC(12)], const ccv_nnc_tensor_param_t info, const char* const name) | ||||
33 | { | ||||
34 | ccv_cnnp_gradient_checkpoint_build_t* const build_context = (ccv_cnnp_gradient_checkpoint_build_t*)context; | ||||
35 | ccv_array_push(build_context->tensor_symbols, &symbol); | ||||
36 | if (build_context->old_tensor_symbol_alias_new_hook) | ||||
37 | build_context->old_tensor_symbol_alias_new_hook(build_context->old_tensor_symbol_alias_new_hook_context, symbol, from_symbol, ofs, inc, info, name); | ||||
38 | } | ||||
39 | |||||
40 | static void _ccv_cnnp_model_gradient_checkpoint_graph_exec_symbol_new_hook(void* context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_symbol_t* const inputs, const int input_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const char* const name) | ||||
41 | { | ||||
42 | ccv_cnnp_gradient_checkpoint_build_t* const build = (ccv_cnnp_gradient_checkpoint_build_t*)context; | ||||
43 | ccv_array_push(build->graph_exec_symbols, &symbol); | ||||
44 | if (build->old_graph_exec_symbol_new_hook) | ||||
45 | build->old_graph_exec_symbol_new_hook(build->old_graph_exec_symbol_new_hook_context, symbol, cmd, inputs, input_size, outputs, output_size, name); | ||||
46 | } | ||||
47 | |||||
48 | KHASH_MAP_INIT_INT(ccv_cnnp_tensor_symbol_map, int)typedef struct kh_ccv_cnnp_tensor_symbol_map_s { khint_t n_buckets , size, n_occupied, upper_bound; khint32_t *flags; khint32_t * keys; int *vals; } kh_ccv_cnnp_tensor_symbol_map_t; static inline __attribute__ ((__unused__)) kh_ccv_cnnp_tensor_symbol_map_t *kh_init_ccv_cnnp_tensor_symbol_map(void) { return (kh_ccv_cnnp_tensor_symbol_map_t *)calloc(1,sizeof(kh_ccv_cnnp_tensor_symbol_map_t)); } static inline __attribute__ ((__unused__)) void kh_destroy_ccv_cnnp_tensor_symbol_map (kh_ccv_cnnp_tensor_symbol_map_t *h) { if (h) { free((void *) h->keys); free(h->flags); free((void *)h->vals); free (h); } } static inline __attribute__ ((__unused__)) void kh_clear_ccv_cnnp_tensor_symbol_map (kh_ccv_cnnp_tensor_symbol_map_t *h) { if (h && h-> flags) { memset(h->flags, 0xaa, ((h->n_buckets) < 16 ? 1 : (h->n_buckets)>>4) * sizeof(khint32_t)); h-> size = h->n_occupied = 0; } } static inline __attribute__ ( (__unused__)) khint_t kh_get_ccv_cnnp_tensor_symbol_map(const kh_ccv_cnnp_tensor_symbol_map_t *h, khint32_t key) { if (h-> n_buckets) { khint_t k, i, last, mask, step = 0; mask = h-> n_buckets - 1; k = (khint32_t)(key); i = k & mask; last = i; while (!((h->flags[i>>4]>>((i&0xfU)<< 1))&2) && (((h->flags[i>>4]>>((i& 0xfU)<<1))&1) || !((h->keys[i]) == (key)))) { i = (i + (++step)) & mask; if (i == last) return h->n_buckets ; } return ((h->flags[i>>4]>>((i&0xfU)<< 1))&3)? h->n_buckets : i; } else return 0; } static inline __attribute__ ((__unused__)) int kh_resize_ccv_cnnp_tensor_symbol_map (kh_ccv_cnnp_tensor_symbol_map_t *h, khint_t new_n_buckets) { khint32_t *new_flags = 0; khint_t j = 1; { (--(new_n_buckets ), (new_n_buckets)|=(new_n_buckets)>>1, (new_n_buckets) |=(new_n_buckets)>>2, (new_n_buckets)|=(new_n_buckets)>> 4, (new_n_buckets)|=(new_n_buckets)>>8, (new_n_buckets) |=(new_n_buckets)>>16, ++(new_n_buckets)); if (new_n_buckets < 4) new_n_buckets = 4; if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; else { new_flags = (khint32_t *)malloc(((new_n_buckets) < 16? 1 : (new_n_buckets)>> 4) * sizeof(khint32_t)); if (!new_flags) return -1; memset(new_flags , 0xaa, ((new_n_buckets) < 16? 1 : (new_n_buckets)>> 4) * sizeof(khint32_t)); if (h->n_buckets < new_n_buckets ) { khint32_t *new_keys = (khint32_t*)realloc((void *)h->keys ,new_n_buckets * sizeof(khint32_t)); if (!new_keys) { free(new_flags ); return -1; } h->keys = new_keys; if (1) { int *new_vals = (int*)realloc((void *)h->vals,new_n_buckets * sizeof(int )); if (!new_vals) { free(new_flags); return -1; } h->vals = new_vals; } } } } if (j) { for (j = 0; j != h->n_buckets ; ++j) { if (((h->flags[j>>4]>>((j&0xfU)<< 1))&3) == 0) { khint32_t key = h->keys[j]; int val; khint_t new_mask; new_mask = new_n_buckets - 1; if (1) val = h->vals [j]; (h->flags[j>>4]|=1ul<<((j&0xfU)<< 1)); while (1) { khint_t k, i, step = 0; k = (khint32_t)(key) ; i = k & new_mask; while (!((new_flags[i>>4]>> ((i&0xfU)<<1))&2)) i = (i + (++step)) & new_mask ; (new_flags[i>>4]&=~(2ul<<((i&0xfU)<< 1))); if (i < h->n_buckets && ((h->flags[i>> 4]>>((i&0xfU)<<1))&3) == 0) { { khint32_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } if (1 ) { int tmp = h->vals[i]; h->vals[i] = val; val = tmp; } (h->flags[i>>4]|=1ul<<((i&0xfU)<<1) ); } else { h->keys[i] = key; if (1) h->vals[i] = val; break ; } } } } if (h->n_buckets > new_n_buckets) { h->keys = (khint32_t*)realloc((void *)h->keys,new_n_buckets * sizeof (khint32_t)); if (1) h->vals = (int*)realloc((void *)h-> vals,new_n_buckets * sizeof(int)); } free(h->flags); h-> flags = new_flags; h->n_buckets = new_n_buckets; h->n_occupied = h->size; h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); } return 0; } static inline __attribute__ ((__unused__)) khint_t kh_put_ccv_cnnp_tensor_symbol_map(kh_ccv_cnnp_tensor_symbol_map_t *h, khint32_t key, int *ret) { khint_t x; if (h->n_occupied >= h->upper_bound) { if (h->n_buckets > (h->size <<1)) { if (kh_resize_ccv_cnnp_tensor_symbol_map(h, h-> n_buckets - 1) < 0) { *ret = -1; return h->n_buckets; } } else if (kh_resize_ccv_cnnp_tensor_symbol_map(h, h->n_buckets + 1) < 0) { *ret = -1; return h->n_buckets; } } { khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; x = site = h->n_buckets; k = (khint32_t)(key); i = k & mask; if (((h->flags[i>>4]>>((i&0xfU)<<1))& 2)) x = i; else { last = i; while (!((h->flags[i>>4] >>((i&0xfU)<<1))&2) && (((h->flags [i>>4]>>((i&0xfU)<<1))&1) || !((h-> keys[i]) == (key)))) { if (((h->flags[i>>4]>>( (i&0xfU)<<1))&1)) site = i; i = (i + (++step)) & mask; if (i == last) { x = site; break; } } if (x == h->n_buckets ) { if (((h->flags[i>>4]>>((i&0xfU)<< 1))&2) && site != h->n_buckets) x = site; else x = i; } } } if (((h->flags[x>>4]>>((x&0xfU )<<1))&2)) { h->keys[x] = key; (h->flags[x>> 4]&=~(3ul<<((x&0xfU)<<1))); ++h->size; ++h->n_occupied; *ret = 1; } else if (((h->flags[x>> 4]>>((x&0xfU)<<1))&1)) { h->keys[x] = key ; (h->flags[x>>4]&=~(3ul<<((x&0xfU)<< 1))); ++h->size; *ret = 2; } else *ret = 0; return x; } static inline __attribute__ ((__unused__)) void kh_del_ccv_cnnp_tensor_symbol_map (kh_ccv_cnnp_tensor_symbol_map_t *h, khint_t x) { if (x != h-> n_buckets && !((h->flags[x>>4]>>((x& 0xfU)<<1))&3)) { (h->flags[x>>4]|=1ul<< ((x&0xfU)<<1)); --h->size; } } | ||||
49 | |||||
50 | void ccv_cnnp_model_apply_gradient_checkpoints(ccv_cnnp_compiled_data_t* const compiled_data, ccv_nnc_symbolic_graph_t* const graph) | ||||
51 | { | ||||
52 | ccv_array_t* const gradient_checkpoints = compiled_data->gradient_checkpoints; | ||||
53 | if (!gradient_checkpoints || gradient_checkpoints->rnum == 0) // No saved gradient checkpoints, this is an easy way out. | ||||
| |||||
54 | return; | ||||
55 | // Otherwise, for each gradient checkpoint, there are 3 steps: | ||||
56 | // 1. Find currently, what execs exists from inputs to outputs. | ||||
57 | // 2. Find execs that generates the outputs, and their corresponding backward execs. | ||||
58 | // 3. Find all backward execs flow from outputs back to inputs. | ||||
59 | // 4. Generate new ops by calling build again with old inputs, record all new tensors / execs. | ||||
60 | // 5. Replace inputs in backward execs with the new tensors. | ||||
61 | // 6. Hook the execs takes inputs with edge from parents of backward execs in step 2. | ||||
62 | // 7. Delete newly generated execs that has no use (i.e. its outputs are not used by backward pass). | ||||
63 | // 8. Mark all new execs with DISABLE_OPT to avoid common sub-expression elimination pass. | ||||
64 | int i, j, k, l; | ||||
65 | ccv_array_t* input_execs = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), 0, 0); | ||||
66 | ccv_array_t* output_execs = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), 0, 0); | ||||
67 | ccv_array_t* input_gradient_execs = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), 0, 0); | ||||
68 | ccv_array_t* output_gradient_execs = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), 0, 0); | ||||
69 | ccv_array_t* visited_backward_execs = ccv_array_new(sizeof(int), 0, 0); | ||||
70 | ccv_array_t* replaced_backward_execs = ccv_array_new(sizeof(int), 0, 0); | ||||
71 | const int exec_rnum = graph->exec_symbol_info->rnum; | ||||
72 | ccv_nnc_graph_exec_symbol_reverse_t* const reversed_nodes = cccalloccalloc(exec_rnum, sizeof(ccv_nnc_graph_exec_symbol_reverse_t)); | ||||
73 | for (i = 0; i < exec_rnum; i++) | ||||
74 | { | ||||
75 | const int* tos = 0; | ||||
76 | int to_size = 0; | ||||
77 | ccv_nnc_graph_exec_symbol_to(graph, (ccv_nnc_graph_exec_symbol_t){ | ||||
78 | .graph = graph, | ||||
79 | .d = i | ||||
80 | }, &tos, &to_size); | ||||
81 | if (tos) | ||||
82 | for (j = 0; j < to_size; j++) | ||||
83 | { | ||||
84 | if (!reversed_nodes[tos[j]].outgoings) | ||||
85 | reversed_nodes[tos[j]].outgoings = ccv_array_new(sizeof(int), 1, 0); | ||||
86 | ccv_array_add_unique_int(reversed_nodes[tos[j]].outgoings, i); | ||||
87 | } | ||||
88 | } | ||||
89 | uint32_t* const maskbit = cccalloccalloc((exec_rnum + 31) >> 5, sizeof(uint32_t)); | ||||
90 | // Temporary for build_data. | ||||
91 | ccv_array_t* const parameters = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0); | ||||
92 | ccv_array_t* const parameter_ids = ccv_array_new(sizeof(char*), 0, 0); | ||||
93 | ccv_array_t* const parameter_trainables = ccv_array_new(sizeof(int), 0, 0); | ||||
94 | ccv_array_t* const internals = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0); | ||||
95 | ccv_array_t* const internal_ids = ccv_array_new(sizeof(char*), 0, 0); | ||||
96 | ccv_array_t* const buf = ccv_array_new(sizeof(int), 0, 0); | ||||
97 | int max_output_size = 0; | ||||
98 | for (i = 0; i < gradient_checkpoints->rnum; i++) | ||||
99 | { | ||||
100 | ccv_cnnp_model_gradient_checkpoint_t* const checkpoint = (ccv_cnnp_model_gradient_checkpoint_t*)ccv_array_get(gradient_checkpoints, i)((void*)(((char*)((gradient_checkpoints)->data)) + (size_t )(gradient_checkpoints)->rsize * (size_t)(i))); | ||||
101 | max_output_size = ccv_max(checkpoint->output_size, max_output_size)({ typeof (checkpoint->output_size) _a = (checkpoint->output_size ); typeof (max_output_size) _b = (max_output_size); (_a > _b ) ? _a : _b; }); | ||||
102 | } | ||||
103 | ccv_nnc_tensor_symbol_t* max_outputs = ccmallocmalloc(sizeof(ccv_nnc_tensor_symbol_t) * max_output_size); | ||||
104 | ccv_array_t* newly_used_outputs = ccv_array_new(sizeof(int), 0, 0); | ||||
105 | for (i = 0; i < gradient_checkpoints->rnum; i++) | ||||
106 | { | ||||
107 | ccv_cnnp_model_gradient_checkpoint_t* const checkpoint = (ccv_cnnp_model_gradient_checkpoint_t*)ccv_array_get(gradient_checkpoints, i)((void*)(((char*)((gradient_checkpoints)->data)) + (size_t )(gradient_checkpoints)->rsize * (size_t)(i))); | ||||
108 | ccv_array_clear(input_execs); | ||||
109 | ccv_array_clear(output_execs); | ||||
110 | ccv_nnc_graph_exec_symbol_info_t* exec_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0)((void*)(((char*)((graph->exec_symbol_info)->data)) + ( size_t)(graph->exec_symbol_info)->rsize * (size_t)(0))); | ||||
111 | for (j = 0; j
| ||||
112 | { | ||||
113 | if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_info[j].flags)((exec_info[j].flags) & CCV_NNC_GRAPH_EXEC_DEAD)) | ||||
114 | continue; | ||||
115 | const int* inputs = exec_info[j].inputs; | ||||
116 | int input_size = exec_info[j].input_size; | ||||
117 | const int* outputs = exec_info[j].outputs; | ||||
118 | int output_size = exec_info[j].output_size; | ||||
119 | if (input_size == 0 && output_size == 0) | ||||
120 | continue; | ||||
121 | // Only go through forward pass. | ||||
122 | if (ccv_nnc_cmd_is_backward(exec_info[j].cmd)) | ||||
123 | continue; | ||||
124 | const ccv_nnc_graph_exec_symbol_t symbol = { | ||||
125 | .graph = graph, | ||||
126 | .d = j | ||||
127 | }; | ||||
128 | int flag = 0; | ||||
129 | for (k = 0; inputs && k < input_size && !flag; k++) | ||||
130 | if (inputs[k] >= 0) | ||||
131 | for (l = 0; l < checkpoint->input_size && !flag; l++) | ||||
132 | if (checkpoint->inputs[l].d >= 0 && inputs[k] == checkpoint->inputs[l].d) | ||||
133 | flag = 1; | ||||
134 | if (flag) | ||||
135 | ccv_array_push(input_execs, &symbol); | ||||
136 | flag = 0; | ||||
137 | for (k = 0; outputs && k < output_size && !flag; k++) | ||||
138 | if (outputs[k] >= 0) | ||||
139 | for (l = 0; l < checkpoint->output_size && !flag; l++) | ||||
140 | if (checkpoint->outputs[l].d >= 0 && outputs[k] == checkpoint->outputs[l].d) | ||||
141 | flag = 1; | ||||
142 | if (flag) | ||||
143 | ccv_array_push(output_execs, &symbol); | ||||
144 | } | ||||
145 | if (input_execs->rnum <= 0 || output_execs->rnum <= 0) | ||||
146 | continue; | ||||
147 | // Fill in blanks (i.e. the backward ops that are not showing in above, but should be included to avoid excluding necessary ones). This is done by flowing gradients from outputs back all the way to inputs. | ||||
148 | ccv_array_clear(input_gradient_execs); | ||||
149 | ccv_array_clear(output_gradient_execs); | ||||
150 | for (j = 0; j
| ||||
151 | { | ||||
152 | const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(input_execs, j)((void*)(((char*)((input_execs)->data)) + (size_t)(input_execs )->rsize * (size_t)(j))))->d; | ||||
153 | for (k = 0; k < exec_info[d].input_size; k++) | ||||
154 | if (exec_info[d].inputs[k] >= 0) | ||||
155 | { | ||||
156 | const ccv_nnc_tensor_symbol_t gradient_symbol = ccv_nnc_tensor_symbol_for_backward(graph, (ccv_nnc_tensor_symbol_t){ | ||||
157 | .graph = graph, | ||||
158 | .d = exec_info[d].inputs[k] | ||||
159 | }); | ||||
160 | if (gradient_symbol.d < 0) | ||||
161 | continue; | ||||
162 | const ccv_nnc_graph_exec_symbol_t backward = ccv_nnc_graph_exec_symbol_for_backward(graph, gradient_symbol); | ||||
163 | if (backward.d < 0) | ||||
164 | continue; | ||||
165 | if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_info[backward.d].flags)((exec_info[backward.d].flags) & CCV_NNC_GRAPH_EXEC_DEAD)) | ||||
166 | continue; | ||||
167 | int flag = 0; | ||||
168 | for (l = 0; !flag && l < output_gradient_execs->rnum; l++) | ||||
169 | if (((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(output_gradient_execs, l)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(l))))->d == backward.d) | ||||
170 | flag = 1; | ||||
171 | if (!flag) | ||||
172 | ccv_array_push(output_gradient_execs, &backward); | ||||
173 | } | ||||
174 | if (exec_info[d].outgoings && exec_info[d].outgoings->rnum > 0) | ||||
175 | for (k = 0; k < exec_info[d].outgoings->rnum; k++) | ||||
176 | { | ||||
177 | const int to_d = *(int*)ccv_array_get(exec_info[d].outgoings, k)((void*)(((char*)((exec_info[d].outgoings)->data)) + (size_t )(exec_info[d].outgoings)->rsize * (size_t)(k))); | ||||
178 | if (!ccv_nnc_cmd_is_backward(exec_info[to_d].cmd)) | ||||
179 | continue; | ||||
180 | int flag = 0; | ||||
181 | for (l = 0; !flag && l < output_gradient_execs->rnum; l++) | ||||
182 | if (((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(output_gradient_execs, l)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(l))))->d == to_d) | ||||
183 | flag = 1; | ||||
184 | if (!flag) | ||||
185 | { | ||||
186 | const ccv_nnc_graph_exec_symbol_t backward = { | ||||
187 | .graph = graph, | ||||
188 | .d = to_d | ||||
189 | }; | ||||
190 | ccv_array_push(output_gradient_execs, &backward); | ||||
191 | } | ||||
192 | } | ||||
193 | } | ||||
194 | // For output_gradient_execs, we can be opportunistic and use the wrt symbols (if exists) to find relevant bits. | ||||
195 | // For input_gradient_execs, there is no other way but to loop over all outgoings, find the ones are direct link as backward execs. | ||||
196 | for (j = 0; j
| ||||
197 | { | ||||
198 | const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(output_execs, j)((void*)(((char*)((output_execs)->data)) + (size_t)(output_execs )->rsize * (size_t)(j))))->d; | ||||
199 | if (exec_info[d].outgoings && exec_info[d].outgoings->rnum > 0) | ||||
200 | for (k = 0; k
| ||||
201 | { | ||||
202 | const int to_d = *(int*)ccv_array_get(exec_info[d].outgoings, k)((void*)(((char*)((exec_info[d].outgoings)->data)) + (size_t )(exec_info[d].outgoings)->rsize * (size_t)(k))); | ||||
203 | if (!ccv_nnc_cmd_is_backward(exec_info[to_d].cmd)) | ||||
204 | continue; | ||||
205 | int flag = 0; | ||||
206 | for (l = 0; !flag
| ||||
207 | if (((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(input_gradient_execs, l)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(l))))->d == to_d) | ||||
208 | flag = 1; | ||||
209 | if (!flag
| ||||
210 | { | ||||
211 | const ccv_nnc_graph_exec_symbol_t backward = { | ||||
212 | .graph = graph, | ||||
213 | .d = to_d | ||||
214 | }; | ||||
215 | ccv_array_push(input_gradient_execs, &backward); | ||||
216 | } | ||||
217 | } | ||||
218 | } | ||||
219 | // Note that we have to use up-to-date ones because the exec_info might have outgoings that is up-to-date. | ||||
220 | ccv_nnc_graph_visit_t* const visit = ccv_nnc_graph_visit_new(graph, exec_info, graph->exec_symbol_info->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(input_gradient_execs, 0), input_gradient_execs->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(output_gradient_execs, 0), output_gradient_execs->rnum, 1)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc (sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) * ((graph->exec_symbol_info->rnum) - 1)); _visit_->size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t c; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_ = 0; for (_i_ = 0; _i_ < (graph->exec_symbol_info-> rnum); _i_++) _incoming_edges_ += ((exec_info)[_i_].outgoings ) ? (exec_info)[_i_].outgoings->rnum : 0; const int _heap_mem_ = ((graph->exec_symbol_info->rnum) + _incoming_edges_ > 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_) _incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t) * ( graph->exec_symbol_info->rnum) + sizeof(int32_t) * ((graph ->exec_symbol_info->rnum) * 2 + _incoming_edges_)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (sizeof( ccv_nnc_incoming_t) * (graph->exec_symbol_info->rnum) + sizeof(int32_t) * ((graph->exec_symbol_info->rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof(ccv_nnc_incoming_t ) * (graph->exec_symbol_info->rnum)); int32_t* _exists_ [2] = { (int32_t*)(_incomings_ + (graph->exec_symbol_info-> rnum)), (int32_t*)(_incomings_ + (graph->exec_symbol_info-> rnum)) + (graph->exec_symbol_info->rnum), }; int32_t* const _edges_ = _exists_[1] + (graph->exec_symbol_info->rnum ); for (_i_ = 0; _i_ < (input_gradient_execs->rnum); _i_ ++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void* )(((char*)((input_gradient_execs)->data)) + (size_t)(input_gradient_execs )->rsize * (size_t)(0))))[_i_].graph == graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((input_gradient_execs )->data)) + (size_t)(input_gradient_execs)->rsize * (size_t )(0))))[_i_].graph == graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((input_gradient_execs)->data)) + (size_t)(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph" , "ccv_cnnp_model_gradient_checkpointing.c", 220, __extension__ __PRETTY_FUNCTION__); })); _incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].d].r = 1; _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void* )(((char*)((input_gradient_execs)->data)) + (size_t)(input_gradient_execs )->rsize * (size_t)(0))))[_i_].d; } int _exist_size_[2] = { (input_gradient_execs->rnum), 0, }; int _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_].r != 1) continue ; _incomings_[_idx_].r = 2; if ((exec_info)[_idx_].outgoings) for (_j_ = 0; _j_ < (exec_info)[_idx_].outgoings->rnum ; _j_++) { const int d = *(int*)((void*)(((char*)(((exec_info )[_idx_].outgoings)->data)) + (size_t)((exec_info)[_idx_]. outgoings)->rsize * (size_t)(_j_))); ++_incomings_[d].c; if (_incomings_[d].r != 0) continue; _incomings_[d].r = 1; ((void ) sizeof ((_exist_size_[_q_] < (graph->exec_symbol_info ->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] < (graph->exec_symbol_info->rnum)) ; else __assert_fail ( "_exist_size_[_q_] < (graph->exec_symbol_info->rnum)" , "ccv_cnnp_model_gradient_checkpointing.c", 220, __extension__ __PRETTY_FUNCTION__); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), ( _q_) = (_i_)); } for (_i_ = 0; _i_ < (input_gradient_execs ->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((input_gradient_execs)->data)) + (size_t)(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph" , "ccv_cnnp_model_gradient_checkpointing.c", 220, __extension__ __PRETTY_FUNCTION__); })); _incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].d].r = 3; _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void* )(((char*)((input_gradient_execs)->data)) + (size_t)(input_gradient_execs )->rsize * (size_t)(0))))[_i_].d; } _exist_size_[0] = (input_gradient_execs ->rnum); _exist_size_[1] = 0; _p_ = 0, _q_ = 1; int _bump_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0 ; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_].r != 3) continue ; _incomings_[_idx_].r = 4; if ((exec_info)[_idx_].outgoings) for (_j_ = 0; _j_ < (exec_info)[_idx_].outgoings->rnum ; _j_++) { const int d = *(int*)((void*)(((char*)(((exec_info )[_idx_].outgoings)->data)) + (size_t)((exec_info)[_idx_]. outgoings)->rsize * (size_t)(_j_))); if (_incomings_[d].edges == 0) { _incomings_[d].edges = _bump_; _bump_ += _incomings_ [d].c; _incomings_[d].c = 0; } _edges_[_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_[d].c; if (_incomings_ [d].r != 2) continue; _incomings_[d].r = 3; ((void) sizeof (( _exist_size_[_q_] < (graph->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] < (graph ->exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (graph->exec_symbol_info->rnum)" , "ccv_cnnp_model_gradient_checkpointing.c", 220, __extension__ __PRETTY_FUNCTION__); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), ( _q_) = (_i_)); } for (_i_ = 0; _i_ < (output_gradient_execs ->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((output_gradient_execs)->data)) + (size_t)(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph" , "ccv_cnnp_model_gradient_checkpointing.c", 220, __extension__ __PRETTY_FUNCTION__); })); _incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].d].r = 5; _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void *)(((char*)((output_gradient_execs)->data)) + (size_t)(output_gradient_execs )->rsize * (size_t)(0))))[_i_].d; } _exist_size_[0] = (output_gradient_execs ->rnum); _exist_size_[1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_ [_p_] > 0) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_ ][_i_]; if (_incomings_[_idx_].r != 5) continue; _incomings_[ _idx_].r = 6; if (_incomings_[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_].c; _j_++) { const int d = _edges_ [_incomings_[_idx_].edges - 1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d].r = 5; ((void) sizeof ((_exist_size_ [_q_] < (graph->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] < (graph->exec_symbol_info-> rnum)) ; else __assert_fail ("_exist_size_[_q_] < (graph->exec_symbol_info->rnum)" , "ccv_cnnp_model_gradient_checkpointing.c", 220, __extension__ __PRETTY_FUNCTION__); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), ( _q_) = (_i_)); } for (_i_ = 0; _i_ < (output_gradient_execs ->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((output_gradient_execs)->data)) + (size_t)(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph" , "ccv_cnnp_model_gradient_checkpointing.c", 220, __extension__ __PRETTY_FUNCTION__); })); _incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].d].d = 1; } for (_i_ = 0; _i_ < (input_gradient_execs->rnum ); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)( (void*)(((char*)((input_gradient_execs)->data)) + (size_t) (input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((input_gradient_execs)->data)) + (size_t)(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph" , "ccv_cnnp_model_gradient_checkpointing.c", 220, __extension__ __PRETTY_FUNCTION__); })); _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].d; } _p_ = 0; _q_ = 1; _exist_size_[0] = (input_gradient_execs->rnum ); _exist_size_[1] = 0; int _d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_ [_p_];) { const int32_t _idx_ = _exists_[_p_][_i_]; _visit_-> node[_visit_->size].index = ((_idx_)); _visit_->node[_visit_ ->size].term = ((_incomings_[_idx_].d)); ++_visit_->size ;; if (_incomings_[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7 ; } if ((exec_info)[_idx_].outgoings) { if ((exec_info)[_idx_ ].outgoings->rnum == 1) { const int d = *(int*)((void*)((( char*)(((exec_info)[_idx_].outgoings)->data)) + (size_t)(( exec_info)[_idx_].outgoings)->rsize * (size_t)(0))); --_incomings_ [d].c; if (_incomings_[d].c == 0 && _incomings_[d].r == 6 && _d_ < (output_gradient_execs->rnum)) { _exists_ [_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < (exec_info )[_idx_].outgoings->rnum; _j_++) { const int d = *(int*)(( void*)(((char*)(((exec_info)[_idx_].outgoings)->data)) + ( size_t)((exec_info)[_idx_].outgoings)->rsize * (size_t)(_j_ ))); --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_[d].r == 6 && _d_ < (output_gradient_execs ->rnum)) { ((void) sizeof ((_exist_size_[_q_] < (graph-> exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_ [_q_] < (graph->exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (graph->exec_symbol_info->rnum)" , "ccv_cnnp_model_gradient_checkpointing.c", 220, __extension__ __PRETTY_FUNCTION__); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for (_i_ = 0; _i_ < (output_gradient_execs ->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((output_gradient_execs)->data)) + (size_t)(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph" , "ccv_cnnp_model_gradient_checkpointing.c", 220, __extension__ __PRETTY_FUNCTION__); })); if (_incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].d].r == 7) continue; if (!(1)) { ((void) sizeof ((_incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].d].c == 0) ? 1 : 0), __extension__ ({ if (_incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].d].c == 0) ; else __assert_fail ("_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((output_gradient_execs)->data)) + (size_t)(output_gradient_execs)->rsize * (size_t)(0))))[_i_].d].c == 0" , "ccv_cnnp_model_gradient_checkpointing.c", 220, __extension__ __PRETTY_FUNCTION__); })); } else if (_incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].d].c > 0) continue; _visit_->node[_visit_->size].index = ((((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((output_gradient_execs )->data)) + (size_t)(output_gradient_execs)->rsize * (size_t )(0))))[_i_].d)); _visit_->node[_visit_->size].term = ( (_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*) ((output_gradient_execs)->data)) + (size_t)(output_gradient_execs )->rsize * (size_t)(0))))[_i_].d].d)); ++_visit_->size; ; } if (_heap_mem_) free(_incomings_); } while (0);; ((void) sizeof ((_visit_->size <= (graph->exec_symbol_info->rnum )) ? 1 : 0), __extension__ ({ if (_visit_->size <= (graph ->exec_symbol_info->rnum)) ; else __assert_fail ("_visit_->size <= (graph->exec_symbol_info->rnum)" , "ccv_cnnp_model_gradient_checkpointing.c", 220, __extension__ __PRETTY_FUNCTION__); })); _visit_; }); | ||||
221 | ccv_nnc_graph_visit_for(visit, exec_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const int idx __attribute__((unused)) = (visit)->node[_i_].index ; const int _node_unused_ __attribute__((unused)) = (visit)-> node[_i_].term; typeof ((exec_info)) const node __attribute__ ((unused)) = (exec_info) + idx; { | ||||
222 | if (idx < exec_rnum && !CCV_NNC_GRAPH_EXEC_IS_DEAD(node->flags)((node->flags) & CCV_NNC_GRAPH_EXEC_DEAD)) | ||||
223 | maskbit[idx >> 5] |= (1u << (idx & 0x1f)); | ||||
224 | } ccv_nnc_graph_visit_endfor} } | ||||
225 | ccv_array_clear(visited_backward_execs); | ||||
226 | // Add more backward pass to the list. Note that we don't add everything, particularly there are new nodes created through gradient checkpointing are ignored. | ||||
227 | #define visitor(node, idx, _) \ | ||||
228 | if (idx < exec_rnum && !CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_info[idx].flags)((exec_info[idx].flags) & CCV_NNC_GRAPH_EXEC_DEAD) && maskbit[idx >> 5] & (1u << (idx & 0x1f))) \ | ||||
229 | ccv_array_add_unique_int(visited_backward_execs, idx); | ||||
230 | CCV_NNC_GRAPH_VISIT(graph, reversed_nodes, exec_rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(output_gradient_execs, 0), output_gradient_execs->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(input_gradient_execs, 0), input_gradient_execs->rnum, 0, visitor)do { typedef struct { int8_t d; int8_t r; uint16_t c; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_ = 0; for (_i_ = 0; _i_ < (exec_rnum); _i_++) _incoming_edges_ += ((reversed_nodes)[_i_].outgoings) ? (reversed_nodes)[_i_] .outgoings->rnum : 0; const int _heap_mem_ = ((exec_rnum) + _incoming_edges_ > 1024); ccv_nnc_incoming_t* _incomings_ ; if (_heap_mem_) _incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof (ccv_nnc_incoming_t) * (exec_rnum) + sizeof(int32_t) * ((exec_rnum ) * 2 + _incoming_edges_)); else _incomings_ = (ccv_nnc_incoming_t *)__builtin_alloca (sizeof(ccv_nnc_incoming_t) * (exec_rnum) + sizeof(int32_t) * ((exec_rnum) * 2 + _incoming_edges_)); memset (_incomings_, 0, sizeof(ccv_nnc_incoming_t) * (exec_rnum)); int32_t * _exists_[2] = { (int32_t*)(_incomings_ + (exec_rnum)), (int32_t *)(_incomings_ + (exec_rnum)) + (exec_rnum), }; int32_t* const _edges_ = _exists_[1] + (exec_rnum); for (_i_ = 0; _i_ < ( output_gradient_execs->rnum); _i_++) { ((void) sizeof (((( ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((output_gradient_execs )->data)) + (size_t)(output_gradient_execs)->rsize * (size_t )(0))))[_i_].graph == graph) ? 1 : 0), __extension__ ({ if (( (ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((output_gradient_execs )->data)) + (size_t)(output_gradient_execs)->rsize * (size_t )(0))))[_i_].graph == graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((output_gradient_execs)->data)) + (size_t)(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph" , "ccv_cnnp_model_gradient_checkpointing.c", 230, __extension__ __PRETTY_FUNCTION__); })); _incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].d].r = 1; _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void *)(((char*)((output_gradient_execs)->data)) + (size_t)(output_gradient_execs )->rsize * (size_t)(0))))[_i_].d; } int _exist_size_[2] = { (output_gradient_execs->rnum), 0, }; int _p_ = 0, _q_ = 1 ; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_].r != 1) continue ; _incomings_[_idx_].r = 2; if ((reversed_nodes)[_idx_].outgoings ) for (_j_ = 0; _j_ < (reversed_nodes)[_idx_].outgoings-> rnum; _j_++) { const int d = *(int*)((void*)(((char*)(((reversed_nodes )[_idx_].outgoings)->data)) + (size_t)((reversed_nodes)[_idx_ ].outgoings)->rsize * (size_t)(_j_))); ++_incomings_[d].c; if (_incomings_[d].r != 0) continue; _incomings_[d].r = 1; ( (void) sizeof ((_exist_size_[_q_] < (exec_rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] < (exec_rnum)) ; else __assert_fail ("_exist_size_[_q_] < (exec_rnum)", "ccv_cnnp_model_gradient_checkpointing.c" , 230, __extension__ __PRETTY_FUNCTION__); })); _exists_[_q_] [_exist_size_[_q_]] = d; ++_exist_size_[_q_]; } } ((_i_) = (_p_ ), (_p_) = (_q_), (_q_) = (_i_)); } for (_i_ = 0; _i_ < (output_gradient_execs ->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((output_gradient_execs)->data)) + (size_t)(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph" , "ccv_cnnp_model_gradient_checkpointing.c", 230, __extension__ __PRETTY_FUNCTION__); })); _incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].d].r = 3; _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void *)(((char*)((output_gradient_execs)->data)) + (size_t)(output_gradient_execs )->rsize * (size_t)(0))))[_i_].d; } _exist_size_[0] = (output_gradient_execs ->rnum); _exist_size_[1] = 0; _p_ = 0, _q_ = 1; int _bump_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0 ; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_].r != 3) continue ; _incomings_[_idx_].r = 4; if ((reversed_nodes)[_idx_].outgoings ) for (_j_ = 0; _j_ < (reversed_nodes)[_idx_].outgoings-> rnum; _j_++) { const int d = *(int*)((void*)(((char*)(((reversed_nodes )[_idx_].outgoings)->data)) + (size_t)((reversed_nodes)[_idx_ ].outgoings)->rsize * (size_t)(_j_))); if (_incomings_[d]. edges == 0) { _incomings_[d].edges = _bump_; _bump_ += _incomings_ [d].c; _incomings_[d].c = 0; } _edges_[_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_[d].c; if (_incomings_ [d].r != 2) continue; _incomings_[d].r = 3; ((void) sizeof (( _exist_size_[_q_] < (exec_rnum)) ? 1 : 0), __extension__ ( { if (_exist_size_[_q_] < (exec_rnum)) ; else __assert_fail ("_exist_size_[_q_] < (exec_rnum)", "ccv_cnnp_model_gradient_checkpointing.c" , 230, __extension__ __PRETTY_FUNCTION__); })); _exists_[_q_] [_exist_size_[_q_]] = d; ++_exist_size_[_q_]; } } ((_i_) = (_p_ ), (_p_) = (_q_), (_q_) = (_i_)); } for (_i_ = 0; _i_ < (input_gradient_execs ->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((input_gradient_execs)->data)) + (size_t)(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph" , "ccv_cnnp_model_gradient_checkpointing.c", 230, __extension__ __PRETTY_FUNCTION__); })); _incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].d].r = 5; _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void* )(((char*)((input_gradient_execs)->data)) + (size_t)(input_gradient_execs )->rsize * (size_t)(0))))[_i_].d; } _exist_size_[0] = (input_gradient_execs ->rnum); _exist_size_[1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_ [_p_] > 0) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_ ][_i_]; if (_incomings_[_idx_].r != 5) continue; _incomings_[ _idx_].r = 6; if (_incomings_[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_].c; _j_++) { const int d = _edges_ [_incomings_[_idx_].edges - 1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d].r = 5; ((void) sizeof ((_exist_size_ [_q_] < (exec_rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_ [_q_] < (exec_rnum)) ; else __assert_fail ("_exist_size_[_q_] < (exec_rnum)" , "ccv_cnnp_model_gradient_checkpointing.c", 230, __extension__ __PRETTY_FUNCTION__); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), ( _q_) = (_i_)); } for (_i_ = 0; _i_ < (input_gradient_execs ->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((input_gradient_execs)->data)) + (size_t)(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph" , "ccv_cnnp_model_gradient_checkpointing.c", 230, __extension__ __PRETTY_FUNCTION__); })); _incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].d].d = 1; } for (_i_ = 0; _i_ < (output_gradient_execs->rnum) ; _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)(( void*)(((char*)((output_gradient_execs)->data)) + (size_t) (output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((output_gradient_execs)->data)) + (size_t)(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph" , "ccv_cnnp_model_gradient_checkpointing.c", 230, __extension__ __PRETTY_FUNCTION__); })); _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].d; } _p_ = 0; _q_ = 1; _exist_size_[0] = (output_gradient_execs-> rnum); _exist_size_[1] = 0; int _d_ = 0; while (_exist_size_[ _p_] > 0) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_ [_p_];) { const int32_t _idx_ = _exists_[_p_][_i_]; visitor(( (reversed_nodes) + _idx_), (_idx_), (_incomings_[_idx_].d)); if (_incomings_[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if ((reversed_nodes)[_idx_].outgoings) { if ((reversed_nodes)[_idx_ ].outgoings->rnum == 1) { const int d = *(int*)((void*)((( char*)(((reversed_nodes)[_idx_].outgoings)->data)) + (size_t )((reversed_nodes)[_idx_].outgoings)->rsize * (size_t)(0)) ); --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_ [d].r == 6 && _d_ < (input_gradient_execs->rnum )) { _exists_[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < (reversed_nodes)[_idx_].outgoings->rnum; _j_++) { const int d = *(int*)((void*)(((char*)(((reversed_nodes)[_idx_ ].outgoings)->data)) + (size_t)((reversed_nodes)[_idx_].outgoings )->rsize * (size_t)(_j_))); --_incomings_[d].c; if (_incomings_ [d].c == 0 && _incomings_[d].r == 6 && _d_ < (input_gradient_execs->rnum)) { ((void) sizeof ((_exist_size_ [_q_] < (exec_rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_ [_q_] < (exec_rnum)) ; else __assert_fail ("_exist_size_[_q_] < (exec_rnum)" , "ccv_cnnp_model_gradient_checkpointing.c", 230, __extension__ __PRETTY_FUNCTION__); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for (_i_ = 0; _i_ < (input_gradient_execs ->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((input_gradient_execs)->data)) + (size_t)(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph" , "ccv_cnnp_model_gradient_checkpointing.c", 230, __extension__ __PRETTY_FUNCTION__); })); if (_incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].d].r == 7) continue; if (!(0)) { ((void) sizeof ((_incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].d].c == 0) ? 1 : 0), __extension__ ({ if (_incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].d].c == 0) ; else __assert_fail ("_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((input_gradient_execs)->data)) + (size_t)(input_gradient_execs)->rsize * (size_t)(0))))[_i_].d].c == 0" , "ccv_cnnp_model_gradient_checkpointing.c", 230, __extension__ __PRETTY_FUNCTION__); })); } else if (_incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].d].c > 0) continue; visitor(((reversed_nodes) + ((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].d), ( ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((input_gradient_execs )->data)) + (size_t)(input_gradient_execs)->rsize * (size_t )(0))))[_i_].d), (_incomings_[((ccv_nnc_graph_exec_symbol_t*) ((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].d].d) ); } if (_heap_mem_) free(_incomings_); } while (0);; | ||||
231 | for (j = 0; j
| ||||
232 | ccv_array_add_unique_int(visited_backward_execs, ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(input_gradient_execs, j)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(j))))->d); | ||||
233 | #undef visitor | ||||
234 | ccv_cnnp_gradient_checkpoint_build_t build = { | ||||
235 | .tensor_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0), | ||||
236 | .graph_exec_symbols = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), 0, 0), | ||||
237 | }; | ||||
238 | build.old_tensor_symbol_new_hook_context = ccv_nnc_tensor_symbol_new_hook(graph, _ccv_cnnp_gradient_checkpoint_tensor_symbol_new_hook, &build, &build.old_tensor_symbol_new_hook); | ||||
239 | build.old_tensor_symbol_alias_new_hook_context = ccv_nnc_tensor_symbol_alias_new_hook(graph, _ccv_cnnp_gradient_checkpoint_tensor_symbol_alias_new_hook, &build, &build.old_tensor_symbol_alias_new_hook); | ||||
240 | build.old_graph_exec_symbol_new_hook_context = ccv_nnc_graph_exec_symbol_new_hook(graph, _ccv_cnnp_model_gradient_checkpoint_graph_exec_symbol_new_hook, &build, &build.old_graph_exec_symbol_new_hook); | ||||
241 | ccv_array_clear(parameters); | ||||
242 | ccv_array_clear(parameter_ids); | ||||
243 | ccv_array_clear(parameter_trainables); | ||||
244 | ccv_array_clear(internals); | ||||
245 | ccv_array_clear(internal_ids); | ||||
246 | ccv_cnnp_model_sequence_t model_sequence = { | ||||
247 | .bank = kh_init(ccv_cnnp_model_name_bank)kh_init_ccv_cnnp_model_name_bank() | ||||
248 | }; | ||||
249 | ccv_cnnp_model_add_to_array_context_t add_to_parameter_context = { | ||||
250 | .sequence = &model_sequence, | ||||
251 | .prefix = 't', | ||||
252 | .symbols = parameters, | ||||
253 | .ids = parameter_ids, | ||||
254 | .trainables = parameter_trainables, | ||||
255 | }; | ||||
256 | ccv_cnnp_model_add_to_array_context_t add_to_output_context = { | ||||
257 | .sequence = &model_sequence, | ||||
258 | .prefix = 'r', | ||||
259 | .symbols = internals, | ||||
260 | .ids = internal_ids, | ||||
261 | .trainables = 0, | ||||
262 | }; | ||||
263 | ccv_cnnp_model_build_data_t build_data = { | ||||
264 | .is_trainable = checkpoint->is_trainable, | ||||
265 | .model_sequence = &model_sequence, | ||||
266 | .add_to_array = ccv_cnnp_model_add_to_array, | ||||
267 | .parameters = parameters, | ||||
268 | .context = { | ||||
269 | .add_to_parameter = &add_to_parameter_context, | ||||
270 | .add_to_output = &add_to_output_context, | ||||
271 | }, | ||||
272 | .is_gradient_checkpointing = 1, // Mark this as true so we don't allocate gradient_checkpoints array or override the hooks. | ||||
273 | .gradient_checkpoints = 0, | ||||
274 | }; | ||||
275 | checkpoint->model->data = &build_data; | ||||
276 | checkpoint->build(checkpoint->model, graph, checkpoint->inputs, checkpoint->input_size, max_outputs, checkpoint->output_size); | ||||
277 | checkpoint->model->data = 0; | ||||
278 | kh_destroy(ccv_cnnp_model_name_bank, model_sequence.bank)kh_destroy_ccv_cnnp_model_name_bank(model_sequence.bank); | ||||
279 | if (model_sequence.sequences) | ||||
280 | ccv_array_free(model_sequence.sequences); | ||||
281 | ccv_nnc_tensor_symbol_new_hook(graph, build.old_tensor_symbol_new_hook, build.old_tensor_symbol_new_hook_context, 0); | ||||
282 | ccv_nnc_tensor_symbol_alias_new_hook(graph, build.old_tensor_symbol_alias_new_hook, build.old_tensor_symbol_alias_new_hook_context, 0); | ||||
283 | ccv_nnc_graph_exec_symbol_autogen(graph, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(build.graph_exec_symbols, 0)((void*)(((char*)((build.graph_exec_symbols)->data)) + (size_t )(build.graph_exec_symbols)->rsize * (size_t)(0))), build.graph_exec_symbols->rnum, 0); | ||||
284 | for (j = 0; j < parameter_ids->rnum; j++) | ||||
285 | ccfreefree(*(char**)ccv_array_get(parameter_ids, j)((void*)(((char*)((parameter_ids)->data)) + (size_t)(parameter_ids )->rsize * (size_t)(j)))); | ||||
286 | for (j = 0; j < internal_ids->rnum; j++) | ||||
287 | ccfreefree(*(char**)ccv_array_get(internal_ids, j)((void*)(((char*)((internal_ids)->data)) + (size_t)(internal_ids )->rsize * (size_t)(j)))); | ||||
288 | // Note that there is no graph optimization applied here. | ||||
289 | exec_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0)((void*)(((char*)((graph->exec_symbol_info)->data)) + ( size_t)(graph->exec_symbol_info)->rsize * (size_t)(0))); | ||||
290 | // Reuse existing one. | ||||
291 | ccv_array_t* const newly_input_execs = input_execs; | ||||
292 | ccv_array_t* const newly_output_execs = output_execs; | ||||
293 | ccv_array_clear(newly_input_execs); | ||||
294 | ccv_array_clear(newly_output_execs); | ||||
295 | for (j = 0; j < build.graph_exec_symbols->rnum; j++) | ||||
296 | { | ||||
297 | const int idx = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(build.graph_exec_symbols, j)((void*)(((char*)((build.graph_exec_symbols)->data)) + (size_t )(build.graph_exec_symbols)->rsize * (size_t)(j))))->d; | ||||
298 | if (idx < 0) | ||||
299 | continue; | ||||
300 | if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_info[idx].flags)((exec_info[idx].flags) & CCV_NNC_GRAPH_EXEC_DEAD)) | ||||
301 | continue; | ||||
302 | const ccv_nnc_graph_exec_symbol_t symbol = { | ||||
303 | .graph = graph, | ||||
304 | .d = idx | ||||
305 | }; | ||||
306 | const int* inputs = exec_info[idx].inputs; | ||||
307 | int input_size = exec_info[idx].input_size; | ||||
308 | // Only go through forward pass. | ||||
309 | assert(!ccv_nnc_cmd_is_backward(exec_info[idx].cmd))((void) sizeof ((!ccv_nnc_cmd_is_backward(exec_info[idx].cmd) ) ? 1 : 0), __extension__ ({ if (!ccv_nnc_cmd_is_backward(exec_info [idx].cmd)) ; else __assert_fail ("!ccv_nnc_cmd_is_backward(exec_info[idx].cmd)" , "ccv_cnnp_model_gradient_checkpointing.c", 309, __extension__ __PRETTY_FUNCTION__); })); | ||||
310 | int flag = 0; | ||||
311 | for (k = 0; inputs && k < input_size && !flag; k++) | ||||
312 | if (inputs[k] >= 0) | ||||
313 | for (l = 0; l < checkpoint->input_size && !flag; l++) | ||||
314 | if (checkpoint->inputs[l].d >= 0 && inputs[k] == checkpoint->inputs[l].d) | ||||
315 | flag = 1; | ||||
316 | if (flag) | ||||
317 | ccv_array_push(newly_input_execs, &symbol); | ||||
318 | flag = 0; | ||||
319 | const int* outputs = exec_info[idx].outputs; | ||||
320 | int output_size = exec_info[idx].output_size; | ||||
321 | for (k = 0; inputs && k < output_size && !flag; k++) | ||||
322 | if (outputs[k] >= 0) | ||||
323 | for (l = 0; l < checkpoint->output_size && !flag; l++) | ||||
324 | if (max_outputs[l].d >= 0 && outputs[k] == max_outputs[l].d) | ||||
325 | flag = 1; | ||||
326 | if (flag) | ||||
327 | ccv_array_push(newly_output_execs, &symbol); | ||||
328 | } | ||||
329 | for (j = 0; j < checkpoint->input_size; j++) | ||||
330 | if (checkpoint->inputs[j].d >= 0) | ||||
331 | ccv_array_push(parameters, checkpoint->inputs + j); | ||||
332 | ccv_nnc_symbolic_graph_simplify(graph, | ||||
333 | SYMBOLIC_GRAPH_PASSES(CCV_NNC_SIMPLIFY_COMMON_SUBEXPRESSION_ELIMINATION,(const int []){CCV_NNC_SIMPLIFY_COMMON_SUBEXPRESSION_ELIMINATION , CCV_NNC_SIMPLIFY_DATA_TRANSFER_OPT, CCV_NNC_SIMPLIFY_OPS_FUSION }, (1 +1 +1 +1 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 + 0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 + 0 +0 +0 +0 -1) | ||||
334 | CCV_NNC_SIMPLIFY_DATA_TRANSFER_OPT,(const int []){CCV_NNC_SIMPLIFY_COMMON_SUBEXPRESSION_ELIMINATION , CCV_NNC_SIMPLIFY_DATA_TRANSFER_OPT, CCV_NNC_SIMPLIFY_OPS_FUSION }, (1 +1 +1 +1 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 + 0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 + 0 +0 +0 +0 -1) | ||||
335 | CCV_NNC_SIMPLIFY_OPS_FUSION)(const int []){CCV_NNC_SIMPLIFY_COMMON_SUBEXPRESSION_ELIMINATION , CCV_NNC_SIMPLIFY_DATA_TRANSFER_OPT, CCV_NNC_SIMPLIFY_OPS_FUSION }, (1 +1 +1 +1 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 + 0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 + 0 +0 +0 +0 -1), | ||||
336 | ccv_array_get(parameters, 0)((void*)(((char*)((parameters)->data)) + (size_t)(parameters )->rsize * (size_t)(0))), parameters->rnum, | ||||
337 | max_outputs, checkpoint->output_size, | ||||
338 | ccv_array_get(newly_input_execs, 0)((void*)(((char*)((newly_input_execs)->data)) + (size_t)(newly_input_execs )->rsize * (size_t)(0))), newly_input_execs->rnum, ccv_array_get(newly_output_execs, 0)((void*)(((char*)((newly_output_execs)->data)) + (size_t)( newly_output_execs)->rsize * (size_t)(0))), newly_output_execs->rnum); | ||||
339 | ccv_nnc_graph_exec_symbol_new_hook(graph, build.old_graph_exec_symbol_new_hook, build.old_graph_exec_symbol_new_hook_context, 0); | ||||
340 | // Need to autogen and redo source / destination. | ||||
341 | ccv_nnc_graph_exec_symbol_autogen(graph, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(build.graph_exec_symbols, 0)((void*)(((char*)((build.graph_exec_symbols)->data)) + (size_t )(build.graph_exec_symbols)->rsize * (size_t)(0))), build.graph_exec_symbols->rnum, 0); | ||||
342 | exec_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0)((void*)(((char*)((graph->exec_symbol_info)->data)) + ( size_t)(graph->exec_symbol_info)->rsize * (size_t)(0))); | ||||
343 | ccv_array_clear(newly_input_execs); | ||||
344 | for (j = 0; j
| ||||
345 | { | ||||
346 | const int idx = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(build.graph_exec_symbols, j)((void*)(((char*)((build.graph_exec_symbols)->data)) + (size_t )(build.graph_exec_symbols)->rsize * (size_t)(j))))->d; | ||||
347 | if (idx < 0) | ||||
348 | continue; | ||||
349 | if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_info[idx].flags)((exec_info[idx].flags) & CCV_NNC_GRAPH_EXEC_DEAD)) | ||||
350 | continue; | ||||
351 | const ccv_nnc_graph_exec_symbol_t symbol = { | ||||
352 | .graph = graph, | ||||
353 | .d = idx | ||||
354 | }; | ||||
355 | const int* inputs = exec_info[idx].inputs; | ||||
356 | int input_size = exec_info[idx].input_size; | ||||
357 | // Only go through forward pass. | ||||
358 | assert(!ccv_nnc_cmd_is_backward(exec_info[idx].cmd))((void) sizeof ((!ccv_nnc_cmd_is_backward(exec_info[idx].cmd) ) ? 1 : 0), __extension__ ({ if (!ccv_nnc_cmd_is_backward(exec_info [idx].cmd)) ; else __assert_fail ("!ccv_nnc_cmd_is_backward(exec_info[idx].cmd)" , "ccv_cnnp_model_gradient_checkpointing.c", 358, __extension__ __PRETTY_FUNCTION__); })); | ||||
359 | int flag = 0; | ||||
360 | for (k = 0; inputs && k < input_size && !flag; k++) | ||||
361 | if (inputs[k] >= 0) | ||||
362 | for (l = 0; l < checkpoint->input_size && !flag; l++) | ||||
363 | if (checkpoint->inputs[l].d >= 0 && inputs[k] == checkpoint->inputs[l].d) | ||||
364 | flag = 1; | ||||
365 | if (flag) | ||||
366 | ccv_array_push(newly_input_execs, &symbol); | ||||
367 | } | ||||
368 | // Build a map between old tensor symbols and new tensor symbols. | ||||
369 | khash_t(ccv_cnnp_tensor_symbol_map)kh_ccv_cnnp_tensor_symbol_map_t* symbol_map = kh_init(ccv_cnnp_tensor_symbol_map)kh_init_ccv_cnnp_tensor_symbol_map(); | ||||
370 | assert(build.tensor_symbols->rnum <= checkpoint->tensor_symbols->rnum)((void) sizeof ((build.tensor_symbols->rnum <= checkpoint ->tensor_symbols->rnum) ? 1 : 0), __extension__ ({ if ( build.tensor_symbols->rnum <= checkpoint->tensor_symbols ->rnum) ; else __assert_fail ("build.tensor_symbols->rnum <= checkpoint->tensor_symbols->rnum" , "ccv_cnnp_model_gradient_checkpointing.c", 370, __extension__ __PRETTY_FUNCTION__); })); | ||||
371 | // Build a map to potentially map from old input to new input. | ||||
372 | for (j = 0, k = 0; j < build.tensor_symbols->rnum && k < checkpoint->tensor_symbols->rnum;) | ||||
373 | { | ||||
374 | const int from_d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(checkpoint->tensor_symbols, k)((void*)(((char*)((checkpoint->tensor_symbols)->data)) + (size_t)(checkpoint->tensor_symbols)->rsize * (size_t) (k))))->d; | ||||
375 | assert(from_d >= 0)((void) sizeof ((from_d >= 0) ? 1 : 0), __extension__ ({ if (from_d >= 0) ; else __assert_fail ("from_d >= 0", "ccv_cnnp_model_gradient_checkpointing.c" , 375, __extension__ __PRETTY_FUNCTION__); })); | ||||
376 | const int to_d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(build.tensor_symbols, j)((void*)(((char*)((build.tensor_symbols)->data)) + (size_t )(build.tensor_symbols)->rsize * (size_t)(j))))->d; | ||||
377 | assert(to_d >= 0)((void) sizeof ((to_d >= 0) ? 1 : 0), __extension__ ({ if ( to_d >= 0) ; else __assert_fail ("to_d >= 0", "ccv_cnnp_model_gradient_checkpointing.c" , 377, __extension__ __PRETTY_FUNCTION__); })); | ||||
378 | int from_flag = 0; | ||||
379 | int to_flag = 0; | ||||
380 | for (l = 0; (!from_flag
| ||||
381 | { | ||||
382 | const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(parameters, l)((void*)(((char*)((parameters)->data)) + (size_t)(parameters )->rsize * (size_t)(l))))->d; | ||||
383 | if (d == from_d) | ||||
384 | from_flag = 1; | ||||
385 | if (d == to_d) | ||||
386 | to_flag = 1; | ||||
387 | } | ||||
388 | if (!from_flag
| ||||
389 | for (l = 0; (!from_flag
| ||||
390 | { | ||||
391 | const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(internals, l)((void*)(((char*)((internals)->data)) + (size_t)(internals )->rsize * (size_t)(l))))->d; | ||||
392 | if (d == from_d) | ||||
393 | from_flag = 1; | ||||
394 | if (d == to_d) | ||||
395 | to_flag = 1; | ||||
396 | } | ||||
397 | if (from_flag
| ||||
398 | ++k; | ||||
399 | if (to_flag
| ||||
400 | ++j; | ||||
401 | if (from_flag
| ||||
402 | continue; | ||||
403 | ++k; | ||||
404 | ++j; | ||||
405 | // Skip if from_d is outputs. | ||||
406 | for (l = 0; l < !from_flag && checkpoint->output_size; l++) | ||||
407 | if (checkpoint->outputs[l].d == from_d) | ||||
408 | from_flag = 1; | ||||
409 | if (from_flag
| ||||
410 | continue; | ||||
411 | int ret = 0; | ||||
412 | khiter_t h = kh_put(ccv_cnnp_tensor_symbol_map, symbol_map, from_d, &ret)kh_put_ccv_cnnp_tensor_symbol_map(symbol_map, from_d, &ret ); | ||||
413 | kh_val(symbol_map, h)((symbol_map)->vals[h]) = to_d; | ||||
| |||||
414 | } | ||||
415 | // Now go over all backward passes to replace inputs with the ones from symbol map. Record these that are used. | ||||
416 | ccv_array_clear(newly_used_outputs); | ||||
417 | ccv_array_clear(replaced_backward_execs); | ||||
418 | for (j = 0; j < visited_backward_execs->rnum; j++) | ||||
419 | { | ||||
420 | const int idx = *(int*)ccv_array_get(visited_backward_execs, j)((void*)(((char*)((visited_backward_execs)->data)) + (size_t )(visited_backward_execs)->rsize * (size_t)(j))); | ||||
421 | if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_info[idx].flags)((exec_info[idx].flags) & CCV_NNC_GRAPH_EXEC_DEAD)) | ||||
422 | continue; | ||||
423 | assert(idx >= 0)((void) sizeof ((idx >= 0) ? 1 : 0), __extension__ ({ if ( idx >= 0) ; else __assert_fail ("idx >= 0", "ccv_cnnp_model_gradient_checkpointing.c" , 423, __extension__ __PRETTY_FUNCTION__); })); | ||||
424 | assert(idx < exec_rnum)((void) sizeof ((idx < exec_rnum) ? 1 : 0), __extension__ ( { if (idx < exec_rnum) ; else __assert_fail ("idx < exec_rnum" , "ccv_cnnp_model_gradient_checkpointing.c", 424, __extension__ __PRETTY_FUNCTION__); })); | ||||
425 | if (!ccv_nnc_cmd_is_backward(exec_info[idx].cmd)) | ||||
426 | continue; | ||||
427 | for (k = 0; k < exec_info[idx].input_size; k++) | ||||
428 | if (exec_info[idx].inputs[k] >= 0) | ||||
429 | { | ||||
430 | const khiter_t h = kh_get(ccv_cnnp_tensor_symbol_map, symbol_map, exec_info[idx].inputs[k])kh_get_ccv_cnnp_tensor_symbol_map(symbol_map, exec_info[idx]. inputs[k]); | ||||
431 | if (h != kh_end(symbol_map)((symbol_map)->n_buckets)) // Replacing it. | ||||
432 | { | ||||
433 | const int newly_created_output = kh_val(symbol_map, h)((symbol_map)->vals[h]); | ||||
434 | exec_info[idx].inputs[k] = newly_created_output; | ||||
435 | ccv_array_add_unique_int(newly_used_outputs, newly_created_output); | ||||
436 | ccv_array_add_unique_int(replaced_backward_execs, idx); | ||||
437 | } | ||||
438 | } | ||||
439 | } | ||||
440 | for (j = 0; j < build.graph_exec_symbols->rnum; j++) | ||||
441 | { | ||||
442 | ccv_nnc_graph_exec_symbol_t* const symbol = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(build.graph_exec_symbols, j)((void*)(((char*)((build.graph_exec_symbols)->data)) + (size_t )(build.graph_exec_symbols)->rsize * (size_t)(j))); | ||||
443 | if (symbol->d < 0) | ||||
444 | continue; | ||||
445 | if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_info[symbol->d].flags)((exec_info[symbol->d].flags) & CCV_NNC_GRAPH_EXEC_DEAD )) | ||||
446 | continue; | ||||
447 | int x, y; | ||||
448 | for (k = 0; k < replaced_backward_execs->rnum; k++) | ||||
449 | { | ||||
450 | const int idx = *(int*)ccv_array_get(replaced_backward_execs, k)((void*)(((char*)((replaced_backward_execs)->data)) + (size_t )(replaced_backward_execs)->rsize * (size_t)(k))); | ||||
451 | assert(idx >= 0)((void) sizeof ((idx >= 0) ? 1 : 0), __extension__ ({ if ( idx >= 0) ; else __assert_fail ("idx >= 0", "ccv_cnnp_model_gradient_checkpointing.c" , 451, __extension__ __PRETTY_FUNCTION__); })); | ||||
452 | assert(idx < exec_rnum)((void) sizeof ((idx < exec_rnum) ? 1 : 0), __extension__ ( { if (idx < exec_rnum) ; else __assert_fail ("idx < exec_rnum" , "ccv_cnnp_model_gradient_checkpointing.c", 452, __extension__ __PRETTY_FUNCTION__); })); | ||||
453 | assert(ccv_nnc_cmd_is_backward(exec_info[idx].cmd))((void) sizeof ((ccv_nnc_cmd_is_backward(exec_info[idx].cmd)) ? 1 : 0), __extension__ ({ if (ccv_nnc_cmd_is_backward(exec_info [idx].cmd)) ; else __assert_fail ("ccv_nnc_cmd_is_backward(exec_info[idx].cmd)" , "ccv_cnnp_model_gradient_checkpointing.c", 453, __extension__ __PRETTY_FUNCTION__); })); | ||||
454 | int flag = 0; | ||||
455 | for (x = 0; !flag && x < exec_info[idx].input_size; x++) | ||||
456 | for (y = 0; !flag && y < exec_info[symbol->d].output_size; y++) | ||||
457 | if (exec_info[idx].inputs[x] == exec_info[symbol->d].outputs[y]) | ||||
458 | flag = 1; | ||||
459 | if (flag) | ||||
460 | ccv_nnc_graph_exec_symbol_concat(graph, *symbol, (ccv_nnc_graph_exec_symbol_t){ | ||||
461 | .graph = graph, | ||||
462 | .d = idx | ||||
463 | }); | ||||
464 | } | ||||
465 | } | ||||
466 | // Find parents to visited_backward_execs, and use that as the starting point of all newly added graph_exec_symbols. Use the visited backward execs as the source, use all its parents as destination, go through with graph visit. | ||||
467 | ccv_sparse_matrix_t* const exec_dep = ccv_sparse_matrix_new(graph->exec_symbol_info->rnum, graph->exec_symbol_info->rnum, CCV_8U | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0); | ||||
468 | #define for_block(x, val) \ | ||||
469 | do { \ | ||||
470 | if (((uint8_t*)val)[0] != 0) \ | ||||
471 | ccv_array_push(buf, &x); \ | ||||
472 | } while (0) | ||||
473 | const uint8_t one = 1; | ||||
474 | // Now go from outputs to inputs, unmark visited ones. | ||||
475 | ccv_nnc_graph_visit_for(visit, exec_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const int idx __attribute__((unused)) = (visit)->node[_i_].index ; const int _node_unused_ __attribute__((unused)) = (visit)-> node[_i_].term; typeof ((exec_info)) const node __attribute__ ((unused)) = (exec_info) + idx; { | ||||
476 | if (idx < exec_rnum && !CCV_NNC_GRAPH_EXEC_IS_DEAD(node->flags)((node->flags) & CCV_NNC_GRAPH_EXEC_DEAD) && maskbit[idx >> 5] & (1u << (idx & 0x1f))) | ||||
477 | { | ||||
478 | ccv_array_clear(buf); | ||||
479 | ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx); | ||||
480 | if (vector) | ||||
481 | CCV_SPARSE_VECTOR_FOREACH(exec_dep, vector, for_block)do { switch ((((exec_dep)->type) & 0xFF000)) { case CCV_32S : { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((exec_dep)->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size ; _i_++) { for_block((_i_), ((vector)->data.i32 + (_i_ * _c_ ))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000 ) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F: { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((exec_dep)->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size ; _i_++) { for_block((_i_), ((vector)->data.f32 + (_i_ * _c_ ))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000 ) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S: { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((exec_dep)->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size ; _i_++) { for_block((_i_), ((vector)->data.i64 + (_i_ * _c_ ))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000 ) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F: { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((exec_dep)->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size ; _i_++) { for_block((_i_), ((vector)->data.f64 + (_i_ * _c_ ))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000 ) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.f64 + (0))); } } } while (0); break; } default: { do { int _i_; __attribute__((unused)) const size_t _c_ = (((exec_dep )->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR ) { for (_i_ = 0; _i_ < (vector)->size; _i_++) { for_block ((_i_), ((vector)->data.u8 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size [(((exec_dep)->type) & 0xFF000) >> 12] * (((exec_dep )->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector )->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.u8 + (0))); } } } while (0); } } } while (0); | ||||
482 | if (node->outgoings && node->outgoings->rnum > 0) | ||||
483 | { | ||||
484 | ccv_array_t* const outgoings = node->outgoings; | ||||
485 | for (k = 0; k < outgoings->rnum; k++) | ||||
486 | { | ||||
487 | const int outgoing_d = *(int*)ccv_array_get(outgoings, k)((void*)(((char*)((outgoings)->data)) + (size_t)(outgoings )->rsize * (size_t)(k))); | ||||
488 | if (outgoing_d >= exec_rnum) | ||||
489 | continue; | ||||
490 | int l; | ||||
491 | // We cannot avoid the ones that visited, because these may not contain all the deps. | ||||
492 | ccv_set_sparse_matrix_cell(exec_dep, outgoing_d, idx, &one); | ||||
493 | for (l = 0; l < buf->rnum; l++) | ||||
494 | ccv_set_sparse_matrix_cell(exec_dep, outgoing_d, *(int*)ccv_array_get(buf, l)((void*)(((char*)((buf)->data)) + (size_t)(buf)->rsize * (size_t)(l))), &one); | ||||
495 | } | ||||
496 | } | ||||
497 | } | ||||
498 | } ccv_nnc_graph_visit_endfor} } | ||||
499 | // Now go from outputs to inputs, unmark visited ones. | ||||
500 | ccv_nnc_graph_visit_for(visit, exec_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const int idx __attribute__((unused)) = (visit)->node[_i_].index ; const int _node_unused_ __attribute__((unused)) = (visit)-> node[_i_].term; typeof ((exec_info)) const node __attribute__ ((unused)) = (exec_info) + idx; { | ||||
501 | if (idx < exec_rnum) | ||||
502 | maskbit[idx >> 5] &= ~(1u << (idx & 0x1f)); | ||||
503 | } ccv_nnc_graph_visit_endfor} } | ||||
504 | ccv_nnc_graph_visit_free(visit); | ||||
505 | #undef for_block | ||||
506 | // Go through visited backward execs, remove the ones that has no dependency on any replaced backward execs. | ||||
507 | for (j = 0; j < visited_backward_execs->rnum;) | ||||
508 | { | ||||
509 | const int idx = *(int*)ccv_array_get(visited_backward_execs, j)((void*)(((char*)((visited_backward_execs)->data)) + (size_t )(visited_backward_execs)->rsize * (size_t)(j))); | ||||
510 | if (ccv_array_contain_int(replaced_backward_execs, idx)) | ||||
511 | { | ||||
512 | ++j; | ||||
513 | continue; | ||||
514 | } | ||||
515 | ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx); | ||||
516 | int flag = 0; | ||||
517 | #define for_block(x, val) \ | ||||
518 | do { \ | ||||
519 | if (((uint8_t*)val)[0] != 0) \ | ||||
520 | if (ccv_array_contain_int(replaced_backward_execs, x)) \ | ||||
521 | flag = 1; \ | ||||
522 | } while (0) | ||||
523 | if (vector) | ||||
524 | CCV_SPARSE_VECTOR_FOREACH(exec_dep, vector, for_block)do { switch ((((exec_dep)->type) & 0xFF000)) { case CCV_32S : { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((exec_dep)->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size ; _i_++) { for_block((_i_), ((vector)->data.i32 + (_i_ * _c_ ))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000 ) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F: { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((exec_dep)->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size ; _i_++) { for_block((_i_), ((vector)->data.f32 + (_i_ * _c_ ))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000 ) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S: { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((exec_dep)->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size ; _i_++) { for_block((_i_), ((vector)->data.i64 + (_i_ * _c_ ))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000 ) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F: { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((exec_dep)->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size ; _i_++) { for_block((_i_), ((vector)->data.f64 + (_i_ * _c_ ))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000 ) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.f64 + (0))); } } } while (0); break; } default: { do { int _i_; __attribute__((unused)) const size_t _c_ = (((exec_dep )->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR ) { for (_i_ = 0; _i_ < (vector)->size; _i_++) { for_block ((_i_), ((vector)->data.u8 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size [(((exec_dep)->type) & 0xFF000) >> 12] * (((exec_dep )->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector )->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.u8 + (0))); } } } while (0); } } } while (0); | ||||
525 | #undef for_block | ||||
526 | if (!flag) | ||||
527 | { | ||||
528 | if (j < visited_backward_execs->rnum - 1) | ||||
529 | *(int*)ccv_array_get(visited_backward_execs, j)((void*)(((char*)((visited_backward_execs)->data)) + (size_t )(visited_backward_execs)->rsize * (size_t)(j))) = *(int*)ccv_array_get(visited_backward_execs, visited_backward_execs->rnum - 1)((void*)(((char*)((visited_backward_execs)->data)) + (size_t )(visited_backward_execs)->rsize * (size_t)(visited_backward_execs ->rnum - 1))); | ||||
530 | --visited_backward_execs->rnum; | ||||
531 | continue; | ||||
532 | } | ||||
533 | ++j; | ||||
534 | } | ||||
535 | // Now go through all replaced_backward_execs to find the ones has no dependencies in visited_backward_execs. | ||||
536 | for (j = 0; j < replaced_backward_execs->rnum; j++) | ||||
537 | { | ||||
538 | const int idx = *(int*)ccv_array_get(replaced_backward_execs, j)((void*)(((char*)((replaced_backward_execs)->data)) + (size_t )(replaced_backward_execs)->rsize * (size_t)(j))); | ||||
539 | ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx); | ||||
540 | int flag = 0; | ||||
541 | #define for_block(x, val) \ | ||||
542 | do { \ | ||||
543 | if (((uint8_t*)val)[0] != 0) \ | ||||
544 | if (ccv_array_contain_int(visited_backward_execs, x)) \ | ||||
545 | flag = 1; \ | ||||
546 | } while (0) | ||||
547 | if (vector) | ||||
548 | CCV_SPARSE_VECTOR_FOREACH(exec_dep, vector, for_block)do { switch ((((exec_dep)->type) & 0xFF000)) { case CCV_32S : { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((exec_dep)->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size ; _i_++) { for_block((_i_), ((vector)->data.i32 + (_i_ * _c_ ))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000 ) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F: { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((exec_dep)->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size ; _i_++) { for_block((_i_), ((vector)->data.f32 + (_i_ * _c_ ))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000 ) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S: { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((exec_dep)->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size ; _i_++) { for_block((_i_), ((vector)->data.i64 + (_i_ * _c_ ))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000 ) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F: { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((exec_dep)->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < (vector)->size ; _i_++) { for_block((_i_), ((vector)->data.f64 + (_i_ * _c_ ))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t ) + ((_ccv_get_data_type_size[(((exec_dep)->type) & 0xFF000 ) >> 12] * (((exec_dep)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t * const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.f64 + (0))); } } } while (0); break; } default: { do { int _i_; __attribute__((unused)) const size_t _c_ = (((exec_dep )->type) & 0xFFF); if ((exec_dep)->type & CCV_DENSE_VECTOR ) { for (_i_ = 0; _i_ < (vector)->size; _i_++) { for_block ((_i_), ((vector)->data.u8 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size [(((exec_dep)->type) & 0xFF000) >> 12] * (((exec_dep )->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(vector)->index; for (_i_ = 0; _i_ < (vector )->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_ = (ccv_sparse_matrix_index_t*)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_-> i), (_d_.u8 + (0))); } } } while (0); } } } while (0); | ||||
549 | #undef for_block | ||||
550 | // If this one has no parents that is within the visited_backward_execs, it is a good place for us to add all its parents as dependency for input_execs. | ||||
551 | if (!flag) | ||||
552 | { | ||||
553 | assert(idx < exec_rnum)((void) sizeof ((idx < exec_rnum) ? 1 : 0), __extension__ ( { if (idx < exec_rnum) ; else __assert_fail ("idx < exec_rnum" , "ccv_cnnp_model_gradient_checkpointing.c", 553, __extension__ __PRETTY_FUNCTION__); })); | ||||
554 | ccv_array_t* const outgoings = reversed_nodes[idx].outgoings; | ||||
555 | assert(outgoings)((void) sizeof ((outgoings) ? 1 : 0), __extension__ ({ if (outgoings ) ; else __assert_fail ("outgoings", "ccv_cnnp_model_gradient_checkpointing.c" , 555, __extension__ __PRETTY_FUNCTION__); })); | ||||
556 | for (k = 0; k < outgoings->rnum; k++) | ||||
557 | { | ||||
558 | const int d = *(int*)ccv_array_get(outgoings, k)((void*)(((char*)((outgoings)->data)) + (size_t)(outgoings )->rsize * (size_t)(k))); | ||||
559 | for (l = 0; l < newly_input_execs->rnum; l++) | ||||
560 | { | ||||
561 | ccv_nnc_graph_exec_symbol_concat(graph, (ccv_nnc_graph_exec_symbol_t){ | ||||
562 | .graph = graph, | ||||
563 | .d = d | ||||
564 | }, *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(newly_input_execs, l)((void*)(((char*)((newly_input_execs)->data)) + (size_t)(newly_input_execs )->rsize * (size_t)(l)))); | ||||
565 | } | ||||
566 | } | ||||
567 | } | ||||
568 | } | ||||
569 | ccv_matrix_free(exec_dep); | ||||
570 | // Go through all exec, free ones that doesn't have output used. | ||||
571 | // Reuse this array because it is not useful any more. | ||||
572 | ccv_array_t* forward_pass_inputs = visited_backward_execs; | ||||
573 | int any_deleted; | ||||
574 | do { | ||||
575 | // Build a map of still active inputs. | ||||
576 | ccv_array_clear(forward_pass_inputs); | ||||
577 | for (j = 0; j < build.graph_exec_symbols->rnum; j++) | ||||
578 | { | ||||
579 | ccv_nnc_graph_exec_symbol_t* const symbol = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(build.graph_exec_symbols, j)((void*)(((char*)((build.graph_exec_symbols)->data)) + (size_t )(build.graph_exec_symbols)->rsize * (size_t)(j))); | ||||
580 | if (symbol->d < 0) | ||||
581 | continue; | ||||
582 | if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_info[symbol->d].flags)((exec_info[symbol->d].flags) & CCV_NNC_GRAPH_EXEC_DEAD )) | ||||
583 | continue; | ||||
584 | int* const inputs = exec_info[symbol->d].inputs; | ||||
585 | const int input_size = exec_info[symbol->d].input_size; | ||||
586 | for (k = 0; k < input_size; k++) | ||||
587 | ccv_array_add_unique_int(forward_pass_inputs, inputs[k]); | ||||
588 | } | ||||
589 | any_deleted = 0; | ||||
590 | for (j = 0; j < build.graph_exec_symbols->rnum; j++) | ||||
591 | { | ||||
592 | ccv_nnc_graph_exec_symbol_t* const symbol = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(build.graph_exec_symbols, j)((void*)(((char*)((build.graph_exec_symbols)->data)) + (size_t )(build.graph_exec_symbols)->rsize * (size_t)(j))); | ||||
593 | if (symbol->d < 0) | ||||
594 | continue; | ||||
595 | if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_info[symbol->d].flags)((exec_info[symbol->d].flags) & CCV_NNC_GRAPH_EXEC_DEAD )) | ||||
596 | continue; | ||||
597 | int* const outputs = exec_info[symbol->d].outputs; | ||||
598 | const int output_size = exec_info[symbol->d].output_size; | ||||
599 | int flag = 0; | ||||
600 | for (k = 0; !flag && k < output_size; k++) | ||||
601 | flag = ccv_array_contain_int(newly_used_outputs, outputs[k]) || ccv_array_contain_int(forward_pass_inputs, outputs[k]); | ||||
602 | if (flag) | ||||
603 | continue; | ||||
604 | ccv_nnc_graph_exec_symbol_free(graph, *symbol); | ||||
605 | symbol->d = -1; | ||||
606 | symbol->graph = 0; | ||||
607 | any_deleted = 1; | ||||
608 | } | ||||
609 | } while (any_deleted); | ||||
610 | ccv_array_clear(forward_pass_inputs); | ||||
611 | for (j = 0; j < build.graph_exec_symbols->rnum; j++) | ||||
612 | { | ||||
613 | ccv_nnc_graph_exec_symbol_t* const symbol = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(build.graph_exec_symbols, j)((void*)(((char*)((build.graph_exec_symbols)->data)) + (size_t )(build.graph_exec_symbols)->rsize * (size_t)(j))); | ||||
614 | if (symbol->d < 0) | ||||
615 | continue; | ||||
616 | if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_info[symbol->d].flags)((exec_info[symbol->d].flags) & CCV_NNC_GRAPH_EXEC_DEAD )) | ||||
617 | continue; | ||||
618 | int* const inputs = exec_info[symbol->d].inputs; | ||||
619 | const int input_size = exec_info[symbol->d].input_size; | ||||
620 | for (k = 0; k < input_size; k++) | ||||
621 | ccv_array_add_unique_int(forward_pass_inputs, inputs[k]); | ||||
622 | int* const outputs = exec_info[symbol->d].outputs; | ||||
623 | const int output_size = exec_info[symbol->d].output_size; | ||||
624 | for (k = 0; k < output_size; k++) | ||||
625 | ccv_array_add_unique_int(forward_pass_inputs, outputs[k]); | ||||
626 | } | ||||
627 | // Free unused tensor symbols. | ||||
628 | for (j = 0; j < build.tensor_symbols->rnum; j++) | ||||
629 | { | ||||
630 | const ccv_nnc_tensor_symbol_t* symbol = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(build.tensor_symbols, j)((void*)(((char*)((build.tensor_symbols)->data)) + (size_t )(build.tensor_symbols)->rsize * (size_t)(j)))); | ||||
631 | if (ccv_array_contain_int(newly_used_outputs, symbol->d) || ccv_array_contain_int(forward_pass_inputs, symbol->d)) | ||||
632 | continue; | ||||
633 | ccv_nnc_tensor_symbol_free(graph, *symbol); | ||||
634 | } | ||||
635 | for (j = 0; j < build.graph_exec_symbols->rnum; j++) | ||||
636 | { | ||||
637 | ccv_nnc_graph_exec_symbol_t* const symbol = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(build.graph_exec_symbols, j)((void*)(((char*)((build.graph_exec_symbols)->data)) + (size_t )(build.graph_exec_symbols)->rsize * (size_t)(j))); | ||||
638 | if (symbol->d < 0) | ||||
639 | continue; | ||||
640 | if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_info[symbol->d].flags)((exec_info[symbol->d].flags) & CCV_NNC_GRAPH_EXEC_DEAD )) | ||||
641 | continue; | ||||
642 | ccv_nnc_graph_exec_symbol_set_flags(graph, *symbol, CCV_NNC_GRAPH_EXEC_DISABLE_OPT); | ||||
643 | } | ||||
644 | // Free these newly created execs and tensor symbols. | ||||
645 | ccv_array_free(build.tensor_symbols); | ||||
646 | ccv_array_free(build.graph_exec_symbols); | ||||
647 | kh_destroy(ccv_cnnp_tensor_symbol_map, symbol_map)kh_destroy_ccv_cnnp_tensor_symbol_map(symbol_map); | ||||
648 | } | ||||
649 | ccfreefree(max_outputs); | ||||
650 | ccv_array_free(buf); | ||||
651 | ccv_array_free(newly_used_outputs); | ||||
652 | ccv_array_free(parameters); | ||||
653 | ccv_array_free(parameter_ids); | ||||
654 | ccv_array_free(parameter_trainables); | ||||
655 | ccv_array_free(internals); | ||||
656 | ccv_array_free(internal_ids); | ||||
657 | ccfreefree(maskbit); | ||||
658 | ccv_array_free(input_gradient_execs); | ||||
659 | ccv_array_free(output_gradient_execs); | ||||
660 | ccv_array_free(input_execs); | ||||
661 | ccv_array_free(output_execs); | ||||
662 | ccv_array_free(replaced_backward_execs); | ||||
663 | ccv_array_free(visited_backward_execs); | ||||
664 | for (i = 0; i < exec_rnum; i++) | ||||
665 | if (reversed_nodes[i].outgoings) | ||||
666 | ccv_array_free(reversed_nodes[i].outgoings); | ||||
667 | ccfreefree(reversed_nodes); | ||||
668 | } |