File: | nnc/ccv_cnnp_model_gradient_checkpointing.c |
Warning: | line 158, column 4 Use of memory allocated with size zero |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | #include "ccv_nnc.h" | |||
2 | #include "ccv_nnc_easy.h" | |||
3 | #include "ccv_nnc_internal.h" | |||
4 | #include "ccv_internal.h" | |||
5 | #include "_ccv_cnnp_model.h" | |||
6 | // This can be removed once we organized ccv_cnnp_apply_gradient_checkpoints better. | |||
7 | #include "_ccv_nnc_symbolic_graph.h" | |||
8 | ||||
9 | void ccv_cnnp_model_gradient_checkpoints_cleanup_after_build(ccv_cnnp_compiled_data_t* const compiled_data, ccv_nnc_symbolic_graph_t* const graph) | |||
10 | { | |||
11 | ccv_array_t* const gradient_checkpoints = compiled_data->gradient_checkpoints; | |||
12 | if (!gradient_checkpoints || gradient_checkpoints->rnum == 0) // No saved gradient checkpoints, this is an easy way out. | |||
13 | return; | |||
14 | int i, j; | |||
15 | const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = (const ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, 0)((void*)(((char*)((graph->tensor_symbol_info)->data)) + (size_t)(graph->tensor_symbol_info)->rsize * (size_t)( 0))); | |||
16 | // Go through to check if any tensors that supposes in this map is removed. | |||
17 | for (i = 0; i < gradient_checkpoints->rnum; i++) | |||
18 | { | |||
19 | ccv_cnnp_model_gradient_checkpoint_t* const checkpoint = (ccv_cnnp_model_gradient_checkpoint_t*)ccv_array_get(gradient_checkpoints, i)((void*)(((char*)((gradient_checkpoints)->data)) + (size_t )(gradient_checkpoints)->rsize * (size_t)(i))); | |||
20 | for (j = 0; j < checkpoint->tensor_symbols->rnum; j++) | |||
21 | { | |||
22 | ccv_nnc_tensor_symbol_t* const symbol = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(checkpoint->tensor_symbols, j)((void*)(((char*)((checkpoint->tensor_symbols)->data)) + (size_t)(checkpoint->tensor_symbols)->rsize * (size_t) (j)))); | |||
23 | if (symbol->d >= 0 && symbol->d < graph->tensor_symbol_info->rnum) | |||
24 | // If it is dead, we need to remove this symbol. | |||
25 | if (CCV_NNC_TENSOR_SYMBOL_IS_DEAD(tensor_symbol_info[symbol->d].flags)((tensor_symbol_info[symbol->d].flags) & CCV_NNC_TENSOR_SYMBOL_DEAD )) | |||
26 | { | |||
27 | symbol->d = -1; | |||
28 | symbol->graph = 0; | |||
29 | } | |||
30 | } | |||
31 | } | |||
32 | } | |||
33 | ||||
34 | typedef struct { | |||
35 | ccv_array_t* outgoings; | |||
36 | } ccv_nnc_graph_exec_symbol_reverse_t; | |||
37 | ||||
38 | typedef struct { | |||
39 | ccv_cnnp_model_gradient_checkpoint_build_context_t tensor_context; | |||
40 | ccv_array_t* graph_exec_symbols; | |||
41 | ccv_nnc_graph_exec_symbol_new_hook_f old_graph_exec_symbol_new_hook; | |||
42 | void* old_graph_exec_symbol_new_hook_context; | |||
43 | ccv_array_t* all_tensor_symbols; | |||
44 | } ccv_cnnp_gradient_checkpoint_build_t; | |||
45 | ||||
46 | static void _ccv_cnnp_gradient_checkpoint_tensor_symbol_new_hook(void* context, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_param_t info, const char* const name) | |||
47 | { | |||
48 | ccv_cnnp_gradient_checkpoint_build_t* const build_context = (ccv_cnnp_gradient_checkpoint_build_t*)context; | |||
49 | if (build_context->tensor_context.record) | |||
50 | ccv_array_push(build_context->tensor_context.tensor_symbols, &symbol); | |||
51 | ccv_array_push(build_context->all_tensor_symbols, &symbol); | |||
52 | if (build_context->tensor_context.old_tensor_symbol_new_hook) | |||
53 | build_context->tensor_context.old_tensor_symbol_new_hook(build_context->tensor_context.old_tensor_symbol_new_hook_context, symbol, info, name); | |||
54 | } | |||
55 | ||||
56 | static void _ccv_cnnp_gradient_checkpoint_tensor_symbol_alias_new_hook(void* context, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_symbol_t from_symbol, const int ofs[CCV_NNC_MAX_DIM_ALLOC(12)], const int inc[CCV_NNC_MAX_DIM_ALLOC(12)], const ccv_nnc_tensor_param_t info, const char* const name) | |||
57 | { | |||
58 | ccv_cnnp_gradient_checkpoint_build_t* const build_context = (ccv_cnnp_gradient_checkpoint_build_t*)context; | |||
59 | if (build_context->tensor_context.record) | |||
60 | ccv_array_push(build_context->tensor_context.tensor_symbols, &symbol); | |||
61 | ccv_array_push(build_context->all_tensor_symbols, &symbol); | |||
62 | if (build_context->tensor_context.old_tensor_symbol_alias_new_hook) | |||
63 | build_context->tensor_context.old_tensor_symbol_alias_new_hook(build_context->tensor_context.old_tensor_symbol_alias_new_hook_context, symbol, from_symbol, ofs, inc, info, name); | |||
64 | } | |||
65 | ||||
66 | static void _ccv_cnnp_model_gradient_checkpoint_graph_exec_symbol_new_hook(void* context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_symbol_t* const inputs, const int input_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const char* const name) | |||
67 | { | |||
68 | ccv_cnnp_gradient_checkpoint_build_t* const build = (ccv_cnnp_gradient_checkpoint_build_t*)context; | |||
69 | ccv_array_push(build->graph_exec_symbols, &symbol); | |||
70 | if (build->old_graph_exec_symbol_new_hook) | |||
71 | build->old_graph_exec_symbol_new_hook(build->old_graph_exec_symbol_new_hook_context, symbol, cmd, inputs, input_size, outputs, output_size, name); | |||
72 | } | |||
73 | ||||
74 | KHASH_MAP_INIT_INT(ccv_cnnp_tensor_symbol_map, int)typedef struct kh_ccv_cnnp_tensor_symbol_map_s { khint_t n_buckets , size, n_occupied, upper_bound; khint32_t *flags; khint32_t * keys; int *vals; } kh_ccv_cnnp_tensor_symbol_map_t; static inline __attribute__ ((__unused__)) kh_ccv_cnnp_tensor_symbol_map_t *kh_init_ccv_cnnp_tensor_symbol_map(void) { return (kh_ccv_cnnp_tensor_symbol_map_t *)calloc(1,sizeof(kh_ccv_cnnp_tensor_symbol_map_t)); } static inline __attribute__ ((__unused__)) void kh_destroy_ccv_cnnp_tensor_symbol_map (kh_ccv_cnnp_tensor_symbol_map_t *h) { if (h) { free((void *) h->keys); free(h->flags); free((void *)h->vals); free (h); } } static inline __attribute__ ((__unused__)) void kh_clear_ccv_cnnp_tensor_symbol_map (kh_ccv_cnnp_tensor_symbol_map_t *h) { if (h && h-> flags) { memset(h->flags, 0xaa, ((h->n_buckets) < 16 ? 1 : (h->n_buckets)>>4) * sizeof(khint32_t)); h-> size = h->n_occupied = 0; } } static inline __attribute__ ( (__unused__)) khint_t kh_get_ccv_cnnp_tensor_symbol_map(const kh_ccv_cnnp_tensor_symbol_map_t *h, khint32_t key) { if (h-> n_buckets) { khint_t k, i, last, mask, step = 0; mask = h-> n_buckets - 1; k = (khint32_t)(key); i = k & mask; last = i; while (!((h->flags[i>>4]>>((i&0xfU)<< 1))&2) && (((h->flags[i>>4]>>((i& 0xfU)<<1))&1) || !((h->keys[i]) == (key)))) { i = (i + (++step)) & mask; if (i == last) return h->n_buckets ; } return ((h->flags[i>>4]>>((i&0xfU)<< 1))&3)? h->n_buckets : i; } else return 0; } static inline __attribute__ ((__unused__)) int kh_resize_ccv_cnnp_tensor_symbol_map (kh_ccv_cnnp_tensor_symbol_map_t *h, khint_t new_n_buckets) { khint32_t *new_flags = 0; khint_t j = 1; { (--(new_n_buckets ), (new_n_buckets)|=(new_n_buckets)>>1, (new_n_buckets) |=(new_n_buckets)>>2, (new_n_buckets)|=(new_n_buckets)>> 4, (new_n_buckets)|=(new_n_buckets)>>8, (new_n_buckets) |=(new_n_buckets)>>16, ++(new_n_buckets)); if (new_n_buckets < 4) new_n_buckets = 4; if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; else { new_flags = (khint32_t *)malloc(((new_n_buckets) < 16? 1 : (new_n_buckets)>> 4) * sizeof(khint32_t)); if (!new_flags) return -1; memset(new_flags , 0xaa, ((new_n_buckets) < 16? 1 : (new_n_buckets)>> 4) * sizeof(khint32_t)); if (h->n_buckets < new_n_buckets ) { khint32_t *new_keys = (khint32_t*)realloc((void *)h->keys ,new_n_buckets * sizeof(khint32_t)); if (!new_keys) { free(new_flags ); return -1; } h->keys = new_keys; if (1) { int *new_vals = (int*)realloc((void *)h->vals,new_n_buckets * sizeof(int )); if (!new_vals) { free(new_flags); return -1; } h->vals = new_vals; } } } } if (j) { for (j = 0; j != h->n_buckets ; ++j) { if (((h->flags[j>>4]>>((j&0xfU)<< 1))&3) == 0) { khint32_t key = h->keys[j]; int val; khint_t new_mask; new_mask = new_n_buckets - 1; if (1) val = h->vals [j]; (h->flags[j>>4]|=1ul<<((j&0xfU)<< 1)); while (1) { khint_t k, i, step = 0; k = (khint32_t)(key) ; i = k & new_mask; while (!((new_flags[i>>4]>> ((i&0xfU)<<1))&2)) i = (i + (++step)) & new_mask ; (new_flags[i>>4]&=~(2ul<<((i&0xfU)<< 1))); if (i < h->n_buckets && ((h->flags[i>> 4]>>((i&0xfU)<<1))&3) == 0) { { khint32_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } if (1 ) { int tmp = h->vals[i]; h->vals[i] = val; val = tmp; } (h->flags[i>>4]|=1ul<<((i&0xfU)<<1) ); } else { h->keys[i] = key; if (1) h->vals[i] = val; break ; } } } } if (h->n_buckets > new_n_buckets) { h->keys = (khint32_t*)realloc((void *)h->keys,new_n_buckets * sizeof (khint32_t)); if (1) h->vals = (int*)realloc((void *)h-> vals,new_n_buckets * sizeof(int)); } free(h->flags); h-> flags = new_flags; h->n_buckets = new_n_buckets; h->n_occupied = h->size; h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); } return 0; } static inline __attribute__ ((__unused__)) khint_t kh_put_ccv_cnnp_tensor_symbol_map(kh_ccv_cnnp_tensor_symbol_map_t *h, khint32_t key, int *ret) { khint_t x; if (h->n_occupied >= h->upper_bound) { if (h->n_buckets > (h->size <<1)) { if (kh_resize_ccv_cnnp_tensor_symbol_map(h, h-> n_buckets - 1) < 0) { *ret = -1; return h->n_buckets; } } else if (kh_resize_ccv_cnnp_tensor_symbol_map(h, h->n_buckets + 1) < 0) { *ret = -1; return h->n_buckets; } } { khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; x = site = h->n_buckets; k = (khint32_t)(key); i = k & mask; if (((h->flags[i>>4]>>((i&0xfU)<<1))& 2)) x = i; else { last = i; while (!((h->flags[i>>4] >>((i&0xfU)<<1))&2) && (((h->flags [i>>4]>>((i&0xfU)<<1))&1) || !((h-> keys[i]) == (key)))) { if (((h->flags[i>>4]>>( (i&0xfU)<<1))&1)) site = i; i = (i + (++step)) & mask; if (i == last) { x = site; break; } } if (x == h->n_buckets ) { if (((h->flags[i>>4]>>((i&0xfU)<< 1))&2) && site != h->n_buckets) x = site; else x = i; } } } if (((h->flags[x>>4]>>((x&0xfU )<<1))&2)) { h->keys[x] = key; (h->flags[x>> 4]&=~(3ul<<((x&0xfU)<<1))); ++h->size; ++h->n_occupied; *ret = 1; } else if (((h->flags[x>> 4]>>((x&0xfU)<<1))&1)) { h->keys[x] = key ; (h->flags[x>>4]&=~(3ul<<((x&0xfU)<< 1))); ++h->size; *ret = 2; } else *ret = 0; return x; } static inline __attribute__ ((__unused__)) void kh_del_ccv_cnnp_tensor_symbol_map (kh_ccv_cnnp_tensor_symbol_map_t *h, khint_t x) { if (x != h-> n_buckets && !((h->flags[x>>4]>>((x& 0xfU)<<1))&3)) { (h->flags[x>>4]|=1ul<< ((x&0xfU)<<1)); --h->size; } } | |||
75 | KHASH_SET_INIT_INT(ccv_cnnp_tensor_symbol_set)typedef struct kh_ccv_cnnp_tensor_symbol_set_s { khint_t n_buckets , size, n_occupied, upper_bound; khint32_t *flags; khint32_t * keys; char *vals; } kh_ccv_cnnp_tensor_symbol_set_t; static inline __attribute__ ((__unused__)) kh_ccv_cnnp_tensor_symbol_set_t *kh_init_ccv_cnnp_tensor_symbol_set(void) { return (kh_ccv_cnnp_tensor_symbol_set_t *)calloc(1,sizeof(kh_ccv_cnnp_tensor_symbol_set_t)); } static inline __attribute__ ((__unused__)) void kh_destroy_ccv_cnnp_tensor_symbol_set (kh_ccv_cnnp_tensor_symbol_set_t *h) { if (h) { free((void *) h->keys); free(h->flags); free((void *)h->vals); free (h); } } static inline __attribute__ ((__unused__)) void kh_clear_ccv_cnnp_tensor_symbol_set (kh_ccv_cnnp_tensor_symbol_set_t *h) { if (h && h-> flags) { memset(h->flags, 0xaa, ((h->n_buckets) < 16 ? 1 : (h->n_buckets)>>4) * sizeof(khint32_t)); h-> size = h->n_occupied = 0; } } static inline __attribute__ ( (__unused__)) khint_t kh_get_ccv_cnnp_tensor_symbol_set(const kh_ccv_cnnp_tensor_symbol_set_t *h, khint32_t key) { if (h-> n_buckets) { khint_t k, i, last, mask, step = 0; mask = h-> n_buckets - 1; k = (khint32_t)(key); i = k & mask; last = i; while (!((h->flags[i>>4]>>((i&0xfU)<< 1))&2) && (((h->flags[i>>4]>>((i& 0xfU)<<1))&1) || !((h->keys[i]) == (key)))) { i = (i + (++step)) & mask; if (i == last) return h->n_buckets ; } return ((h->flags[i>>4]>>((i&0xfU)<< 1))&3)? h->n_buckets : i; } else return 0; } static inline __attribute__ ((__unused__)) int kh_resize_ccv_cnnp_tensor_symbol_set (kh_ccv_cnnp_tensor_symbol_set_t *h, khint_t new_n_buckets) { khint32_t *new_flags = 0; khint_t j = 1; { (--(new_n_buckets ), (new_n_buckets)|=(new_n_buckets)>>1, (new_n_buckets) |=(new_n_buckets)>>2, (new_n_buckets)|=(new_n_buckets)>> 4, (new_n_buckets)|=(new_n_buckets)>>8, (new_n_buckets) |=(new_n_buckets)>>16, ++(new_n_buckets)); if (new_n_buckets < 4) new_n_buckets = 4; if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; else { new_flags = (khint32_t *)malloc(((new_n_buckets) < 16? 1 : (new_n_buckets)>> 4) * sizeof(khint32_t)); if (!new_flags) return -1; memset(new_flags , 0xaa, ((new_n_buckets) < 16? 1 : (new_n_buckets)>> 4) * sizeof(khint32_t)); if (h->n_buckets < new_n_buckets ) { khint32_t *new_keys = (khint32_t*)realloc((void *)h->keys ,new_n_buckets * sizeof(khint32_t)); if (!new_keys) { free(new_flags ); return -1; } h->keys = new_keys; if (0) { char *new_vals = (char*)realloc((void *)h->vals,new_n_buckets * sizeof(char )); if (!new_vals) { free(new_flags); return -1; } h->vals = new_vals; } } } } if (j) { for (j = 0; j != h->n_buckets ; ++j) { if (((h->flags[j>>4]>>((j&0xfU)<< 1))&3) == 0) { khint32_t key = h->keys[j]; char val; khint_t new_mask; new_mask = new_n_buckets - 1; if (0) val = h->vals [j]; (h->flags[j>>4]|=1ul<<((j&0xfU)<< 1)); while (1) { khint_t k, i, step = 0; k = (khint32_t)(key) ; i = k & new_mask; while (!((new_flags[i>>4]>> ((i&0xfU)<<1))&2)) i = (i + (++step)) & new_mask ; (new_flags[i>>4]&=~(2ul<<((i&0xfU)<< 1))); if (i < h->n_buckets && ((h->flags[i>> 4]>>((i&0xfU)<<1))&3) == 0) { { khint32_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } if (0 ) { char tmp = h->vals[i]; h->vals[i] = val; val = tmp; } (h->flags[i>>4]|=1ul<<((i&0xfU)<< 1)); } else { h->keys[i] = key; if (0) h->vals[i] = val ; break; } } } } if (h->n_buckets > new_n_buckets) { h-> keys = (khint32_t*)realloc((void *)h->keys,new_n_buckets * sizeof(khint32_t)); if (0) h->vals = (char*)realloc((void *)h->vals,new_n_buckets * sizeof(char)); } free(h->flags ); h->flags = new_flags; h->n_buckets = new_n_buckets; h ->n_occupied = h->size; h->upper_bound = (khint_t)(h ->n_buckets * __ac_HASH_UPPER + 0.5); } return 0; } static inline __attribute__ ((__unused__)) khint_t kh_put_ccv_cnnp_tensor_symbol_set (kh_ccv_cnnp_tensor_symbol_set_t *h, khint32_t key, int *ret) { khint_t x; if (h->n_occupied >= h->upper_bound) { if (h->n_buckets > (h->size<<1)) { if (kh_resize_ccv_cnnp_tensor_symbol_set (h, h->n_buckets - 1) < 0) { *ret = -1; return h->n_buckets ; } } else if (kh_resize_ccv_cnnp_tensor_symbol_set(h, h-> n_buckets + 1) < 0) { *ret = -1; return h->n_buckets; } } { khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; x = site = h->n_buckets; k = (khint32_t)(key); i = k & mask; if (((h->flags[i>>4]>>((i&0xfU )<<1))&2)) x = i; else { last = i; while (!((h-> flags[i>>4]>>((i&0xfU)<<1))&2) && (((h->flags[i>>4]>>((i&0xfU)<<1))& 1) || !((h->keys[i]) == (key)))) { if (((h->flags[i>> 4]>>((i&0xfU)<<1))&1)) site = i; i = (i + (++step)) & mask; if (i == last) { x = site; break; } } if (x == h->n_buckets) { if (((h->flags[i>>4]>> ((i&0xfU)<<1))&2) && site != h->n_buckets ) x = site; else x = i; } } } if (((h->flags[x>>4]>> ((x&0xfU)<<1))&2)) { h->keys[x] = key; (h-> flags[x>>4]&=~(3ul<<((x&0xfU)<<1))) ; ++h->size; ++h->n_occupied; *ret = 1; } else if (((h-> flags[x>>4]>>((x&0xfU)<<1))&1)) { h ->keys[x] = key; (h->flags[x>>4]&=~(3ul<< ((x&0xfU)<<1))); ++h->size; *ret = 2; } else *ret = 0; return x; } static inline __attribute__ ((__unused__)) void kh_del_ccv_cnnp_tensor_symbol_set(kh_ccv_cnnp_tensor_symbol_set_t *h, khint_t x) { if (x != h->n_buckets && !((h-> flags[x>>4]>>((x&0xfU)<<1))&3)) { ( h->flags[x>>4]|=1ul<<((x&0xfU)<<1)); --h->size; } } | |||
76 | ||||
77 | ccv_nnc_exec_dep_t _ccv_nnc_exec_dep_new(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_graph_visit_t* const visit, const int exec_rnum, uint32_t* const maskbit) | |||
78 | { | |||
79 | int* chain_ids = ccmallocmalloc(sizeof(int) * exec_rnum * 2); | |||
80 | int* chain_pos = chain_ids + exec_rnum; | |||
81 | int* buf = (int*)ccmallocmalloc(sizeof(int) * exec_rnum); | |||
82 | int* reversed_depth = buf; | |||
83 | const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0)((void*)(((char*)((graph->exec_symbol_info)->data)) + ( size_t)(graph->exec_symbol_info)->rsize * (size_t)(0))); | |||
84 | int i, j; | |||
85 | // Go reverse order to generate the distance from sink. | |||
86 | ccv_nnc_graph_visit_for_reversed(visit, exec_symbol_info, node, idx, term){ int _i_; for (_i_ = (visit)->size - 1; _i_ >= 0; _i_-- ) { const int idx __attribute__((unused)) = (visit)->node[ _i_].index; const int term __attribute__((unused)) = (visit)-> node[_i_].term; typeof ((exec_symbol_info)) const node __attribute__ ((unused)) = (exec_symbol_info) + idx; { | |||
| ||||
87 | if (idx >= exec_rnum || CCV_NNC_GRAPH_EXEC_IS_DEAD(node->flags)((node->flags) & CCV_NNC_GRAPH_EXEC_DEAD) || !(maskbit[idx >> 5] & (1u << (idx & 0x1f)))) | |||
88 | continue; | |||
89 | chain_ids[idx] = -1; | |||
90 | if (!node->outgoings || node->outgoings->rnum == 0) | |||
91 | { | |||
92 | reversed_depth[idx] = 0; | |||
93 | continue; | |||
94 | } | |||
95 | int depth = -1; | |||
96 | for (i = 0; i < node->outgoings->rnum; i++) | |||
97 | { | |||
98 | const int outgoing = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)( node->outgoings)->rsize * (size_t)(i))); | |||
99 | if (outgoing >= exec_rnum) | |||
100 | continue; | |||
101 | depth = ccv_max(depth, reversed_depth[outgoing])({ typeof (depth) _a = (depth); typeof (reversed_depth[outgoing ]) _b = (reversed_depth[outgoing]); (_a > _b) ? _a : _b; } ); | |||
102 | } | |||
103 | reversed_depth[idx] = depth + 1; | |||
104 | } ccv_nnc_graph_visit_endfor} } | |||
105 | // Go in order to generate chain ids (if there are multiple exits, we use the reverse depth to break the tie). | |||
106 | // Note that we cannot use depth so-far because then multiple exit nodes are equally good to "inherit" the chain selection. | |||
107 | int chain_count = 0; | |||
108 | ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx, term){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const int idx __attribute__((unused)) = (visit)->node[_i_].index ; const int term __attribute__((unused)) = (visit)->node[_i_ ].term; typeof ((exec_symbol_info)) const node __attribute__( (unused)) = (exec_symbol_info) + idx; { | |||
109 | if (idx >= exec_rnum || CCV_NNC_GRAPH_EXEC_IS_DEAD(node->flags)((node->flags) & CCV_NNC_GRAPH_EXEC_DEAD) || !(maskbit[idx >> 5] & (1u << (idx & 0x1f)))) | |||
110 | continue; | |||
111 | int chain_id = chain_ids[idx]; | |||
112 | if (chain_ids[idx] < 0) | |||
113 | { | |||
114 | chain_id = chain_count; | |||
115 | chain_ids[idx] = chain_id; | |||
116 | chain_pos[idx] = 1; // The first one in this chain. 1-based index because in sparse matrix, 0 is the default value. | |||
117 | chain_count += 1; | |||
118 | } | |||
119 | if (!node->outgoings || node->outgoings->rnum == 0) | |||
120 | continue; | |||
121 | int depth = -1; | |||
122 | int next_idx = -1; | |||
123 | for (i = 0; i < node->outgoings->rnum; i++) | |||
124 | { | |||
125 | const int outgoing = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)( node->outgoings)->rsize * (size_t)(i))); | |||
126 | if (outgoing >= exec_rnum) | |||
127 | continue; | |||
128 | if (chain_ids[outgoing] < 0 && reversed_depth[outgoing] > depth) | |||
129 | depth = reversed_depth[outgoing], next_idx = outgoing; | |||
130 | } | |||
131 | if (next_idx >= 0) | |||
132 | { | |||
133 | chain_ids[next_idx] = chain_id; | |||
134 | assert(reversed_depth[idx] - depth >= 1)((void) sizeof ((reversed_depth[idx] - depth >= 1) ? 1 : 0 ), __extension__ ({ if (reversed_depth[idx] - depth >= 1) ; else __assert_fail ("reversed_depth[idx] - depth >= 1", "ccv_cnnp_model_gradient_checkpointing.c" , 134, __extension__ __PRETTY_FUNCTION__); })); | |||
135 | chain_pos[next_idx] = chain_pos[idx] + (reversed_depth[idx] - depth); | |||
136 | } | |||
137 | } ccv_nnc_graph_visit_endfor} } | |||
138 | if (exec_rnum < chain_count * 2) // Be more conservative on RAM usage. | |||
139 | buf = ccreallocrealloc(buf, sizeof(int) * chain_count * 2); | |||
140 | ccv_sparse_matrix_t* deps = ccv_sparse_matrix_new(graph->exec_symbol_info->rnum, chain_count, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0); | |||
141 | // It logs which pos on that chain we depend on. We can simply compare that with the chain_pos for a node to know if they are ancestors. | |||
142 | #define for_block(x, val) \ | |||
143 | do { \ | |||
144 | if (((int32_t*)val)[0] > 0) \ | |||
145 | { \ | |||
146 | buf[buf_size * 2] = x; \ | |||
147 | buf[buf_size * 2 + 1] = ((int32_t*)val)[0]; \ | |||
148 | ++buf_size; \ | |||
149 | } \ | |||
150 | } while (0) | |||
151 | int buf_size; | |||
152 | ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx, term){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const int idx __attribute__((unused)) = (visit)->node[_i_].index ; const int term __attribute__((unused)) = (visit)->node[_i_ ].term; typeof ((exec_symbol_info)) const node __attribute__( (unused)) = (exec_symbol_info) + idx; { | |||
153 | if (idx
| |||
154 | continue; | |||
155 | buf_size = 0; /* save all its parent deps to this buffer */ | |||
156 | ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(deps, idx); | |||
157 | if (vector) | |||
158 | CCV_SPARSE_VECTOR_FOREACH(deps, vector, for_block)do { switch ((((deps)->type) & 0xFF000)) { case CCV_32S : { do { int _i_; __attribute__((unused)) const size_t _c_ = ( ((deps)->type) & 0xFFF); if ((deps)->type & CCV_DENSE_VECTOR ) { for (_i_ = 0; _i_ < (vector)->size; _i_++) { for_block ((_i_), ((vector)->data.i32 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof(ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size [(((deps)->type) & 0xFF000) >> 12] * (((deps)-> type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t *)(vector)->index; for (_i_ = 0; _i_ < (vector)->size ; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_ = (ccv_sparse_matrix_index_t *)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->i), (_d_.i32 + (0))); } } } while (0); break; } case CCV_32F: { do { int _i_; __attribute__((unused )) const size_t _c_ = (((deps)->type) & 0xFFF); if ((deps )->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < ( vector)->size; _i_++) { for_block((_i_), ((vector)->data .f32 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof (ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((deps )->type) & 0xFF000) >> 12] * (((deps)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(vector )->index; for (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_ = (ccv_sparse_matrix_index_t *)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->i), (_d_.f32 + (0))); } } } while (0); break; } case CCV_64S: { do { int _i_; __attribute__((unused )) const size_t _c_ = (((deps)->type) & 0xFFF); if ((deps )->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < ( vector)->size; _i_++) { for_block((_i_), ((vector)->data .i64 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof (ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((deps )->type) & 0xFF000) >> 12] * (((deps)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(vector )->index; for (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_ = (ccv_sparse_matrix_index_t *)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->i), (_d_.i64 + (0))); } } } while (0); break; } case CCV_64F: { do { int _i_; __attribute__((unused )) const size_t _c_ = (((deps)->type) & 0xFFF); if ((deps )->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < ( vector)->size; _i_++) { for_block((_i_), ((vector)->data .f64 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof (ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((deps )->type) & 0xFF000) >> 12] * (((deps)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(vector )->index; for (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_ = (ccv_sparse_matrix_index_t *)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->i), (_d_.f64 + (0))); } } } while (0); break; } default: { do { int _i_; __attribute__((unused )) const size_t _c_ = (((deps)->type) & 0xFFF); if ((deps )->type & CCV_DENSE_VECTOR) { for (_i_ = 0; _i_ < ( vector)->size; _i_++) { for_block((_i_), ((vector)->data .u8 + (_i_ * _c_))); } } else { const size_t _idx_size_ = sizeof (ccv_sparse_matrix_index_t) + ((_ccv_get_data_type_size[(((deps )->type) & 0xFF000) >> 12] * (((deps)->type) & 0xFFF) + 3) & -4); uint8_t* const _vidx_ = (uint8_t*)(vector )->index; for (_i_ = 0; _i_ < (vector)->size; _i_++) { ccv_sparse_matrix_index_t* const _idx_i_ = (ccv_sparse_matrix_index_t *)(_vidx_ + _idx_size_ * _i_); if (_idx_i_->ifbit <= 1) continue; ccv_numeric_data_t _d_ = { .u8 = (uint8_t*)(_idx_i_ + 1) }; for_block((_idx_i_->i), (_d_.u8 + (0))); } } } while (0); } } } while (0); | |||
| ||||
159 | if (!node->outgoings) | |||
160 | continue; | |||
161 | const int chain_id = chain_ids[idx]; | |||
162 | const int pos = chain_pos[idx]; | |||
163 | for (i = 0; i < node->outgoings->rnum; i++) | |||
164 | { | |||
165 | const int outgoing = *(int*)ccv_array_get(node->outgoings, i)((void*)(((char*)((node->outgoings)->data)) + (size_t)( node->outgoings)->rsize * (size_t)(i))); | |||
166 | if (outgoing >= exec_rnum) | |||
167 | continue; | |||
168 | const int outgoing_chain_id = chain_ids[outgoing]; | |||
169 | if (outgoing_chain_id != chain_id) | |||
170 | { | |||
171 | ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(deps, outgoing, chain_id); | |||
172 | /* If not found, set, if the current node is the destination node, no need | |||
173 | * set itself as parent of subsequent nodes because its terminal nature. */ | |||
174 | if (!cell.i32 || cell.i32[0] == 0 || cell.i32[0] < pos) | |||
175 | ccv_set_sparse_matrix_cell(deps, outgoing, chain_id, &pos); | |||
176 | } | |||
177 | if (buf_size > 0) | |||
178 | { | |||
179 | ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(deps, outgoing); | |||
180 | for (j = 0; j < buf_size; j++) /* set with all idx's dependencies as well */ | |||
181 | { | |||
182 | if (outgoing_chain_id == buf[j * 2]) // We don't need to add as dependency for the same chain. | |||
183 | continue; | |||
184 | if (!vector) | |||
185 | { | |||
186 | ccv_set_sparse_matrix_cell(deps, outgoing, buf[j * 2], &buf[j * 2 + 1]); | |||
187 | vector = ccv_get_sparse_matrix_vector(deps, outgoing); | |||
188 | continue; | |||
189 | } | |||
190 | ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell_from_vector(deps, vector, buf[j * 2]); | |||
191 | /* If not found, set. Otherwise, set to the latest one only if it is later. */ | |||
192 | if (!cell.i32 || cell.i32[0] == 0 || cell.i32[0] <= buf[j * 2 + 1]) | |||
193 | ccv_set_sparse_matrix_cell_from_vector(deps, vector, buf[j * 2], &buf[j * 2 + 1]); | |||
194 | } | |||
195 | } | |||
196 | } | |||
197 | } ccv_nnc_graph_visit_endfor} } | |||
198 | #undef for_block | |||
199 | ccfreefree(buf); | |||
200 | ccv_nnc_exec_dep_t exec_dep = { | |||
201 | .chain_ids = chain_ids, | |||
202 | .chain_pos = chain_pos, | |||
203 | .deps = deps | |||
204 | }; | |||
205 | return exec_dep; | |||
206 | } | |||
207 | ||||
208 | void ccv_cnnp_model_apply_gradient_checkpoints(ccv_cnnp_compiled_data_t* const compiled_data, ccv_nnc_symbolic_graph_t* const graph) | |||
209 | { | |||
210 | ccv_array_t* const gradient_checkpoints = compiled_data->gradient_checkpoints; | |||
211 | if (!gradient_checkpoints || gradient_checkpoints->rnum == 0) // No saved gradient checkpoints, this is an easy way out. | |||
212 | return; | |||
213 | // Otherwise, for each gradient checkpoint, there are 3 steps: | |||
214 | // 1. Find currently, what execs exists from inputs to outputs. | |||
215 | // 2. Find execs that generates the outputs, and their corresponding backward execs. | |||
216 | // 3. Find all backward execs flow from outputs back to inputs. | |||
217 | // 4. Generate new ops by calling build again with old inputs, record all new tensors / execs. | |||
218 | // 5. Replace inputs in backward execs with the new tensors. | |||
219 | // 6. Hook the execs takes inputs with edge from parents of backward execs in step 2. | |||
220 | // 7. Delete newly generated execs that has no use (i.e. its outputs are not used by backward pass). | |||
221 | // 8. Mark all new execs with DISABLE_OPT to avoid common sub-expression elimination pass. | |||
222 | int i, j, k, l; | |||
223 | ccv_array_t* input_execs = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), 0, 0); | |||
224 | ccv_array_t* output_execs = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), 0, 0); | |||
225 | ccv_array_t* input_gradient_execs = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), 0, 0); | |||
226 | ccv_array_t* output_gradient_execs = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), 0, 0); | |||
227 | ccv_array_t* visited_backward_execs = ccv_array_new(sizeof(int), 0, 0); | |||
228 | ccv_array_t* replaced_backward_execs = ccv_array_new(sizeof(int), 0, 0); | |||
229 | const int exec_rnum = graph->exec_symbol_info->rnum; | |||
230 | ccv_nnc_graph_exec_symbol_reverse_t* const reversed_nodes = cccalloccalloc(exec_rnum, sizeof(ccv_nnc_graph_exec_symbol_reverse_t)); | |||
231 | for (i = 0; i < exec_rnum; i++) | |||
232 | { | |||
233 | const int* tos = 0; | |||
234 | int to_size = 0; | |||
235 | ccv_nnc_graph_exec_symbol_to(graph, (ccv_nnc_graph_exec_symbol_t){ | |||
236 | .graph = graph, | |||
237 | .d = i | |||
238 | }, &tos, &to_size); | |||
239 | if (tos) | |||
240 | for (j = 0; j < to_size; j++) | |||
241 | { | |||
242 | if (!reversed_nodes[tos[j]].outgoings) | |||
243 | reversed_nodes[tos[j]].outgoings = ccv_array_new(sizeof(int), 1, 0); | |||
244 | ccv_array_add_unique_int(reversed_nodes[tos[j]].outgoings, i); | |||
245 | } | |||
246 | } | |||
247 | uint32_t* const maskbit = cccalloccalloc((exec_rnum + 31) >> 5, sizeof(uint32_t)); | |||
248 | // Temporary for build_data. | |||
249 | ccv_array_t* const parameters = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0); | |||
250 | ccv_array_t* const parameter_ids = ccv_array_new(sizeof(char*), 0, 0); | |||
251 | ccv_array_t* const parameter_trainables = ccv_array_new(sizeof(int), 0, 0); | |||
252 | ccv_array_t* const internals = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0); | |||
253 | ccv_array_t* const internal_ids = ccv_array_new(sizeof(char*), 0, 0); | |||
254 | int max_output_size = 0; | |||
255 | for (i = 0; i < gradient_checkpoints->rnum; i++) | |||
256 | { | |||
257 | ccv_cnnp_model_gradient_checkpoint_t* const checkpoint = (ccv_cnnp_model_gradient_checkpoint_t*)ccv_array_get(gradient_checkpoints, i)((void*)(((char*)((gradient_checkpoints)->data)) + (size_t )(gradient_checkpoints)->rsize * (size_t)(i))); | |||
258 | max_output_size = ccv_max(checkpoint->output_size, max_output_size)({ typeof (checkpoint->output_size) _a = (checkpoint->output_size ); typeof (max_output_size) _b = (max_output_size); (_a > _b ) ? _a : _b; }); | |||
259 | } | |||
260 | ccv_nnc_tensor_symbol_t* max_outputs = ccmallocmalloc(sizeof(ccv_nnc_tensor_symbol_t) * max_output_size); | |||
261 | ccv_array_t* newly_used_outputs = ccv_array_new(sizeof(int), 0, 0); | |||
262 | khash_t(ccv_cnnp_tensor_symbol_set)kh_ccv_cnnp_tensor_symbol_set_t* const parameters_or_internals = kh_init(ccv_cnnp_tensor_symbol_set)kh_init_ccv_cnnp_tensor_symbol_set(); | |||
263 | for (i = 0; i < compiled_data->parameters->rnum; i++) | |||
264 | { | |||
265 | const ccv_nnc_tensor_symbol_t* const symbol = (const ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i)((void*)(((char*)((compiled_data->parameters)->data)) + (size_t)(compiled_data->parameters)->rsize * (size_t)( i))); | |||
266 | int ret; | |||
267 | kh_put(ccv_cnnp_tensor_symbol_set, parameters_or_internals, symbol->d, &ret)kh_put_ccv_cnnp_tensor_symbol_set(parameters_or_internals, symbol ->d, &ret); | |||
268 | } | |||
269 | for (i = 0; i < compiled_data->internals->rnum; i++) | |||
270 | { | |||
271 | const ccv_nnc_tensor_symbol_t* const symbol = (const ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i)((void*)(((char*)((compiled_data->parameters)->data)) + (size_t)(compiled_data->parameters)->rsize * (size_t)( i))); | |||
272 | int ret; | |||
273 | kh_put(ccv_cnnp_tensor_symbol_set, parameters_or_internals, symbol->d, &ret)kh_put_ccv_cnnp_tensor_symbol_set(parameters_or_internals, symbol ->d, &ret); | |||
274 | } | |||
275 | khash_t(ccv_cnnp_tensor_symbol_set)kh_ccv_cnnp_tensor_symbol_set_t* const newly_created_tensor_symbols = kh_init(ccv_cnnp_tensor_symbol_set)kh_init_ccv_cnnp_tensor_symbol_set(); | |||
276 | khash_t(ccv_cnnp_tensor_symbol_map)kh_ccv_cnnp_tensor_symbol_map_t* symbol_map = kh_init(ccv_cnnp_tensor_symbol_map)kh_init_ccv_cnnp_tensor_symbol_map(); | |||
277 | for (i = 0; i < gradient_checkpoints->rnum; i++) | |||
278 | { | |||
279 | ccv_cnnp_model_gradient_checkpoint_t* const checkpoint = (ccv_cnnp_model_gradient_checkpoint_t*)ccv_array_get(gradient_checkpoints, i)((void*)(((char*)((gradient_checkpoints)->data)) + (size_t )(gradient_checkpoints)->rsize * (size_t)(i))); | |||
280 | kh_clear(ccv_cnnp_tensor_symbol_set, newly_created_tensor_symbols)kh_clear_ccv_cnnp_tensor_symbol_set(newly_created_tensor_symbols ); | |||
281 | for (j = 0; j < checkpoint->tensor_symbols->rnum; j++) | |||
282 | { | |||
283 | const int idx = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(checkpoint->tensor_symbols, j)((void*)(((char*)((checkpoint->tensor_symbols)->data)) + (size_t)(checkpoint->tensor_symbols)->rsize * (size_t) (j))))->d; | |||
284 | if (idx < 0) | |||
285 | continue; | |||
286 | // Skip parameters or internals. | |||
287 | if (kh_get(ccv_cnnp_tensor_symbol_set, parameters_or_internals, idx)kh_get_ccv_cnnp_tensor_symbol_set(parameters_or_internals, idx ) != kh_end(parameters_or_internals)((parameters_or_internals)->n_buckets)) | |||
288 | continue; | |||
289 | int ret; | |||
290 | kh_put(ccv_cnnp_tensor_symbol_set, newly_created_tensor_symbols, idx, &ret)kh_put_ccv_cnnp_tensor_symbol_set(newly_created_tensor_symbols , idx, &ret); | |||
291 | } | |||
292 | ccv_array_clear(input_execs); | |||
293 | ccv_array_clear(output_execs); | |||
294 | ccv_nnc_graph_exec_symbol_info_t* exec_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0)((void*)(((char*)((graph->exec_symbol_info)->data)) + ( size_t)(graph->exec_symbol_info)->rsize * (size_t)(0))); | |||
295 | for (j = 0; j < exec_rnum; j++) | |||
296 | { | |||
297 | if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_info[j].flags)((exec_info[j].flags) & CCV_NNC_GRAPH_EXEC_DEAD)) | |||
298 | continue; | |||
299 | const int* inputs = exec_info[j].inputs; | |||
300 | int input_size = exec_info[j].input_size; | |||
301 | const int* outputs = exec_info[j].outputs; | |||
302 | int output_size = exec_info[j].output_size; | |||
303 | if (input_size == 0 && output_size == 0) | |||
304 | continue; | |||
305 | // Only go through forward pass. | |||
306 | if (ccv_nnc_cmd_is_backward(exec_info[j].cmd)) | |||
307 | continue; | |||
308 | const ccv_nnc_graph_exec_symbol_t symbol = { | |||
309 | .graph = graph, | |||
310 | .d = j | |||
311 | }; | |||
312 | int flag = 0; | |||
313 | for (k = 0; inputs && k < input_size && !flag; k++) | |||
314 | if (inputs[k] >= 0) | |||
315 | for (l = 0; l < checkpoint->input_size && !flag; l++) | |||
316 | if (checkpoint->inputs[l].d >= 0 && inputs[k] == checkpoint->inputs[l].d) | |||
317 | flag = 1; | |||
318 | // Input logic is different from output logic. We need to filter out these exec that contains inputs from within the graph. | |||
319 | for (k = 0; inputs && k < input_size && flag; k++) | |||
320 | if (inputs[k] >= 0 && kh_get(ccv_cnnp_tensor_symbol_set, newly_created_tensor_symbols, inputs[k])kh_get_ccv_cnnp_tensor_symbol_set(newly_created_tensor_symbols , inputs[k]) != kh_end(newly_created_tensor_symbols)((newly_created_tensor_symbols)->n_buckets)) | |||
321 | flag = 0; | |||
322 | if (flag) | |||
323 | ccv_array_push(input_execs, &symbol); | |||
324 | flag = 0; | |||
325 | for (k = 0; outputs && k < output_size && !flag; k++) | |||
326 | if (outputs[k] >= 0) | |||
327 | for (l = 0; l < checkpoint->output_size && !flag; l++) | |||
328 | if (checkpoint->outputs[l].d >= 0 && outputs[k] == checkpoint->outputs[l].d) | |||
329 | flag = 1; | |||
330 | if (flag) | |||
331 | ccv_array_push(output_execs, &symbol); | |||
332 | } | |||
333 | if (input_execs->rnum <= 0 || output_execs->rnum <= 0) | |||
334 | continue; | |||
335 | // Fill in blanks (i.e. the backward ops that are not showing in above, but should be included to avoid excluding necessary ones). This is done by flowing gradients from outputs back all the way to inputs. | |||
336 | ccv_array_clear(input_gradient_execs); | |||
337 | ccv_array_clear(output_gradient_execs); | |||
338 | for (j = 0; j < input_execs->rnum; j++) | |||
339 | { | |||
340 | const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(input_execs, j)((void*)(((char*)((input_execs)->data)) + (size_t)(input_execs )->rsize * (size_t)(j))))->d; | |||
341 | for (k = 0; k < exec_info[d].input_size; k++) | |||
342 | if (exec_info[d].inputs[k] >= 0) | |||
343 | { | |||
344 | const ccv_nnc_tensor_symbol_t gradient_symbol = ccv_nnc_tensor_symbol_for_backward(graph, (ccv_nnc_tensor_symbol_t){ | |||
345 | .graph = graph, | |||
346 | .d = exec_info[d].inputs[k] | |||
347 | }); | |||
348 | if (gradient_symbol.d < 0) | |||
349 | continue; | |||
350 | const ccv_nnc_graph_exec_symbol_t backward = ccv_nnc_graph_exec_symbol_for_backward(graph, gradient_symbol); | |||
351 | if (backward.d < 0) | |||
352 | continue; | |||
353 | if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_info[backward.d].flags)((exec_info[backward.d].flags) & CCV_NNC_GRAPH_EXEC_DEAD)) | |||
354 | continue; | |||
355 | int flag = 0; | |||
356 | for (l = 0; !flag && l < output_gradient_execs->rnum; l++) | |||
357 | if (((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(output_gradient_execs, l)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(l))))->d == backward.d) | |||
358 | flag = 1; | |||
359 | if (!flag) | |||
360 | ccv_array_push(output_gradient_execs, &backward); | |||
361 | } | |||
362 | if (exec_info[d].outgoings && exec_info[d].outgoings->rnum > 0) | |||
363 | for (k = 0; k < exec_info[d].outgoings->rnum; k++) | |||
364 | { | |||
365 | const int to_d = *(int*)ccv_array_get(exec_info[d].outgoings, k)((void*)(((char*)((exec_info[d].outgoings)->data)) + (size_t )(exec_info[d].outgoings)->rsize * (size_t)(k))); | |||
366 | if (!ccv_nnc_cmd_is_backward(exec_info[to_d].cmd)) | |||
367 | continue; | |||
368 | int flag = 0; | |||
369 | for (l = 0; !flag && l < output_gradient_execs->rnum; l++) | |||
370 | if (((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(output_gradient_execs, l)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(l))))->d == to_d) | |||
371 | flag = 1; | |||
372 | if (!flag) | |||
373 | { | |||
374 | const ccv_nnc_graph_exec_symbol_t backward = { | |||
375 | .graph = graph, | |||
376 | .d = to_d | |||
377 | }; | |||
378 | ccv_array_push(output_gradient_execs, &backward); | |||
379 | } | |||
380 | } | |||
381 | } | |||
382 | // For output_gradient_execs, we can be opportunistic and use the wrt symbols (if exists) to find relevant bits. | |||
383 | // For input_gradient_execs, there is no other way but to loop over all outgoings, find the ones are direct link as backward execs. | |||
384 | for (j = 0; j < output_execs->rnum; j++) | |||
385 | { | |||
386 | const int d = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(output_execs, j)((void*)(((char*)((output_execs)->data)) + (size_t)(output_execs )->rsize * (size_t)(j))))->d; | |||
387 | if (exec_info[d].outgoings && exec_info[d].outgoings->rnum > 0) | |||
388 | for (k = 0; k < exec_info[d].outgoings->rnum; k++) | |||
389 | { | |||
390 | const int to_d = *(int*)ccv_array_get(exec_info[d].outgoings, k)((void*)(((char*)((exec_info[d].outgoings)->data)) + (size_t )(exec_info[d].outgoings)->rsize * (size_t)(k))); | |||
391 | if (!ccv_nnc_cmd_is_backward(exec_info[to_d].cmd)) | |||
392 | continue; | |||
393 | int flag = 0; | |||
394 | for (l = 0; !flag && l < input_gradient_execs->rnum; l++) | |||
395 | if (((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(input_gradient_execs, l)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(l))))->d == to_d) | |||
396 | flag = 1; | |||
397 | if (!flag) | |||
398 | { | |||
399 | const ccv_nnc_graph_exec_symbol_t backward = { | |||
400 | .graph = graph, | |||
401 | .d = to_d | |||
402 | }; | |||
403 | ccv_array_push(input_gradient_execs, &backward); | |||
404 | } | |||
405 | } | |||
406 | } | |||
407 | // Note that we have to use up-to-date ones because the exec_info might have outgoings that is up-to-date. | |||
408 | ccv_nnc_graph_visit_t* const visit = ccv_nnc_graph_visit_new(graph, exec_info, graph->exec_symbol_info->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(input_gradient_execs, 0), input_gradient_execs->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(output_gradient_execs, 0), output_gradient_execs->rnum, 1)({ ccv_nnc_graph_visit_t* _visit_ = (ccv_nnc_graph_visit_t*)malloc (sizeof(ccv_nnc_graph_visit_t) + sizeof(_visit_->node[0]) * ((graph->exec_symbol_info->rnum) - 1)); _visit_->size = 0; do { typedef struct { int8_t d; int8_t r; uint16_t c; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_ = 0; for (_i_ = 0; _i_ < (graph->exec_symbol_info-> rnum); _i_++) _incoming_edges_ += ((exec_info)[_i_].outgoings ) ? (exec_info)[_i_].outgoings->rnum : 0; const int _heap_mem_ = ((graph->exec_symbol_info->rnum) + _incoming_edges_ > 1024); ccv_nnc_incoming_t* _incomings_; if (_heap_mem_) _incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof(ccv_nnc_incoming_t) * ( graph->exec_symbol_info->rnum) + sizeof(int32_t) * ((graph ->exec_symbol_info->rnum) * 2 + _incoming_edges_)); else _incomings_ = (ccv_nnc_incoming_t*)__builtin_alloca (sizeof( ccv_nnc_incoming_t) * (graph->exec_symbol_info->rnum) + sizeof(int32_t) * ((graph->exec_symbol_info->rnum) * 2 + _incoming_edges_)); memset(_incomings_, 0, sizeof(ccv_nnc_incoming_t ) * (graph->exec_symbol_info->rnum)); int32_t* _exists_ [2] = { (int32_t*)(_incomings_ + (graph->exec_symbol_info-> rnum)), (int32_t*)(_incomings_ + (graph->exec_symbol_info-> rnum)) + (graph->exec_symbol_info->rnum), }; int32_t* const _edges_ = _exists_[1] + (graph->exec_symbol_info->rnum ); for (_i_ = 0; _i_ < (input_gradient_execs->rnum); _i_ ++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)((void* )(((char*)((input_gradient_execs)->data)) + (size_t)(input_gradient_execs )->rsize * (size_t)(0))))[_i_].graph == graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((input_gradient_execs )->data)) + (size_t)(input_gradient_execs)->rsize * (size_t )(0))))[_i_].graph == graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((input_gradient_execs)->data)) + (size_t)(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph" , "ccv_cnnp_model_gradient_checkpointing.c", 408, __extension__ __PRETTY_FUNCTION__); })); _incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].d].r = 1; _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void* )(((char*)((input_gradient_execs)->data)) + (size_t)(input_gradient_execs )->rsize * (size_t)(0))))[_i_].d; } int _exist_size_[2] = { (input_gradient_execs->rnum), 0, }; int _p_ = 0, _q_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_].r != 1) continue ; _incomings_[_idx_].r = 2; if ((exec_info)[_idx_].outgoings) for (_j_ = 0; _j_ < (exec_info)[_idx_].outgoings->rnum ; _j_++) { const int d = *(int*)((void*)(((char*)(((exec_info )[_idx_].outgoings)->data)) + (size_t)((exec_info)[_idx_]. outgoings)->rsize * (size_t)(_j_))); ++_incomings_[d].c; if (_incomings_[d].r != 0) continue; _incomings_[d].r = 1; ((void ) sizeof ((_exist_size_[_q_] < (graph->exec_symbol_info ->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] < (graph->exec_symbol_info->rnum)) ; else __assert_fail ( "_exist_size_[_q_] < (graph->exec_symbol_info->rnum)" , "ccv_cnnp_model_gradient_checkpointing.c", 408, __extension__ __PRETTY_FUNCTION__); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), ( _q_) = (_i_)); } for (_i_ = 0; _i_ < (input_gradient_execs ->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((input_gradient_execs)->data)) + (size_t)(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph" , "ccv_cnnp_model_gradient_checkpointing.c", 408, __extension__ __PRETTY_FUNCTION__); })); _incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].d].r = 3; _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void* )(((char*)((input_gradient_execs)->data)) + (size_t)(input_gradient_execs )->rsize * (size_t)(0))))[_i_].d; } _exist_size_[0] = (input_gradient_execs ->rnum); _exist_size_[1] = 0; _p_ = 0, _q_ = 1; int _bump_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0 ; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_].r != 3) continue ; _incomings_[_idx_].r = 4; if ((exec_info)[_idx_].outgoings) for (_j_ = 0; _j_ < (exec_info)[_idx_].outgoings->rnum ; _j_++) { const int d = *(int*)((void*)(((char*)(((exec_info )[_idx_].outgoings)->data)) + (size_t)((exec_info)[_idx_]. outgoings)->rsize * (size_t)(_j_))); if (_incomings_[d].edges == 0) { _incomings_[d].edges = _bump_; _bump_ += _incomings_ [d].c; _incomings_[d].c = 0; } _edges_[_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_[d].c; if (_incomings_ [d].r != 2) continue; _incomings_[d].r = 3; ((void) sizeof (( _exist_size_[_q_] < (graph->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] < (graph ->exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (graph->exec_symbol_info->rnum)" , "ccv_cnnp_model_gradient_checkpointing.c", 408, __extension__ __PRETTY_FUNCTION__); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), ( _q_) = (_i_)); } for (_i_ = 0; _i_ < (output_gradient_execs ->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((output_gradient_execs)->data)) + (size_t)(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph" , "ccv_cnnp_model_gradient_checkpointing.c", 408, __extension__ __PRETTY_FUNCTION__); })); _incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].d].r = 5; _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void *)(((char*)((output_gradient_execs)->data)) + (size_t)(output_gradient_execs )->rsize * (size_t)(0))))[_i_].d; } _exist_size_[0] = (output_gradient_execs ->rnum); _exist_size_[1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_ [_p_] > 0) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_ ][_i_]; if (_incomings_[_idx_].r != 5) continue; _incomings_[ _idx_].r = 6; if (_incomings_[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_].c; _j_++) { const int d = _edges_ [_incomings_[_idx_].edges - 1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d].r = 5; ((void) sizeof ((_exist_size_ [_q_] < (graph->exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] < (graph->exec_symbol_info-> rnum)) ; else __assert_fail ("_exist_size_[_q_] < (graph->exec_symbol_info->rnum)" , "ccv_cnnp_model_gradient_checkpointing.c", 408, __extension__ __PRETTY_FUNCTION__); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), ( _q_) = (_i_)); } for (_i_ = 0; _i_ < (output_gradient_execs ->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((output_gradient_execs)->data)) + (size_t)(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph" , "ccv_cnnp_model_gradient_checkpointing.c", 408, __extension__ __PRETTY_FUNCTION__); })); _incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].d].d = 1; } for (_i_ = 0; _i_ < (input_gradient_execs->rnum ); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)( (void*)(((char*)((input_gradient_execs)->data)) + (size_t) (input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((input_gradient_execs)->data)) + (size_t)(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph" , "ccv_cnnp_model_gradient_checkpointing.c", 408, __extension__ __PRETTY_FUNCTION__); })); _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].d; } _p_ = 0; _q_ = 1; _exist_size_[0] = (input_gradient_execs->rnum ); _exist_size_[1] = 0; int _d_ = 0; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_ [_p_];) { const int32_t _idx_ = _exists_[_p_][_i_]; _visit_-> node[_visit_->size].index = ((_idx_)); _visit_->node[_visit_ ->size].term = ((_incomings_[_idx_].d)); ++_visit_->size ;; if (_incomings_[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7 ; } if ((exec_info)[_idx_].outgoings) { if ((exec_info)[_idx_ ].outgoings->rnum == 1) { const int d = *(int*)((void*)((( char*)(((exec_info)[_idx_].outgoings)->data)) + (size_t)(( exec_info)[_idx_].outgoings)->rsize * (size_t)(0))); --_incomings_ [d].c; if (_incomings_[d].c == 0 && _incomings_[d].r == 6 && _d_ < (output_gradient_execs->rnum)) { _exists_ [_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < (exec_info )[_idx_].outgoings->rnum; _j_++) { const int d = *(int*)(( void*)(((char*)(((exec_info)[_idx_].outgoings)->data)) + ( size_t)((exec_info)[_idx_].outgoings)->rsize * (size_t)(_j_ ))); --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_[d].r == 6 && _d_ < (output_gradient_execs ->rnum)) { ((void) sizeof ((_exist_size_[_q_] < (graph-> exec_symbol_info->rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_ [_q_] < (graph->exec_symbol_info->rnum)) ; else __assert_fail ("_exist_size_[_q_] < (graph->exec_symbol_info->rnum)" , "ccv_cnnp_model_gradient_checkpointing.c", 408, __extension__ __PRETTY_FUNCTION__); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for (_i_ = 0; _i_ < (output_gradient_execs ->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((output_gradient_execs)->data)) + (size_t)(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph" , "ccv_cnnp_model_gradient_checkpointing.c", 408, __extension__ __PRETTY_FUNCTION__); })); if (_incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].d].r == 7) continue; if (!(1)) { ((void) sizeof ((_incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].d].c == 0) ? 1 : 0), __extension__ ({ if (_incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].d].c == 0) ; else __assert_fail ("_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((output_gradient_execs)->data)) + (size_t)(output_gradient_execs)->rsize * (size_t)(0))))[_i_].d].c == 0" , "ccv_cnnp_model_gradient_checkpointing.c", 408, __extension__ __PRETTY_FUNCTION__); })); } else if (_incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].d].c > 0) continue; _visit_->node[_visit_->size].index = ((((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((output_gradient_execs )->data)) + (size_t)(output_gradient_execs)->rsize * (size_t )(0))))[_i_].d)); _visit_->node[_visit_->size].term = ( (_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*) ((output_gradient_execs)->data)) + (size_t)(output_gradient_execs )->rsize * (size_t)(0))))[_i_].d].d)); ++_visit_->size; ; } if (_heap_mem_) free(_incomings_); } while (0);; ((void) sizeof ((_visit_->size <= (graph->exec_symbol_info->rnum )) ? 1 : 0), __extension__ ({ if (_visit_->size <= (graph ->exec_symbol_info->rnum)) ; else __assert_fail ("_visit_->size <= (graph->exec_symbol_info->rnum)" , "ccv_cnnp_model_gradient_checkpointing.c", 408, __extension__ __PRETTY_FUNCTION__); })); _visit_; }); | |||
409 | ccv_nnc_graph_visit_for(visit, exec_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const int idx __attribute__((unused)) = (visit)->node[_i_].index ; const int _node_unused_ __attribute__((unused)) = (visit)-> node[_i_].term; typeof ((exec_info)) const node __attribute__ ((unused)) = (exec_info) + idx; { | |||
410 | if (idx < exec_rnum && !CCV_NNC_GRAPH_EXEC_IS_DEAD(node->flags)((node->flags) & CCV_NNC_GRAPH_EXEC_DEAD)) | |||
411 | maskbit[idx >> 5] |= (1u << (idx & 0x1f)); | |||
412 | } ccv_nnc_graph_visit_endfor} } | |||
413 | ccv_array_clear(visited_backward_execs); | |||
414 | // Add more backward pass to the list. Note that we don't add everything, particularly there are new nodes created through gradient checkpointing are ignored. | |||
415 | #define visitor(node, idx, _) \ | |||
416 | if (idx < exec_rnum && !CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_info[idx].flags)((exec_info[idx].flags) & CCV_NNC_GRAPH_EXEC_DEAD) && maskbit[idx >> 5] & (1u << (idx & 0x1f))) \ | |||
417 | ccv_array_add_unique_int(visited_backward_execs, idx); | |||
418 | CCV_NNC_GRAPH_VISIT(graph, reversed_nodes, exec_rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(output_gradient_execs, 0), output_gradient_execs->rnum, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(input_gradient_execs, 0), input_gradient_execs->rnum, 0, visitor)do { typedef struct { int8_t d; int8_t r; uint16_t c; int32_t edges; } ccv_nnc_incoming_t; int _i_, _j_; int _incoming_edges_ = 0; for (_i_ = 0; _i_ < (exec_rnum); _i_++) _incoming_edges_ += ((reversed_nodes)[_i_].outgoings) ? (reversed_nodes)[_i_] .outgoings->rnum : 0; const int _heap_mem_ = ((exec_rnum) + _incoming_edges_ > 1024); ccv_nnc_incoming_t* _incomings_ ; if (_heap_mem_) _incomings_ = (ccv_nnc_incoming_t*)malloc(sizeof (ccv_nnc_incoming_t) * (exec_rnum) + sizeof(int32_t) * ((exec_rnum ) * 2 + _incoming_edges_)); else _incomings_ = (ccv_nnc_incoming_t *)__builtin_alloca (sizeof(ccv_nnc_incoming_t) * (exec_rnum) + sizeof(int32_t) * ((exec_rnum) * 2 + _incoming_edges_)); memset (_incomings_, 0, sizeof(ccv_nnc_incoming_t) * (exec_rnum)); int32_t * _exists_[2] = { (int32_t*)(_incomings_ + (exec_rnum)), (int32_t *)(_incomings_ + (exec_rnum)) + (exec_rnum), }; int32_t* const _edges_ = _exists_[1] + (exec_rnum); for (_i_ = 0; _i_ < ( output_gradient_execs->rnum); _i_++) { ((void) sizeof (((( ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((output_gradient_execs )->data)) + (size_t)(output_gradient_execs)->rsize * (size_t )(0))))[_i_].graph == graph) ? 1 : 0), __extension__ ({ if (( (ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((output_gradient_execs )->data)) + (size_t)(output_gradient_execs)->rsize * (size_t )(0))))[_i_].graph == graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((output_gradient_execs)->data)) + (size_t)(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph" , "ccv_cnnp_model_gradient_checkpointing.c", 418, __extension__ __PRETTY_FUNCTION__); })); _incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].d].r = 1; _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void *)(((char*)((output_gradient_execs)->data)) + (size_t)(output_gradient_execs )->rsize * (size_t)(0))))[_i_].d; } int _exist_size_[2] = { (output_gradient_execs->rnum), 0, }; int _p_ = 0, _q_ = 1 ; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_].r != 1) continue ; _incomings_[_idx_].r = 2; if ((reversed_nodes)[_idx_].outgoings ) for (_j_ = 0; _j_ < (reversed_nodes)[_idx_].outgoings-> rnum; _j_++) { const int d = *(int*)((void*)(((char*)(((reversed_nodes )[_idx_].outgoings)->data)) + (size_t)((reversed_nodes)[_idx_ ].outgoings)->rsize * (size_t)(_j_))); ++_incomings_[d].c; if (_incomings_[d].r != 0) continue; _incomings_[d].r = 1; ( (void) sizeof ((_exist_size_[_q_] < (exec_rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_[_q_] < (exec_rnum)) ; else __assert_fail ("_exist_size_[_q_] < (exec_rnum)", "ccv_cnnp_model_gradient_checkpointing.c" , 418, __extension__ __PRETTY_FUNCTION__); })); _exists_[_q_] [_exist_size_[_q_]] = d; ++_exist_size_[_q_]; } } ((_i_) = (_p_ ), (_p_) = (_q_), (_q_) = (_i_)); } for (_i_ = 0; _i_ < (output_gradient_execs ->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((output_gradient_execs)->data)) + (size_t)(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph" , "ccv_cnnp_model_gradient_checkpointing.c", 418, __extension__ __PRETTY_FUNCTION__); })); _incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].d].r = 3; _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void *)(((char*)((output_gradient_execs)->data)) + (size_t)(output_gradient_execs )->rsize * (size_t)(0))))[_i_].d; } _exist_size_[0] = (output_gradient_execs ->rnum); _exist_size_[1] = 0; _p_ = 0, _q_ = 1; int _bump_ = 1; while (_exist_size_[_p_] > 0) { _exist_size_[_q_] = 0 ; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_][_i_]; if (_incomings_[_idx_].r != 3) continue ; _incomings_[_idx_].r = 4; if ((reversed_nodes)[_idx_].outgoings ) for (_j_ = 0; _j_ < (reversed_nodes)[_idx_].outgoings-> rnum; _j_++) { const int d = *(int*)((void*)(((char*)(((reversed_nodes )[_idx_].outgoings)->data)) + (size_t)((reversed_nodes)[_idx_ ].outgoings)->rsize * (size_t)(_j_))); if (_incomings_[d]. edges == 0) { _incomings_[d].edges = _bump_; _bump_ += _incomings_ [d].c; _incomings_[d].c = 0; } _edges_[_incomings_[d].edges - 1 + _incomings_[d].c] = _idx_; ++_incomings_[d].c; if (_incomings_ [d].r != 2) continue; _incomings_[d].r = 3; ((void) sizeof (( _exist_size_[_q_] < (exec_rnum)) ? 1 : 0), __extension__ ( { if (_exist_size_[_q_] < (exec_rnum)) ; else __assert_fail ("_exist_size_[_q_] < (exec_rnum)", "ccv_cnnp_model_gradient_checkpointing.c" , 418, __extension__ __PRETTY_FUNCTION__); })); _exists_[_q_] [_exist_size_[_q_]] = d; ++_exist_size_[_q_]; } } ((_i_) = (_p_ ), (_p_) = (_q_), (_q_) = (_i_)); } for (_i_ = 0; _i_ < (input_gradient_execs ->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((input_gradient_execs)->data)) + (size_t)(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph" , "ccv_cnnp_model_gradient_checkpointing.c", 418, __extension__ __PRETTY_FUNCTION__); })); _incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].d].r = 5; _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t*)((void* )(((char*)((input_gradient_execs)->data)) + (size_t)(input_gradient_execs )->rsize * (size_t)(0))))[_i_].d; } _exist_size_[0] = (input_gradient_execs ->rnum); _exist_size_[1] = 0; _p_ = 0, _q_ = 1; while (_exist_size_ [_p_] > 0) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_[_p_]; _i_++) { const int32_t _idx_ = _exists_[_p_ ][_i_]; if (_incomings_[_idx_].r != 5) continue; _incomings_[ _idx_].r = 6; if (_incomings_[_idx_].edges > 0) for (_j_ = 0; _j_ < _incomings_[_idx_].c; _j_++) { const int d = _edges_ [_incomings_[_idx_].edges - 1 + _j_]; if (_incomings_[d].r != 4) continue; _incomings_[d].r = 5; ((void) sizeof ((_exist_size_ [_q_] < (exec_rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_ [_q_] < (exec_rnum)) ; else __assert_fail ("_exist_size_[_q_] < (exec_rnum)" , "ccv_cnnp_model_gradient_checkpointing.c", 418, __extension__ __PRETTY_FUNCTION__); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[_q_]; } } ((_i_) = (_p_), (_p_) = (_q_), ( _q_) = (_i_)); } for (_i_ = 0; _i_ < (input_gradient_execs ->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((input_gradient_execs)->data)) + (size_t)(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph" , "ccv_cnnp_model_gradient_checkpointing.c", 418, __extension__ __PRETTY_FUNCTION__); })); _incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].d].d = 1; } for (_i_ = 0; _i_ < (output_gradient_execs->rnum) ; _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t*)(( void*)(((char*)((output_gradient_execs)->data)) + (size_t) (output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((output_gradient_execs)->data)) + (size_t)(output_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph" , "ccv_cnnp_model_gradient_checkpointing.c", 418, __extension__ __PRETTY_FUNCTION__); })); _exists_[0][_i_] = ((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((output_gradient_execs)->data)) + (size_t )(output_gradient_execs)->rsize * (size_t)(0))))[_i_].d; } _p_ = 0; _q_ = 1; _exist_size_[0] = (output_gradient_execs-> rnum); _exist_size_[1] = 0; int _d_ = 0; while (_exist_size_[ _p_] > 0) { _exist_size_[_q_] = 0; for (_i_ = 0; _i_ < _exist_size_ [_p_];) { const int32_t _idx_ = _exists_[_p_][_i_]; visitor(( (reversed_nodes) + _idx_), (_idx_), (_incomings_[_idx_].d)); if (_incomings_[_idx_].d) { ++_d_; _incomings_[_idx_].r = 7; } if ((reversed_nodes)[_idx_].outgoings) { if ((reversed_nodes)[_idx_ ].outgoings->rnum == 1) { const int d = *(int*)((void*)((( char*)(((reversed_nodes)[_idx_].outgoings)->data)) + (size_t )((reversed_nodes)[_idx_].outgoings)->rsize * (size_t)(0)) ); --_incomings_[d].c; if (_incomings_[d].c == 0 && _incomings_ [d].r == 6 && _d_ < (input_gradient_execs->rnum )) { _exists_[_p_][_i_] = d; continue; } } else for (_j_ = 0; _j_ < (reversed_nodes)[_idx_].outgoings->rnum; _j_++) { const int d = *(int*)((void*)(((char*)(((reversed_nodes)[_idx_ ].outgoings)->data)) + (size_t)((reversed_nodes)[_idx_].outgoings )->rsize * (size_t)(_j_))); --_incomings_[d].c; if (_incomings_ [d].c == 0 && _incomings_[d].r == 6 && _d_ < (input_gradient_execs->rnum)) { ((void) sizeof ((_exist_size_ [_q_] < (exec_rnum)) ? 1 : 0), __extension__ ({ if (_exist_size_ [_q_] < (exec_rnum)) ; else __assert_fail ("_exist_size_[_q_] < (exec_rnum)" , "ccv_cnnp_model_gradient_checkpointing.c", 418, __extension__ __PRETTY_FUNCTION__); })); _exists_[_q_][_exist_size_[_q_]] = d; ++_exist_size_[_q_]; } } } ++_i_; } ((_i_) = (_p_), (_p_) = (_q_), (_q_) = (_i_)); } for (_i_ = 0; _i_ < (input_gradient_execs ->rnum); _i_++) { ((void) sizeof ((((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ? 1 : 0), __extension__ ({ if (((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph) ; else __assert_fail ("((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((input_gradient_execs)->data)) + (size_t)(input_gradient_execs)->rsize * (size_t)(0))))[_i_].graph == graph" , "ccv_cnnp_model_gradient_checkpointing.c", 418, __extension__ __PRETTY_FUNCTION__); })); if (_incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].d].r == 7) continue; if (!(0)) { ((void) sizeof ((_incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].d].c == 0) ? 1 : 0), __extension__ ({ if (_incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].d].c == 0) ; else __assert_fail ("_incomings_[((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((input_gradient_execs)->data)) + (size_t)(input_gradient_execs)->rsize * (size_t)(0))))[_i_].d].c == 0" , "ccv_cnnp_model_gradient_checkpointing.c", 418, __extension__ __PRETTY_FUNCTION__); })); } else if (_incomings_[((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].d].c > 0) continue; visitor(((reversed_nodes) + ((ccv_nnc_graph_exec_symbol_t *)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].d), ( ((ccv_nnc_graph_exec_symbol_t*)((void*)(((char*)((input_gradient_execs )->data)) + (size_t)(input_gradient_execs)->rsize * (size_t )(0))))[_i_].d), (_incomings_[((ccv_nnc_graph_exec_symbol_t*) ((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(0))))[_i_].d].d) ); } if (_heap_mem_) free(_incomings_); } while (0);; | |||
419 | for (j = 0; j < input_gradient_execs->rnum; j++) | |||
420 | ccv_array_add_unique_int(visited_backward_execs, ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(input_gradient_execs, j)((void*)(((char*)((input_gradient_execs)->data)) + (size_t )(input_gradient_execs)->rsize * (size_t)(j))))->d); | |||
421 | #undef visitor | |||
422 | ccv_cnnp_gradient_checkpoint_build_t build = { | |||
423 | .tensor_context = { | |||
424 | .record = 1, | |||
425 | .tensor_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0), | |||
426 | }, | |||
427 | .graph_exec_symbols = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), 0, 0), | |||
428 | .all_tensor_symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0), | |||
429 | }; | |||
430 | build.tensor_context.old_tensor_symbol_new_hook_context = ccv_nnc_tensor_symbol_new_hook(graph, _ccv_cnnp_gradient_checkpoint_tensor_symbol_new_hook, &build, &build.tensor_context.old_tensor_symbol_new_hook); | |||
431 | build.tensor_context.old_tensor_symbol_alias_new_hook_context = ccv_nnc_tensor_symbol_alias_new_hook(graph, _ccv_cnnp_gradient_checkpoint_tensor_symbol_alias_new_hook, &build, &build.tensor_context.old_tensor_symbol_alias_new_hook); | |||
432 | build.old_graph_exec_symbol_new_hook_context = ccv_nnc_graph_exec_symbol_new_hook(graph, _ccv_cnnp_model_gradient_checkpoint_graph_exec_symbol_new_hook, &build, &build.old_graph_exec_symbol_new_hook); | |||
433 | ccv_array_clear(parameters); | |||
434 | ccv_array_clear(parameter_ids); | |||
435 | ccv_array_clear(parameter_trainables); | |||
436 | ccv_array_clear(internals); | |||
437 | ccv_array_clear(internal_ids); | |||
438 | ccv_cnnp_model_sequence_t model_sequence = { | |||
439 | .bank = kh_init(ccv_cnnp_model_name_bank)kh_init_ccv_cnnp_model_name_bank() | |||
440 | }; | |||
441 | ccv_cnnp_model_add_to_array_context_t add_to_parameter_context = { | |||
442 | .add_parameter_indices = 0, | |||
443 | .prefix = 't', | |||
444 | .sequence = &model_sequence, | |||
445 | .symbols = parameters, | |||
446 | .ids = parameter_ids, | |||
447 | .trainables = parameter_trainables, | |||
448 | }; | |||
449 | ccv_cnnp_model_add_to_array_context_t add_to_output_context = { | |||
450 | .add_parameter_indices = 0, | |||
451 | .prefix = 'r', | |||
452 | .sequence = &model_sequence, | |||
453 | .symbols = internals, | |||
454 | .ids = internal_ids, | |||
455 | .trainables = 0, | |||
456 | }; | |||
457 | ccv_cnnp_model_build_data_t build_data = { | |||
458 | .is_trainable = checkpoint->is_trainable, | |||
459 | .model_sequence = &model_sequence, | |||
460 | .add_to_array = ccv_cnnp_model_add_to_array, | |||
461 | .parameters = parameters, | |||
462 | .context = { | |||
463 | .add_to_parameter = &add_to_parameter_context, | |||
464 | .add_to_output = &add_to_output_context, | |||
465 | }, | |||
466 | .is_gradient_checkpointing = 1, // Mark this as true so we don't allocate gradient_checkpoints array or override the hooks. | |||
467 | .gradient_checkpoints = 0, | |||
468 | }; | |||
469 | checkpoint->model->data = &build_data; | |||
470 | checkpoint->build(checkpoint->model, graph, checkpoint->inputs, checkpoint->input_size, max_outputs, checkpoint->output_size); | |||
471 | checkpoint->model->data = 0; | |||
472 | kh_destroy(ccv_cnnp_model_name_bank, model_sequence.bank)kh_destroy_ccv_cnnp_model_name_bank(model_sequence.bank); | |||
473 | if (model_sequence.sequences) | |||
474 | ccv_array_free(model_sequence.sequences); | |||
475 | ccv_nnc_tensor_symbol_new_hook(graph, build.tensor_context.old_tensor_symbol_new_hook, build.tensor_context.old_tensor_symbol_new_hook_context, 0); | |||
476 | ccv_nnc_tensor_symbol_alias_new_hook(graph, build.tensor_context.old_tensor_symbol_alias_new_hook, build.tensor_context.old_tensor_symbol_alias_new_hook_context, 0); | |||
477 | ccv_nnc_graph_exec_symbol_autogen(graph, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(build.graph_exec_symbols, 0)((void*)(((char*)((build.graph_exec_symbols)->data)) + (size_t )(build.graph_exec_symbols)->rsize * (size_t)(0))), build.graph_exec_symbols->rnum, 0); | |||
478 | for (j = 0; j < parameter_ids->rnum; j++) | |||
479 | ccfreefree(*(char**)ccv_array_get(parameter_ids, j)((void*)(((char*)((parameter_ids)->data)) + (size_t)(parameter_ids )->rsize * (size_t)(j)))); | |||
480 | for (j = 0; j < internal_ids->rnum; j++) | |||
481 | ccfreefree(*(char**)ccv_array_get(internal_ids, j)((void*)(((char*)((internal_ids)->data)) + (size_t)(internal_ids )->rsize * (size_t)(j)))); | |||
482 | // Note that there is no graph optimization applied here. | |||
483 | exec_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0)((void*)(((char*)((graph->exec_symbol_info)->data)) + ( size_t)(graph->exec_symbol_info)->rsize * (size_t)(0))); | |||
484 | // Reuse existing one. | |||
485 | kh_clear(ccv_cnnp_tensor_symbol_set, newly_created_tensor_symbols)kh_clear_ccv_cnnp_tensor_symbol_set(newly_created_tensor_symbols ); | |||
486 | for (j = 0; j < build.tensor_context.tensor_symbols->rnum; j++) | |||
487 | { | |||
488 | const int idx = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(build.tensor_context.tensor_symbols, j)((void*)(((char*)((build.tensor_context.tensor_symbols)->data )) + (size_t)(build.tensor_context.tensor_symbols)->rsize * (size_t)(j))))->d; | |||
489 | if (idx < 0) | |||
490 | continue; | |||
491 | if (kh_get(ccv_cnnp_tensor_symbol_set, parameters_or_internals, idx)kh_get_ccv_cnnp_tensor_symbol_set(parameters_or_internals, idx ) != kh_end(parameters_or_internals)((parameters_or_internals)->n_buckets)) | |||
492 | continue; | |||
493 | int ret; | |||
494 | kh_put(ccv_cnnp_tensor_symbol_set, newly_created_tensor_symbols, idx, &ret)kh_put_ccv_cnnp_tensor_symbol_set(newly_created_tensor_symbols , idx, &ret); | |||
495 | } | |||
496 | ccv_array_t* const newly_input_execs = input_execs; | |||
497 | ccv_array_t* const newly_output_execs = output_execs; | |||
498 | ccv_array_clear(newly_input_execs); | |||
499 | ccv_array_clear(newly_output_execs); | |||
500 | for (j = 0; j < build.graph_exec_symbols->rnum; j++) | |||
501 | { | |||
502 | const int idx = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(build.graph_exec_symbols, j)((void*)(((char*)((build.graph_exec_symbols)->data)) + (size_t )(build.graph_exec_symbols)->rsize * (size_t)(j))))->d; | |||
503 | if (idx < 0) | |||
504 | continue; | |||
505 | if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_info[idx].flags)((exec_info[idx].flags) & CCV_NNC_GRAPH_EXEC_DEAD)) | |||
506 | continue; | |||
507 | const ccv_nnc_graph_exec_symbol_t symbol = { | |||
508 | .graph = graph, | |||
509 | .d = idx | |||
510 | }; | |||
511 | const int* inputs = exec_info[idx].inputs; | |||
512 | int input_size = exec_info[idx].input_size; | |||
513 | // Only go through forward pass. | |||
514 | assert(!ccv_nnc_cmd_is_backward(exec_info[idx].cmd))((void) sizeof ((!ccv_nnc_cmd_is_backward(exec_info[idx].cmd) ) ? 1 : 0), __extension__ ({ if (!ccv_nnc_cmd_is_backward(exec_info [idx].cmd)) ; else __assert_fail ("!ccv_nnc_cmd_is_backward(exec_info[idx].cmd)" , "ccv_cnnp_model_gradient_checkpointing.c", 514, __extension__ __PRETTY_FUNCTION__); })); | |||
515 | int flag = 0; | |||
516 | for (k = 0; inputs && k < input_size && !flag; k++) | |||
517 | if (inputs[k] >= 0) | |||
518 | for (l = 0; l < checkpoint->input_size && !flag; l++) | |||
519 | if (checkpoint->inputs[l].d >= 0 && inputs[k] == checkpoint->inputs[l].d) | |||
520 | flag = 1; | |||
521 | // Input logic is different from output logic. We need to filter out these exec that contains inputs from within the graph. | |||
522 | for (k = 0; inputs && k < input_size && flag; k++) | |||
523 | if (inputs[k] >= 0 && kh_get(ccv_cnnp_tensor_symbol_set, newly_created_tensor_symbols, inputs[k])kh_get_ccv_cnnp_tensor_symbol_set(newly_created_tensor_symbols , inputs[k]) != kh_end(newly_created_tensor_symbols)((newly_created_tensor_symbols)->n_buckets)) | |||
524 | flag = 0; | |||
525 | if (flag) | |||
526 | ccv_array_push(newly_input_execs, &symbol); | |||
527 | flag = 0; | |||
528 | const int* outputs = exec_info[idx].outputs; | |||
529 | int output_size = exec_info[idx].output_size; | |||
530 | for (k = 0; inputs && k < output_size && !flag; k++) | |||
531 | if (outputs[k] >= 0) | |||
532 | for (l = 0; l < checkpoint->output_size && !flag; l++) | |||
533 | if (max_outputs[l].d >= 0 && outputs[k] == max_outputs[l].d) | |||
534 | flag = 1; | |||
535 | if (flag) | |||
536 | ccv_array_push(newly_output_execs, &symbol); | |||
537 | } | |||
538 | for (j = 0; j < checkpoint->input_size; j++) | |||
539 | if (checkpoint->inputs[j].d >= 0) | |||
540 | ccv_array_push(parameters, checkpoint->inputs + j); | |||
541 | ccv_nnc_symbolic_graph_simplify(graph, | |||
542 | SYMBOLIC_GRAPH_PASSES(CCV_NNC_SIMPLIFY_COMMON_SUBEXPRESSION_ELIMINATION,(const int []){CCV_NNC_SIMPLIFY_COMMON_SUBEXPRESSION_ELIMINATION , CCV_NNC_SIMPLIFY_DATA_TRANSFER_OPT, CCV_NNC_SIMPLIFY_OPS_FUSION }, (1 +1 +1 +1 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 + 0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 + 0 +0 +0 +0 -1) | |||
543 | CCV_NNC_SIMPLIFY_DATA_TRANSFER_OPT,(const int []){CCV_NNC_SIMPLIFY_COMMON_SUBEXPRESSION_ELIMINATION , CCV_NNC_SIMPLIFY_DATA_TRANSFER_OPT, CCV_NNC_SIMPLIFY_OPS_FUSION }, (1 +1 +1 +1 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 + 0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 + 0 +0 +0 +0 -1) | |||
544 | CCV_NNC_SIMPLIFY_OPS_FUSION)(const int []){CCV_NNC_SIMPLIFY_COMMON_SUBEXPRESSION_ELIMINATION , CCV_NNC_SIMPLIFY_DATA_TRANSFER_OPT, CCV_NNC_SIMPLIFY_OPS_FUSION }, (1 +1 +1 +1 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 + 0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 + 0 +0 +0 +0 -1), | |||
545 | ccv_array_get(parameters, 0)((void*)(((char*)((parameters)->data)) + (size_t)(parameters )->rsize * (size_t)(0))), parameters->rnum, | |||
546 | max_outputs, checkpoint->output_size, | |||
547 | ccv_array_get(newly_input_execs, 0)((void*)(((char*)((newly_input_execs)->data)) + (size_t)(newly_input_execs )->rsize * (size_t)(0))), newly_input_execs->rnum, ccv_array_get(newly_output_execs, 0)((void*)(((char*)((newly_output_execs)->data)) + (size_t)( newly_output_execs)->rsize * (size_t)(0))), newly_output_execs->rnum); | |||
548 | ccv_nnc_graph_exec_symbol_new_hook(graph, build.old_graph_exec_symbol_new_hook, build.old_graph_exec_symbol_new_hook_context, 0); | |||
549 | // Need to autogen and redo source / destination. | |||
550 | ccv_nnc_graph_exec_symbol_autogen(graph, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(build.graph_exec_symbols, 0)((void*)(((char*)((build.graph_exec_symbols)->data)) + (size_t )(build.graph_exec_symbols)->rsize * (size_t)(0))), build.graph_exec_symbols->rnum, 0); | |||
551 | ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, 0)((void*)(((char*)((graph->tensor_symbol_info)->data)) + (size_t)(graph->tensor_symbol_info)->rsize * (size_t)( 0))); | |||
552 | exec_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0)((void*)(((char*)((graph->exec_symbol_info)->data)) + ( size_t)(graph->exec_symbol_info)->rsize * (size_t)(0))); | |||
553 | ccv_array_clear(newly_input_execs); | |||
554 | for (j = 0; j < build.graph_exec_symbols->rnum; j++) | |||
555 | { | |||
556 | const int idx = ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(build.graph_exec_symbols, j)((void*)(((char*)((build.graph_exec_symbols)->data)) + (size_t )(build.graph_exec_symbols)->rsize * (size_t)(j))))->d; | |||
557 | if (idx < 0) | |||
558 | continue; | |||
559 | if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_info[idx].flags)((exec_info[idx].flags) & CCV_NNC_GRAPH_EXEC_DEAD)) | |||
560 | continue; | |||
561 | const ccv_nnc_graph_exec_symbol_t symbol = { | |||
562 | .graph = graph, | |||
563 | .d = idx | |||
564 | }; | |||
565 | const int* inputs = exec_info[idx].inputs; | |||
566 | int input_size = exec_info[idx].input_size; | |||
567 | // Only go through forward pass. | |||
568 | assert(!ccv_nnc_cmd_is_backward(exec_info[idx].cmd))((void) sizeof ((!ccv_nnc_cmd_is_backward(exec_info[idx].cmd) ) ? 1 : 0), __extension__ ({ if (!ccv_nnc_cmd_is_backward(exec_info [idx].cmd)) ; else __assert_fail ("!ccv_nnc_cmd_is_backward(exec_info[idx].cmd)" , "ccv_cnnp_model_gradient_checkpointing.c", 568, __extension__ __PRETTY_FUNCTION__); })); | |||
569 | int flag = 0; | |||
570 | for (k = 0; inputs && k < input_size && !flag; k++) | |||
571 | if (inputs[k] >= 0) | |||
572 | for (l = 0; l < checkpoint->input_size && !flag; l++) | |||
573 | if (checkpoint->inputs[l].d >= 0 && inputs[k] == checkpoint->inputs[l].d) | |||
574 | flag = 1; | |||
575 | for (k = 0; inputs && k < input_size && flag; k++) | |||
576 | if (inputs[k] >= 0 && kh_get(ccv_cnnp_tensor_symbol_set, newly_created_tensor_symbols, inputs[k])kh_get_ccv_cnnp_tensor_symbol_set(newly_created_tensor_symbols , inputs[k]) != kh_end(newly_created_tensor_symbols)((newly_created_tensor_symbols)->n_buckets)) | |||
577 | flag = 0; | |||
578 | if (flag) | |||
579 | ccv_array_push(newly_input_execs, &symbol); | |||
580 | } | |||
581 | // Build a map between old tensor symbols and new tensor symbols. | |||
582 | assert(build.tensor_context.tensor_symbols->rnum <= checkpoint->tensor_symbols->rnum)((void) sizeof ((build.tensor_context.tensor_symbols->rnum <= checkpoint->tensor_symbols->rnum) ? 1 : 0), __extension__ ({ if (build.tensor_context.tensor_symbols->rnum <= checkpoint ->tensor_symbols->rnum) ; else __assert_fail ("build.tensor_context.tensor_symbols->rnum <= checkpoint->tensor_symbols->rnum" , "ccv_cnnp_model_gradient_checkpointing.c", 582, __extension__ __PRETTY_FUNCTION__); })); | |||
583 | // Build a map to potentially map from old input to new input. | |||
584 | kh_clear(ccv_cnnp_tensor_symbol_map, symbol_map)kh_clear_ccv_cnnp_tensor_symbol_map(symbol_map); | |||
585 | for (j = 0, k = 0; j < build.tensor_context.tensor_symbols->rnum && k < checkpoint->tensor_symbols->rnum;) | |||
586 | { | |||
587 | const int from_d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(checkpoint->tensor_symbols, k)((void*)(((char*)((checkpoint->tensor_symbols)->data)) + (size_t)(checkpoint->tensor_symbols)->rsize * (size_t) (k))))->d; | |||
588 | if (from_d < 0) // This is removed, move to the next one. | |||
589 | { | |||
590 | ++j; | |||
591 | ++k; | |||
592 | continue; | |||
593 | } | |||
594 | const int to_d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(build.tensor_context.tensor_symbols, j)((void*)(((char*)((build.tensor_context.tensor_symbols)->data )) + (size_t)(build.tensor_context.tensor_symbols)->rsize * (size_t)(j))))->d; | |||
595 | assert(to_d >= 0)((void) sizeof ((to_d >= 0) ? 1 : 0), __extension__ ({ if ( to_d >= 0) ; else __assert_fail ("to_d >= 0", "ccv_cnnp_model_gradient_checkpointing.c" , 595, __extension__ __PRETTY_FUNCTION__); })); | |||
596 | int from_flag = kh_get(ccv_cnnp_tensor_symbol_set, parameters_or_internals, from_d)kh_get_ccv_cnnp_tensor_symbol_set(parameters_or_internals, from_d ) != kh_end(parameters_or_internals)((parameters_or_internals)->n_buckets); | |||
597 | int to_flag = kh_get(ccv_cnnp_tensor_symbol_set, parameters_or_internals, to_d)kh_get_ccv_cnnp_tensor_symbol_set(parameters_or_internals, to_d ) != kh_end(parameters_or_internals)((parameters_or_internals)->n_buckets); | |||
598 | if (from_flag) | |||
599 | ++k; | |||
600 | if (to_flag) | |||
601 | ++j; | |||
602 | if (from_flag || to_flag) | |||
603 | continue; | |||
604 | ++k; | |||
605 | ++j; | |||
606 | // Skip if from_d is outputs. | |||
607 | for (l = 0; l < !from_flag && checkpoint->output_size; l++) | |||
608 | if (checkpoint->outputs[l].d == from_d) | |||
609 | from_flag = 1; | |||
610 | if (from_flag) | |||
611 | continue; | |||
612 | // Skip if to_d is outputs. | |||
613 | for (l = 0; l < !to_flag && checkpoint->output_size; l++) | |||
614 | if (checkpoint->outputs[l].d == to_d) | |||
615 | to_flag = 1; | |||
616 | if (to_flag) | |||
617 | continue; | |||
618 | int ret = 0; | |||
619 | khiter_t h = kh_put(ccv_cnnp_tensor_symbol_map, symbol_map, from_d, &ret)kh_put_ccv_cnnp_tensor_symbol_map(symbol_map, from_d, &ret ); | |||
620 | kh_val(symbol_map, h)((symbol_map)->vals[h]) = to_d; | |||
621 | } | |||
622 | // Now go over all backward passes to replace inputs with the ones from symbol map. Record these that are used. | |||
623 | ccv_array_clear(newly_used_outputs); | |||
624 | ccv_array_clear(replaced_backward_execs); | |||
625 | for (j = 0; j < visited_backward_execs->rnum; j++) | |||
626 | { | |||
627 | const int idx = *(int*)ccv_array_get(visited_backward_execs, j)((void*)(((char*)((visited_backward_execs)->data)) + (size_t )(visited_backward_execs)->rsize * (size_t)(j))); | |||
628 | if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_info[idx].flags)((exec_info[idx].flags) & CCV_NNC_GRAPH_EXEC_DEAD)) | |||
629 | continue; | |||
630 | assert(idx >= 0)((void) sizeof ((idx >= 0) ? 1 : 0), __extension__ ({ if ( idx >= 0) ; else __assert_fail ("idx >= 0", "ccv_cnnp_model_gradient_checkpointing.c" , 630, __extension__ __PRETTY_FUNCTION__); })); | |||
631 | assert(idx < exec_rnum)((void) sizeof ((idx < exec_rnum) ? 1 : 0), __extension__ ( { if (idx < exec_rnum) ; else __assert_fail ("idx < exec_rnum" , "ccv_cnnp_model_gradient_checkpointing.c", 631, __extension__ __PRETTY_FUNCTION__); })); | |||
632 | if (!ccv_nnc_cmd_is_backward(exec_info[idx].cmd)) | |||
633 | continue; | |||
634 | for (k = 0; k < exec_info[idx].input_size; k++) | |||
635 | if (exec_info[idx].inputs[k] >= 0) | |||
636 | { | |||
637 | const khiter_t h = kh_get(ccv_cnnp_tensor_symbol_map, symbol_map, exec_info[idx].inputs[k])kh_get_ccv_cnnp_tensor_symbol_map(symbol_map, exec_info[idx]. inputs[k]); | |||
638 | if (h != kh_end(symbol_map)((symbol_map)->n_buckets)) // Replacing it. | |||
639 | { | |||
640 | int newly_created_output = kh_val(symbol_map, h)((symbol_map)->vals[h]); | |||
641 | exec_info[idx].inputs[k] = newly_created_output; | |||
642 | ccv_array_add_unique_int(newly_used_outputs, newly_created_output); | |||
643 | if (tensor_symbol_info[newly_created_output].alias_ref > 0) | |||
644 | { | |||
645 | newly_created_output = tensor_symbol_info[newly_created_output].alias_ref - 1; | |||
646 | ccv_array_add_unique_int(newly_used_outputs, newly_created_output); | |||
647 | } | |||
648 | ccv_array_add_unique_int(replaced_backward_execs, idx); | |||
649 | } | |||
650 | } | |||
651 | } | |||
652 | for (j = 0; j < build.graph_exec_symbols->rnum; j++) | |||
653 | { | |||
654 | ccv_nnc_graph_exec_symbol_t* const symbol = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(build.graph_exec_symbols, j)((void*)(((char*)((build.graph_exec_symbols)->data)) + (size_t )(build.graph_exec_symbols)->rsize * (size_t)(j))); | |||
655 | if (symbol->d < 0) | |||
656 | continue; | |||
657 | if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_info[symbol->d].flags)((exec_info[symbol->d].flags) & CCV_NNC_GRAPH_EXEC_DEAD )) | |||
658 | continue; | |||
659 | int x, y; | |||
660 | for (k = 0; k < replaced_backward_execs->rnum; k++) | |||
661 | { | |||
662 | const int idx = *(int*)ccv_array_get(replaced_backward_execs, k)((void*)(((char*)((replaced_backward_execs)->data)) + (size_t )(replaced_backward_execs)->rsize * (size_t)(k))); | |||
663 | assert(idx >= 0)((void) sizeof ((idx >= 0) ? 1 : 0), __extension__ ({ if ( idx >= 0) ; else __assert_fail ("idx >= 0", "ccv_cnnp_model_gradient_checkpointing.c" , 663, __extension__ __PRETTY_FUNCTION__); })); | |||
664 | assert(idx < exec_rnum)((void) sizeof ((idx < exec_rnum) ? 1 : 0), __extension__ ( { if (idx < exec_rnum) ; else __assert_fail ("idx < exec_rnum" , "ccv_cnnp_model_gradient_checkpointing.c", 664, __extension__ __PRETTY_FUNCTION__); })); | |||
665 | assert(ccv_nnc_cmd_is_backward(exec_info[idx].cmd))((void) sizeof ((ccv_nnc_cmd_is_backward(exec_info[idx].cmd)) ? 1 : 0), __extension__ ({ if (ccv_nnc_cmd_is_backward(exec_info [idx].cmd)) ; else __assert_fail ("ccv_nnc_cmd_is_backward(exec_info[idx].cmd)" , "ccv_cnnp_model_gradient_checkpointing.c", 665, __extension__ __PRETTY_FUNCTION__); })); | |||
666 | int flag = 0; | |||
667 | for (x = 0; !flag && x < exec_info[idx].input_size; x++) | |||
668 | { | |||
669 | int x_d = exec_info[idx].inputs[x]; | |||
670 | if (x_d < 0) | |||
671 | continue; | |||
672 | if (tensor_symbol_info[x_d].alias_ref > 0) | |||
673 | x_d = tensor_symbol_info[x_d].alias_ref - 1; | |||
674 | for (y = 0; !flag && y < exec_info[symbol->d].output_size; y++) | |||
675 | { | |||
676 | int y_d = exec_info[symbol->d].outputs[y]; | |||
677 | if (y_d < 0) | |||
678 | continue; | |||
679 | if (tensor_symbol_info[y_d].alias_ref > 0) | |||
680 | y_d = tensor_symbol_info[y_d].alias_ref - 1; | |||
681 | if (x_d == y_d) | |||
682 | flag = 1; | |||
683 | } | |||
684 | } | |||
685 | if (flag) | |||
686 | ccv_nnc_graph_exec_symbol_concat(graph, *symbol, (ccv_nnc_graph_exec_symbol_t){ | |||
687 | .graph = graph, | |||
688 | .d = idx | |||
689 | }); | |||
690 | } | |||
691 | } | |||
692 | // Find parents to visited_backward_execs, and use that as the starting point of all newly added graph_exec_symbols. Use the visited backward execs as the source, use all its parents as destination, go through with graph visit. | |||
693 | ccv_nnc_exec_dep_t exec_dep = _ccv_nnc_exec_dep_new(graph, visit, exec_rnum, maskbit); | |||
694 | // Now go from outputs to inputs, unmark visited ones. | |||
695 | ccv_nnc_graph_visit_for(visit, exec_info, node, idx){ int _i_; for (_i_ = 0; _i_ < (visit)->size; _i_++) { const int idx __attribute__((unused)) = (visit)->node[_i_].index ; const int _node_unused_ __attribute__((unused)) = (visit)-> node[_i_].term; typeof ((exec_info)) const node __attribute__ ((unused)) = (exec_info) + idx; { | |||
696 | if (idx < exec_rnum) | |||
697 | maskbit[idx >> 5] &= ~(1u << (idx & 0x1f)); | |||
698 | } ccv_nnc_graph_visit_endfor} } | |||
699 | ccv_nnc_graph_visit_free(visit); | |||
700 | // Go through visited backward execs, remove the ones that has no dependency on any replaced backward execs. | |||
701 | for (j = 0; j < visited_backward_execs->rnum;) | |||
702 | { | |||
703 | const int idx = *(int*)ccv_array_get(visited_backward_execs, j)((void*)(((char*)((visited_backward_execs)->data)) + (size_t )(visited_backward_execs)->rsize * (size_t)(j))); | |||
704 | if (ccv_array_contain_int(replaced_backward_execs, idx)) | |||
705 | { | |||
706 | ++j; | |||
707 | continue; | |||
708 | } | |||
709 | ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep.deps, idx); | |||
710 | if (!vector) | |||
711 | vector = (ccv_sparse_matrix_vector_t*)1; // Mark it as we tried but cannot find. | |||
712 | int flag = 0; | |||
713 | for (k = 0; !flag && k < replaced_backward_execs->rnum; k++) | |||
714 | { | |||
715 | const int d = *(int*)ccv_array_get(replaced_backward_execs, k)((void*)(((char*)((replaced_backward_execs)->data)) + (size_t )(replaced_backward_execs)->rsize * (size_t)(k))); | |||
716 | flag = ccv_nnc_exec_dep_check(exec_dep, idx, vector, d); | |||
717 | } | |||
718 | if (!flag) | |||
719 | { | |||
720 | if (j < visited_backward_execs->rnum - 1) | |||
721 | *(int*)ccv_array_get(visited_backward_execs, j)((void*)(((char*)((visited_backward_execs)->data)) + (size_t )(visited_backward_execs)->rsize * (size_t)(j))) = *(int*)ccv_array_get(visited_backward_execs, visited_backward_execs->rnum - 1)((void*)(((char*)((visited_backward_execs)->data)) + (size_t )(visited_backward_execs)->rsize * (size_t)(visited_backward_execs ->rnum - 1))); | |||
722 | --visited_backward_execs->rnum; | |||
723 | continue; | |||
724 | } | |||
725 | ++j; | |||
726 | } | |||
727 | // Now go through all replaced_backward_execs to find the ones has no dependencies in visited_backward_execs. | |||
728 | for (j = 0; j < replaced_backward_execs->rnum; j++) | |||
729 | { | |||
730 | const int idx = *(int*)ccv_array_get(replaced_backward_execs, j)((void*)(((char*)((replaced_backward_execs)->data)) + (size_t )(replaced_backward_execs)->rsize * (size_t)(j))); | |||
731 | ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep.deps, idx); | |||
732 | if (!vector) | |||
733 | vector = (ccv_sparse_matrix_vector_t*)1; // Mark it as we tried but cannot find. | |||
734 | int flag = 0; | |||
735 | for (k = 0; !flag && k < visited_backward_execs->rnum; k++) | |||
736 | { | |||
737 | const int d = *(int*)ccv_array_get(visited_backward_execs, k)((void*)(((char*)((visited_backward_execs)->data)) + (size_t )(visited_backward_execs)->rsize * (size_t)(k))); | |||
738 | flag = ccv_nnc_exec_dep_check(exec_dep, idx, vector, d); | |||
739 | } | |||
740 | // If this one has no parents that is within the visited_backward_execs, it is a good place for us to add all its parents as dependency for input_execs. | |||
741 | if (!flag) | |||
742 | { | |||
743 | assert(idx < exec_rnum)((void) sizeof ((idx < exec_rnum) ? 1 : 0), __extension__ ( { if (idx < exec_rnum) ; else __assert_fail ("idx < exec_rnum" , "ccv_cnnp_model_gradient_checkpointing.c", 743, __extension__ __PRETTY_FUNCTION__); })); | |||
744 | ccv_array_t* const outgoings = reversed_nodes[idx].outgoings; | |||
745 | assert(outgoings)((void) sizeof ((outgoings) ? 1 : 0), __extension__ ({ if (outgoings ) ; else __assert_fail ("outgoings", "ccv_cnnp_model_gradient_checkpointing.c" , 745, __extension__ __PRETTY_FUNCTION__); })); | |||
746 | for (k = 0; k < outgoings->rnum; k++) | |||
747 | { | |||
748 | const int d = *(int*)ccv_array_get(outgoings, k)((void*)(((char*)((outgoings)->data)) + (size_t)(outgoings )->rsize * (size_t)(k))); | |||
749 | for (l = 0; l < newly_input_execs->rnum; l++) | |||
750 | { | |||
751 | ccv_nnc_graph_exec_symbol_concat(graph, (ccv_nnc_graph_exec_symbol_t){ | |||
752 | .graph = graph, | |||
753 | .d = d | |||
754 | }, *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(newly_input_execs, l)((void*)(((char*)((newly_input_execs)->data)) + (size_t)(newly_input_execs )->rsize * (size_t)(l)))); | |||
755 | } | |||
756 | } | |||
757 | } | |||
758 | } | |||
759 | ccv_nnc_exec_dep_free(exec_dep); | |||
760 | // Go through all exec, free ones that doesn't have output used. | |||
761 | // Reuse this array because it is not useful any more. | |||
762 | ccv_array_t* forward_pass_inputs = visited_backward_execs; | |||
763 | int any_deleted; | |||
764 | do { | |||
765 | // Build a map of still active inputs. | |||
766 | ccv_array_clear(forward_pass_inputs); | |||
767 | for (j = 0; j < build.graph_exec_symbols->rnum; j++) | |||
768 | { | |||
769 | ccv_nnc_graph_exec_symbol_t* const symbol = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(build.graph_exec_symbols, j)((void*)(((char*)((build.graph_exec_symbols)->data)) + (size_t )(build.graph_exec_symbols)->rsize * (size_t)(j))); | |||
770 | if (symbol->d < 0) | |||
771 | continue; | |||
772 | if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_info[symbol->d].flags)((exec_info[symbol->d].flags) & CCV_NNC_GRAPH_EXEC_DEAD )) | |||
773 | continue; | |||
774 | int* const inputs = exec_info[symbol->d].inputs; | |||
775 | const int input_size = exec_info[symbol->d].input_size; | |||
776 | for (k = 0; k < input_size; k++) | |||
777 | { | |||
778 | int d = inputs[k]; | |||
779 | if (d < 0) | |||
780 | continue; | |||
781 | ccv_array_add_unique_int(forward_pass_inputs, d); | |||
782 | if (tensor_symbol_info[d].alias_ref > 0) | |||
783 | { | |||
784 | d = tensor_symbol_info[d].alias_ref - 1; | |||
785 | ccv_array_add_unique_int(forward_pass_inputs, d); | |||
786 | } | |||
787 | } | |||
788 | } | |||
789 | any_deleted = 0; | |||
790 | for (j = 0; j < build.graph_exec_symbols->rnum; j++) | |||
791 | { | |||
792 | ccv_nnc_graph_exec_symbol_t* const symbol = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(build.graph_exec_symbols, j)((void*)(((char*)((build.graph_exec_symbols)->data)) + (size_t )(build.graph_exec_symbols)->rsize * (size_t)(j))); | |||
793 | if (symbol->d < 0) | |||
794 | continue; | |||
795 | if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_info[symbol->d].flags)((exec_info[symbol->d].flags) & CCV_NNC_GRAPH_EXEC_DEAD )) | |||
796 | continue; | |||
797 | int* const outputs = exec_info[symbol->d].outputs; | |||
798 | const int output_size = exec_info[symbol->d].output_size; | |||
799 | int flag = 0; | |||
800 | for (k = 0; !flag && k < output_size; k++) | |||
801 | { | |||
802 | int d = outputs[k]; | |||
803 | if (d < 0) | |||
804 | continue; | |||
805 | flag = ccv_array_contain_int(newly_used_outputs, d) || ccv_array_contain_int(forward_pass_inputs, d); | |||
806 | if (!flag && tensor_symbol_info[d].alias_ref > 0) | |||
807 | { | |||
808 | d = tensor_symbol_info[d].alias_ref - 1; | |||
809 | flag = ccv_array_contain_int(newly_used_outputs, d) || ccv_array_contain_int(forward_pass_inputs, d); | |||
810 | } | |||
811 | } | |||
812 | if (flag) | |||
813 | continue; | |||
814 | ccv_nnc_graph_exec_symbol_free(graph, *symbol); | |||
815 | symbol->d = -1; | |||
816 | symbol->graph = 0; | |||
817 | any_deleted = 1; | |||
818 | } | |||
819 | } while (any_deleted); | |||
820 | ccv_array_clear(forward_pass_inputs); | |||
821 | for (j = 0; j < build.graph_exec_symbols->rnum; j++) | |||
822 | { | |||
823 | ccv_nnc_graph_exec_symbol_t* const symbol = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(build.graph_exec_symbols, j)((void*)(((char*)((build.graph_exec_symbols)->data)) + (size_t )(build.graph_exec_symbols)->rsize * (size_t)(j))); | |||
824 | if (symbol->d < 0) | |||
825 | continue; | |||
826 | if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_info[symbol->d].flags)((exec_info[symbol->d].flags) & CCV_NNC_GRAPH_EXEC_DEAD )) | |||
827 | continue; | |||
828 | int* const inputs = exec_info[symbol->d].inputs; | |||
829 | const int input_size = exec_info[symbol->d].input_size; | |||
830 | for (k = 0; k < input_size; k++) | |||
831 | { | |||
832 | if (inputs[k] < 0) | |||
833 | continue; | |||
834 | ccv_array_add_unique_int(forward_pass_inputs, inputs[k]); | |||
835 | if (tensor_symbol_info[inputs[k]].alias_ref > 0) | |||
836 | ccv_array_add_unique_int(forward_pass_inputs, tensor_symbol_info[inputs[k]].alias_ref - 1); | |||
837 | } | |||
838 | int* const outputs = exec_info[symbol->d].outputs; | |||
839 | const int output_size = exec_info[symbol->d].output_size; | |||
840 | for (k = 0; k < output_size; k++) | |||
841 | { | |||
842 | if (outputs[k] < 0) | |||
843 | continue; | |||
844 | ccv_array_add_unique_int(forward_pass_inputs, outputs[k]); | |||
845 | if (tensor_symbol_info[outputs[k]].alias_ref > 0) | |||
846 | ccv_array_add_unique_int(forward_pass_inputs, tensor_symbol_info[outputs[k]].alias_ref - 1); | |||
847 | } | |||
848 | } | |||
849 | // Free unused tensor symbols. | |||
850 | for (j = 0; j < build.all_tensor_symbols->rnum; j++) | |||
851 | { | |||
852 | const ccv_nnc_tensor_symbol_t* symbol = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(build.all_tensor_symbols, j)((void*)(((char*)((build.all_tensor_symbols)->data)) + (size_t )(build.all_tensor_symbols)->rsize * (size_t)(j)))); | |||
853 | if (ccv_array_contain_int(newly_used_outputs, symbol->d) || ccv_array_contain_int(forward_pass_inputs, symbol->d)) | |||
854 | continue; | |||
855 | if (tensor_symbol_info[symbol->d].alias_ref > 0) | |||
856 | { | |||
857 | const int d = tensor_symbol_info[symbol->d].alias_ref - 1; | |||
858 | if (ccv_array_contain_int(newly_used_outputs, d) || ccv_array_contain_int(forward_pass_inputs, d)) | |||
859 | continue; | |||
860 | } | |||
861 | ccv_nnc_tensor_symbol_free(graph, *symbol); | |||
862 | } | |||
863 | for (j = 0; j < build.graph_exec_symbols->rnum; j++) | |||
864 | { | |||
865 | ccv_nnc_graph_exec_symbol_t* const symbol = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(build.graph_exec_symbols, j)((void*)(((char*)((build.graph_exec_symbols)->data)) + (size_t )(build.graph_exec_symbols)->rsize * (size_t)(j))); | |||
866 | if (symbol->d < 0) | |||
867 | continue; | |||
868 | if (CCV_NNC_GRAPH_EXEC_IS_DEAD(exec_info[symbol->d].flags)((exec_info[symbol->d].flags) & CCV_NNC_GRAPH_EXEC_DEAD )) | |||
869 | continue; | |||
870 | ccv_nnc_graph_exec_symbol_set_flags(graph, *symbol, CCV_NNC_GRAPH_EXEC_DISABLE_OPT); | |||
871 | } | |||
872 | // Free these newly created execs and tensor symbols. | |||
873 | ccv_array_free(build.tensor_context.tensor_symbols); | |||
874 | ccv_array_free(build.graph_exec_symbols); | |||
875 | ccv_array_free(build.all_tensor_symbols); | |||
876 | } | |||
877 | kh_destroy(ccv_cnnp_tensor_symbol_map, symbol_map)kh_destroy_ccv_cnnp_tensor_symbol_map(symbol_map); | |||
878 | kh_destroy(ccv_cnnp_tensor_symbol_set, newly_created_tensor_symbols)kh_destroy_ccv_cnnp_tensor_symbol_set(newly_created_tensor_symbols ); | |||
879 | kh_destroy(ccv_cnnp_tensor_symbol_set, parameters_or_internals)kh_destroy_ccv_cnnp_tensor_symbol_set(parameters_or_internals ); | |||
880 | ccfreefree(max_outputs); | |||
881 | ccv_array_free(newly_used_outputs); | |||
882 | ccv_array_free(parameters); | |||
883 | ccv_array_free(parameter_ids); | |||
884 | ccv_array_free(parameter_trainables); | |||
885 | ccv_array_free(internals); | |||
886 | ccv_array_free(internal_ids); | |||
887 | ccfreefree(maskbit); | |||
888 | ccv_array_free(input_gradient_execs); | |||
889 | ccv_array_free(output_gradient_execs); | |||
890 | ccv_array_free(input_execs); | |||
891 | ccv_array_free(output_execs); | |||
892 | ccv_array_free(replaced_backward_execs); | |||
893 | ccv_array_free(visited_backward_execs); | |||
894 | for (i = 0; i < exec_rnum; i++) | |||
895 | if (reversed_nodes[i].outgoings) | |||
896 | ccv_array_free(reversed_nodes[i].outgoings); | |||
897 | ccfreefree(reversed_nodes); | |||
898 | } |