/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_symbolic_graph_memory_compression.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "ccv_nnc.h" |
2 | | #include "ccv_nnc_easy.h" |
3 | | #include "ccv_nnc_internal.h" |
4 | | #include "ccv_internal.h" |
5 | | #include "_ccv_nnc_symbolic_graph.h" |
6 | | |
7 | | // MARK - Level-3.5 API |
8 | | |
9 | | static void _ccv_nnc_remove_unused_from_marked(const uint32_t* const tensor_used, const int size, uint32_t* const tensor_marked) |
10 | 0 | { |
11 | 0 | int i; |
12 | 0 | for (i = 0; i < size; i++) |
13 | 0 | tensor_marked[i] &= tensor_used[i]; |
14 | 0 | } |
15 | | |
16 | | static ccv_sparse_matrix_t* _ccv_nnc_exec_dep_new(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_graph_visit_t* const visit) |
17 | 0 | { |
18 | 0 | ccv_sparse_matrix_t* exec_dep = ccv_sparse_matrix_new(graph->exec_symbol_info->rnum, graph->exec_symbol_info->rnum, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0); |
19 | 0 | int* buf = (int*)ccmalloc(sizeof(int) * graph->exec_symbol_info->rnum * 2); |
20 | 0 | int buf_size; |
21 | 0 | #define for_block(x, val) \ |
22 | 0 | do { \ |
23 | 0 | if (((int32_t*)val)[0] > 0) \ |
24 | 0 | { \ |
25 | 0 | buf[buf_size * 2] = x; \ |
26 | 0 | buf[buf_size * 2 + 1] = ((int32_t*)val)[0] + 1; \ |
27 | 0 | ++buf_size; \ |
28 | 0 | } \ |
29 | 0 | } while (0) |
30 | 0 | const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0); |
31 | 0 | int i, j; |
32 | 0 | ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx, term) { |
33 | 0 | buf_size = 0; /* save all its parent deps to this buffer */ |
34 | 0 | ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx); |
35 | 0 | if (vector) |
36 | 0 | CCV_SPARSE_VECTOR_FOREACH(exec_dep, vector, for_block); |
37 | 0 | if (!node->outgoings) |
38 | 0 | continue; |
39 | 0 | for (i = 0; i < node->outgoings->rnum; i++) |
40 | 0 | { |
41 | 0 | int outgoing = *(int*)ccv_array_get(node->outgoings, i); |
42 | 0 | const int32_t one = 1; |
43 | 0 | ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, idx); |
44 | | /* If not found, set, if the current node is the destination node, no need |
45 | | * set itself as parent of subsequent nodes because its terminal nature. */ |
46 | 0 | if (!cell.i32 || cell.i32[0] == 0) |
47 | 0 | ccv_set_sparse_matrix_cell(exec_dep, outgoing, idx, &one); |
48 | 0 | if (buf_size > 0) |
49 | 0 | { |
50 | 0 | ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, outgoing); |
51 | 0 | for (j = 0; j < buf_size; j++) /* set with all idx's dependencies as well */ |
52 | 0 | { |
53 | 0 | ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2]); |
54 | | /* If not found, set */ |
55 | 0 | if (!cell.i32 || cell.i32[0] == 0) |
56 | 0 | ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &buf[j * 2 + 1]); |
57 | 0 | else { |
58 | | /* Otherwise, set to the longest one */ |
59 | 0 | int32_t dep = ccv_max(cell.i32[0], buf[j * 2 + 1]); |
60 | 0 | ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &dep); |
61 | 0 | } |
62 | 0 | } |
63 | 0 | } |
64 | 0 | } |
65 | 0 | } ccv_nnc_graph_visit_endfor |
66 | 0 | #undef for_block |
67 | 0 | ccfree(buf); |
68 | 0 | return exec_dep; |
69 | 0 | } |
70 | | |
71 | | typedef struct { |
72 | | int should_compress; |
73 | | ccv_nnc_tensor_param_t info; |
74 | | struct { |
75 | | int source; |
76 | | int destination; |
77 | | } compress; |
78 | | struct { |
79 | | int source; |
80 | | int destination; |
81 | | ccv_array_t* nodes; |
82 | | } decompress; |
83 | | } ccv_nnc_compress_info_t; |
84 | | |
85 | | void ccv_nnc_symbolic_graph_memory_compression(ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size) |
86 | 0 | { |
87 | | // Note all these exec_symbol_info and tensor_symbol_info cannot be accessed once I start to mutate the graph. Therefore, I will do the |
88 | | // mutation at the last step, to carefully step away from that possibility. |
89 | 0 | ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0); |
90 | 0 | ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, 0); |
91 | 0 | ccv_nnc_graph_visit_t* const visit = ccv_nnc_graph_visit_new(graph, exec_symbol_info, graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0); |
92 | 0 | ccv_nnc_symbolic_graph_symbol_infer(graph, visit, sources, source_size, destinations, destination_size, 0, 0, tensor_symbol_info, exec_symbol_info); |
93 | 0 | const int tensor_symbol_info_size = graph->tensor_symbol_info->rnum; |
94 | 0 | const int exec_symbol_info_size = graph->exec_symbol_info->rnum; |
95 | 0 | uint32_t* const tensor_marked = (uint32_t*)cccalloc(((tensor_symbol_info_size + 31) >> 5) * 2, sizeof(uint32_t)); |
96 | 0 | uint32_t* const tensor_used = tensor_marked + ((tensor_symbol_info_size + 31) >> 5); |
97 | 0 | int i, j, k; |
98 | 0 | ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) { |
99 | | // If this node is a convolution backward node, check the original input tensor symbol, and see if it is generated |
100 | | // by this graph. If it is, we can track it back and compress it immediately after its generation. |
101 | | // I am not certain what's better (whether to "overlap" the compression with the last bits of usage on forward pass) |
102 | | // or just compress immediately after. I certainly need to decompress just before this convolution backward node. |
103 | 0 | if (node->cmd.cmd == CCV_NNC_CONVOLUTION_BACKWARD && node->input_size > 1 && node->inputs[1] >= 0) |
104 | 0 | { |
105 | 0 | const int d = node->inputs[1]; |
106 | | // If this tensor is alias, or assigned (while loop), or bypassed (case..of), skip. |
107 | 0 | if (tensor_symbol_info[d].alias_ref || tensor_symbol_info[d].assign_ref || tensor_symbol_info[d].bypass_ref || |
108 | 0 | tensor_symbol_info[d].r_assign_ref || tensor_symbol_info[d].r_bypass_ref) |
109 | 0 | continue; |
110 | 0 | tensor_marked[d >> 5] |= (1u << (d & 0x1f)); |
111 | 0 | } |
112 | 0 | if (node->cmd.cmd == CCV_NNC_CONVOLUTION_FORWARD && node->output_size >= 1 && node->outputs[0] >= 0) |
113 | 0 | { |
114 | 0 | const int d = node->outputs[0]; |
115 | | // If this tensor is alias, or assigned (while loop), or bypassed (case..of), skip. |
116 | 0 | if (tensor_symbol_info[d].alias_ref || tensor_symbol_info[d].assign_ref || tensor_symbol_info[d].bypass_ref || |
117 | 0 | tensor_symbol_info[d].r_assign_ref || tensor_symbol_info[d].r_bypass_ref) |
118 | 0 | continue; |
119 | 0 | tensor_marked[d >> 5] |= (1u << (d & 0x1f)); |
120 | 0 | } |
121 | 0 | if (ccv_nnc_cmd_is_backward(node->cmd)) |
122 | 0 | for (i = 0; i < node->input_size; i++) |
123 | 0 | { |
124 | 0 | const int d = node->inputs[i]; |
125 | 0 | if (d >= 0) |
126 | 0 | tensor_used[d >> 5] |= (1u << (d & 0x1f)); |
127 | 0 | } |
128 | 0 | } ccv_nnc_graph_visit_endfor |
129 | | // If a tensor is marked but never used in backward pass, no need to compress it. |
130 | 0 | _ccv_nnc_remove_unused_from_marked(tensor_used, (tensor_symbol_info_size + 31) >> 5, tensor_marked); |
131 | | // If a tensor is not generated on the forward pass, no need to compress it. |
132 | 0 | memset(tensor_used, 0, sizeof(uint32_t) * ((tensor_symbol_info_size + 31) >> 5)); |
133 | 0 | ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) { |
134 | 0 | if (ccv_nnc_cmd_is_forward(node->cmd)) |
135 | 0 | for (i = 0; i < node->output_size; i++) |
136 | 0 | { |
137 | 0 | const int d = node->outputs[i]; |
138 | 0 | if (d >= 0) |
139 | 0 | tensor_used[d >> 5] |= (1u << (d & 0x1f)); |
140 | 0 | } |
141 | 0 | } ccv_nnc_graph_visit_endfor |
142 | 0 | _ccv_nnc_remove_unused_from_marked(tensor_used, (tensor_symbol_info_size + 31) >> 5, tensor_marked); |
143 | | // If this tensor is pointed to by an alias, we don't want to compress as well. |
144 | 0 | for (i = 0; i < tensor_symbol_info_size; i++) |
145 | 0 | if (tensor_symbol_info[i].alias_ref) |
146 | 0 | { |
147 | 0 | const int d = tensor_symbol_info[i].alias_ref - 1; |
148 | | // unmark. |
149 | 0 | if ((tensor_marked[d >> 5] & (1u << (d & 0x1f)))) |
150 | 0 | tensor_marked[d >> 5] &= ~(1u << (d & 0x1f)); |
151 | 0 | } |
152 | | // Now tensor_marked only contains the tensors that we think beneficial to compress. Find the best place to insert compression / decompression. |
153 | 0 | ccv_nnc_compress_info_t* const compress_info = cccalloc(tensor_symbol_info_size, sizeof(ccv_nnc_compress_info_t)); |
154 | 0 | ccv_sparse_matrix_t* const exec_dep = _ccv_nnc_exec_dep_new(graph, visit); |
155 | 0 | ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) { |
156 | 0 | if (ccv_nnc_cmd_is_forward(node->cmd)) |
157 | 0 | for (i = 0; i < node->output_size; i++) |
158 | 0 | { |
159 | 0 | const int d = node->outputs[i]; |
160 | 0 | if (d >= 0 && (tensor_marked[d >> 5] & (1u << (d & 0x1f)))) |
161 | 0 | compress_info[d].compress.source = idx; |
162 | 0 | } |
163 | 0 | else if (ccv_nnc_cmd_is_backward(node->cmd)) |
164 | 0 | for (i = 0; i < node->input_size; i++) |
165 | 0 | { |
166 | 0 | const int d = node->inputs[i]; |
167 | 0 | if (d >= 0 && (tensor_marked[d >> 5] & (1u << (d & 0x1f)))) |
168 | 0 | { |
169 | 0 | if (!compress_info[d].decompress.nodes) |
170 | 0 | compress_info[d].decompress.nodes = ccv_array_new(sizeof(int), 0, 0); |
171 | 0 | ccv_array_push(compress_info[d].decompress.nodes, &idx); |
172 | 0 | } |
173 | 0 | } |
174 | 0 | } ccv_nnc_graph_visit_endfor |
175 | 0 | ccv_array_t* const commons = ccv_array_new(sizeof(int), 0, 0); |
176 | 0 | for (i = 0; i < tensor_symbol_info_size; i++) |
177 | 0 | { |
178 | 0 | if (!compress_info[i].decompress.nodes) |
179 | 0 | continue; |
180 | | // If we have more than one destination, need to find the common ancestor. |
181 | 0 | ccv_array_t* decompress_nodes = compress_info[i].decompress.nodes; |
182 | 0 | if (decompress_nodes && decompress_nodes->rnum > 1) |
183 | 0 | { |
184 | 0 | ccv_array_clear(commons); |
185 | 0 | ccv_array_t* const nodes = compress_info[i].decompress.nodes; |
186 | 0 | const int d = *(int*)ccv_array_get(nodes, 0); |
187 | 0 | ccv_array_push(commons, &d); |
188 | 0 | #define for_block(x, val) \ |
189 | 0 | do { \ |
190 | 0 | const int dd = ((int32_t*)val)[0]; \ |
191 | 0 | if (dd > 0) \ |
192 | 0 | ccv_array_push(commons, &x); \ |
193 | 0 | } while (0) |
194 | 0 | ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, d); |
195 | 0 | if (vector) |
196 | 0 | CCV_SPARSE_VECTOR_FOREACH(exec_dep, vector, for_block); |
197 | 0 | #undef for_block |
198 | 0 | for (j = 0; j < commons->rnum;) |
199 | 0 | { |
200 | 0 | const int d = *(int*)ccv_array_get(commons, j); |
201 | 0 | int flag = 0; |
202 | 0 | for (k = 1; k < nodes->rnum && !flag; k++) |
203 | 0 | { |
204 | 0 | const int dd = *(int*)ccv_array_get(nodes, k); |
205 | 0 | if (dd == d) // If it is the same as the commons, keep. |
206 | 0 | continue; |
207 | 0 | const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, dd, d); |
208 | | // If I cannot reach from this destination to the ancestor. This is not an ancestor. |
209 | | // Remove it from the list. |
210 | 0 | if (!cell.i32 || cell.i32[0] == 0) |
211 | 0 | flag = 1; |
212 | 0 | } |
213 | 0 | if (flag) |
214 | 0 | { |
215 | 0 | if (j < commons->rnum - 1) |
216 | 0 | *(int*)ccv_array_get(commons, j) = *(int*)ccv_array_get(commons, commons->rnum - 1); |
217 | 0 | --commons->rnum; |
218 | 0 | continue; |
219 | 0 | } |
220 | 0 | ++j; |
221 | 0 | } |
222 | | // If there is no common ancestor. We cannot do this. Abort the whole thing. |
223 | 0 | if (commons->rnum == 0) |
224 | 0 | continue; |
225 | 0 | decompress_nodes = commons; |
226 | 0 | } |
227 | | // Find source / destination for compress nodes. |
228 | 0 | const int compress_source = compress_info[i].compress.source; |
229 | 0 | ccv_array_t* const outgoings = exec_symbol_info[compress_source].outgoings; |
230 | 0 | if (!outgoings || outgoings->rnum == 0) |
231 | 0 | continue; |
232 | 0 | int hop = exec_symbol_info_size; |
233 | 0 | for (j = 0; j < outgoings->rnum; j++) |
234 | 0 | { |
235 | 0 | const int d = *(int*)ccv_array_get(outgoings, j); |
236 | 0 | const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, d, compress_source); |
237 | 0 | if (cell.i32[0] < hop) |
238 | 0 | { |
239 | 0 | compress_info[i].compress.destination = d; |
240 | 0 | hop = cell.i32[0]; |
241 | 0 | } |
242 | 0 | } |
243 | 0 | if (hop == exec_symbol_info_size) |
244 | 0 | continue; |
245 | | // Find source / destination for decompress nodes. |
246 | | // First, find the node that everyone can reach it. |
247 | 0 | int decompress_destination = -1; |
248 | 0 | for (j = 0; j < decompress_nodes->rnum; j++) |
249 | 0 | { |
250 | 0 | int flag = 0; |
251 | 0 | const int dj = *(int*)ccv_array_get(decompress_nodes, j); |
252 | 0 | for (k = 0; !flag && k < decompress_nodes->rnum; k++) |
253 | 0 | if (j != k) |
254 | 0 | { |
255 | 0 | const int dk = *(int*)ccv_array_get(decompress_nodes, k); |
256 | 0 | const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, dj, dk); |
257 | 0 | if (!cell.i32 || cell.i32[0] == 0) |
258 | 0 | flag = 1; |
259 | 0 | } |
260 | 0 | if (!flag) |
261 | 0 | decompress_destination = (decompress_destination == -1) ? dj : -2; |
262 | 0 | } |
263 | | // Cannot continue, either we cannot find the node that is child node of everyone, or |
264 | | // it has more than one of these. |
265 | 0 | if (decompress_destination < 0) |
266 | 0 | continue; |
267 | 0 | compress_info[i].decompress.destination = decompress_destination; |
268 | 0 | hop = exec_symbol_info_size; |
269 | 0 | #define for_block(x, val) \ |
270 | 0 | do { \ |
271 | 0 | const int dd = ((int32_t*)val)[0]; \ |
272 | 0 | if (dd > 0 && dd < hop) \ |
273 | 0 | { \ |
274 | 0 | compress_info[i].decompress.source = x; \ |
275 | 0 | hop = dd; \ |
276 | 0 | } \ |
277 | 0 | } while (0) |
278 | 0 | ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, decompress_destination); |
279 | 0 | if (vector) |
280 | 0 | CCV_SPARSE_VECTOR_FOREACH(exec_dep, vector, for_block); |
281 | | // Final check, the destination of compression should be smaller than the source of decompression. |
282 | 0 | const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, compress_info[i].decompress.source, compress_info[i].compress.destination); |
283 | 0 | if (compress_info[i].decompress.source != compress_info[i].compress.destination && (!cell.i32 || cell.i32[0] == 0)) |
284 | 0 | continue; |
285 | | // Mark it as ready to be compressed. |
286 | 0 | compress_info[i].info = tensor_symbol_info[i].info; |
287 | 0 | compress_info[i].should_compress = 1; |
288 | 0 | } |
289 | | // Do the graph mutation now based on the compression info. |
290 | 0 | for (i = 0; i < tensor_symbol_info_size; i++) |
291 | 0 | if (compress_info[i].should_compress) |
292 | 0 | { |
293 | 0 | ccv_nnc_tensor_param_t compressed_params; |
294 | 0 | ccv_nnc_hint_tensor_auto(CMD_COMPRESSION_LSSC_FORWARD(), &compress_info[i].info, 1, ccv_nnc_no_hint, &compressed_params, 1); |
295 | 0 | const ccv_nnc_tensor_symbol_t original = (ccv_nnc_tensor_symbol_t){ |
296 | 0 | .graph = graph, |
297 | 0 | .d = i |
298 | 0 | }; |
299 | 0 | const ccv_nnc_tensor_symbol_t compressed = ccv_nnc_tensor_symbol_new(graph, compressed_params, 0); |
300 | 0 | const ccv_nnc_graph_exec_symbol_t compress_node = ccv_nnc_graph_exec_symbol_new(graph, CMD_COMPRESSION_LSSC_FORWARD(), TENSOR_SYMBOL_LIST(original), TENSOR_SYMBOL_LIST(compressed), 0); |
301 | 0 | ccv_nnc_graph_exec_symbol_disjoin(graph, (ccv_nnc_graph_exec_symbol_t){ |
302 | 0 | .graph = graph, |
303 | 0 | .d = compress_info[i].compress.source, |
304 | 0 | }, (ccv_nnc_graph_exec_symbol_t){ |
305 | 0 | .graph = graph, |
306 | 0 | .d = compress_info[i].compress.destination |
307 | 0 | }); |
308 | 0 | ccv_nnc_graph_exec_symbol_concat(graph, (ccv_nnc_graph_exec_symbol_t){ |
309 | 0 | .graph = graph, |
310 | 0 | .d = compress_info[i].compress.source, |
311 | 0 | }, compress_node); |
312 | 0 | ccv_nnc_graph_exec_symbol_concat(graph, compress_node, (ccv_nnc_graph_exec_symbol_t){ |
313 | 0 | .graph = graph, |
314 | 0 | .d = compress_info[i].compress.destination |
315 | 0 | }); |
316 | 0 | const ccv_nnc_tensor_symbol_t decompressed = ccv_nnc_tensor_symbol_new(graph, compress_info[i].info, 0); |
317 | 0 | const ccv_nnc_graph_exec_symbol_t decompress_node = ccv_nnc_graph_exec_symbol_new(graph, CMD_COMPRESSION_LSSC_BACKWARD(), TENSOR_SYMBOL_LIST(compressed), TENSOR_SYMBOL_LIST(decompressed), 0); |
318 | 0 | ccv_nnc_graph_exec_symbol_disjoin(graph, (ccv_nnc_graph_exec_symbol_t){ |
319 | 0 | .graph = graph, |
320 | 0 | .d = compress_info[i].decompress.source, |
321 | 0 | }, (ccv_nnc_graph_exec_symbol_t){ |
322 | 0 | .graph = graph, |
323 | 0 | .d = compress_info[i].decompress.destination |
324 | 0 | }); |
325 | 0 | ccv_nnc_graph_exec_symbol_concat(graph, (ccv_nnc_graph_exec_symbol_t){ |
326 | 0 | .graph = graph, |
327 | 0 | .d = compress_info[i].decompress.source, |
328 | 0 | }, decompress_node); |
329 | 0 | ccv_nnc_graph_exec_symbol_concat(graph, decompress_node, (ccv_nnc_graph_exec_symbol_t){ |
330 | 0 | .graph = graph, |
331 | 0 | .d = compress_info[i].decompress.destination |
332 | 0 | }); |
333 | 0 | for (j = 0; j < compress_info[i].decompress.nodes->rnum; j++) |
334 | 0 | { |
335 | 0 | const int d = *(int*)ccv_array_get(compress_info[i].decompress.nodes, j); |
336 | 0 | ccv_nnc_graph_exec_symbol_info_t* const destination_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, d); |
337 | 0 | for (k = 0; k < destination_info->input_size; k++) |
338 | 0 | if (destination_info->inputs[k] == i) |
339 | 0 | destination_info->inputs[k] = decompressed.d; |
340 | 0 | } |
341 | 0 | } |
342 | 0 | ccv_nnc_graph_visit_free(visit); |
343 | 0 | ccv_array_free(commons); |
344 | 0 | ccv_matrix_free(exec_dep); |
345 | 0 | ccfree(tensor_marked); |
346 | 0 | for (i = 0; i < tensor_symbol_info_size; i++) |
347 | 0 | if (compress_info[i].decompress.nodes) |
348 | 0 | ccv_array_free(compress_info[i].decompress.nodes); |
349 | 0 | ccfree(compress_info); |
350 | 0 | } |