Coverage Report

Created: 2024-08-18 16:21

/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_symbolic_graph_memory_compression.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#include "_ccv_nnc_symbolic_graph.h"
6
7
// MARK - Level-3.5 API
8
9
static void _ccv_nnc_remove_unused_from_marked(const uint32_t* const tensor_used, const int size, uint32_t* const tensor_marked)
10
0
{
11
0
  int i;
12
0
  for (i = 0; i < size; i++)
13
0
    tensor_marked[i] &= tensor_used[i];
14
0
}
15
16
static ccv_sparse_matrix_t* _ccv_nnc_exec_dep_new(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_graph_visit_t* const visit)
17
0
{
18
0
  ccv_sparse_matrix_t* exec_dep = ccv_sparse_matrix_new(graph->exec_symbol_info->rnum, graph->exec_symbol_info->rnum, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
19
0
  int* buf = (int*)ccmalloc(sizeof(int) * graph->exec_symbol_info->rnum * 2);
20
0
  int buf_size;
21
0
#define for_block(x, val) \
22
0
  do { \
23
0
    if (((int32_t*)val)[0] > 0) \
24
0
    { \
25
0
      buf[buf_size * 2] = x; \
26
0
      buf[buf_size * 2 + 1] = ((int32_t*)val)[0] + 1; \
27
0
      ++buf_size; \
28
0
    } \
29
0
  } while (0)
30
0
  const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0);
31
0
  int i, j;
32
0
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx, term) {
33
0
    buf_size = 0; /* save all its parent deps to this buffer */
34
0
    ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx);
35
0
    if (vector)
36
0
      CCV_SPARSE_VECTOR_FOREACH(exec_dep, vector, for_block);
37
0
    if (!node->outgoings)
38
0
      continue;
39
0
    for (i = 0; i < node->outgoings->rnum; i++)
40
0
    {
41
0
      int outgoing = *(int*)ccv_array_get(node->outgoings, i);
42
0
      const int32_t one = 1;
43
0
      ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, idx);
44
      /* If not found, set, if the current node is the destination node, no need 
45
       * set itself as parent of subsequent nodes because its terminal nature. */
46
0
      if (!cell.i32 || cell.i32[0] == 0)
47
0
        ccv_set_sparse_matrix_cell(exec_dep, outgoing, idx, &one);
48
0
      if (buf_size > 0)
49
0
      {
50
0
        ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, outgoing);
51
0
        for (j = 0; j < buf_size; j++) /* set with all idx's dependencies as well */
52
0
        {
53
0
          ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2]);
54
          /* If not found, set */
55
0
          if (!cell.i32 || cell.i32[0] == 0)
56
0
            ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &buf[j * 2 + 1]);
57
0
          else {
58
            /* Otherwise, set to the longest one */
59
0
            int32_t dep = ccv_max(cell.i32[0], buf[j * 2 + 1]);
60
0
            ccv_set_sparse_matrix_cell_from_vector(exec_dep, vector, buf[j * 2], &dep);
61
0
          }
62
0
        }
63
0
      }
64
0
    }
65
0
  } ccv_nnc_graph_visit_endfor
66
0
#undef for_block
67
0
  ccfree(buf);
68
0
  return exec_dep;
69
0
}
70
71
typedef struct {
72
  int should_compress;
73
  ccv_nnc_tensor_param_t info;
74
  struct {
75
    int source;
76
    int destination;
77
  } compress;
78
  struct {
79
    int source;
80
    int destination;
81
    ccv_array_t* nodes;
82
  } decompress;
83
} ccv_nnc_compress_info_t;
84
85
void ccv_nnc_symbolic_graph_memory_compression(ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size)
86
0
{
87
  // Note all these exec_symbol_info and tensor_symbol_info cannot be accessed once I start to mutate the graph. Therefore, I will do the
88
  // mutation at the last step, to carefully step away from that possibility.
89
0
  ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0);
90
0
  ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, 0);
91
0
  ccv_nnc_graph_visit_t* const visit = ccv_nnc_graph_visit_new(graph, exec_symbol_info, graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0);
92
0
  ccv_nnc_symbolic_graph_symbol_infer(graph, visit, sources, source_size, destinations, destination_size, 0, 0, tensor_symbol_info, exec_symbol_info);
93
0
  const int tensor_symbol_info_size = graph->tensor_symbol_info->rnum;
94
0
  const int exec_symbol_info_size = graph->exec_symbol_info->rnum;
95
0
  uint32_t* const tensor_marked = (uint32_t*)cccalloc(((tensor_symbol_info_size + 31) >> 5) * 2, sizeof(uint32_t));
96
0
  uint32_t* const tensor_used = tensor_marked + ((tensor_symbol_info_size + 31) >> 5);
97
0
  int i, j, k;
98
0
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
99
    // If this node is a convolution backward node, check the original input tensor symbol, and see if it is generated
100
    // by this graph. If it is, we can track it back and compress it immediately after its generation.
101
    // I am not certain what's better (whether to "overlap" the compression with the last bits of usage on forward pass)
102
    // or just compress immediately after. I certainly need to decompress just before this convolution backward node.
103
0
    if (node->cmd.cmd == CCV_NNC_CONVOLUTION_BACKWARD && node->input_size > 1 && node->inputs[1] >= 0)
104
0
    {
105
0
      const int d = node->inputs[1];
106
      // If this tensor is alias, or assigned (while loop), or bypassed (case..of), skip.
107
0
      if (tensor_symbol_info[d].alias_ref || tensor_symbol_info[d].assign_ref || tensor_symbol_info[d].bypass_ref ||
108
0
          tensor_symbol_info[d].r_assign_ref || tensor_symbol_info[d].r_bypass_ref)
109
0
        continue;
110
0
      tensor_marked[d >> 5] |= (1u << (d & 0x1f));
111
0
    }
112
0
    if (node->cmd.cmd == CCV_NNC_CONVOLUTION_FORWARD && node->output_size >= 1 && node->outputs[0] >= 0)
113
0
    {
114
0
      const int d = node->outputs[0];
115
      // If this tensor is alias, or assigned (while loop), or bypassed (case..of), skip.
116
0
      if (tensor_symbol_info[d].alias_ref || tensor_symbol_info[d].assign_ref || tensor_symbol_info[d].bypass_ref ||
117
0
          tensor_symbol_info[d].r_assign_ref || tensor_symbol_info[d].r_bypass_ref)
118
0
        continue;
119
0
      tensor_marked[d >> 5] |= (1u << (d & 0x1f));
120
0
    }
121
0
    if (ccv_nnc_cmd_is_backward(node->cmd))
122
0
      for (i = 0; i < node->input_size; i++)
123
0
      {
124
0
        const int d = node->inputs[i];
125
0
        if (d >= 0)
126
0
          tensor_used[d >> 5] |= (1u << (d & 0x1f));
127
0
      }
128
0
  } ccv_nnc_graph_visit_endfor
129
  // If a tensor is marked but never used in backward pass, no need to compress it.
130
0
  _ccv_nnc_remove_unused_from_marked(tensor_used, (tensor_symbol_info_size + 31) >> 5, tensor_marked);
131
  // If a tensor is not generated on the forward pass, no need to compress it.
132
0
  memset(tensor_used, 0, sizeof(uint32_t) * ((tensor_symbol_info_size + 31) >> 5));
133
0
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
134
0
    if (ccv_nnc_cmd_is_forward(node->cmd))
135
0
      for (i = 0; i < node->output_size; i++)
136
0
      {
137
0
        const int d = node->outputs[i];
138
0
        if (d >= 0)
139
0
          tensor_used[d >> 5] |= (1u << (d & 0x1f));
140
0
      }
141
0
  } ccv_nnc_graph_visit_endfor
142
0
  _ccv_nnc_remove_unused_from_marked(tensor_used, (tensor_symbol_info_size + 31) >> 5, tensor_marked);
143
  // If this tensor is pointed to by an alias, we don't want to compress as well.
144
0
  for (i = 0; i < tensor_symbol_info_size; i++)
145
0
    if (tensor_symbol_info[i].alias_ref)
146
0
    {
147
0
      const int d = tensor_symbol_info[i].alias_ref - 1;
148
      // unmark.
149
0
      if ((tensor_marked[d >> 5] & (1u << (d & 0x1f))))
150
0
        tensor_marked[d >> 5] &= ~(1u << (d & 0x1f));
151
0
    }
152
  // Now tensor_marked only contains the tensors that we think beneficial to compress. Find the best place to insert compression / decompression.
153
0
  ccv_nnc_compress_info_t* const compress_info = cccalloc(tensor_symbol_info_size, sizeof(ccv_nnc_compress_info_t));
154
0
  ccv_sparse_matrix_t* const exec_dep = _ccv_nnc_exec_dep_new(graph, visit);
155
0
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
156
0
    if (ccv_nnc_cmd_is_forward(node->cmd))
157
0
      for (i = 0; i < node->output_size; i++)
158
0
      {
159
0
        const int d = node->outputs[i];
160
0
        if (d >= 0 && (tensor_marked[d >> 5] & (1u << (d & 0x1f))))
161
0
          compress_info[d].compress.source = idx;
162
0
      }
163
0
    else if (ccv_nnc_cmd_is_backward(node->cmd))
164
0
      for (i = 0; i < node->input_size; i++)
165
0
      {
166
0
        const int d = node->inputs[i];
167
0
        if (d >= 0 && (tensor_marked[d >> 5] & (1u << (d & 0x1f))))
168
0
        {
169
0
          if (!compress_info[d].decompress.nodes)
170
0
            compress_info[d].decompress.nodes = ccv_array_new(sizeof(int), 0, 0);
171
0
          ccv_array_push(compress_info[d].decompress.nodes, &idx);
172
0
        }
173
0
      }
174
0
  } ccv_nnc_graph_visit_endfor
175
0
  ccv_array_t* const commons = ccv_array_new(sizeof(int), 0, 0);
176
0
  for (i = 0; i < tensor_symbol_info_size; i++)
177
0
  {
178
0
    if (!compress_info[i].decompress.nodes)
179
0
      continue;
180
    // If we have more than one destination, need to find the common ancestor.
181
0
    ccv_array_t* decompress_nodes = compress_info[i].decompress.nodes;
182
0
    if (decompress_nodes && decompress_nodes->rnum > 1)
183
0
    {
184
0
      ccv_array_clear(commons);
185
0
      ccv_array_t* const nodes = compress_info[i].decompress.nodes;
186
0
      const int d = *(int*)ccv_array_get(nodes, 0);
187
0
      ccv_array_push(commons, &d);
188
0
#define for_block(x, val) \
189
0
      do { \
190
0
        const int dd = ((int32_t*)val)[0]; \
191
0
        if (dd > 0) \
192
0
          ccv_array_push(commons, &x); \
193
0
      } while (0)
194
0
      ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, d);
195
0
      if (vector)
196
0
        CCV_SPARSE_VECTOR_FOREACH(exec_dep, vector, for_block);
197
0
#undef for_block
198
0
      for (j = 0; j < commons->rnum;)
199
0
      {
200
0
        const int d = *(int*)ccv_array_get(commons, j);
201
0
        int flag = 0;
202
0
        for (k = 1; k < nodes->rnum && !flag; k++)
203
0
        {
204
0
          const int dd = *(int*)ccv_array_get(nodes, k);
205
0
          if (dd == d) // If it is the same as the commons, keep.
206
0
            continue;
207
0
          const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, dd, d);
208
          // If I cannot reach from this destination to the ancestor. This is not an ancestor.
209
          // Remove it from the list.
210
0
          if (!cell.i32 || cell.i32[0] == 0)
211
0
            flag = 1;
212
0
        }
213
0
        if (flag)
214
0
        {
215
0
          if (j < commons->rnum - 1)
216
0
            *(int*)ccv_array_get(commons, j) = *(int*)ccv_array_get(commons, commons->rnum - 1);
217
0
          --commons->rnum;
218
0
          continue;
219
0
        }
220
0
        ++j;
221
0
      }
222
      // If there is no common ancestor. We cannot do this. Abort the whole thing.
223
0
      if (commons->rnum == 0)
224
0
        continue;
225
0
      decompress_nodes = commons;
226
0
    }
227
    // Find source / destination for compress nodes.
228
0
    const int compress_source = compress_info[i].compress.source;
229
0
    ccv_array_t* const outgoings = exec_symbol_info[compress_source].outgoings;
230
0
    if (!outgoings || outgoings->rnum == 0)
231
0
      continue;
232
0
    int hop = exec_symbol_info_size;
233
0
    for (j = 0; j < outgoings->rnum; j++)
234
0
    {
235
0
      const int d = *(int*)ccv_array_get(outgoings, j);
236
0
      const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, d, compress_source);
237
0
      if (cell.i32[0] < hop)
238
0
      {
239
0
        compress_info[i].compress.destination = d;
240
0
        hop = cell.i32[0];
241
0
      }
242
0
    }
243
0
    if (hop == exec_symbol_info_size)
244
0
      continue;
245
    // Find source / destination for decompress nodes.
246
    // First, find the node that everyone can reach it.
247
0
    int decompress_destination = -1;
248
0
    for (j = 0; j < decompress_nodes->rnum; j++)
249
0
    {
250
0
      int flag = 0;
251
0
      const int dj = *(int*)ccv_array_get(decompress_nodes, j);
252
0
      for (k = 0; !flag && k < decompress_nodes->rnum; k++)
253
0
        if (j != k)
254
0
        {
255
0
          const int dk = *(int*)ccv_array_get(decompress_nodes, k);
256
0
          const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, dj, dk);
257
0
          if (!cell.i32 || cell.i32[0] == 0)
258
0
            flag = 1;
259
0
        }
260
0
      if (!flag)
261
0
        decompress_destination = (decompress_destination == -1) ? dj : -2;
262
0
    }
263
    // Cannot continue, either we cannot find the node that is child node of everyone, or
264
    // it has more than one of these.
265
0
    if (decompress_destination < 0)
266
0
      continue;
267
0
    compress_info[i].decompress.destination = decompress_destination;
268
0
    hop = exec_symbol_info_size;
269
0
#define for_block(x, val) \
270
0
    do { \
271
0
      const int dd = ((int32_t*)val)[0]; \
272
0
      if (dd > 0 && dd < hop) \
273
0
      { \
274
0
        compress_info[i].decompress.source = x; \
275
0
        hop = dd; \
276
0
      } \
277
0
    } while (0)
278
0
    ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, decompress_destination);
279
0
    if (vector)
280
0
      CCV_SPARSE_VECTOR_FOREACH(exec_dep, vector, for_block);
281
    // Final check, the destination of compression should be smaller than the source of decompression.
282
0
    const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, compress_info[i].decompress.source, compress_info[i].compress.destination);
283
0
    if (compress_info[i].decompress.source != compress_info[i].compress.destination && (!cell.i32 || cell.i32[0] == 0))
284
0
      continue;
285
    // Mark it as ready to be compressed.
286
0
    compress_info[i].info = tensor_symbol_info[i].info;
287
0
    compress_info[i].should_compress = 1;
288
0
  }
289
  // Do the graph mutation now based on the compression info.
290
0
  for (i = 0; i < tensor_symbol_info_size; i++)
291
0
    if (compress_info[i].should_compress)
292
0
    {
293
0
      ccv_nnc_tensor_param_t compressed_params;
294
0
      ccv_nnc_hint_tensor_auto(CMD_COMPRESSION_LSSC_FORWARD(), &compress_info[i].info, 1, ccv_nnc_no_hint, &compressed_params, 1);
295
0
      const ccv_nnc_tensor_symbol_t original = (ccv_nnc_tensor_symbol_t){
296
0
        .graph = graph,
297
0
        .d = i
298
0
      };
299
0
      const ccv_nnc_tensor_symbol_t compressed = ccv_nnc_tensor_symbol_new(graph, compressed_params, 0);
300
0
      const ccv_nnc_graph_exec_symbol_t compress_node = ccv_nnc_graph_exec_symbol_new(graph, CMD_COMPRESSION_LSSC_FORWARD(), TENSOR_SYMBOL_LIST(original), TENSOR_SYMBOL_LIST(compressed), 0);
301
0
      ccv_nnc_graph_exec_symbol_disjoin(graph, (ccv_nnc_graph_exec_symbol_t){
302
0
        .graph = graph,
303
0
        .d = compress_info[i].compress.source,
304
0
      }, (ccv_nnc_graph_exec_symbol_t){
305
0
        .graph = graph,
306
0
        .d = compress_info[i].compress.destination
307
0
      });
308
0
      ccv_nnc_graph_exec_symbol_concat(graph, (ccv_nnc_graph_exec_symbol_t){
309
0
        .graph = graph,
310
0
        .d = compress_info[i].compress.source,
311
0
      }, compress_node);
312
0
      ccv_nnc_graph_exec_symbol_concat(graph, compress_node, (ccv_nnc_graph_exec_symbol_t){
313
0
        .graph = graph,
314
0
        .d = compress_info[i].compress.destination
315
0
      });
316
0
      const ccv_nnc_tensor_symbol_t decompressed = ccv_nnc_tensor_symbol_new(graph, compress_info[i].info, 0);
317
0
      const ccv_nnc_graph_exec_symbol_t decompress_node = ccv_nnc_graph_exec_symbol_new(graph, CMD_COMPRESSION_LSSC_BACKWARD(), TENSOR_SYMBOL_LIST(compressed), TENSOR_SYMBOL_LIST(decompressed), 0);
318
0
      ccv_nnc_graph_exec_symbol_disjoin(graph, (ccv_nnc_graph_exec_symbol_t){
319
0
        .graph = graph,
320
0
        .d = compress_info[i].decompress.source,
321
0
      }, (ccv_nnc_graph_exec_symbol_t){
322
0
        .graph = graph,
323
0
        .d = compress_info[i].decompress.destination
324
0
      });
325
0
      ccv_nnc_graph_exec_symbol_concat(graph, (ccv_nnc_graph_exec_symbol_t){
326
0
        .graph = graph,
327
0
        .d = compress_info[i].decompress.source,
328
0
      }, decompress_node);
329
0
      ccv_nnc_graph_exec_symbol_concat(graph, decompress_node, (ccv_nnc_graph_exec_symbol_t){
330
0
        .graph = graph,
331
0
        .d = compress_info[i].decompress.destination
332
0
      });
333
0
      for (j = 0; j < compress_info[i].decompress.nodes->rnum; j++)
334
0
      {
335
0
        const int d = *(int*)ccv_array_get(compress_info[i].decompress.nodes, j);
336
0
        ccv_nnc_graph_exec_symbol_info_t* const destination_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, d);
337
0
        for (k = 0; k < destination_info->input_size; k++)
338
0
          if (destination_info->inputs[k] == i)
339
0
            destination_info->inputs[k] = decompressed.d;
340
0
      }
341
0
    }
342
0
  ccv_nnc_graph_visit_free(visit);
343
0
  ccv_array_free(commons);
344
0
  ccv_matrix_free(exec_dep);
345
0
  ccfree(tensor_marked);
346
0
  for (i = 0; i < tensor_symbol_info_size; i++)
347
0
    if (compress_info[i].decompress.nodes)
348
0
      ccv_array_free(compress_info[i].decompress.nodes);
349
0
  ccfree(compress_info);
350
0
}