Coverage Report

Created: 2021-09-30 21:42

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/ccv_nnc_symbolic_graph_memory_compression.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#include "_ccv_nnc_symbolic_graph.h"
6
7
// MARK - Level-3.5 API
8
9
static void _ccv_nnc_remove_unused_from_marked(const uint32_t* const tensor_used, const int size, uint32_t* const tensor_marked)
10
0
{
11
0
  int i;
12
0
  for (i = 0; i < size; i++)
13
0
    tensor_marked[i] &= tensor_used[i];
14
0
}
15
16
static ccv_sparse_matrix_t* _ccv_nnc_exec_dep_new(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_graph_visit_t* const visit)
17
0
{
18
0
  ccv_sparse_matrix_t* exec_dep = ccv_sparse_matrix_new(graph->exec_symbol_info->rnum, graph->exec_symbol_info->rnum, CCV_32S | CCV_C1, CCV_SPARSE_ROW_MAJOR, 0);
19
0
  int* buf = (int*)ccmalloc(sizeof(int) * graph->exec_symbol_info->rnum * 2);
20
0
  int buf_size;
21
0
#define for_block(x, val) \
22
0
  do { \
23
0
    if (((int32_t*)val)[0] > 0) \
24
0
    { \
25
0
      buf[buf_size * 2] = x; \
26
0
      buf[buf_size * 2 + 1] = ((int32_t*)val)[0] + 1; \
27
0
      ++buf_size; \
28
0
    } \
29
0
  } while (0)
30
0
  const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0);
31
0
  int i, j;
32
0
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx, term) {
33
0
    buf_size = 0; /* save all its parent deps to this buffer */
34
0
    ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, idx);
35
0
    if (vector)
36
0
      CCV_SPARSE_VECTOR_FOREACH(exec_dep, vector, for_block);
37
0
    if (!node->outgoings)
38
0
      continue;
39
0
    for (i = 0; i < node->outgoings->rnum; i++)
40
0
    {
41
0
      int outgoing = *(int*)ccv_array_get(node->outgoings, i);
42
0
      const int32_t one = 1;
43
0
      ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, idx);
44
0
      /* If not found, set, if the current node is the destination node, no need 
45
0
       * set itself as parent of subsequent nodes because its terminal nature. */
46
0
      if (!term && (!cell.i32 || cell.i32[0] == 0))
47
0
        ccv_set_sparse_matrix_cell(exec_dep, outgoing, idx, &one);
48
0
      for (j = 0; j < buf_size; j++) /* set with all idx's dependencies as well */
49
0
      {
50
0
        ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2]);
51
0
        /* If not found, set */
52
0
        if (!cell.i32 || cell.i32[0] == 0)
53
0
          ccv_set_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2], &buf[j * 2 + 1]);
54
0
        else {
55
0
          /* Otherwise, set to the longest one */
56
0
          int32_t dep = ccv_max(cell.i32[0], buf[j * 2 + 1]);
57
0
          ccv_set_sparse_matrix_cell(exec_dep, outgoing, buf[j * 2], &dep);
58
0
        }
59
0
      }
60
0
    }
61
0
  } ccv_nnc_graph_visit_endfor
62
0
#undef for_block
63
0
  ccfree(buf);
64
0
  return exec_dep;
65
0
}
66
67
typedef struct {
68
  int should_compress;
69
  ccv_nnc_tensor_param_t info;
70
  struct {
71
    int source;
72
    int destination;
73
  } compress;
74
  struct {
75
    int source;
76
    int destination;
77
    ccv_array_t* nodes;
78
  } decompress;
79
} ccv_nnc_compress_info_t;
80
81
void ccv_nnc_symbolic_graph_memory_compression(ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size)
82
0
{
83
0
  // Note all these exec_symbol_info and tensor_symbol_info cannot be accessed once I start to mutate the graph. Therefore, I will do the
84
0
  // mutation at the last step, to carefully step away from that possibility.
85
0
  ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0);
86
0
  ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, 0);
87
0
  ccv_nnc_graph_visit_t* const visit = ccv_nnc_graph_visit_new(graph, exec_symbol_info, graph->exec_symbol_info->rnum, sources, source_size, destinations, destination_size, 0);
88
0
  ccv_nnc_symbolic_graph_symbol_infer(graph, visit, sources, source_size, destinations, destination_size, 0, 0, tensor_symbol_info, exec_symbol_info);
89
0
  const int tensor_symbol_info_size = graph->tensor_symbol_info->rnum;
90
0
  const int exec_symbol_info_size = graph->exec_symbol_info->rnum;
91
0
  uint32_t* const tensor_marked = (uint32_t*)cccalloc(((tensor_symbol_info_size + 31) >> 5) * 2, sizeof(uint32_t));
92
0
  uint32_t* const tensor_used = tensor_marked + ((tensor_symbol_info_size + 31) >> 5);
93
0
  int i, j, k;
94
0
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
95
0
    // If this node is a convolution backward node, check the original input tensor symbol, and see if it is generated
96
0
    // by this graph. If it is, we can track it back and compress it immediately after its generation.
97
0
    // I am not certain what's better (whether to "overlap" the compression with the last bits of usage on forward pass)
98
0
    // or just compress immediately after. I certainly need to decompress just before this convolution backward node.
99
0
    if (node->cmd.cmd == CCV_NNC_CONVOLUTION_BACKWARD && node->input_size > 1 && node->inputs[1] >= 0)
100
0
    {
101
0
      const int d = node->inputs[1];
102
0
      // If this tensor is alias, or assigned (while loop), or bypassed (case..of), skip.
103
0
      if (tensor_symbol_info[d].alias_ref || tensor_symbol_info[d].assign_ref || tensor_symbol_info[d].bypass_ref ||
104
0
          tensor_symbol_info[d].r_assign_ref || tensor_symbol_info[d].r_bypass_ref)
105
0
        continue;
106
0
      tensor_marked[d >> 5] |= (1u << (d & 0x1f));
107
0
    }
108
0
    if (node->cmd.cmd == CCV_NNC_CONVOLUTION_FORWARD && node->output_size >= 1 && node->outputs[0] >= 0)
109
0
    {
110
0
      const int d = node->outputs[0];
111
0
      // If this tensor is alias, or assigned (while loop), or bypassed (case..of), skip.
112
0
      if (tensor_symbol_info[d].alias_ref || tensor_symbol_info[d].assign_ref || tensor_symbol_info[d].bypass_ref ||
113
0
          tensor_symbol_info[d].r_assign_ref || tensor_symbol_info[d].r_bypass_ref)
114
0
        continue;
115
0
      tensor_marked[d >> 5] |= (1u << (d & 0x1f));
116
0
    }
117
0
    if (ccv_nnc_cmd_is_backward(node->cmd))
118
0
      for (i = 0; i < node->input_size; i++)
119
0
      {
120
0
        const int d = node->inputs[i];
121
0
        if (d >= 0)
122
0
          tensor_used[d >> 5] |= (1u << (d & 0x1f));
123
0
      }
124
0
  } ccv_nnc_graph_visit_endfor
125
0
  // If a tensor is marked but never used in backward pass, no need to compress it.
126
0
  _ccv_nnc_remove_unused_from_marked(tensor_used, (tensor_symbol_info_size + 31) >> 5, tensor_marked);
127
0
  // If a tensor is not generated on the forward pass, no need to compress it.
128
0
  memset(tensor_used, 0, sizeof(uint32_t) * ((tensor_symbol_info_size + 31) >> 5));
129
0
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
130
0
    if (ccv_nnc_cmd_is_forward(node->cmd))
131
0
      for (i = 0; i < node->output_size; i++)
132
0
      {
133
0
        const int d = node->outputs[i];
134
0
        if (d >= 0)
135
0
          tensor_used[d >> 5] |= (1u << (d & 0x1f));
136
0
      }
137
0
  } ccv_nnc_graph_visit_endfor
138
0
  _ccv_nnc_remove_unused_from_marked(tensor_used, (tensor_symbol_info_size + 31) >> 5, tensor_marked);
139
0
  // If this tensor is pointed to by an alias, we don't want to compress as well.
140
0
  for (i = 0; i < tensor_symbol_info_size; i++)
141
0
    if (tensor_symbol_info[i].alias_ref)
142
0
    {
143
0
      const int d = tensor_symbol_info[i].alias_ref - 1;
144
0
      // unmark.
145
0
      if ((tensor_marked[d >> 5] & (1u << (d & 0x1f))))
146
0
        tensor_marked[d >> 5] &= ~(1u << (d & 0x1f));
147
0
    }
148
0
  // Now tensor_marked only contains the tensors that we think beneficial to compress. Find the best place to insert compression / decompression.
149
0
  ccv_nnc_compress_info_t* const compress_info = cccalloc(tensor_symbol_info_size, sizeof(ccv_nnc_compress_info_t));
150
0
  ccv_sparse_matrix_t* const exec_dep = _ccv_nnc_exec_dep_new(graph, visit);
151
0
  ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx) {
152
0
    if (ccv_nnc_cmd_is_forward(node->cmd))
153
0
      for (i = 0; i < node->output_size; i++)
154
0
      {
155
0
        const int d = node->outputs[i];
156
0
        if (d >= 0 && (tensor_marked[d >> 5] & (1u << (d & 0x1f))))
157
0
          compress_info[d].compress.source = idx;
158
0
      }
159
0
    else if (ccv_nnc_cmd_is_backward(node->cmd))
160
0
      for (i = 0; i < node->input_size; i++)
161
0
      {
162
0
        const int d = node->inputs[i];
163
0
        if (d >= 0 && (tensor_marked[d >> 5] & (1u << (d & 0x1f))))
164
0
        {
165
0
          if (!compress_info[d].decompress.nodes)
166
0
            compress_info[d].decompress.nodes = ccv_array_new(sizeof(int), 0, 0);
167
0
          ccv_array_push(compress_info[d].decompress.nodes, &idx);
168
0
        }
169
0
      }
170
0
  } ccv_nnc_graph_visit_endfor
171
0
  ccv_array_t* const commons = ccv_array_new(sizeof(int), 0, 0);
172
0
  for (i = 0; i < tensor_symbol_info_size; i++)
173
0
  {
174
0
    if (!compress_info[i].decompress.nodes)
175
0
      continue;
176
0
    // If we have more than one destination, need to find the common ancestor.
177
0
    ccv_array_t* decompress_nodes = compress_info[i].decompress.nodes;
178
0
    if (decompress_nodes && decompress_nodes->rnum > 1)
179
0
    {
180
0
      ccv_array_clear(commons);
181
0
      ccv_array_t* const nodes = compress_info[i].decompress.nodes;
182
0
      const int d = *(int*)ccv_array_get(nodes, 0);
183
0
      ccv_array_push(commons, &d);
184
0
#define for_block(x, val) \
185
0
      do { \
186
0
        const int dd = ((int32_t*)val)[0]; \
187
0
        if (dd > 0) \
188
0
          ccv_array_push(commons, &x); \
189
0
      } while (0)
190
0
      ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, d);
191
0
      if (vector)
192
0
        CCV_SPARSE_VECTOR_FOREACH(exec_dep, vector, for_block);
193
0
#undef for_block
194
0
      for (j = 0; j < commons->rnum;)
195
0
      {
196
0
        const int d = *(int*)ccv_array_get(commons, j);
197
0
        int flag = 0;
198
0
        for (k = 1; k < nodes->rnum && !flag; k++)
199
0
        {
200
0
          const int dd = *(int*)ccv_array_get(nodes, k);
201
0
          if (dd == d) // If it is the same as the commons, keep.
202
0
            continue;
203
0
          const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, dd, d);
204
0
          // If I cannot reach from this destination to the ancestor. This is not an ancestor.
205
0
          // Remove it from the list.
206
0
          if (!cell.i32 || cell.i32[0] == 0)
207
0
            flag = 1;
208
0
        }
209
0
        if (flag)
210
0
        {
211
0
          if (j < commons->rnum - 1)
212
0
            *(int*)ccv_array_get(commons, j) = *(int*)ccv_array_get(commons, commons->rnum - 1);
213
0
          --commons->rnum;
214
0
          continue;
215
0
        }
216
0
        ++j;
217
0
      }
218
0
      // If there is no common ancestor. We cannot do this. Abort the whole thing.
219
0
      if (commons->rnum == 0)
220
0
        continue;
221
0
      decompress_nodes = commons;
222
0
    }
223
0
    // Find source / destination for compress nodes.
224
0
    const int compress_source = compress_info[i].compress.source;
225
0
    ccv_array_t* const outgoings = exec_symbol_info[compress_source].outgoings;
226
0
    if (!outgoings || outgoings->rnum == 0)
227
0
      continue;
228
0
    int hop = exec_symbol_info_size;
229
0
    for (j = 0; j < outgoings->rnum; j++)
230
0
    {
231
0
      const int d = *(int*)ccv_array_get(outgoings, j);
232
0
      const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, d, compress_source);
233
0
      if (cell.i32[0] < hop)
234
0
      {
235
0
        compress_info[i].compress.destination = d;
236
0
        hop = cell.i32[0];
237
0
      }
238
0
    }
239
0
    if (hop == exec_symbol_info_size)
240
0
      continue;
241
0
    // Find source / destination for decompress nodes.
242
0
    // First, find the node that everyone can reach it.
243
0
    int decompress_destination = -1;
244
0
    for (j = 0; j < decompress_nodes->rnum; j++)
245
0
    {
246
0
      int flag = 0;
247
0
      const int dj = *(int*)ccv_array_get(decompress_nodes, j);
248
0
      for (k = 0; !flag && k < decompress_nodes->rnum; k++)
249
0
        if (j != k)
250
0
        {
251
0
          const int dk = *(int*)ccv_array_get(decompress_nodes, k);
252
0
          const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, dj, dk);
253
0
          if (!cell.i32 || cell.i32[0] == 0)
254
0
            flag = 1;
255
0
        }
256
0
      if (!flag)
257
0
        decompress_destination = (decompress_destination == -1) ? dj : -2;
258
0
    }
259
0
    // Cannot continue, either we cannot find the node that is child node of everyone, or
260
0
    // it has more than one of these.
261
0
    if (decompress_destination < 0)
262
0
      continue;
263
0
    compress_info[i].decompress.destination = decompress_destination;
264
0
    hop = exec_symbol_info_size;
265
0
#define for_block(x, val) \
266
0
    do { \
267
0
      const int dd = ((int32_t*)val)[0]; \
268
0
      if (dd > 0 && dd < hop) \
269
0
      { \
270
0
        compress_info[i].decompress.source = x; \
271
0
        hop = dd; \
272
0
      } \
273
0
    } while (0)
274
0
    ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(exec_dep, decompress_destination);
275
0
    if (vector)
276
0
      CCV_SPARSE_VECTOR_FOREACH(exec_dep, vector, for_block);
277
0
    // Final check, the destination of compression should be smaller than the source of decompression.
278
0
    const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep, compress_info[i].decompress.source, compress_info[i].compress.destination);
279
0
    if (compress_info[i].decompress.source != compress_info[i].compress.destination && (!cell.i32 || cell.i32[0] == 0))
280
0
      continue;
281
0
    // Mark it as ready to be compressed.
282
0
    compress_info[i].info = tensor_symbol_info[i].info;
283
0
    compress_info[i].should_compress = 1;
284
0
  }
285
0
  // Do the graph mutation now based on the compression info.
286
0
  for (i = 0; i < tensor_symbol_info_size; i++)
287
0
    if (compress_info[i].should_compress)
288
0
    {
289
0
      ccv_nnc_tensor_param_t compressed_params;
290
0
      ccv_nnc_hint_tensor_auto(CMD_COMPRESSION_LSSC_FORWARD(), &compress_info[i].info, 1, ccv_nnc_no_hint, &compressed_params, 1);
291
0
      const ccv_nnc_tensor_symbol_t original = (ccv_nnc_tensor_symbol_t){
292
0
        .graph = graph,
293
0
        .d = i
294
0
      };
295
0
      const ccv_nnc_tensor_symbol_t compressed = ccv_nnc_tensor_symbol_new(graph, compressed_params, 0);
296
0
      const ccv_nnc_graph_exec_symbol_t compress_node = ccv_nnc_graph_exec_symbol_new(graph, CMD_COMPRESSION_LSSC_FORWARD(), TENSOR_SYMBOL_LIST(original), TENSOR_SYMBOL_LIST(compressed), 0);
297
0
      ccv_nnc_graph_exec_symbol_concat(graph, (ccv_nnc_graph_exec_symbol_t){
298
0
        .graph = graph,
299
0
        .d = compress_info[i].compress.source,
300
0
      }, compress_node);
301
0
      ccv_nnc_graph_exec_symbol_concat(graph, compress_node, (ccv_nnc_graph_exec_symbol_t){
302
0
        .graph = graph,
303
0
        .d = compress_info[i].compress.destination
304
0
      });
305
0
      ccv_nnc_graph_exec_symbol_disjoin(graph, (ccv_nnc_graph_exec_symbol_t){
306
0
        .graph = graph,
307
0
        .d = compress_info[i].compress.source,
308
0
      }, (ccv_nnc_graph_exec_symbol_t){
309
0
        .graph = graph,
310
0
        .d = compress_info[i].compress.destination
311
0
      });
312
0
      const ccv_nnc_tensor_symbol_t decompressed = ccv_nnc_tensor_symbol_new(graph, compress_info[i].info, 0);
313
0
      const ccv_nnc_graph_exec_symbol_t decompress_node = ccv_nnc_graph_exec_symbol_new(graph, CMD_COMPRESSION_LSSC_BACKWARD(), TENSOR_SYMBOL_LIST(compressed), TENSOR_SYMBOL_LIST(decompressed), 0);
314
0
      ccv_nnc_graph_exec_symbol_concat(graph, (ccv_nnc_graph_exec_symbol_t){
315
0
        .graph = graph,
316
0
        .d = compress_info[i].decompress.source,
317
0
      }, decompress_node);
318
0
      ccv_nnc_graph_exec_symbol_concat(graph, decompress_node, (ccv_nnc_graph_exec_symbol_t){
319
0
        .graph = graph,
320
0
        .d = compress_info[i].decompress.destination
321
0
      });
322
0
      ccv_nnc_graph_exec_symbol_disjoin(graph, (ccv_nnc_graph_exec_symbol_t){
323
0
        .graph = graph,
324
0
        .d = compress_info[i].decompress.source,
325
0
      }, (ccv_nnc_graph_exec_symbol_t){
326
0
        .graph = graph,
327
0
        .d = compress_info[i].decompress.destination
328
0
      });
329
0
      for (j = 0; j < compress_info[i].decompress.nodes->rnum; j++)
330
0
      {
331
0
        const int d = *(int*)ccv_array_get(compress_info[i].decompress.nodes, j);
332
0
        ccv_nnc_graph_exec_symbol_info_t* const destination_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, d);
333
0
        for (k = 0; k < destination_info->input_size; k++)
334
0
          if (destination_info->inputs[k] == i)
335
0
            destination_info->inputs[k] = decompressed.d;
336
0
      }
337
0
    }
338
0
  ccv_nnc_graph_visit_free(visit);
339
0
  ccv_array_free(commons);
340
0
  ccv_matrix_free(exec_dep);
341
0
  ccfree(tensor_marked);
342
0
  for (i = 0; i < tensor_symbol_info_size; i++)
343
0
    if (compress_info[i].decompress.nodes)
344
0
      ccv_array_free(compress_info[i].decompress.nodes);
345
0
  ccfree(compress_info);
346
0
}