Coverage Report

Created: 2024-08-19 11:27

/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_dynamic_graph_backward.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_nnc_easy.h"
5
#include "ccv_internal.h"
6
#ifdef HAVE_CUDA
7
#include "gpu/ccv_nnc_compat.h"
8
#endif
9
#include "_ccv_nnc_dynamic_graph.h"
10
11
// MARK - Level-4.5 API
12
13
static void* _ccv_nnc_dynamic_compile_alloc(const int type, const int pinned_mem, const size_t size, void* const arg)
14
266
{
15
266
  assert(type & CCV_TENSOR_GPU_MEMORY);
16
266
  ccv_nnc_dy_xpu_alloc_t* const xpu_alloc  = (ccv_nnc_dy_xpu_alloc_t*)arg;
17
266
  const int device = CCV_TENSOR_GET_DEVICE_ID(type);
18
266
  return ccv_nnc_xpu_alloc(xpu_alloc->xpu_alloc, device, xpu_alloc->stream, size);
19
266
}
20
21
static void _ccv_nnc_dynamic_compile_free(void* const ptr, void* const arg)
22
266
{
23
266
  ccv_nnc_xpu_alloc_t* const xpu_alloc = (ccv_nnc_xpu_alloc_t*)arg;
24
266
  ccv_nnc_xpu_free(xpu_alloc, ptr);
25
266
}
26
27
const ccv_nnc_symbolic_graph_compile_allocator_vtab_t ccv_nnc_dy_allocator_isa = {
28
  .alloc = _ccv_nnc_dynamic_compile_alloc,
29
  .free = _ccv_nnc_dynamic_compile_free
30
};
31
32
void ccv_nnc_dynamic_graph_backward(ccv_nnc_dynamic_graph_t* const dynamic_graph, const ccv_nnc_tensor_variable_t* const f_variables, const int f_variable_size, const ccv_nnc_tensor_variable_t* const df_optionals, const ccv_nnc_tensor_variable_t* const inputs, const int input_size, ccv_nnc_tensor_variable_t* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
33
3.42k
{
34
3.42k
  int d, i, j, k;
35
3.42k
  assert(input_size == output_size);
36
3.42k
  assert(input_size > 0);
37
3.42k
  assert(output_size > 0);
38
3.42k
  assert(f_variable_size > 0);
39
3.42k
  int f_source_size = 0;
40
  // Both f_variable and tensor_variable should be, at least, executed. Otherwise we cannot differentiate.
41
6.85k
  for (i = 0; i < f_variable_size; 
i++3.43k
)
42
3.43k
  {
43
3.43k
    assert(f_variables[i]->symbol.d >= 0);
44
3.43k
    const ccv_nnc_tensor_variable_graph_bind_t* const f_symbol_extra = (ccv_nnc_tensor_variable_graph_bind_t*)ccv_array_get(dynamic_graph->binds, f_variables[i]->symbol.d);
45
3.43k
    assert(f_symbol_extra->sources && f_symbol_extra->sources->rnum > 0);
46
3.43k
    f_source_size += f_symbol_extra->sources->rnum;
47
3.43k
  }
48
3.42k
  assert(!dynamic_graph->no_grad);
49
6.86k
  
for (i = 0; 3.42k
i < input_size;
i++3.44k
)
50
3.44k
  {
51
3.44k
    assert(inputs[i]->type != CCV_NNC_TENSOR_CONSTANT);
52
3.44k
    assert(inputs[i]->symbol.d >= 0);
53
3.44k
    assert(((ccv_nnc_tensor_variable_graph_bind_t*)ccv_array_get(dynamic_graph->binds, inputs[i]->symbol.d))->destinations &&
54
3.44k
      ((ccv_nnc_tensor_variable_graph_bind_t*)ccv_array_get(dynamic_graph->binds, inputs[i]->symbol.d))->destinations->rnum > 0);
55
3.44k
  }
56
  // Fill in the symbol info for outputs.
57
6.86k
  
for (i = 0; 3.42k
i < output_size;
i++3.44k
)
58
3.44k
    if (outputs[i] && 
ccv_nnc_is_tensor_auto(outputs[i]->info)3.24k
)
59
1.23k
      outputs[i]->info = inputs[i]->info;
60
3.42k
  const int exec_symbol_info_size = ccv_nnc_graph_exec_symbol_count(dynamic_graph->tape);
61
3.42k
  ccv_array_t* const sources = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), 1, 0);
62
3.42k
  if (!dynamic_graph->ws)
63
15
    dynamic_graph->ws = ccv_array_new(sizeof(int), exec_symbol_info_size * 2 + ((exec_symbol_info_size + 31) >> 5), 0);
64
3.42k
  ccv_array_t* const ws = dynamic_graph->ws;
65
3.42k
  ccv_array_resize(ws, exec_symbol_info_size * 2 + ((exec_symbol_info_size + 31) >> 5));
66
  // set visited to all 0.
67
3.42k
  memset((uint32_t*)ccv_array_get(ws, exec_symbol_info_size * 2), 0, sizeof(uint32_t) * ((exec_symbol_info_size + 31) >> 5));
68
6.86k
  for (i = 0; i < input_size; 
i++3.44k
)
69
3.44k
  {
70
3.44k
    ccv_array_t* const destinations = ((ccv_nnc_tensor_variable_graph_bind_t*)ccv_array_get(dynamic_graph->binds, inputs[i]->symbol.d))->destinations;
71
6.91k
    for (j = 0; j < destinations->rnum; 
j++3.47k
)
72
3.47k
      ccv_nnc_insert_if_prior_to_any(dynamic_graph->tape,
73
3.47k
        *(int*)ccv_array_get(destinations, j),
74
3.47k
        sources, (uint32_t*)ccv_array_get(ws, exec_symbol_info_size * 2),
75
3.47k
        (int*)ccv_array_get(ws, 0), (int*)ccv_array_get(ws, exec_symbol_info_size));
76
3.44k
  }
77
3.42k
  ccv_array_t* const destinations = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), f_source_size, 0);
78
6.85k
  for (i = 0; i < f_variable_size; 
i++3.43k
)
79
3.43k
  {
80
3.43k
    const ccv_nnc_tensor_variable_graph_bind_t* const loss_symbol_extra = (ccv_nnc_tensor_variable_graph_bind_t*)ccv_array_get(dynamic_graph->binds, f_variables[i]->symbol.d);
81
6.86k
    for (j = 0; j < loss_symbol_extra->sources->rnum; 
j++3.43k
)
82
3.43k
    {
83
3.43k
      const int symbol_d = *(int*)ccv_array_get(loss_symbol_extra->sources, j);
84
3.43k
      int flag = 0;
85
3.45k
      for (k = 0; !flag && k < destinations->rnum; 
k++19
)
86
19
        flag = (symbol_d == ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(destinations, k))->d);
87
3.43k
      if (!flag)
88
3.43k
      {
89
3.43k
        const ccv_nnc_graph_exec_symbol_t symbol = {
90
3.43k
          .d = symbol_d,
91
3.43k
          .graph = dynamic_graph->tape
92
3.43k
        };
93
3.43k
        ccv_array_push(destinations, &symbol);
94
3.43k
      }
95
3.43k
    }
96
3.43k
  }
97
  // Go over sources, because destinations will get removed all the time, thus, the index is not accurate.
98
3.42k
  if (destinations->rnum > 1)
99
25
    
for (i = 0; 7
i < destinations->rnum;
i++18
)
100
18
    {
101
18
      memset((uint32_t*)ccv_array_get(ws, exec_symbol_info_size * 2), 0, sizeof(uint32_t) * ((exec_symbol_info_size + 31) >> 5));
102
18
      ccv_nnc_remove_if_prior_to_any(dynamic_graph->tape,
103
18
        ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(destinations, i))->d,
104
18
        destinations, (uint32_t*)ccv_array_get(ws, exec_symbol_info_size * 2),
105
18
        (int*)ccv_array_get(ws, 0), (int*)ccv_array_get(ws, exec_symbol_info_size));
106
18
    }
107
3.42k
  ccv_nnc_tensor_symbol_t f_symbols[f_variable_size];
108
6.85k
  for (i = 0; i < f_variable_size; 
i++3.43k
)
109
3.43k
    f_symbols[i] = f_variables[i]->symbol;
110
3.42k
  ccv_nnc_tensor_symbol_t input_symbols[input_size];
111
6.86k
  for (i = 0; i < input_size; 
i++3.44k
)
112
3.44k
    input_symbols[i] = inputs[i]->symbol;
113
3.42k
  ccv_array_t* const symbol_stack = ccv_array_new(sizeof(ccv_nnc_tape_symbol_t), 1, 0);
114
3.42k
  ccv_nnc_tensor_symbol_new_hook(dynamic_graph->tape, ccv_nnc_dynamic_graph_push_backward_tensor_symbol, symbol_stack, 0);
115
3.42k
  ccv_nnc_tensor_symbol_alias_new_hook(dynamic_graph->tape, ccv_nnc_dynamic_graph_push_backward_tensor_symbol_alias, symbol_stack, 0);
116
3.42k
  ccv_nnc_graph_exec_symbol_new_hook(dynamic_graph->tape, ccv_nnc_dynamic_graph_push_backward_graph_exec_symbol, symbol_stack, 0);
117
3.42k
  ccv_nnc_symbolic_graph_backward(dynamic_graph->tape,
118
3.42k
    f_symbols, f_variable_size, input_symbols, input_size,
119
3.42k
    (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, 0), sources->rnum,
120
3.42k
    (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(destinations, 0), destinations->rnum);
121
  // Bind generated tensors.
122
3.42k
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), dynamic_graph->vars->rnum + 2, 0);
123
32.5k
  for (i = 0; i < dynamic_graph->vars->rnum; 
i++29.1k
)
124
29.1k
  {
125
29.1k
    ccv_nnc_tensor_variable_t var = *(ccv_nnc_tensor_variable_t*)ccv_array_get(dynamic_graph->vars, i);
126
29.1k
    if (var && 
var->tensor_view29.1k
&&
var->symbol.d >= 027.8k
)
127
22.8k
    {
128
22.8k
      ccv_nnc_tensor_bind_t bind = {
129
22.8k
        .symbol = var->symbol,
130
22.8k
        .tensor = (ccv_nnc_tensor_t*)CCV_NNC_TENSOR_VIEW(var->tensor_view)
131
22.8k
      };
132
22.8k
      ccv_array_push(tensor_binds, &bind);
133
22.8k
    }
134
29.1k
  }
135
26.6k
  for (i = 0; i < dynamic_graph->binds->rnum; 
i++23.2k
)
136
23.2k
  {
137
23.2k
    ccv_nnc_tensor_variable_graph_bind_t* const bind = (ccv_nnc_tensor_variable_graph_bind_t*)ccv_array_get(dynamic_graph->binds, i);
138
23.2k
    if (bind->index == CCV_NNC_TENSOR_NO_VARIABLE_BUT_USED && 
bind->tensor_view418
)
139
418
    {
140
418
      ccv_nnc_tensor_bind_t b = {
141
418
        .symbol = {
142
418
          .d = i,
143
418
          .graph = dynamic_graph->tape,
144
418
        },
145
418
        .tensor = (ccv_nnc_tensor_t*)CCV_NNC_TENSOR_VIEW(bind->tensor_view)
146
418
      };
147
418
      ccv_array_push(tensor_binds, &b);
148
418
    }
149
23.2k
  }
150
  // Compiled graph comes from the df.
151
3.42k
  ccv_array_clear(sources);
152
3.42k
  ccv_nnc_tensor_symbol_t df_symbols[f_variable_size];
153
6.85k
  for (i = 0; i < f_variable_size; 
i++3.43k
)
154
3.43k
  {
155
3.43k
    df_symbols[i] = ccv_nnc_tensor_symbol_for_backward(dynamic_graph->tape, f_variables[i]->symbol);
156
3.43k
    assert(f_symbols[i].d >= 0);
157
3.43k
  }
158
6.85k
  
for (d = 0; 3.42k
d < destinations->rnum;
d++3.43k
)
159
3.43k
  {
160
3.43k
    const ccv_nnc_graph_exec_symbol_t* const destination = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(destinations, d);
161
3.43k
    const int* outgoings; int outgoing_size;
162
3.43k
    ccv_nnc_graph_exec_symbol_to(dynamic_graph->tape, *destination, &outgoings, &outgoing_size);
163
6.86k
    for (i = 0; i < outgoing_size; 
i++3.43k
)
164
3.43k
    {
165
3.43k
      const int exec_idx = outgoings[i];
166
3.43k
      const int* inputs; int input_size;
167
3.43k
      ccv_nnc_graph_exec_symbol_io(dynamic_graph->tape, (ccv_nnc_graph_exec_symbol_t){
168
3.43k
        .d = exec_idx,
169
3.43k
        .graph = dynamic_graph->tape
170
3.43k
      }, &inputs, &input_size, 0, 0);
171
3.63k
      for (j = 0; j < input_size; 
j++206
)
172
3.63k
      {
173
3.63k
        const int input = inputs[j];
174
3.63k
        const int alias_ref = input >= 0 ? ccv_nnc_tensor_symbol_alias_to(dynamic_graph->tape, (ccv_nnc_tensor_symbol_t){
175
3.43k
          .d = input,
176
3.43k
          .graph = dynamic_graph->tape
177
3.43k
        }).d : 
CCV_NNC_NO_TENSOR_SYMBOL206
; // This could be CCV_NNC_NO_TENSOR_SYMBOL, which is negative.
178
        // alias_ref is either exists, or -1.
179
3.63k
        int flag = 0;
180
7.31k
        for (k = 0; !flag && 
k < f_variable_size3.88k
;
k++3.67k
)
181
3.67k
          flag = (df_symbols[k].d == input || 
df_symbols[k].d == alias_ref247
);
182
3.63k
        if (flag)
183
3.43k
        {
184
3.43k
          flag = 0;
185
3.44k
          for (k = 0; !flag && k < sources->rnum; 
k++16
)
186
16
            flag = (exec_idx == ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, k))->d);
187
3.43k
          if (!flag)
188
3.43k
          {
189
3.43k
            const ccv_nnc_graph_exec_symbol_t source = {
190
3.43k
              .d = exec_idx,
191
3.43k
              .graph = dynamic_graph->tape
192
3.43k
            };
193
3.43k
            ccv_array_push(sources, &source);
194
3.43k
          }
195
3.43k
          break;
196
3.43k
        }
197
3.63k
      }
198
3.43k
    }
199
3.43k
  }
200
3.42k
  int freeable_size = 0;
201
3.42k
  ccv_nnc_tensor_variable_t freeables[output_size];
202
3.42k
  ccv_array_clear(destinations);
203
3.42k
  int max_input_size = 1;
204
3.42k
  int max_output_size = 1;
205
6.85k
  for (i = 0; i < sources->rnum; 
i++3.43k
)
206
3.43k
  {
207
3.43k
    const ccv_nnc_graph_exec_symbol_t source = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, i);
208
3.43k
    int input_size; int output_size;
209
3.43k
    ccv_nnc_graph_exec_symbol_io(dynamic_graph->tape, source, 0, &input_size, 0, &output_size);
210
3.43k
    max_input_size = ccv_max(input_size, max_input_size);
211
3.43k
    max_output_size = ccv_max(output_size, max_output_size);
212
3.43k
  }
213
6.86k
  for (i = 0; i < output_size; 
i++3.44k
)
214
3.44k
  {
215
3.44k
    const ccv_nnc_tensor_symbol_t symbol = ccv_nnc_tensor_symbol_for_backward(dynamic_graph->tape, input_symbols[i]);
216
3.44k
    ccv_nnc_graph_exec_symbol_t destination = ccv_nnc_graph_exec_symbol_for_backward(dynamic_graph->tape, symbol);
217
3.44k
    int input_size; int output_size;
218
3.44k
    ccv_nnc_graph_exec_symbol_io(dynamic_graph->tape, destination, 0, &input_size, 0, &output_size);
219
3.44k
    max_input_size = ccv_max(input_size, max_input_size);
220
3.44k
    max_output_size = ccv_max(output_size, max_output_size);
221
3.44k
  }
222
3.42k
  const int max_input_bitmask_size = ((max_input_size + 63) >> 6);
223
3.42k
  const int max_output_bitmask_size =  ((max_output_size + 63) >> 6);
224
3.42k
  ccv_nnc_tensor_symbol_t temp_input_symbols[max_input_size];
225
3.42k
  ccv_nnc_tensor_symbol_t temp_output_symbols[max_output_size];
226
3.42k
  uint64_t temp_input_bitmasks[max_input_bitmask_size];
227
3.42k
  uint64_t temp_output_bitmasks[max_output_bitmask_size];
228
  // Bind dt tensor.
229
6.86k
  for (i = 0; i < output_size; 
i++3.44k
)
230
3.44k
  {
231
3.44k
    const ccv_nnc_tensor_symbol_t symbol = ccv_nnc_tensor_symbol_for_backward(dynamic_graph->tape, input_symbols[i]);
232
3.44k
    ccv_nnc_graph_exec_symbol_t destination = ccv_nnc_graph_exec_symbol_for_backward(dynamic_graph->tape, symbol);
233
3.44k
    if (outputs[i])
234
3.24k
    {
235
3.24k
      if (ccv_nnc_tensor_variable_contains_value(outputs[i]))
236
2.00k
      {
237
        // If the output tensors already exist, we need to accumulate the result.
238
        // However, if this tensor is set from outside, we don't accumulate on that
239
        // (these maybe people just want to collect the result in explicit way).
240
        // On the other hand, if these external tensor views has a symbol associated
241
        // with them, they are not made to collect results. They are probably bind in
242
        // previous computations.
243
        // The above logic is convoluted, but it should make intuitive sense in many
244
        // cases.
245
2.00k
        ccv_nnc_tensor_symbol_t inputs[2];
246
2.00k
        inputs[0] = ccv_nnc_tensor_symbol_new(dynamic_graph->tape, outputs[i]->info, 0);
247
2.00k
        inputs[1] = symbol;
248
2.00k
        const ccv_nnc_tensor_symbol_t output = ccv_nnc_tensor_symbol_new(dynamic_graph->tape, outputs[i]->info, 0);
249
2.00k
        ccv_nnc_tensor_bind_t dt_bind = {
250
2.00k
          .symbol = inputs[0],
251
2.00k
          .tensor = ccv_nnc_tensor_from_variable(dynamic_graph, outputs[i], stream_context)
252
2.00k
        };
253
2.00k
        ccv_array_push(tensor_binds, &dt_bind);
254
2.00k
        ccv_nnc_graph_exec_symbol_t accum = ccv_nnc_graph_exec_symbol_new(dynamic_graph->tape, CMD_EWSUM_FORWARD(), inputs, 2, &output, 1, 0);
255
2.00k
        ccv_nnc_graph_exec_symbol_concat(dynamic_graph->tape, destination, accum);
256
2.00k
        destination = accum; // The accumulation unit becomes the new destination.
257
2.00k
        freeables[freeable_size++] = ccv_nnc_tensor_variable_exchange_new(dynamic_graph, outputs[i]);
258
2.00k
        dt_bind.symbol = output;
259
2.00k
        dt_bind.tensor = ccv_nnc_tensor_from_variable(dynamic_graph, outputs[i], stream_context);
260
2.00k
        ccv_array_push(tensor_binds, &dt_bind);
261
2.00k
      } else {
262
1.23k
        assert(outputs[i]->symbol.d < 0);
263
        // Otherwise, we can directly bind to the backward output.
264
1.23k
        ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_from_variable(dynamic_graph, outputs[i], stream_context);
265
1.23k
        const ccv_nnc_tensor_bind_t dt_bind = {
266
1.23k
          .symbol = symbol,
267
1.23k
          .tensor = tensor
268
1.23k
        };
269
1.23k
        ccv_array_push(tensor_binds, &dt_bind);
270
1.23k
      }
271
3.24k
    } else {
272
      // Remove this symbol if it is possible, since we don't have any use of it.
273
      // This won't cover cases where we need to merge them together (hence, the cmd will be sum), so it is the best guess.
274
196
      const int* inputs; int input_size;
275
196
      const int* outputs; int output_size;
276
196
      ccv_nnc_graph_exec_symbol_io(dynamic_graph->tape, destination, &inputs, &input_size, &outputs, &output_size);
277
196
      ccv_nnc_tensor_symbol_t* input_symbols = temp_input_symbols;
278
196
      ccv_nnc_tensor_symbol_t* output_symbols = temp_output_symbols;
279
196
      uint64_t* input_bitmasks = temp_input_bitmasks;
280
196
      uint64_t* output_bitmasks = temp_output_bitmasks;
281
196
      memset(input_bitmasks, 0, sizeof(uint64_t) * ccv_max(1, ((input_size + 63) >> 6)));
282
196
      memset(output_bitmasks, 0, sizeof(uint64_t) * ccv_max(1, ((output_size + 63) >> 6)));
283
196
      const ccv_nnc_cmd_t cmd = ccv_nnc_graph_exec_symbol_cmd(dynamic_graph->tape, destination);
284
      // Now, check to see if we can remove this symbol from this source.
285
784
      for (k = 0; k < input_size; 
k++588
)
286
588
        if (inputs[k] >= 0)
287
588
          input_bitmasks[k >> 6] |= ((uint64_t)1 << (k & 63));
288
196
      int flag = 0;
289
392
      for (k = 0; k < output_size; 
k++196
)
290
196
        if (outputs[k] >= 0 && outputs[k] != symbol.d)
291
0
        {
292
0
          output_bitmasks[k >> 6] |= ((uint64_t)1 << (k & 63));
293
0
          flag = 1;
294
0
        }
295
      // If we can omit this output (or there is no output at all).
296
196
      if (!flag || 
ccv_nnc_cmd_bitmask(cmd, input_size, output_size, input_bitmasks, (input_size + 63) >> 6, output_bitmasks, (output_size + 63) >> 6)0
)
297
196
      {
298
        // Set the new outputs by omitting the one.
299
784
        for (k = 0; k < input_size; 
k++588
)
300
588
          input_symbols[k] = (ccv_nnc_tensor_symbol_t){
301
588
            .d = inputs[k],
302
588
            .graph = inputs[k] != CCV_NNC_NO_TENSOR_SYMBOL ? dynamic_graph->tape : 
00
,
303
588
          };
304
392
        for (k = 0; k < output_size; 
k++196
)
305
196
          if (outputs[k] != symbol.d)
306
0
            output_symbols[k] = (ccv_nnc_tensor_symbol_t){
307
0
              .d = outputs[k],
308
0
              .graph = outputs[k] != CCV_NNC_NO_TENSOR_SYMBOL ? dynamic_graph->tape : 0,
309
0
            };
310
196
          else
311
196
            output_symbols[k] = (ccv_nnc_tensor_symbol_t){
312
196
              .d = CCV_NNC_NO_TENSOR_SYMBOL,
313
196
              .graph = 0,
314
196
            };
315
196
        ccv_nnc_graph_exec_symbol_set_io(dynamic_graph->tape, destination, input_symbols, input_size, output_symbols, output_size);
316
        // If there is no output, and this is not custom (custom may have side effect,
317
        // whereas the normal ops are side-effect free), set this symbol to be a noop.
318
        // TODO: This could be other cases regarding CCV_NNC_GRAPH_BACKWARD.
319
196
        if (!flag &&
320
196
          cmd.cmd != CCV_NNC_CUSTOM_FORWARD &&
321
196
          cmd.cmd != CCV_NNC_CUSTOM_BACKWARD)
322
0
          ccv_nnc_graph_exec_symbol_set(dynamic_graph->tape, destination, ccv_nnc_cmd(CCV_NNC_NOOP, 0, ccv_nnc_cmd_auto, 0));
323
196
      }
324
196
    }
325
3.44k
    ccv_array_push(destinations, &destination);
326
3.44k
  }
327
  // Remove the hook only at this point.
328
3.42k
  ccv_nnc_tensor_symbol_new_hook(dynamic_graph->tape, 0, 0, 0);
329
3.42k
  ccv_nnc_tensor_symbol_alias_new_hook(dynamic_graph->tape, 0, 0, 0);
330
3.42k
  ccv_nnc_graph_exec_symbol_new_hook(dynamic_graph->tape, 0, 0, 0);
331
3.42k
  ccv_nnc_dy_xpu_alloc_t xpu_alloc = {
332
3.42k
    .xpu_alloc = &dynamic_graph->xpu_alloc,
333
3.42k
    .stream = stream_context
334
3.42k
  };
335
3.42k
  ccv_nnc_symbolic_graph_compile_param_t compile_params = {
336
3.42k
    .allocator = {
337
3.42k
      .isa = &ccv_nnc_dy_allocator_isa,
338
3.42k
      .context = {
339
3.42k
        .alloc = &xpu_alloc,
340
3.42k
        .free = &dynamic_graph->xpu_alloc,
341
3.42k
      }
342
3.42k
    }
343
3.42k
  };
344
3.42k
  ccv_nnc_graph_t* graph = 0;
345
3.42k
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
346
3.42k
  ccv_nnc_graph_exec_arena_t* exec_arena = 0;
347
  // TODO: Should apply simplification right after the backward pass generated.
348
  // Remove these if it is not needed by the cmd, for example, if absence assumed to be 1.
349
6.85k
  for (i = 0; i < f_variable_size; 
i++3.43k
)
350
3.43k
  {
351
3.43k
    if (df_optionals && 
df_optionals[i]2
)
352
2
    {
353
2
      const ccv_nnc_tensor_bind_t df_bind = {
354
2
        .symbol = df_symbols[i],
355
2
        .tensor = ccv_nnc_tensor_from_variable(dynamic_graph, df_optionals[i], stream_context)
356
2
      };
357
2
      ccv_array_push(tensor_binds, &df_bind);
358
2
      continue;
359
2
    }
360
3.43k
    if (!df_symbols[i].graph) // Skip.
361
0
      continue;
362
3.43k
    int no_set = 0; // If we cannot find the df_symbols in all sources, we cannot predict whether it is used or not.
363
6.89k
    for (j = 0; j < sources->rnum; 
j++3.46k
)
364
3.46k
    {
365
3.46k
      const ccv_nnc_graph_exec_symbol_t source = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, j);
366
3.46k
      const int* inputs; int input_size;
367
3.46k
      const int* outputs; int output_size;
368
3.46k
      ccv_nnc_graph_exec_symbol_io(dynamic_graph->tape, source, &inputs, &input_size, &outputs, &output_size);
369
3.46k
      const ccv_nnc_cmd_t cmd = ccv_nnc_graph_exec_symbol_cmd(dynamic_graph->tape, source);
370
3.46k
      int flag = 0;
371
7.26k
      for (k = 0; !flag && 
k < input_size3.83k
;
k++3.80k
)
372
3.80k
      {
373
3.80k
        const int alias_ref = inputs[k] >= 0 ? ccv_nnc_tensor_symbol_alias_to(dynamic_graph->tape, (ccv_nnc_tensor_symbol_t){
374
3.49k
          .d = inputs[k],
375
3.49k
          .graph = dynamic_graph->tape
376
3.49k
        }).d : 
CCV_NNC_NO_TENSOR_SYMBOL306
; // This could be CCV_NNC_NO_TENSOR_SYMBOL, which is negative.
377
3.80k
        flag = (df_symbols[i].d == inputs[k] || 
df_symbols[i].d == alias_ref375
);
378
3.80k
      }
379
3.46k
      if (flag)
380
3.43k
      {
381
3.43k
        no_set = 1;
382
        // Now, check to see if we can remove this symbol from this source.
383
3.43k
        memset(temp_input_bitmasks, 0, sizeof(uint64_t) * ccv_max(1, ((input_size + 63) >> 6)));
384
3.43k
        memset(temp_output_bitmasks, 0, sizeof(uint64_t) * ccv_max(1, ((output_size + 63) >> 6)));
385
16.5k
        for (k = 0; k < input_size; 
k++13.1k
)
386
13.1k
          if (inputs[k] >= 0)
387
8.46k
          {
388
8.46k
            const int alias_ref = inputs[k] >= 0 ? ccv_nnc_tensor_symbol_alias_to(dynamic_graph->tape, (ccv_nnc_tensor_symbol_t){
389
8.46k
              .d = inputs[k],
390
8.46k
              .graph = dynamic_graph->tape
391
8.46k
            }).d : 
CCV_NNC_NO_TENSOR_SYMBOL0
; // This could be CCV_NNC_NO_TENSOR_SYMBOL, which is negative.
392
8.46k
            if (df_symbols[i].d != inputs[k] && 
df_symbols[i].d != alias_ref5.03k
)
393
5.03k
              temp_input_bitmasks[k >> 6] |= ((uint64_t)1 << (k & 63));
394
8.46k
          }
395
9.28k
        for (k = 0; k < output_size; 
k++5.85k
)
396
5.85k
          if (outputs[k] >= 0)
397
5.64k
            temp_output_bitmasks[k >> 6] |= ((uint64_t)1 << (k & 63));
398
3.43k
        if (!ccv_nnc_cmd_bitmask(cmd, input_size, output_size, temp_input_bitmasks, (input_size + 63) >> 6, temp_output_bitmasks, (output_size + 63) >> 6))
399
3.22k
          no_set = 0;
400
3.43k
      }
401
3.46k
    }
402
3.43k
    if (no_set) // Remove this flag from all sources and continue.
403
206
    {
404
436
      for (j = 0; j < sources->rnum; 
j++230
)
405
230
      {
406
230
        const ccv_nnc_graph_exec_symbol_t source = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, j);
407
230
        const int* inputs; int input_size;
408
230
        const int* outputs; int output_size;
409
230
        ccv_nnc_graph_exec_symbol_io(dynamic_graph->tape, source, &inputs, &input_size, &outputs, &output_size);
410
230
        int flag = 0;
411
786
        for (k = 0; !flag && 
k < input_size580
;
k++556
)
412
556
        {
413
556
          const int alias_ref = inputs[k] >= 0 ? ccv_nnc_tensor_symbol_alias_to(dynamic_graph->tape, (ccv_nnc_tensor_symbol_t){
414
266
            .d = inputs[k],
415
266
            .graph = dynamic_graph->tape
416
290
          }).d : CCV_NNC_NO_TENSOR_SYMBOL; // This could be CCV_NNC_NO_TENSOR_SYMBOL, which is negative.
417
556
          flag = (df_symbols[i].d == inputs[k] || 
df_symbols[i].d == alias_ref350
);
418
556
        }
419
230
        if (flag)
420
206
        {
421
1.44k
          for (k = 0; k < input_size; 
k++1.23k
)
422
1.23k
            if (inputs[k] >= 0)
423
618
            {
424
618
              const int alias_ref = inputs[k] >= 0 ? ccv_nnc_tensor_symbol_alias_to(dynamic_graph->tape, (ccv_nnc_tensor_symbol_t){
425
618
                .d = inputs[k],
426
618
                .graph = dynamic_graph->tape
427
618
              }).d : 
CCV_NNC_NO_TENSOR_SYMBOL0
; // This could be CCV_NNC_NO_TENSOR_SYMBOL, which is negative.
428
618
              const int no_symbol = df_symbols[i].d == inputs[k] || 
df_symbols[i].d == alias_ref412
;
429
618
              temp_input_symbols[k] = (ccv_nnc_tensor_symbol_t){
430
618
                .d = no_symbol ? 
CCV_NNC_NO_TENSOR_SYMBOL206
:
inputs[k]412
,
431
618
                .graph = no_symbol ? 
0206
:
dynamic_graph->tape412
,
432
618
              };
433
618
            } else {
434
618
              temp_input_symbols[k] = (ccv_nnc_tensor_symbol_t){
435
618
                .d = inputs[k],
436
618
                .graph = inputs[k] != CCV_NNC_NO_TENSOR_SYMBOL ? 
dynamic_graph->tape0
: 0,
437
618
              };
438
618
            }
439
618
          for (k = 0; k < output_size; 
k++412
)
440
412
            temp_output_symbols[k] = (ccv_nnc_tensor_symbol_t){
441
412
              .d = outputs[k],
442
412
              .graph = outputs[k] != CCV_NNC_NO_TENSOR_SYMBOL ? 
dynamic_graph->tape206
:
0206
,
443
412
            };
444
206
          ccv_nnc_graph_exec_symbol_set_io(dynamic_graph->tape, source, temp_input_symbols, input_size, temp_output_symbols, output_size);
445
206
        }
446
230
      }
447
206
      df_symbols[i].graph = 0;
448
206
    }
449
3.43k
  }
450
  // Aggregate them into one set command.
451
3.42k
  ccv_nnc_tensor_symbol_t df_symbols_0[f_variable_size];
452
3.42k
  ccv_nnc_graph_exec_symbol_t set_ones[f_variable_size];
453
3.42k
  int set_one_size = 0;
454
6.85k
  for (i = 0; i < f_variable_size;)
455
3.43k
    if ((df_optionals && 
df_optionals[i]2
) ||
!df_symbols[i].graph3.43k
) // Skip.
456
208
      ++i;
457
3.22k
    else {
458
3.22k
      df_symbols_0[0] = df_symbols[i];
459
3.22k
      k = 1;
460
3.22k
      int idx = f_variable_size;
461
3.22k
      const ccv_nnc_tensor_param_t params_0 = ccv_nnc_tensor_symbol_params(dynamic_graph->tape, df_symbols_0[0]);
462
3.22k
      for (j = i + 1; j < f_variable_size; 
j++4
)
463
4
        if (df_symbols[j].graph)
464
4
        {
465
4
          const ccv_nnc_tensor_param_t params_j = ccv_nnc_tensor_symbol_params(dynamic_graph->tape, df_symbols[j]);
466
4
          if (params_j.type != params_0.type)
467
4
          {
468
4
            if (idx == f_variable_size)
469
4
              idx = j;
470
4
          } else {
471
0
            df_symbols_0[k++] = df_symbols[j];
472
0
            assert(df_symbols[j].graph == dynamic_graph->tape);
473
0
            df_symbols[j].graph = 0;
474
0
          }
475
4
        }
476
3.22k
      i = idx;
477
3.22k
      set_ones[set_one_size] = ccv_nnc_graph_exec_symbol_new(dynamic_graph->tape, CMD_SET_FORWARD(1), 0, 0, df_symbols_0, k, 0);
478
6.45k
      for (j = 0; j < sources->rnum; 
j++3.23k
)
479
3.23k
        ccv_nnc_graph_exec_symbol_concat(dynamic_graph->tape, set_ones[set_one_size], *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, j));
480
3.22k
      ++set_one_size;
481
3.22k
    }
482
  // Reset it back.
483
6.85k
  
for (i = 0; 3.42k
i < f_variable_size;
i++3.43k
)
484
3.43k
    df_symbols[i].graph = dynamic_graph->tape;
485
3.42k
  if (set_one_size > 0)
486
3.22k
  {
487
3.22k
    ccv_nnc_symbolic_graph_compile(dynamic_graph->tape, compile_params,
488
3.22k
      (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum,
489
3.22k
      0, 0,
490
3.22k
      set_ones, set_one_size,
491
3.22k
      (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(destinations, 0), destinations->rnum,
492
3.22k
      &graph, &tensor_arena, &exec_arena);
493
3.22k
  } else {
494
    // Otherwise we don't have a single set ones, in this case, we still compile from source.
495
202
    ccv_nnc_symbolic_graph_compile(dynamic_graph->tape, compile_params,
496
202
      (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum,
497
202
      0, 0,
498
202
      (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, 0), sources->rnum,
499
202
      (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(destinations, 0), destinations->rnum,
500
202
      &graph, &tensor_arena, &exec_arena);
501
202
  }
502
3.42k
  ccv_array_free(sources);
503
6.64k
  for (i = 0; i < set_one_size; 
i++3.22k
)
504
3.22k
    ccv_nnc_graph_exec_symbol_free(dynamic_graph->tape, set_ones[i]);
505
3.42k
  ccv_array_free(destinations);
506
3.42k
  ccv_array_free(tensor_binds);
507
  // Remove newly added symbols to restore the graph.
508
48.1k
  for (i = 0; i < symbol_stack->rnum; 
i++44.6k
)
509
44.6k
  {
510
44.6k
    const ccv_nnc_tape_symbol_t* const symbol = (ccv_nnc_tape_symbol_t*)ccv_array_get(symbol_stack, i);
511
44.6k
    if (symbol->type == CCV_NNC_SYMBOL_TENSOR || 
symbol->type == CCV_NNC_SYMBOL_TENSOR_ALIAS18.5k
)
512
27.1k
      ccv_nnc_tensor_symbol_free(dynamic_graph->tape, (ccv_nnc_tensor_symbol_t){
513
27.1k
        .d = symbol->d,
514
27.1k
        .graph = dynamic_graph->tape
515
27.1k
      });
516
17.4k
    else if (symbol->type == CCV_NNC_SYMBOL_GRAPH_EXEC)
517
17.4k
      ccv_nnc_graph_exec_symbol_free(dynamic_graph->tape, (ccv_nnc_graph_exec_symbol_t){
518
17.4k
        .d = symbol->d,
519
17.4k
        .graph = dynamic_graph->tape
520
17.4k
      });
521
44.6k
  }
522
3.42k
  ccv_array_free(symbol_stack);
523
  // Go through inputs and outputs to find out stream type and parallel counts.
524
3.42k
  int multi_device = 0;
525
3.43k
  for (i = 1; !multi_device && 
i < input_size3.42k
;
i++9
)
526
9
    multi_device = (CCV_TENSOR_GET_DEVICE(inputs[i - 1]->info.type) != CCV_TENSOR_GET_DEVICE(inputs[i]->info.type));
527
3.42k
  if (stream_context)
528
200
  {
529
200
    ccv_nnc_graph_set_default_static_schedule(graph, ccv_nnc_stream_context_type(stream_context), dynamic_graph->max_stream_count);
530
200
    ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, stream_context);
531
#ifdef HAVE_MPS
532
    // This might be problematic when we will actually have async logic going on.
533
    ccv_nnc_graph_free(graph);
534
    ccv_nnc_tensor_arena_free(tensor_arena);
535
    ccv_nnc_graph_exec_arena_free(exec_arena);
536
#else
537
200
    ccv_nnc_tensor_arena_buffer_free(tensor_arena);
538
200
    ccv_nnc_compilation_artifact_t* const artifact = ccv_nnc_compilation_artifact_new(graph, tensor_arena, exec_arena);
539
200
    ccv_nnc_stream_context_add_callback(stream_context, (ccv_nnc_callback_f)ccv_nnc_compilation_artifact_free, artifact);
540
200
#endif
541
3.22k
  } else {
542
3.22k
    if (multi_device)
543
2
    {
544
2
      int flag = 0;
545
4
      for (i = 0; !flag && 
i < input_size2
;
i++2
)
546
2
        flag = (CCV_TENSOR_GET_MEMORY(inputs[i]->info.type) == CCV_TENSOR_GPU_MEMORY);
547
2
      const int stream_type = flag ? CCV_STREAM_CONTEXT_GPU : 
CCV_STREAM_CONTEXT_CPU0
;
548
2
      ccv_nnc_graph_set_default_static_schedule(graph, stream_type, dynamic_graph->max_stream_count);
549
2
      ccv_nnc_stream_context_t* const default_stream = ccv_nnc_graph_default_stream(graph);
550
2
      ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, default_stream);
551
2
      ccv_nnc_stream_context_wait(default_stream);
552
2
    } else
553
3.22k
      ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
554
3.22k
    ccv_nnc_graph_free(graph);
555
3.22k
    ccv_nnc_tensor_arena_free(tensor_arena);
556
3.22k
    ccv_nnc_graph_exec_arena_free(exec_arena);
557
3.22k
  }
558
  // Now, able to free some of the reused outputs. This need to be the last step otherwise some of the exec symbols
559
  // above may be freed by this operation.
560
5.43k
  for (i = 0; i < freeable_size; 
i++2.00k
)
561
2.00k
    ccv_nnc_tensor_variable_free(dynamic_graph, freeables[i]);
562
3.42k
}