Coverage Report

Created: 2021-09-30 20:21

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/ccv_nnc_dynamic_graph_backward.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_nnc_easy.h"
5
#include "ccv_internal.h"
6
#ifdef HAVE_CUDA
7
#include "gpu/ccv_nnc_compat.h"
8
#endif
9
#include "_ccv_nnc_dynamic_graph.h"
10
11
// MARK - Level-4.5 API
12
13
static void* _ccv_nnc_dynamic_compile_alloc(const int type, const int pinned_mem, const size_t size, void* const arg)
14
260
{
15
260
  assert(type & CCV_TENSOR_GPU_MEMORY);
16
260
  ccv_nnc_dy_xpu_alloc_t* const xpu_alloc  = (ccv_nnc_dy_xpu_alloc_t*)arg;
17
260
  const int device = CCV_TENSOR_GET_DEVICE_ID(type);
18
260
  return ccv_nnc_dynamic_graph_xpu_alloc(xpu_alloc->graph, device, xpu_alloc->stream, size);
19
260
}
20
21
static void _ccv_nnc_dynamic_compile_free(void* const ptr, void* const arg)
22
260
{
23
260
  ccv_nnc_dynamic_graph_t* const graph = (ccv_nnc_dynamic_graph_t*)arg;
24
260
  ccv_nnc_dynamic_graph_xpu_free(graph, ptr);
25
260
}
26
27
const ccv_nnc_symbolic_graph_compile_allocator_vtab_t ccv_nnc_dy_allocator_isa = {
28
  .alloc = _ccv_nnc_dynamic_compile_alloc,
29
  .free = _ccv_nnc_dynamic_compile_free
30
};
31
32
void ccv_nnc_dynamic_graph_backward(ccv_nnc_dynamic_graph_t* const dynamic_graph, const ccv_nnc_tensor_variable_t* const f_variables, const int f_variable_size, const ccv_nnc_tensor_variable_t* const df_optionals, const ccv_nnc_tensor_variable_t* const inputs, const int input_size, ccv_nnc_tensor_variable_t* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
33
3.41k
{
34
3.41k
  int d, i, j, k;
35
3.41k
  assert(input_size == output_size);
36
3.41k
  assert(input_size > 0);
37
3.41k
  assert(output_size > 0);
38
3.41k
  assert(f_variable_size > 0);
39
3.41k
  int f_source_size = 0;
40
3.41k
  // Both f_variable and tensor_variable should be, at least, executed. Otherwise we cannot differentiate.
41
6.84k
  for (i = 0; i < f_variable_size; 
i++3.42k
)
42
3.42k
  {
43
3.42k
    assert(f_variables[i]->symbol.d >= 0);
44
3.42k
    const ccv_nnc_tensor_variable_graph_bind_t* const f_symbol_extra = (ccv_nnc_tensor_variable_graph_bind_t*)ccv_array_get(dynamic_graph->binds, f_variables[i]->symbol.d);
45
3.42k
    assert(f_symbol_extra->sources && f_symbol_extra->sources->rnum > 0);
46
3.42k
    f_source_size += f_symbol_extra->sources->rnum;
47
3.42k
  }
48
3.41k
  assert(!dynamic_graph->no_grad);
49
6.85k
  
for (i = 0; 3.41k
i < input_size;
i++3.43k
)
50
3.43k
  {
51
3.43k
    assert(inputs[i]->type != CCV_NNC_TENSOR_CONSTANT);
52
3.43k
    assert(inputs[i]->symbol.d >= 0);
53
3.43k
    assert(((ccv_nnc_tensor_variable_graph_bind_t*)ccv_array_get(dynamic_graph->binds, inputs[i]->symbol.d))->destinations &&
54
3.43k
      ((ccv_nnc_tensor_variable_graph_bind_t*)ccv_array_get(dynamic_graph->binds, inputs[i]->symbol.d))->destinations->rnum > 0);
55
3.43k
  }
56
3.41k
  // Fill in the symbol info for outputs.
57
6.85k
  
for (i = 0; 3.41k
i < output_size;
i++3.43k
)
58
3.43k
    if (outputs[i] && 
ccv_nnc_is_tensor_auto(outputs[i]->info)3.24k
)
59
1.23k
      outputs[i]->info = inputs[i]->info;
60
3.41k
  const int exec_symbol_info_size = ccv_nnc_graph_exec_symbol_count(dynamic_graph->tape);
61
3.41k
  ccv_array_t* const sources = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), 1, 0);
62
3.41k
  if (!dynamic_graph->ws)
63
12
    dynamic_graph->ws = ccv_array_new(sizeof(int), exec_symbol_info_size * 2 + ((exec_symbol_info_size + 31) >> 5), 0);
64
3.41k
  ccv_array_t* const ws = dynamic_graph->ws;
65
3.41k
  ccv_array_resize(ws, exec_symbol_info_size * 2 + ((exec_symbol_info_size + 31) >> 5));
66
3.41k
  // set visited to all 0.
67
3.41k
  memset((uint32_t*)ccv_array_get(ws, exec_symbol_info_size * 2), 0, sizeof(uint32_t) * ((exec_symbol_info_size + 31) >> 5));
68
6.85k
  for (i = 0; i < input_size; 
i++3.43k
)
69
3.43k
  {
70
3.43k
    ccv_array_t* const destinations = ((ccv_nnc_tensor_variable_graph_bind_t*)ccv_array_get(dynamic_graph->binds, inputs[i]->symbol.d))->destinations;
71
6.90k
    for (j = 0; j < destinations->rnum; 
j++3.47k
)
72
3.47k
      ccv_nnc_insert_if_prior_to_any(dynamic_graph->tape,
73
3.47k
        *(int*)ccv_array_get(destinations, j),
74
3.47k
        sources, (uint32_t*)ccv_array_get(ws, exec_symbol_info_size * 2),
75
3.47k
        (int*)ccv_array_get(ws, 0), (int*)ccv_array_get(ws, exec_symbol_info_size));
76
3.43k
  }
77
3.41k
  ccv_array_t* const destinations = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), f_source_size, 0);
78
6.84k
  for (i = 0; i < f_variable_size; 
i++3.42k
)
79
3.42k
  {
80
3.42k
    const ccv_nnc_tensor_variable_graph_bind_t* const loss_symbol_extra = (ccv_nnc_tensor_variable_graph_bind_t*)ccv_array_get(dynamic_graph->binds, f_variables[i]->symbol.d);
81
6.85k
    for (j = 0; j < loss_symbol_extra->sources->rnum; 
j++3.43k
)
82
3.43k
    {
83
3.43k
      const int symbol_d = *(int*)ccv_array_get(loss_symbol_extra->sources, j);
84
3.43k
      int flag = 0;
85
3.44k
      for (k = 0; !flag && k < destinations->rnum; 
k++19
)
86
19
        flag = (symbol_d == ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(destinations, k))->d);
87
3.43k
      if (!flag)
88
3.43k
      {
89
3.43k
        const ccv_nnc_graph_exec_symbol_t symbol = {
90
3.43k
          .d = symbol_d,
91
3.43k
          .graph = dynamic_graph->tape
92
3.43k
        };
93
3.43k
        ccv_array_push(destinations, &symbol);
94
3.43k
      }
95
3.43k
    }
96
3.42k
  }
97
3.41k
  // Go over sources, because destinations will get removed all the time, thus, the index is not accurate.
98
3.41k
  if (destinations->rnum > 1)
99
25
    
for (i = 0; 7
i < destinations->rnum;
i++18
)
100
18
    {
101
18
      memset((uint32_t*)ccv_array_get(ws, exec_symbol_info_size * 2), 0, sizeof(uint32_t) * ((exec_symbol_info_size + 31) >> 5));
102
18
      ccv_nnc_remove_if_prior_to_any(dynamic_graph->tape,
103
18
        ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(destinations, i))->d,
104
18
        destinations, (uint32_t*)ccv_array_get(ws, exec_symbol_info_size * 2),
105
18
        (int*)ccv_array_get(ws, 0), (int*)ccv_array_get(ws, exec_symbol_info_size));
106
18
    }
107
3.41k
  ccv_nnc_tensor_symbol_t f_symbols[f_variable_size];
108
6.84k
  for (i = 0; i < f_variable_size; 
i++3.42k
)
109
3.42k
    f_symbols[i] = f_variables[i]->symbol;
110
3.41k
  ccv_nnc_tensor_symbol_t input_symbols[input_size];
111
6.85k
  for (i = 0; i < input_size; 
i++3.43k
)
112
3.43k
    input_symbols[i] = inputs[i]->symbol;
113
3.41k
  ccv_array_t* const symbol_stack = ccv_array_new(sizeof(ccv_nnc_tape_symbol_t), 1, 0);
114
3.41k
  ccv_nnc_tensor_symbol_new_hook(dynamic_graph->tape, ccv_nnc_dynamic_graph_push_backward_tensor_symbol, symbol_stack);
115
3.41k
  ccv_nnc_tensor_symbol_alias_new_hook(dynamic_graph->tape, ccv_nnc_dynamic_graph_push_backward_tensor_symbol_alias, symbol_stack);
116
3.41k
  ccv_nnc_graph_exec_symbol_new_hook(dynamic_graph->tape, ccv_nnc_dynamic_graph_push_backward_graph_exec_symbol, symbol_stack);
117
3.41k
  ccv_nnc_symbolic_graph_backward(dynamic_graph->tape,
118
3.41k
    f_symbols, f_variable_size, input_symbols, input_size,
119
3.41k
    (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, 0), sources->rnum,
120
3.41k
    (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(destinations, 0), destinations->rnum);
121
3.41k
  // Bind generated tensors.
122
3.41k
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), dynamic_graph->vars->rnum + 2, 0);
123
32.5k
  for (i = 0; i < dynamic_graph->vars->rnum; 
i++29.1k
)
124
29.1k
  {
125
29.1k
    ccv_nnc_tensor_variable_t var = *(ccv_nnc_tensor_variable_t*)ccv_array_get(dynamic_graph->vars, i);
126
29.1k
    if (var && 
var->tensor_view29.1k
&&
var->symbol.d >= 027.8k
)
127
22.8k
    {
128
22.8k
      ccv_nnc_tensor_bind_t bind = {
129
22.8k
        .symbol = var->symbol,
130
22.8k
        .tensor = (ccv_nnc_tensor_t*)CCV_NNC_TENSOR_VIEW(var->tensor_view)
131
22.8k
      };
132
22.8k
      ccv_array_push(tensor_binds, &bind);
133
22.8k
    }
134
29.1k
  }
135
26.6k
  for (i = 0; i < dynamic_graph->binds->rnum; 
i++23.2k
)
136
23.2k
  {
137
23.2k
    ccv_nnc_tensor_variable_graph_bind_t* const bind = (ccv_nnc_tensor_variable_graph_bind_t*)ccv_array_get(dynamic_graph->binds, i);
138
23.2k
    if (bind->index == CCV_NNC_TENSOR_NO_VARIABLE_BUT_USED && 
bind->tensor_view416
)
139
416
    {
140
416
      ccv_nnc_tensor_bind_t b = {
141
416
        .symbol = {
142
416
          .d = i,
143
416
          .graph = dynamic_graph->tape,
144
416
        },
145
416
        .tensor = (ccv_nnc_tensor_t*)CCV_NNC_TENSOR_VIEW(bind->tensor_view)
146
416
      };
147
416
      ccv_array_push(tensor_binds, &b);
148
416
    }
149
23.2k
  }
150
3.41k
  // Compiled graph comes from the df.
151
3.41k
  ccv_array_clear(sources);
152
3.41k
  ccv_nnc_tensor_symbol_t df_symbols[f_variable_size];
153
6.84k
  for (i = 0; i < f_variable_size; 
i++3.42k
)
154
3.42k
  {
155
3.42k
    df_symbols[i] = ccv_nnc_tensor_symbol_for_backward(dynamic_graph->tape, f_variables[i]->symbol);
156
3.42k
    assert(f_symbols[i].d >= 0);
157
3.42k
  }
158
6.84k
  
for (d = 0; 3.41k
d < destinations->rnum;
d++3.42k
)
159
3.42k
  {
160
3.42k
    const ccv_nnc_graph_exec_symbol_t* const destination = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(destinations, d);
161
3.42k
    const int* outgoings; int outgoing_size;
162
3.42k
    ccv_nnc_graph_exec_symbol_to(dynamic_graph->tape, *destination, &outgoings, &outgoing_size);
163
6.85k
    for (i = 0; i < outgoing_size; 
i++3.42k
)
164
3.42k
    {
165
3.42k
      const int exec_idx = outgoings[i];
166
3.42k
      const int* inputs; int input_size;
167
3.42k
      ccv_nnc_graph_exec_symbol_io(dynamic_graph->tape, (ccv_nnc_graph_exec_symbol_t){
168
3.42k
        .d = exec_idx,
169
3.42k
        .graph = dynamic_graph->tape
170
3.42k
      }, &inputs, &input_size, 0, 0);
171
3.63k
      for (j = 0; j < input_size; 
j++204
)
172
3.63k
      {
173
3.63k
        const int input = inputs[j];
174
3.63k
        const int alias_ref = input >= 0 ? ccv_nnc_tensor_symbol_alias_to(dynamic_graph->tape, (ccv_nnc_tensor_symbol_t){
175
3.42k
          .d = input,
176
3.42k
          .graph = dynamic_graph->tape
177
3.42k
        }).d : 
CCV_NNC_NO_TENSOR_SYMBOL204
; // This could be CCV_NNC_NO_TENSOR_SYMBOL, which is negative.
178
3.63k
        // alias_ref is either exists, or -1.
179
3.63k
        int flag = 0;
180
7.30k
        for (k = 0; !flag && 
k < f_variable_size3.87k
;
k++3.67k
)
181
3.67k
          flag = (df_symbols[k].d == input || 
df_symbols[k].d == alias_ref245
);
182
3.63k
        if (flag)
183
3.42k
        {
184
3.42k
          flag = 0;
185
3.44k
          for (k = 0; !flag && k < sources->rnum; 
k++16
)
186
16
            flag = (exec_idx == ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, k))->d);
187
3.42k
          if (!flag)
188
3.42k
          {
189
3.42k
            const ccv_nnc_graph_exec_symbol_t source = {
190
3.42k
              .d = exec_idx,
191
3.42k
              .graph = dynamic_graph->tape
192
3.42k
            };
193
3.42k
            ccv_array_push(sources, &source);
194
3.42k
          }
195
3.42k
          break;
196
3.42k
        }
197
3.63k
      }
198
3.42k
    }
199
3.42k
  }
200
3.41k
  int freeable_size = 0;
201
3.41k
  ccv_nnc_tensor_variable_t freeables[output_size];
202
3.41k
  ccv_array_clear(destinations);
203
3.41k
  int max_input_size = 1;
204
3.41k
  int max_output_size = 1;
205
6.84k
  for (i = 0; i < sources->rnum; 
i++3.42k
)
206
3.42k
  {
207
3.42k
    const ccv_nnc_graph_exec_symbol_t source = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, i);
208
3.42k
    int input_size; int output_size;
209
3.42k
    ccv_nnc_graph_exec_symbol_io(dynamic_graph->tape, source, 0, &input_size, 0, &output_size);
210
3.42k
    max_input_size = ccv_max(input_size, max_input_size);
211
3.42k
    max_output_size = ccv_max(output_size, max_output_size);
212
3.42k
  }
213
6.85k
  for (i = 0; i < output_size; 
i++3.43k
)
214
3.43k
  {
215
3.43k
    const ccv_nnc_tensor_symbol_t symbol = ccv_nnc_tensor_symbol_for_backward(dynamic_graph->tape, input_symbols[i]);
216
3.43k
    ccv_nnc_graph_exec_symbol_t destination = ccv_nnc_graph_exec_symbol_for_backward(dynamic_graph->tape, symbol);
217
3.43k
    int input_size; int output_size;
218
3.43k
    ccv_nnc_graph_exec_symbol_io(dynamic_graph->tape, destination, 0, &input_size, 0, &output_size);
219
3.43k
    max_input_size = ccv_max(input_size, max_input_size);
220
3.43k
    max_output_size = ccv_max(output_size, max_output_size);
221
3.43k
  }
222
3.41k
  const int max_input_bitmask_size = ((max_input_size + 63) >> 6);
223
3.41k
  const int max_output_bitmask_size =  ((max_output_size + 63) >> 6);
224
3.41k
  ccv_nnc_tensor_symbol_t temp_input_symbols[max_input_size];
225
3.41k
  ccv_nnc_tensor_symbol_t temp_output_symbols[max_output_size];
226
3.41k
  uint64_t temp_input_bitmasks[max_input_bitmask_size];
227
3.41k
  uint64_t temp_output_bitmasks[max_output_bitmask_size];
228
3.41k
  // Bind dt tensor.
229
6.85k
  for (i = 0; i < output_size; 
i++3.43k
)
230
3.43k
  {
231
3.43k
    const ccv_nnc_tensor_symbol_t symbol = ccv_nnc_tensor_symbol_for_backward(dynamic_graph->tape, input_symbols[i]);
232
3.43k
    ccv_nnc_graph_exec_symbol_t destination = ccv_nnc_graph_exec_symbol_for_backward(dynamic_graph->tape, symbol);
233
3.43k
    if (outputs[i])
234
3.24k
    {
235
3.24k
      if (ccv_nnc_tensor_variable_contains_value(outputs[i]))
236
2.00k
      {
237
2.00k
        // If the output tensors already exist, we need to accumulate the result.
238
2.00k
        // However, if this tensor is set from outside, we don't accumulate on that
239
2.00k
        // (these maybe people just want to collect the result in explicit way).
240
2.00k
        // On the other hand, if these external tensor views has a symbol associated
241
2.00k
        // with them, they are not made to collect results. They are probably bind in
242
2.00k
        // previous computations.
243
2.00k
        // The above logic is convoluted, but it should make intuitive sense in many
244
2.00k
        // cases.
245
2.00k
        ccv_nnc_tensor_symbol_t inputs[2];
246
2.00k
        inputs[0] = ccv_nnc_tensor_symbol_new(dynamic_graph->tape, outputs[i]->info, 0);
247
2.00k
        inputs[1] = symbol;
248
2.00k
        const ccv_nnc_tensor_symbol_t output = ccv_nnc_tensor_symbol_new(dynamic_graph->tape, outputs[i]->info, 0);
249
2.00k
        ccv_nnc_tensor_bind_t dt_bind = {
250
2.00k
          .symbol = inputs[0],
251
2.00k
          .tensor = ccv_nnc_tensor_from_variable(dynamic_graph, outputs[i], stream_context)
252
2.00k
        };
253
2.00k
        ccv_array_push(tensor_binds, &dt_bind);
254
2.00k
        ccv_nnc_graph_exec_symbol_t accum = ccv_nnc_graph_exec_symbol_new(dynamic_graph->tape, CMD_EWSUM_FORWARD(), inputs, 2, &output, 1, 0);
255
2.00k
        ccv_nnc_graph_exec_symbol_concat(dynamic_graph->tape, destination, accum);
256
2.00k
        destination = accum; // The accumulation unit becomes the new destination.
257
2.00k
        freeables[freeable_size++] = ccv_nnc_tensor_variable_exchange_new(dynamic_graph, outputs[i]);
258
2.00k
        dt_bind.symbol = output;
259
2.00k
        dt_bind.tensor = ccv_nnc_tensor_from_variable(dynamic_graph, outputs[i], stream_context);
260
2.00k
        ccv_array_push(tensor_binds, &dt_bind);
261
2.00k
      } else {
262
1.23k
        assert(outputs[i]->symbol.d < 0);
263
1.23k
        // Otherwise, we can directly bind to the backward output.
264
1.23k
        ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_from_variable(dynamic_graph, outputs[i], stream_context);
265
1.23k
        const ccv_nnc_tensor_bind_t dt_bind = {
266
1.23k
          .symbol = symbol,
267
1.23k
          .tensor = tensor
268
1.23k
        };
269
1.23k
        ccv_array_push(tensor_binds, &dt_bind);
270
1.23k
      }
271
3.24k
    } else {
272
196
      // Remove this symbol if it is possible, since we don't have any use of it.
273
196
      // This won't cover cases where we need to merge them together (hence, the cmd will be sum), so it is the best guess.
274
196
      const int* inputs; int input_size;
275
196
      const int* outputs; int output_size;
276
196
      ccv_nnc_graph_exec_symbol_io(dynamic_graph->tape, destination, &inputs, &input_size, &outputs, &output_size);
277
196
      ccv_nnc_tensor_symbol_t* input_symbols = temp_input_symbols;
278
196
      ccv_nnc_tensor_symbol_t* output_symbols = temp_output_symbols;
279
196
      uint64_t* input_bitmasks = temp_input_bitmasks;
280
196
      uint64_t* output_bitmasks = temp_output_bitmasks;
281
196
      memset(input_bitmasks, 0, sizeof(uint64_t) * ccv_max(1, ((input_size + 63) >> 6)));
282
196
      memset(output_bitmasks, 0, sizeof(uint64_t) * ccv_max(1, ((output_size + 63) >> 6)));
283
196
      const ccv_nnc_cmd_t cmd = ccv_nnc_graph_exec_symbol_cmd(dynamic_graph->tape, destination);
284
196
      // Now, check to see if we can remove this symbol from this source.
285
784
      for (k = 0; k < input_size; 
k++588
)
286
588
        if (inputs[k] >= 0)
287
588
          input_bitmasks[k >> 6] |= ((uint64_t)1 << (k & 63));
288
196
      int flag = 0;
289
392
      for (k = 0; k < output_size; 
k++196
)
290
196
        if (outputs[k] >= 0 && outputs[k] != symbol.d)
291
0
        {
292
0
          output_bitmasks[k >> 6] |= ((uint64_t)1 << (k & 63));
293
0
          flag = 1;
294
0
        }
295
196
      // If we can omit this output (or there is no output at all).
296
196
      if (!flag || 
ccv_nnc_cmd_bitmask(cmd, input_size, output_size, input_bitmasks, (input_size + 63) >> 6, output_bitmasks, (output_size + 63) >> 6)0
)
297
196
      {
298
196
        // Set the new outputs by omitting the one.
299
784
        for (k = 0; k < input_size; 
k++588
)
300
588
          input_symbols[k] = (ccv_nnc_tensor_symbol_t){
301
588
            .d = inputs[k],
302
588
            .graph = inputs[k] != CCV_NNC_NO_TENSOR_SYMBOL ? dynamic_graph->tape : 
00
,
303
588
          };
304
392
        for (k = 0; k < output_size; 
k++196
)
305
196
          if (outputs[k] != symbol.d)
306
0
            output_symbols[k] = (ccv_nnc_tensor_symbol_t){
307
0
              .d = outputs[k],
308
0
              .graph = outputs[k] != CCV_NNC_NO_TENSOR_SYMBOL ? dynamic_graph->tape : 0,
309
0
            };
310
196
          else
311
196
            output_symbols[k] = (ccv_nnc_tensor_symbol_t){
312
196
              .d = CCV_NNC_NO_TENSOR_SYMBOL,
313
196
              .graph = 0,
314
196
            };
315
196
        ccv_nnc_graph_exec_symbol_set_io(dynamic_graph->tape, destination, input_symbols, input_size, output_symbols, output_size);
316
196
        // If there is no output, and this is not custom (custom may have side effect,
317
196
        // whereas the normal ops are side-effect free), set this symbol to be a noop.
318
196
        // TODO: This could be other cases regarding CCV_NNC_GRAPH_BACKWARD.
319
196
        if (!flag &&
320
196
          cmd.cmd != CCV_NNC_CUSTOM_FORWARD &&
321
196
          cmd.cmd != CCV_NNC_CUSTOM_BACKWARD)
322
0
          ccv_nnc_graph_exec_symbol_set(dynamic_graph->tape, destination, ccv_nnc_cmd(CCV_NNC_NOOP, 0, ccv_nnc_cmd_auto, 0));
323
196
      }
324
196
    }
325
3.43k
    ccv_array_push(destinations, &destination);
326
3.43k
  }
327
3.41k
  // Remove the hook only at this point.
328
3.41k
  ccv_nnc_tensor_symbol_new_hook(dynamic_graph->tape, 0, 0);
329
3.41k
  ccv_nnc_tensor_symbol_alias_new_hook(dynamic_graph->tape, 0, 0);
330
3.41k
  ccv_nnc_graph_exec_symbol_new_hook(dynamic_graph->tape, 0, 0);
331
3.41k
  ccv_nnc_dy_xpu_alloc_t xpu_alloc = {
332
3.41k
    .graph = dynamic_graph,
333
3.41k
    .stream = stream_context
334
3.41k
  };
335
3.41k
  ccv_nnc_symbolic_graph_compile_param_t compile_params = {
336
3.41k
    .allocator = {
337
3.41k
      .isa = &ccv_nnc_dy_allocator_isa,
338
3.41k
      .context = {
339
3.41k
        .alloc = &xpu_alloc,
340
3.41k
        .free = dynamic_graph,
341
3.41k
      }
342
3.41k
    }
343
3.41k
  };
344
3.41k
  ccv_nnc_graph_t* graph = 0;
345
3.41k
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
346
3.41k
  ccv_nnc_graph_exec_arena_t* exec_arena = 0;
347
3.41k
  // TODO: Should apply simplification right after the backward pass generated.
348
3.41k
  // Remove these if it is not needed by the cmd, for example, if absence assumed to be 1.
349
6.84k
  for (i = 0; i < f_variable_size; 
i++3.42k
)
350
3.42k
  {
351
3.42k
    if (df_optionals && 
df_optionals[i]2
)
352
2
    {
353
2
      const ccv_nnc_tensor_bind_t df_bind = {
354
2
        .symbol = df_symbols[i],
355
2
        .tensor = ccv_nnc_tensor_from_variable(dynamic_graph, df_optionals[i], stream_context)
356
2
      };
357
2
      ccv_array_push(tensor_binds, &df_bind);
358
2
      continue;
359
2
    }
360
3.42k
    if (!df_symbols[i].graph) // Skip.
361
0
      continue;
362
3.42k
    int no_set = 0; // If we cannot find the df_symbols in all sources, we cannot predict whether it is used or not.
363
6.88k
    for (j = 0; j < sources->rnum; 
j++3.45k
)
364
3.45k
    {
365
3.45k
      const ccv_nnc_graph_exec_symbol_t source = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, j);
366
3.45k
      const int* inputs; int input_size;
367
3.45k
      const int* outputs; int output_size;
368
3.45k
      ccv_nnc_graph_exec_symbol_io(dynamic_graph->tape, source, &inputs, &input_size, &outputs, &output_size);
369
3.45k
      const ccv_nnc_cmd_t cmd = ccv_nnc_graph_exec_symbol_cmd(dynamic_graph->tape, source);
370
3.45k
      int flag = 0;
371
7.25k
      for (k = 0; !flag && 
k < input_size3.83k
;
k++3.79k
)
372
3.79k
      {
373
3.79k
        const int alias_ref = inputs[k] >= 0 ? ccv_nnc_tensor_symbol_alias_to(dynamic_graph->tape, (ccv_nnc_tensor_symbol_t){
374
3.49k
          .d = inputs[k],
375
3.49k
          .graph = dynamic_graph->tape
376
3.49k
        }).d : 
CCV_NNC_NO_TENSOR_SYMBOL304
; // This could be CCV_NNC_NO_TENSOR_SYMBOL, which is negative.
377
3.79k
        flag = (df_symbols[i].d == inputs[k] || 
df_symbols[i].d == alias_ref373
);
378
3.79k
      }
379
3.45k
      if (flag)
380
3.42k
      {
381
3.42k
        no_set = 1;
382
3.42k
        // Now, check to see if we can remove this symbol from this source.
383
3.42k
        memset(temp_input_bitmasks, 0, sizeof(uint64_t) * ccv_max(1, ((input_size + 63) >> 6)));
384
3.42k
        memset(temp_output_bitmasks, 0, sizeof(uint64_t) * ccv_max(1, ((output_size + 63) >> 6)));
385
16.5k
        for (k = 0; k < input_size; 
k++13.1k
)
386
13.1k
          if (inputs[k] >= 0)
387
8.44k
          {
388
8.44k
            const int alias_ref = inputs[k] >= 0 ? ccv_nnc_tensor_symbol_alias_to(dynamic_graph->tape, (ccv_nnc_tensor_symbol_t){
389
8.44k
              .d = inputs[k],
390
8.44k
              .graph = dynamic_graph->tape
391
8.44k
            }).d : 
CCV_NNC_NO_TENSOR_SYMBOL0
; // This could be CCV_NNC_NO_TENSOR_SYMBOL, which is negative.
392
8.44k
            if (df_symbols[i].d != inputs[k] && 
df_symbols[i].d != alias_ref5.02k
)
393
5.02k
              temp_input_bitmasks[k >> 6] |= ((uint64_t)1 << (k & 63));
394
8.44k
          }
395
9.27k
        for (k = 0; k < output_size; 
k++5.84k
)
396
5.84k
          if (outputs[k] >= 0)
397
5.63k
            temp_output_bitmasks[k >> 6] |= ((uint64_t)1 << (k & 63));
398
3.42k
        if (!ccv_nnc_cmd_bitmask(cmd, input_size, output_size, temp_input_bitmasks, (input_size + 63) >> 6, temp_output_bitmasks, (output_size + 63) >> 6))
399
3.22k
          no_set = 0;
400
3.42k
      }
401
3.45k
    }
402
3.42k
    if (no_set) // Remove this flag from all sources and continue.
403
204
    {
404
432
      for (j = 0; j < sources->rnum; 
j++228
)
405
228
      {
406
228
        const ccv_nnc_graph_exec_symbol_t source = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, j);
407
228
        const int* inputs; int input_size;
408
228
        const int* outputs; int output_size;
409
228
        ccv_nnc_graph_exec_symbol_io(dynamic_graph->tape, source, &inputs, &input_size, &outputs, &output_size);
410
228
        int flag = 0;
411
780
        for (k = 0; !flag && 
k < input_size576
;
k++552
)
412
552
        {
413
552
          const int alias_ref = inputs[k] >= 0 ? ccv_nnc_tensor_symbol_alias_to(dynamic_graph->tape, (ccv_nnc_tensor_symbol_t){
414
264
            .d = inputs[k],
415
264
            .graph = dynamic_graph->tape
416
288
          }).d : CCV_NNC_NO_TENSOR_SYMBOL; // This could be CCV_NNC_NO_TENSOR_SYMBOL, which is negative.
417
552
          flag = (df_symbols[i].d == inputs[k] || 
df_symbols[i].d == alias_ref348
);
418
552
        }
419
228
        if (flag)
420
204
        {
421
1.42k
          for (k = 0; k < input_size; 
k++1.22k
)
422
1.22k
            if (inputs[k] >= 0)
423
612
            {
424
612
              const int alias_ref = inputs[k] >= 0 ? ccv_nnc_tensor_symbol_alias_to(dynamic_graph->tape, (ccv_nnc_tensor_symbol_t){
425
612
                .d = inputs[k],
426
612
                .graph = dynamic_graph->tape
427
612
              }).d : 
CCV_NNC_NO_TENSOR_SYMBOL0
; // This could be CCV_NNC_NO_TENSOR_SYMBOL, which is negative.
428
612
              const int no_symbol = df_symbols[i].d == inputs[k] || 
df_symbols[i].d == alias_ref408
;
429
612
              temp_input_symbols[k] = (ccv_nnc_tensor_symbol_t){
430
612
                .d = no_symbol ? 
CCV_NNC_NO_TENSOR_SYMBOL204
:
inputs[k]408
,
431
612
                .graph = no_symbol ? 
0204
:
dynamic_graph->tape408
,
432
612
              };
433
612
            } else {
434
612
              temp_input_symbols[k] = (ccv_nnc_tensor_symbol_t){
435
612
                .d = inputs[k],
436
612
                .graph = inputs[k] != CCV_NNC_NO_TENSOR_SYMBOL ? 
dynamic_graph->tape0
: 0,
437
612
              };
438
612
            }
439
612
          for (k = 0; k < output_size; 
k++408
)
440
408
            temp_output_symbols[k] = (ccv_nnc_tensor_symbol_t){
441
408
              .d = outputs[k],
442
408
              .graph = outputs[k] != CCV_NNC_NO_TENSOR_SYMBOL ? 
dynamic_graph->tape204
:
0204
,
443
408
            };
444
204
          ccv_nnc_graph_exec_symbol_set_io(dynamic_graph->tape, source, temp_input_symbols, input_size, temp_output_symbols, output_size);
445
204
        }
446
228
      }
447
204
      df_symbols[i].graph = 0;
448
204
    }
449
3.42k
  }
450
3.41k
  // Aggregate them into one set command.
451
3.41k
  ccv_nnc_tensor_symbol_t df_symbols_0[f_variable_size];
452
3.41k
  ccv_nnc_graph_exec_symbol_t set_ones[f_variable_size];
453
3.41k
  int set_one_size = 0;
454
6.84k
  for (i = 0; i < f_variable_size;)
455
3.42k
    if ((df_optionals && 
df_optionals[i]2
) ||
!df_symbols[i].graph3.42k
) // Skip.
456
206
      ++i;
457
3.22k
    else {
458
3.22k
      df_symbols_0[0] = df_symbols[i];
459
3.22k
      k = 1;
460
3.22k
      int idx = f_variable_size;
461
3.22k
      const ccv_nnc_tensor_param_t params_0 = ccv_nnc_tensor_symbol_params(dynamic_graph->tape, df_symbols_0[0]);
462
3.22k
      for (j = i + 1; j < f_variable_size; 
j++4
)
463
4
        if (df_symbols[j].graph)
464
4
        {
465
4
          const ccv_nnc_tensor_param_t params_j = ccv_nnc_tensor_symbol_params(dynamic_graph->tape, df_symbols[j]);
466
4
          if (params_j.type != params_0.type)
467
4
          {
468
4
            if (idx == f_variable_size)
469
4
              idx = j;
470
4
          } else {
471
0
            df_symbols_0[k++] = df_symbols[j];
472
0
            assert(df_symbols[j].graph == dynamic_graph->tape);
473
0
            df_symbols[j].graph = 0;
474
0
          }
475
4
        }
476
3.22k
      i = idx;
477
3.22k
      set_ones[set_one_size] = ccv_nnc_graph_exec_symbol_new(dynamic_graph->tape, CMD_SET_FORWARD(1), 0, 0, df_symbols_0, k, 0);
478
6.45k
      for (j = 0; j < sources->rnum; 
j++3.23k
)
479
3.23k
        ccv_nnc_graph_exec_symbol_concat(dynamic_graph->tape, set_ones[set_one_size], *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, j));
480
3.22k
      ++set_one_size;
481
3.22k
    }
482
3.41k
  // Reset it back.
483
6.84k
  
for (i = 0; 3.41k
i < f_variable_size;
i++3.42k
)
484
3.42k
    df_symbols[i].graph = dynamic_graph->tape;
485
3.41k
  if (set_one_size > 0)
486
3.21k
  {
487
3.21k
    ccv_nnc_symbolic_graph_compile(dynamic_graph->tape, compile_params,
488
3.21k
      (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum,
489
3.21k
      0, 0,
490
3.21k
      set_ones, set_one_size,
491
3.21k
      (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(destinations, 0), destinations->rnum,
492
3.21k
      &graph, &tensor_arena, &exec_arena);
493
3.21k
  } else {
494
200
    // Otherwise we don't have a single set ones, in this case, we still compile from source.
495
200
    ccv_nnc_symbolic_graph_compile(dynamic_graph->tape, compile_params,
496
200
      (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum,
497
200
      0, 0,
498
200
      (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, 0), sources->rnum,
499
200
      (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(destinations, 0), destinations->rnum,
500
200
      &graph, &tensor_arena, &exec_arena);
501
200
  }
502
3.41k
  ccv_array_free(sources);
503
6.64k
  for (i = 0; i < set_one_size; 
i++3.22k
)
504
3.22k
    ccv_nnc_graph_exec_symbol_free(dynamic_graph->tape, set_ones[i]);
505
3.41k
  ccv_array_free(destinations);
506
3.41k
  ccv_array_free(tensor_binds);
507
3.41k
  // Remove newly added symbols to restore the graph.
508
49.0k
  for (i = 0; i < symbol_stack->rnum; 
i++45.6k
)
509
45.6k
  {
510
45.6k
    const ccv_nnc_tape_symbol_t* const symbol = (ccv_nnc_tape_symbol_t*)ccv_array_get(symbol_stack, i);
511
45.6k
    if (symbol->type == CCV_NNC_SYMBOL_TENSOR || 
symbol->type == CCV_NNC_SYMBOL_TENSOR_ALIAS18.4k
)
512
28.1k
      ccv_nnc_tensor_symbol_free(dynamic_graph->tape, (ccv_nnc_tensor_symbol_t){
513
28.1k
        .d = symbol->d,
514
28.1k
        .graph = dynamic_graph->tape
515
28.1k
      });
516
17.4k
    else if (symbol->type == CCV_NNC_SYMBOL_GRAPH_EXEC)
517
17.4k
      ccv_nnc_graph_exec_symbol_free(dynamic_graph->tape, (ccv_nnc_graph_exec_symbol_t){
518
17.4k
        .d = symbol->d,
519
17.4k
        .graph = dynamic_graph->tape
520
17.4k
      });
521
45.6k
  }
522
3.41k
  ccv_array_free(symbol_stack);
523
3.41k
  // Go through inputs and outputs to find out stream type and parallel counts.
524
3.41k
  int multi_device = 0;
525
3.42k
  for (i = 1; !multi_device && 
i < input_size3.42k
;
i++9
)
526
9
    multi_device = (CCV_TENSOR_GET_DEVICE(inputs[i - 1]->info.type) != CCV_TENSOR_GET_DEVICE(inputs[i]->info.type));
527
3.41k
  if (stream_context)
528
200
  {
529
200
    ccv_nnc_graph_set_default_static_schedule(graph, ccv_nnc_stream_context_type(stream_context));
530
200
    ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, stream_context);
531
200
    ccv_nnc_tensor_arena_buffer_free(tensor_arena);
532
200
    ccv_nnc_compilation_artifact_t* const artifact = ccv_nnc_compilation_artifact_new(graph, tensor_arena, exec_arena);
533
200
    ccv_nnc_stream_context_add_callback(stream_context, (ccv_nnc_callback_f)ccv_nnc_compilation_artifact_free, artifact);
534
3.21k
  } else {
535
3.21k
    if (multi_device)
536
2
    {
537
2
      int flag = 0;
538
4
      for (i = 0; !flag && 
i < input_size2
;
i++2
)
539
2
        flag = (CCV_TENSOR_GET_MEMORY(inputs[i]->info.type) == CCV_TENSOR_GPU_MEMORY);
540
2
      const int stream_type = flag ? CCV_STREAM_CONTEXT_GPU : 
CCV_STREAM_CONTEXT_CPU0
;
541
2
      ccv_nnc_graph_set_default_static_schedule(graph, stream_type);
542
2
      ccv_nnc_stream_context_t* const default_stream = ccv_nnc_graph_default_stream(graph);
543
2
      ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, default_stream);
544
2
      ccv_nnc_stream_context_wait(default_stream);
545
2
    } else
546
3.21k
      ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
547
3.21k
    ccv_nnc_graph_free(graph);
548
3.21k
    ccv_nnc_tensor_arena_free(tensor_arena);
549
3.21k
    ccv_nnc_graph_exec_arena_free(exec_arena);
550
3.21k
  }
551
3.41k
  // Now, able to free some of the reused outputs. This need to be the last step otherwise some of the exec symbols
552
3.41k
  // above may be freed by this operation.
553
5.42k
  for (i = 0; i < freeable_size; 
i++2.00k
)
554
2.00k
    ccv_nnc_tensor_variable_free(dynamic_graph, freeables[i]);
555
3.41k
}