Coverage Report

Created: 2024-08-18 16:21

/home/liu/actions-runner/_work/ccv/ccv/lib/nnc/ccv_nnc_dynamic_graph_minimize.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_nnc_easy.h"
5
#include "ccv_internal.h"
6
#include "_ccv_nnc_dynamic_graph.h"
7
8
// MARK - Level-4.5 API
9
10
void ccv_nnc_dynamic_graph_minimize(ccv_nnc_dynamic_graph_t* const dynamic_graph, const ccv_nnc_cmd_t minimizer, const ccv_nnc_tensor_variable_t* const losses, const int loss_size, const ccv_nnc_tensor_variable_t* const dloss_optionals, ccv_nnc_tensor_variable_t* const parameters, const int parameter_size, ccv_nnc_tensor_variable_t* const saved_aux, const int parallel, ccv_nnc_stream_context_t* const stream_context)
11
1.00k
{
12
1.00k
  assert(parameter_size > 0);
13
1.00k
  assert(loss_size > 0);
14
1.00k
  int d, i, j, k;
15
1.00k
  int losses_source_size = 0;
16
  // Both f_variable and tensor_variable should be, at least, executed. Otherwise we cannot differentiate.
17
2.00k
  for (i = 0; i < loss_size; 
i++1.00k
)
18
1.00k
  {
19
1.00k
    assert(losses[i]->symbol.d >= 0);
20
1.00k
    const ccv_nnc_tensor_variable_graph_bind_t* const loss_symbol_extra = (ccv_nnc_tensor_variable_graph_bind_t*)ccv_array_get(dynamic_graph->binds, losses[i]->symbol.d);
21
1.00k
    assert(loss_symbol_extra->sources && loss_symbol_extra->sources->rnum > 0);
22
1.00k
    losses_source_size += loss_symbol_extra->sources->rnum;
23
1.00k
  }
24
2.00k
  
for (i = 0; 1.00k
i < parameter_size;
i++1.00k
)
25
1.00k
  {
26
1.00k
    assert(parameters[i]->symbol.d >= 0);
27
1.00k
    assert(((ccv_nnc_tensor_variable_graph_bind_t*)ccv_array_get(dynamic_graph->binds, parameters[i]->symbol.d))->destinations &&
28
1.00k
      ((ccv_nnc_tensor_variable_graph_bind_t*)ccv_array_get(dynamic_graph->binds, parameters[i]->symbol.d))->destinations->rnum > 0);
29
1.00k
  }
30
1.00k
  const int exec_symbol_info_size = ccv_nnc_graph_exec_symbol_count(dynamic_graph->tape);
31
1.00k
  ccv_array_t* const sources = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), 1, 0);
32
1.00k
  if (!dynamic_graph->ws)
33
3
    dynamic_graph->ws = ccv_array_new(sizeof(int), exec_symbol_info_size * 2 + ((exec_symbol_info_size + 31) >> 5), 0);
34
1.00k
  ccv_array_t* const ws = dynamic_graph->ws;
35
1.00k
  ccv_array_resize(ws, exec_symbol_info_size * 2 + ((exec_symbol_info_size + 31) >> 5));
36
  // set visited to all 0.
37
1.00k
  memset((uint32_t*)ccv_array_get(ws, exec_symbol_info_size * 2), 0, sizeof(uint32_t) * ((exec_symbol_info_size + 31) >> 5));
38
2.00k
  for (i = 0; i < parameter_size; 
i++1.00k
)
39
1.00k
  {
40
1.00k
    ccv_array_t* const destinations = ((ccv_nnc_tensor_variable_graph_bind_t*)ccv_array_get(dynamic_graph->binds, parameters[i]->symbol.d))->destinations;
41
2.00k
    for (j = 0; j < destinations->rnum; 
j++1.00k
)
42
1.00k
      ccv_nnc_insert_if_prior_to_any(dynamic_graph->tape,
43
1.00k
        *(int*)ccv_array_get(destinations, j),
44
1.00k
        sources, (uint32_t*)ccv_array_get(ws, exec_symbol_info_size * 2),
45
1.00k
        (int*)ccv_array_get(ws, 0), (int*)ccv_array_get(ws, exec_symbol_info_size));
46
1.00k
  }
47
1.00k
  ccv_array_t* const destinations = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), losses_source_size, 0);
48
2.00k
  for (i = 0; i < loss_size; 
i++1.00k
)
49
1.00k
  {
50
1.00k
    const ccv_nnc_tensor_variable_graph_bind_t* const loss_symbol_extra = (ccv_nnc_tensor_variable_graph_bind_t*)ccv_array_get(dynamic_graph->binds, losses[i]->symbol.d);
51
2.00k
    for (j = 0; j < loss_symbol_extra->sources->rnum; 
j++1.00k
)
52
1.00k
    {
53
1.00k
      const int symbol_d = *(int*)ccv_array_get(loss_symbol_extra->sources, j);
54
1.00k
      int flag = 0;
55
1.00k
      for (k = 0; !flag && k < destinations->rnum; 
k++2
)
56
2
        flag = (symbol_d == ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(destinations, k))->d);
57
1.00k
      if (!flag)
58
1.00k
      {
59
1.00k
        const ccv_nnc_graph_exec_symbol_t symbol = {
60
1.00k
          .d = symbol_d,
61
1.00k
          .graph = dynamic_graph->tape
62
1.00k
        };
63
1.00k
        ccv_array_push(destinations, &symbol);
64
1.00k
      }
65
1.00k
    }
66
1.00k
  }
67
  // Go over sources, because destinations will get removed all the time, thus, the index is not accurate.
68
1.00k
  if (destinations->rnum > 1)
69
6
    
for (i = 0; 2
i < destinations->rnum;
i++4
)
70
4
    {
71
4
      memset((uint32_t*)ccv_array_get(ws, exec_symbol_info_size * 2), 0, sizeof(uint32_t) * ((exec_symbol_info_size + 31) >> 5));
72
4
      ccv_nnc_remove_if_prior_to_any(dynamic_graph->tape,
73
4
        ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(destinations, i))->d,
74
4
        destinations, (uint32_t*)ccv_array_get(ws, exec_symbol_info_size * 2),
75
4
        (int*)ccv_array_get(ws, 0), (int*)ccv_array_get(ws, exec_symbol_info_size));
76
4
    }
77
1.00k
  ccv_nnc_tensor_symbol_t loss_symbols[loss_size];
78
2.00k
  for (i = 0; i < loss_size; 
i++1.00k
)
79
1.00k
    loss_symbols[i] = losses[i]->symbol;
80
1.00k
  ccv_nnc_tensor_symbol_t parameter_symbols[parameter_size];
81
2.00k
  for (i = 0; i < parameter_size; 
i++1.00k
)
82
1.00k
    parameter_symbols[i] = parameters[i]->symbol;
83
1.00k
  ccv_array_t* const symbol_stack = ccv_array_new(sizeof(ccv_nnc_tape_symbol_t), 1, 0);
84
1.00k
  ccv_nnc_tensor_symbol_new_hook(dynamic_graph->tape, ccv_nnc_dynamic_graph_push_backward_tensor_symbol, symbol_stack, 0);
85
1.00k
  ccv_nnc_tensor_symbol_alias_new_hook(dynamic_graph->tape, ccv_nnc_dynamic_graph_push_backward_tensor_symbol_alias, symbol_stack, 0);
86
1.00k
  ccv_nnc_graph_exec_symbol_new_hook(dynamic_graph->tape, ccv_nnc_dynamic_graph_push_backward_graph_exec_symbol, symbol_stack, 0);
87
1.00k
  ccv_nnc_tensor_symbol_t updated_parameter_symbols[parameter_size];
88
1.00k
  const int saved_aux_size = parameter_size * ccv_nnc_minimizer_saved_aux_size(minimizer);
89
1.00k
  ccv_nnc_tensor_symbol_map_t saved_aux_symbols[saved_aux_size];
90
1.00k
  ccv_nnc_graph_exec_symbol_t update_exec_symbols[parameter_size];
91
1.00k
  ccv_nnc_symbolic_graph_minimize(dynamic_graph->tape, minimizer,
92
1.00k
    loss_symbols, loss_size, parameter_symbols, parameter_size, 0, 0,
93
1.00k
    (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, 0), sources->rnum,
94
1.00k
    (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(destinations, 0), destinations->rnum,
95
1.00k
    0, updated_parameter_symbols, saved_aux_symbols, update_exec_symbols);
96
1.00k
  const int parallel_count = ccv_max(parallel, 1);
97
1.00k
  if (parallel_count > 1)
98
2
  {
99
2
    const int per_parameter_size = parameter_size / parallel_count;
100
2
    assert((parameter_size % parallel_count) == 0);
101
2
    ccv_nnc_tensor_symbol_t* const allreduce_inputs = parallel_count > 1 ? (ccv_nnc_tensor_symbol_t*)alloca(sizeof(ccv_nnc_tensor_symbol_t) * parallel_count * 2) : 
00
;
102
2
    ccv_nnc_tensor_symbol_t* const allreduce_outputs = allreduce_inputs ? allreduce_inputs + parallel_count : 
00
;
103
4
    for (i = 0; i < per_parameter_size; 
i++2
)
104
2
    {
105
6
      for (j = 0; j < parallel_count; 
j++4
)
106
4
      {
107
4
        const int idx = i + j * per_parameter_size;
108
4
        assert(parameters[idx]->symbol.d >= 0);
109
4
        const ccv_nnc_tensor_param_t info = parameters[i + j * per_parameter_size]->info;
110
4
        const ccv_nnc_tensor_symbol_t gradient = ccv_nnc_tensor_symbol_for_backward(dynamic_graph->tape, parameters[idx]->symbol);
111
4
        allreduce_inputs[j] = gradient;
112
4
        allreduce_outputs[j] = ccv_nnc_tensor_symbol_new(dynamic_graph->tape, info, 0);
113
4
      }
114
2
      const ccv_nnc_graph_exec_symbol_t allreduce = ccv_nnc_graph_exec_symbol_new(dynamic_graph->tape, CMD_COMM_ALLREDUCE_FORWARD(), allreduce_inputs, parallel_count, allreduce_outputs, parallel_count, 0);
115
6
      for (j = 0; j < parallel_count; 
j++4
)
116
4
      {
117
4
        const int idx = i + j * per_parameter_size;
118
4
        const ccv_nnc_tensor_symbol_t gradient = ccv_nnc_tensor_symbol_for_backward(dynamic_graph->tape, parameters[idx]->symbol);
119
4
        const ccv_nnc_graph_exec_symbol_t graph_exec = ccv_nnc_graph_exec_symbol_for_backward(dynamic_graph->tape, gradient);
120
4
        ccv_nnc_graph_exec_symbol_disjoin(dynamic_graph->tape, graph_exec, update_exec_symbols[idx]);
121
4
        ccv_nnc_graph_exec_symbol_concat(dynamic_graph->tape, graph_exec, allreduce);
122
4
        ccv_nnc_graph_exec_symbol_replace_io(dynamic_graph->tape, update_exec_symbols[idx], allreduce_inputs[j], allreduce_outputs[j]);
123
4
        ccv_nnc_graph_exec_symbol_concat(dynamic_graph->tape, allreduce, update_exec_symbols[idx]);
124
4
      }
125
2
    }
126
2
  }
127
1.00k
  ccv_nnc_tensor_symbol_new_hook(dynamic_graph->tape, 0, 0, 0);
128
1.00k
  ccv_nnc_tensor_symbol_alias_new_hook(dynamic_graph->tape, 0, 0, 0);
129
1.00k
  ccv_nnc_graph_exec_symbol_new_hook(dynamic_graph->tape, 0, 0, 0);
130
  // Bind generated tensors.
131
1.00k
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), dynamic_graph->vars->rnum + 2, 0);
132
8.01k
  for (i = 0; i < dynamic_graph->vars->rnum; 
i++7.00k
)
133
7.00k
  {
134
7.00k
    ccv_nnc_tensor_variable_t var = *(ccv_nnc_tensor_variable_t*)ccv_array_get(dynamic_graph->vars, i);
135
7.00k
    if (var && var->tensor_view && var->symbol.d >= 0)
136
6.00k
    {
137
6.00k
      ccv_nnc_tensor_bind_t bind = {
138
6.00k
        .symbol = var->symbol,
139
6.00k
        .tensor = (ccv_nnc_tensor_t*)CCV_NNC_TENSOR_VIEW(var->tensor_view)
140
6.00k
      };
141
6.00k
      ccv_array_push(tensor_binds, &bind);
142
6.00k
    }
143
7.00k
  }
144
7.01k
  for (i = 0; i < dynamic_graph->binds->rnum; 
i++6.00k
)
145
6.00k
  {
146
6.00k
    ccv_nnc_tensor_variable_graph_bind_t* const bind = (ccv_nnc_tensor_variable_graph_bind_t*)ccv_array_get(dynamic_graph->binds, i);
147
6.00k
    if (bind->index == CCV_NNC_TENSOR_NO_VARIABLE_BUT_USED && 
bind->tensor_view0
)
148
0
    {
149
0
      ccv_nnc_tensor_bind_t b = {
150
0
        .symbol = {
151
0
          .d = i,
152
0
          .graph = dynamic_graph->tape,
153
0
        },
154
0
        .tensor = (ccv_nnc_tensor_t*)CCV_NNC_TENSOR_VIEW(bind->tensor_view)
155
0
      };
156
0
      ccv_array_push(tensor_binds, &b);
157
0
    }
158
6.00k
  }
159
  // Compiled graph comes from the dloss.
160
1.00k
  ccv_array_clear(sources);
161
1.00k
  ccv_nnc_tensor_symbol_t dloss_symbols[loss_size];
162
2.00k
  for (i = 0; i < loss_size; 
i++1.00k
)
163
1.00k
  {
164
1.00k
    dloss_symbols[i] = ccv_nnc_tensor_symbol_for_backward(dynamic_graph->tape, losses[i]->symbol);
165
1.00k
    assert(dloss_symbols[i].d >= 0);
166
1.00k
  }
167
2.00k
  
for (d = 0; 1.00k
d < destinations->rnum;
d++1.00k
)
168
1.00k
  {
169
1.00k
    const ccv_nnc_graph_exec_symbol_t* const destination = (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(destinations, d);
170
1.00k
    const int* outgoings; int outgoing_size;
171
1.00k
    ccv_nnc_graph_exec_symbol_to(dynamic_graph->tape, *destination, &outgoings, &outgoing_size);
172
2.00k
    for (i = 0; i < outgoing_size; 
i++1.00k
)
173
1.00k
    {
174
1.00k
      const int exec_idx = outgoings[i];
175
1.00k
      const int* inputs; int input_size;
176
1.00k
      ccv_nnc_graph_exec_symbol_io(dynamic_graph->tape, (ccv_nnc_graph_exec_symbol_t){
177
1.00k
        .d = exec_idx,
178
1.00k
        .graph = dynamic_graph->tape
179
1.00k
      }, &inputs, &input_size, 0, 0);
180
1.00k
      for (j = 0; j < input_size; 
j++0
)
181
1.00k
      {
182
1.00k
        const int input = inputs[j];
183
1.00k
        const int alias_ref = input >= 0 ? ccv_nnc_tensor_symbol_alias_to(dynamic_graph->tape, (ccv_nnc_tensor_symbol_t){
184
1.00k
          .d = input,
185
1.00k
          .graph = dynamic_graph->tape
186
1.00k
        }).d : 
CCV_NNC_NO_TENSOR_SYMBOL0
; // This could be CCV_NNC_NO_TENSOR_SYMBOL, which is negative.
187
        // alias_ref is either exists, or -1.
188
1.00k
        int flag = 0;
189
2.01k
        for (k = 0; !flag && 
k < loss_size1.00k
;
k++1.00k
)
190
1.00k
          flag = (dloss_symbols[k].d == input || 
dloss_symbols[k].d == alias_ref2
);
191
1.00k
        if (flag)
192
1.00k
        {
193
1.00k
          flag = 0;
194
1.00k
          for (k = 0; !flag && k < sources->rnum; 
k++2
)
195
2
            flag = (exec_idx == ((ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, k))->d);
196
1.00k
          if (!flag)
197
1.00k
          {
198
1.00k
            const ccv_nnc_graph_exec_symbol_t source = {
199
1.00k
              .d = exec_idx,
200
1.00k
              .graph = dynamic_graph->tape
201
1.00k
            };
202
1.00k
            ccv_array_push(sources, &source);
203
1.00k
          }
204
1.00k
          break;
205
1.00k
        }
206
1.00k
      }
207
1.00k
    }
208
1.00k
  }
209
1.00k
  ccv_array_free(destinations);
210
1.00k
  int freeable_size = 0;
211
1.00k
  ccv_nnc_tensor_variable_t freeables[parameter_size + saved_aux_size];
212
  // Bind dt tensor.
213
2.00k
  for (i = 0; i < parameter_size; 
i++1.00k
)
214
1.00k
  {
215
1.00k
    const ccv_nnc_tensor_symbol_t symbol = updated_parameter_symbols[i];
216
1.00k
    if (parameters[i]->symbol.d >= 0)
217
1.00k
      freeables[freeable_size++] = ccv_nnc_tensor_variable_exchange_new(dynamic_graph, parameters[i]);
218
1.00k
    ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_from_variable(dynamic_graph, parameters[i], stream_context);
219
1.00k
    const ccv_nnc_tensor_bind_t dt_bind = {
220
1.00k
      .symbol = symbol,
221
1.00k
      .tensor = tensor
222
1.00k
    };
223
1.00k
    ccv_array_push(tensor_binds, &dt_bind);
224
1.00k
  }
225
2.00k
  for (i = 0; i < saved_aux_size; 
i++1.00k
)
226
1.00k
  {
227
1.00k
    const ccv_nnc_tensor_symbol_map_t symbol_map = saved_aux_symbols[i];
228
1.00k
    if (saved_aux[i]->symbol.d >= 0)
229
0
      freeables[freeable_size++] = ccv_nnc_tensor_variable_exchange_new(dynamic_graph, saved_aux[i]);
230
1.00k
    ccv_nnc_tensor_t* tensor = ccv_nnc_tensor_from_variable(dynamic_graph, saved_aux[i], stream_context);
231
1.00k
    ccv_nnc_tensor_bind_t aux_bind = {
232
1.00k
      .symbol = symbol_map.source,
233
1.00k
      .tensor = tensor
234
1.00k
    };
235
1.00k
    ccv_array_push(tensor_binds, &aux_bind);
236
1.00k
    aux_bind.symbol = symbol_map.destination;
237
1.00k
    ccv_array_push(tensor_binds, &aux_bind);
238
1.00k
  }
239
1.00k
  ccv_nnc_dy_xpu_alloc_t xpu_alloc = {
240
1.00k
    .xpu_alloc = &dynamic_graph->xpu_alloc,
241
1.00k
    .stream = stream_context
242
1.00k
  };
243
1.00k
  ccv_nnc_symbolic_graph_compile_param_t compile_params = {
244
1.00k
    .allocator = {
245
1.00k
      .isa = &ccv_nnc_dy_allocator_isa,
246
1.00k
      .context = {
247
1.00k
        .alloc = &xpu_alloc,
248
1.00k
        .free = &dynamic_graph->xpu_alloc,
249
1.00k
      }
250
1.00k
    }
251
1.00k
  };
252
1.00k
  ccv_nnc_graph_t* graph = 0;
253
1.00k
  ccv_nnc_tensor_arena_t* tensor_arena = 0;
254
1.00k
  ccv_nnc_graph_exec_arena_t* exec_arena = 0;
255
1.00k
  if (dloss_optionals)
256
0
  {
257
    // If provided df variable, no need to set to all ones.
258
0
    for (i = 0; i < loss_size; i++)
259
0
    {
260
0
      const ccv_nnc_tensor_bind_t df_bind = {
261
0
        .symbol = dloss_symbols[i],
262
0
        .tensor = ccv_nnc_tensor_from_variable(dynamic_graph, dloss_optionals[i], stream_context)
263
0
      };
264
0
      ccv_array_push(tensor_binds, &df_bind);
265
0
    }
266
0
    ccv_nnc_symbolic_graph_compile(dynamic_graph->tape, compile_params,
267
0
      (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum,
268
0
      0, 0,
269
0
      (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, 0), sources->rnum,
270
0
      update_exec_symbols, parameter_size,
271
0
      &graph, &tensor_arena, &exec_arena);
272
0
    ccv_array_free(sources);
273
1.00k
  } else {
274
1.00k
    int max_input_size = 1;
275
1.00k
    int max_output_size = 1;
276
2.00k
    for (i = 0; i < sources->rnum; 
i++1.00k
)
277
1.00k
    {
278
1.00k
      const ccv_nnc_graph_exec_symbol_t source = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, i);
279
1.00k
      int input_size; int output_size;
280
1.00k
      ccv_nnc_graph_exec_symbol_io(dynamic_graph->tape, source, 0, &input_size, 0, &output_size);
281
1.00k
      max_input_size = ccv_max(input_size, max_input_size);
282
1.00k
      max_output_size = ccv_max(output_size, max_output_size);
283
1.00k
    }
284
1.00k
    const int max_input_bitmask_size = ((max_input_size + 63) >> 6);
285
1.00k
    const int max_output_bitmask_size =  ((max_output_size + 63) >> 6);
286
1.00k
    ccv_nnc_tensor_symbol_t input_symbols[max_input_size];
287
1.00k
    ccv_nnc_tensor_symbol_t output_symbols[max_output_size];
288
1.00k
    uint64_t input_bitmasks[max_input_bitmask_size];
289
1.00k
    uint64_t output_bitmasks[max_output_bitmask_size];
290
    // Remove these if it is not needed by the cmd, for example, if absence assumed to be 1.
291
2.00k
    for (i = 0; i < loss_size; 
i++1.00k
)
292
1.00k
    {
293
1.00k
      if (!dloss_symbols[i].graph) // Skip.
294
0
        continue;
295
1.00k
      int no_set = 0; // If we cannot find the df_symbols in all sources, we cannot predict whether it is used or not.
296
2.01k
      for (j = 0; j < sources->rnum; 
j++1.00k
)
297
1.00k
      {
298
1.00k
        const ccv_nnc_graph_exec_symbol_t source = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, j);
299
1.00k
        const int* inputs; int input_size;
300
1.00k
        const int* outputs; int output_size;
301
1.00k
        ccv_nnc_graph_exec_symbol_io(dynamic_graph->tape, source, &inputs, &input_size, &outputs, &output_size);
302
1.00k
        const ccv_nnc_cmd_t cmd = ccv_nnc_graph_exec_symbol_cmd(dynamic_graph->tape, source);
303
1.00k
        int flag = 0;
304
2.02k
        for (k = 0; !flag && 
k < input_size1.02k
;
k++1.01k
)
305
1.01k
        {
306
1.01k
          const int alias_ref = inputs[k] >= 0 ? ccv_nnc_tensor_symbol_alias_to(dynamic_graph->tape, (ccv_nnc_tensor_symbol_t){
307
1.00k
            .d = inputs[k],
308
1.00k
            .graph = dynamic_graph->tape
309
1.00k
          }).d : 
CCV_NNC_NO_TENSOR_SYMBOL8
; // This could be CCV_NNC_NO_TENSOR_SYMBOL, which is negative.
310
1.01k
          flag = (dloss_symbols[i].d == inputs[k] || 
dloss_symbols[i].d == alias_ref12
);
311
1.01k
        }
312
1.00k
        if (flag)
313
1.00k
        {
314
1.00k
          no_set = 1;
315
          // Now, check to see if we can remove this symbol from this source.
316
1.00k
          memset(input_bitmasks, 0, sizeof(uint64_t) * ccv_max(1, ((input_size + 63) >> 6)));
317
1.00k
          memset(output_bitmasks, 0, sizeof(uint64_t) * ccv_max(1, ((output_size + 63) >> 6)));
318
4.01k
          for (k = 0; k < input_size; 
k++3.01k
)
319
3.01k
            if (inputs[k] >= 0)
320
1.00k
            {
321
1.00k
              const int alias_ref = inputs[k] >= 0 ? ccv_nnc_tensor_symbol_alias_to(dynamic_graph->tape, (ccv_nnc_tensor_symbol_t){
322
1.00k
                .d = inputs[k],
323
1.00k
                .graph = dynamic_graph->tape
324
1.00k
              }).d : 
CCV_NNC_NO_TENSOR_SYMBOL0
; // This could be CCV_NNC_NO_TENSOR_SYMBOL, which is negative.
325
1.00k
              if (dloss_symbols[i].d != inputs[k] && 
dloss_symbols[i].d != alias_ref0
)
326
0
                input_bitmasks[k >> 6] |= ((uint64_t)1 << (k & 63));
327
1.00k
            }
328
2.00k
          for (k = 0; k < output_size; 
k++1.00k
)
329
1.00k
            if (outputs[k] >= 0)
330
1.00k
              output_bitmasks[k >> 6] |= ((uint64_t)1 << (k & 63));
331
1.00k
          if (!ccv_nnc_cmd_bitmask(cmd, input_size, output_size, input_bitmasks, (input_size + 63) >> 6, output_bitmasks, (output_size + 63) >> 6))
332
1.00k
            no_set = 0;
333
1.00k
        }
334
1.00k
      }
335
1.00k
      if (no_set) // Remove this flag from all sources and continue.
336
0
      {
337
0
        for (j = 0; j < sources->rnum; j++)
338
0
        {
339
0
          const ccv_nnc_graph_exec_symbol_t source = *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, j);
340
0
          const int* inputs; int input_size;
341
0
          const int* outputs; int output_size;
342
0
          ccv_nnc_graph_exec_symbol_io(dynamic_graph->tape, source, &inputs, &input_size, &outputs, &output_size);
343
0
          int flag = 0;
344
0
          for (k = 0; !flag && k < input_size; k++)
345
0
          {
346
0
            const int alias_ref = inputs[k] >= 0 ? ccv_nnc_tensor_symbol_alias_to(dynamic_graph->tape, (ccv_nnc_tensor_symbol_t){
347
0
              .d = inputs[k],
348
0
              .graph = dynamic_graph->tape
349
0
            }).d : CCV_NNC_NO_TENSOR_SYMBOL; // This could be CCV_NNC_NO_TENSOR_SYMBOL, which is negative.
350
0
            flag = (dloss_symbols[i].d == inputs[k] || dloss_symbols[i].d == alias_ref);
351
0
          }
352
0
          if (flag)
353
0
          {
354
0
            for (k = 0; k < input_size; k++)
355
0
              if (inputs[k] >= 0)
356
0
              {
357
0
                const int alias_ref = inputs[k] >= 0 ? ccv_nnc_tensor_symbol_alias_to(dynamic_graph->tape, (ccv_nnc_tensor_symbol_t){
358
0
                  .d = inputs[k],
359
0
                  .graph = dynamic_graph->tape
360
0
                }).d : CCV_NNC_NO_TENSOR_SYMBOL; // This could be CCV_NNC_NO_TENSOR_SYMBOL, which is negative.
361
0
                const int no_symbol = (dloss_symbols[i].d == inputs[k] || dloss_symbols[i].d == alias_ref);
362
0
                input_symbols[k] = (ccv_nnc_tensor_symbol_t){
363
0
                  .d = no_symbol ? CCV_NNC_NO_TENSOR_SYMBOL : inputs[k],
364
0
                  .graph = no_symbol ? 0 : dynamic_graph->tape,
365
0
                };
366
0
              } else {
367
0
                input_symbols[k] = (ccv_nnc_tensor_symbol_t){
368
0
                  .d = inputs[k],
369
0
                  .graph = inputs[k] != CCV_NNC_NO_TENSOR_SYMBOL ? dynamic_graph->tape : 0,
370
0
                };
371
0
              }
372
0
            for (k = 0; k < output_size; k++)
373
0
              output_symbols[k] = (ccv_nnc_tensor_symbol_t){
374
0
                .d = outputs[k],
375
0
                .graph = outputs[k] != CCV_NNC_NO_TENSOR_SYMBOL ? dynamic_graph->tape : 0,
376
0
              };
377
0
            ccv_nnc_graph_exec_symbol_set_io(dynamic_graph->tape, source, input_symbols, input_size, output_symbols, output_size);
378
0
          }
379
0
        }
380
0
        dloss_symbols[i].graph = 0;
381
0
      }
382
1.00k
    }
383
    // Aggregate them into one set command.
384
1.00k
    ccv_nnc_tensor_symbol_t dloss_symbols_0[loss_size];
385
1.00k
    ccv_nnc_graph_exec_symbol_t set_ones[loss_size];
386
1.00k
    int set_one_size = 0;
387
2.00k
    for (i = 0; i < loss_size;)
388
1.00k
      if (!dloss_symbols[i].graph) // Skip.
389
0
        ++i;
390
1.00k
      else {
391
1.00k
        dloss_symbols_0[0] = dloss_symbols[i];
392
1.00k
        k = 1;
393
1.00k
        int idx = loss_size;
394
1.00k
        const ccv_nnc_tensor_param_t params_0 = ccv_nnc_tensor_symbol_params(dynamic_graph->tape, dloss_symbols_0[0]);
395
1.00k
        for (j = i + 1; j < loss_size; 
j++2
)
396
2
          if (dloss_symbols[j].graph)
397
2
          {
398
2
            const ccv_nnc_tensor_param_t params_j = ccv_nnc_tensor_symbol_params(dynamic_graph->tape, dloss_symbols[j]);
399
2
            if (params_j.type != params_0.type)
400
2
            {
401
2
              if (idx == loss_size)
402
2
                idx = j;
403
2
            } else {
404
0
              dloss_symbols_0[k++] = dloss_symbols[j];
405
0
              assert(dloss_symbols[j].graph == dynamic_graph->tape);
406
0
              dloss_symbols[j].graph = 0;
407
0
            }
408
2
          }
409
1.00k
        i = idx;
410
1.00k
        set_ones[set_one_size] = ccv_nnc_graph_exec_symbol_new(dynamic_graph->tape, CMD_SET_FORWARD(1), 0, 0, dloss_symbols_0, k, 0);
411
2.01k
        for (j = 0; j < sources->rnum; 
j++1.00k
)
412
1.00k
          ccv_nnc_graph_exec_symbol_concat(dynamic_graph->tape, set_ones[set_one_size], *(ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, j));
413
1.00k
        ++set_one_size;
414
1.00k
      }
415
    // Reset it back.
416
2.00k
    
for (i = 0; 1.00k
i < loss_size;
i++1.00k
)
417
1.00k
      dloss_symbols[i].graph = dynamic_graph->tape;
418
1.00k
    if (set_one_size > 0)
419
1.00k
    {
420
1.00k
      ccv_nnc_symbolic_graph_compile(dynamic_graph->tape, compile_params,
421
1.00k
        (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum,
422
1.00k
        0, 0,
423
1.00k
        set_ones, set_one_size,
424
1.00k
        update_exec_symbols, parameter_size,
425
1.00k
        &graph, &tensor_arena, &exec_arena);
426
1.00k
    } else {
427
0
      ccv_nnc_symbolic_graph_compile(dynamic_graph->tape, compile_params,
428
0
        (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum,
429
0
        0, 0,
430
0
        (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(sources, 0), sources->rnum,
431
0
        update_exec_symbols, parameter_size,
432
0
        &graph, &tensor_arena, &exec_arena);
433
0
    }
434
1.00k
    ccv_array_free(sources);
435
2.00k
    for (i = 0; i < set_one_size; 
i++1.00k
)
436
1.00k
      ccv_nnc_graph_exec_symbol_free(dynamic_graph->tape, set_ones[i]);
437
1.00k
  }
438
1.00k
  ccv_array_free(tensor_binds);
439
  // Remove newly added symbols to restore the graph.
440
15.0k
  for (i = 0; i < symbol_stack->rnum; 
i++14.0k
)
441
14.0k
  {
442
14.0k
    const ccv_nnc_tape_symbol_t* const symbol = (ccv_nnc_tape_symbol_t*)ccv_array_get(symbol_stack, i);
443
14.0k
    if (symbol->type == CCV_NNC_SYMBOL_TENSOR || 
symbol->type == CCV_NNC_SYMBOL_TENSOR_ALIAS5.01k
)
444
9.01k
      ccv_nnc_tensor_symbol_free(dynamic_graph->tape, (ccv_nnc_tensor_symbol_t){
445
9.01k
        .d = symbol->d,
446
9.01k
        .graph = dynamic_graph->tape
447
9.01k
      });
448
5.01k
    else if (symbol->type == CCV_NNC_SYMBOL_GRAPH_EXEC)
449
5.01k
      ccv_nnc_graph_exec_symbol_free(dynamic_graph->tape, (ccv_nnc_graph_exec_symbol_t){
450
5.01k
        .d = symbol->d,
451
5.01k
        .graph = dynamic_graph->tape
452
5.01k
      });
453
14.0k
  }
454
1.00k
  ccv_array_free(symbol_stack);
455
1.00k
  if (stream_context)
456
1
  {
457
1
    ccv_nnc_graph_set_default_static_schedule(graph, ccv_nnc_stream_context_type(stream_context), dynamic_graph->max_stream_count);
458
1
    ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, stream_context);
459
1
    ccv_nnc_tensor_arena_buffer_free(tensor_arena);
460
1
    ccv_nnc_compilation_artifact_t* const artifact = ccv_nnc_compilation_artifact_new(graph, tensor_arena, exec_arena);
461
1
    ccv_nnc_stream_context_add_callback(stream_context, (ccv_nnc_callback_f)ccv_nnc_compilation_artifact_free, artifact);
462
1.00k
  } else {
463
1.00k
    if (parallel > 1)
464
1
    {
465
1
      int flag = 0;
466
2
      for (i = 0; !flag && 
i < parameter_size1
;
i++1
)
467
1
        flag = (CCV_TENSOR_GET_MEMORY(parameters[i]->info.type) == CCV_TENSOR_GPU_MEMORY);
468
1
      const int stream_type = flag ? CCV_STREAM_CONTEXT_GPU : 
CCV_STREAM_CONTEXT_CPU0
;
469
1
      ccv_nnc_graph_set_default_static_schedule(graph, stream_type, dynamic_graph->max_stream_count);
470
1
      ccv_nnc_stream_context_t* const default_stream = ccv_nnc_graph_default_stream(graph);
471
1
      ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, default_stream);
472
1
      ccv_nnc_stream_context_wait(default_stream);
473
1
    } else
474
1.00k
      ccv_nnc_graph_run(graph, 0, TRAVERSE_FULL, 0, 0);
475
1.00k
    ccv_nnc_graph_free(graph);
476
1.00k
    ccv_nnc_tensor_arena_free(tensor_arena);
477
1.00k
    ccv_nnc_graph_exec_arena_free(exec_arena);
478
1.00k
  }
479
  // Now, able to free some of the reused outputs. This need to be the last step otherwise some of the exec symbols
480
  // above may be freed by this operation.
481
2.00k
  for (i = 0; i < freeable_size; 
i++1.00k
)
482
1.00k
    ccv_nnc_tensor_variable_free(dynamic_graph, freeables[i]);
483
1.00k
}