Coverage Report

Created: 2021-04-06 02:31

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/ccv_cnnp_model.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#include "_ccv_cnnp_model.h"
6
7
// MARK - Level-5 API
8
9
ccv_cnnp_model_io_t ccv_cnnp_model_apply(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t* const inputs, const int input_size)
10
519
{
11
519
  assert(input_size > 0);
12
519
  if (!model->io)
13
511
    model->io = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 1, 0);
14
519
  ccv_cnnp_model_io_t model_io = ccmalloc(sizeof(struct ccv_cnnp_model_io_s) + sizeof(ccv_nnc_tensor_symbol_t) * model->output_size);
15
519
  model_io->param_ref = 0;
16
519
  model_io->param_sel = 0;
17
519
  model_io->visit = 0;
18
519
  model_io->model = model;
19
519
  model_io->incomings = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 1, 0);
20
519
  model_io->outgoings = 0;
21
519
  model_io->outputs = (ccv_nnc_tensor_symbol_t*)(model_io + 1);
22
519
  ccv_array_push(model->io, &model_io);
23
519
  int i;
24
519
  ccv_array_resize(model_io->incomings, input_size);
25
519
  memcpy(ccv_array_get(model_io->incomings, 0), inputs, sizeof(ccv_cnnp_model_io_t) * input_size);
26
1.15k
  for (i = 0; i < input_size; 
i++637
)
27
637
  {
28
637
    if (!inputs[i]->outgoings)
29
559
      inputs[i]->outgoings = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 1, 0);
30
637
    ccv_array_push(inputs[i]->outgoings, &model_io);
31
637
  }
32
519
  return model_io;
33
519
}
34
35
int ccv_cnnp_model_output_size(const ccv_cnnp_model_t* const model)
36
0
{
37
0
  return model->output_size;
38
0
}
39
40
ccv_cnnp_model_io_t ccv_cnnp_model_parameters(ccv_cnnp_model_t* const model, const int selector, const int index)
41
354
{
42
354
  if (!model->io)
43
29
    model->io = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 1, 0);
44
354
  ccv_cnnp_model_io_t model_io = ccmalloc(sizeof(struct ccv_cnnp_model_io_s));
45
354
  model_io->param_ref = index >= 0 ? 
index + 112
:
ALL_PARAMETERS342
;
46
354
  model_io->param_sel = selector >= 0 ? 
selector + 1304
:
ALL_PARAMETERS50
;
47
354
  model_io->visit = 0;
48
354
  model_io->model = model;
49
354
  model_io->outputs = 0;
50
354
  model_io->incomings = 0;
51
354
  model_io->outgoings = 0;
52
354
  ccv_array_push(model->io, &model_io);
53
354
  return model_io;
54
354
}
55
56
void ccv_cnnp_model_notify_hook(ccv_cnnp_model_t* const model, ccv_cnnp_model_notify_f func, void* const context)
57
3
{
58
3
  model->notify_hook.func = func;
59
3
  model->notify_hook.context = context;
60
3
}
61
62
void ccv_cnnp_model_notify(const ccv_cnnp_model_t* const model, const int tag, void* const payload)
63
14
{
64
14
  if (model->notify_hook.func)
65
3
    model->notify_hook.func(model, tag, payload, model->notify_hook.context);
66
14
  if (model->isa->notify)
67
1
    model->isa->notify(model, tag, payload);
68
14
}
69
70
static int _ccv_nnc_array_dedup_graph_exec_symbols(ccv_nnc_graph_exec_symbol_t* const graph_exec_symbols, int graph_exec_symbol_size)
71
2.23k
{
72
2.23k
  int i, j;
73
4.88k
  for (i = 0; i < graph_exec_symbol_size; 
i++2.65k
)
74
2.65k
  {
75
2.65k
    ccv_nnc_graph_exec_symbol_t* const graph_exec_symbol = graph_exec_symbols + i;
76
2.65k
    // Check whether this tensor symbol has any duplicate.
77
27.4k
    for (j = i + 1; j < graph_exec_symbol_size;)
78
24.7k
    {
79
24.7k
      ccv_nnc_graph_exec_symbol_t* const other_symbol = graph_exec_symbols + j;
80
24.7k
      // If there is a same tensor symbol, remove it.
81
24.7k
      if (other_symbol->d == graph_exec_symbol->d && 
other_symbol->graph == graph_exec_symbol->graph2.74k
)
82
2.74k
      {
83
2.74k
        if (j + 1 < graph_exec_symbol_size)
84
473
          *other_symbol = graph_exec_symbols[graph_exec_symbol_size - 1];
85
2.74k
        --graph_exec_symbol_size;
86
2.74k
        continue;
87
2.74k
      }
88
22.0k
      ++j;
89
22.0k
    }
90
2.65k
  }
91
2.23k
  return graph_exec_symbol_size;
92
2.23k
}
93
94
typedef struct {
95
  ccv_cnnp_model_sequence_t* sequence;
96
  char prefix;
97
  ccv_array_t* symbols;
98
  ccv_array_t* ids;
99
} ccv_cnnp_model_add_to_array_context_t;
100
101
static void _ccv_cnnp_add_to_array(void* const context, const ccv_nnc_tensor_symbol_t symbol)
102
3.10k
{
103
3.10k
  ccv_cnnp_model_add_to_array_context_t* const add_to_array_context = (ccv_cnnp_model_add_to_array_context_t*)context;
104
3.10k
  ccv_cnnp_model_t* const model = add_to_array_context->sequence->model;
105
3.10k
  int i;
106
3.10k
  if (!model->parameter_indices)
107
2.50k
    model->parameter_indices = ccv_array_new(sizeof(int), 0, 0);
108
37.3k
  for (i = 0; i < add_to_array_context->symbols->rnum; 
i++34.2k
)
109
34.2k
  {
110
34.2k
    const ccv_nnc_tensor_symbol_t other_symbol = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(add_to_array_context->symbols, i);
111
34.2k
    if (other_symbol.d == symbol.d && 
other_symbol.graph == symbol.graph22
)
112
22
    {
113
22
      // Only add to parameter_indices if it is trainable.
114
22
      if (add_to_array_context->prefix == 't')
115
14
        ccv_array_add_unique_int(model->parameter_indices, i);
116
22
      // Found it, return, don't add it.
117
22
      return;
118
22
    }
119
34.2k
  }
120
3.10k
  // Only add to parameter_indices if it is trainable.
121
3.10k
  
if (3.08k
add_to_array_context->prefix == 't'3.08k
)
122
2.92k
    ccv_array_push(model->parameter_indices, &add_to_array_context->symbols->rnum);
123
3.08k
  // This is a new one, no need to add_unique_int, it is unique.
124
3.08k
  ccv_array_push(add_to_array_context->symbols, &symbol);
125
3.08k
  char id[2048];
126
3.08k
  id[0] = add_to_array_context->prefix;
127
3.08k
  id[1] = '-';
128
3.08k
  int total_len = 2;
129
6.17k
  for (i = 0; i < add_to_array_context->sequence->sequences->rnum; 
i++3.08k
)
130
3.08k
  {
131
3.08k
    const ccv_cnnp_model_name_t* const name = (ccv_cnnp_model_name_t*)ccv_array_get(add_to_array_context->sequence->sequences, i);
132
3.08k
    int len;
133
3.08k
    if (name->name && 
name->name[0] != '\0'70
)
134
70
      len = snprintf(id + total_len, 2048 - total_len, "%s-%d-", name->name, name->sequence);
135
3.01k
    else
136
3.01k
      len = snprintf(id + total_len, 2048 - total_len, "%d-", name->sequence);
137
3.08k
    total_len += len;
138
3.08k
    if (total_len >= 2047)
139
0
      break;
140
3.08k
  }
141
3.08k
  if (total_len < 2047)
142
3.08k
    total_len += snprintf(id + total_len, 2048 - total_len, "%d", add_to_array_context->sequence->it);
143
3.08k
  assert(total_len < 2048);
144
3.08k
  char *heap_id = (char*)ccmalloc(total_len + 1);
145
3.08k
  memcpy(heap_id, id, total_len + 1);
146
3.08k
  ccv_array_push(add_to_array_context->ids, &heap_id);
147
3.08k
  ++add_to_array_context->sequence->it;
148
3.08k
}
149
150
static void _ccv_cnnp_model_compile(ccv_cnnp_model_t* const model, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_cmd_t loss)
151
2.25k
{
152
2.25k
  assert(model->graph);
153
2.25k
  model->inputs = ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * input_size);
154
2.25k
  int i;
155
4.55k
  for (i = 0; i < input_size; 
i++2.29k
)
156
2.29k
    model->inputs[i] = ccv_nnc_tensor_symbol_new(model->graph, inputs[i], 0);
157
2.25k
  ccv_array_t* const parameters = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
158
2.25k
  ccv_array_t* const parameter_ids = ccv_array_new(sizeof(char*), 0, 0);
159
2.25k
  ccv_cnnp_model_sequence_t model_sequence = {
160
2.25k
    .bank = kh_init(ccv_cnnp_model_name_bank)
161
2.25k
  };
162
2.25k
  ccv_cnnp_model_add_to_array_context_t add_to_parameter_context = {
163
2.25k
    .sequence = &model_sequence,
164
2.25k
    .prefix = 't',
165
2.25k
    .symbols = parameters,
166
2.25k
    .ids = parameter_ids,
167
2.25k
  };
168
2.25k
  ccv_array_t* const internals = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
169
2.25k
  ccv_array_t* const internal_ids = ccv_array_new(sizeof(char*), 0, 0);
170
2.25k
  ccv_cnnp_model_add_to_array_context_t add_to_output_context = {
171
2.25k
    .sequence = &model_sequence,
172
2.25k
    .prefix = 'r',
173
2.25k
    .symbols = internals,
174
2.25k
    .ids = internal_ids,
175
2.25k
  };
176
2.25k
  ccv_cnnp_model_build_data_t build_data = {
177
2.25k
    .model_sequence = &model_sequence,
178
2.25k
    .add_to_array = _ccv_cnnp_add_to_array,
179
2.25k
    .parameters = parameters,
180
2.25k
    .context = {
181
2.25k
      .add_to_parameter = &add_to_parameter_context,
182
2.25k
      .add_to_output = &add_to_output_context,
183
2.25k
    },
184
2.25k
  };
185
2.25k
  model->data = &build_data;
186
2.25k
  ccv_cnnp_model_build(model, model->graph, model->inputs, input_size, 0, 0);
187
2.25k
  model->data = 0;
188
2.25k
  kh_destroy(ccv_cnnp_model_name_bank, model_sequence.bank);
189
2.25k
  ccv_array_free(model_sequence.sequences);
190
2.25k
  // Assert no parameter is alias.
191
5.18k
  for (i = 0; i < parameters->rnum; 
i++2.92k
)
192
2.92k
  {
193
2.92k
    const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(parameters, i);
194
2.92k
    const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(parameter.graph, parameter);
195
2.92k
    assert(alias_to.graph == 0); // Cannot find the one alias to.
196
2.92k
  }
197
2.25k
  // Assert no internal is alias.
198
2.41k
  
for (i = 0; 2.25k
i < internals->rnum;
i++160
)
199
160
  {
200
160
    const ccv_nnc_tensor_symbol_t retained = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(internals, i);
201
160
    const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(retained.graph, retained);
202
160
    assert(alias_to.graph == 0); // Cannot find the one alias to.
203
160
  }
204
2.25k
  const int output_size = model->output_size;
205
2.25k
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
206
2.25k
  ccv_nnc_symbolic_graph_simplify(model->graph,
207
2.25k
    SYMBOLIC_GRAPH_PASSES(CCV_NNC_SIMPLIFY_COMMON_SUBEXPRESSION_ELIMINATION,
208
2.25k
      CCV_NNC_SIMPLIFY_DATA_TRANSFER_OPT,
209
2.25k
      CCV_NNC_SIMPLIFY_OPS_FUSION,
210
2.25k
      CCV_NNC_SIMPLIFY_GRAPH_PRUNING),
211
2.25k
    model->inputs, input_size,
212
2.25k
    model->outputs, output_size,
213
2.25k
    SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
214
2.25k
  ccv_cnnp_compiled_data_t* compiled_data = model->compiled_data = cccalloc(1, sizeof(ccv_cnnp_compiled_data_t) + sizeof(ccv_nnc_tensor_symbol_t) * (output_size * 2 - 1));
215
2.25k
  compiled_data->f = compiled_data->fits + output_size;
216
2.25k
  const int evaluate_to_size = compiled_data->evaluate.to_size = ccv_nnc_symbolic_graph_destination_size(model->graph);
217
2.25k
  assert(evaluate_to_size > 0);
218
2.25k
  compiled_data->evaluate.tos = ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size);
219
2.25k
  memcpy(compiled_data->evaluate.tos, ccv_nnc_symbolic_graph_destinations(model->graph), sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size);
220
2.25k
  compiled_data->loss = loss;
221
2.25k
  if (loss.cmd == CCV_NNC_NOOP)
222
2.25k
  {
223
2.25k
    // If no loss function provided, there is no fits.
224
4.50k
    for (i = 0; i < output_size; 
i++2.25k
)
225
2.25k
    {
226
2.25k
      compiled_data->fits[i] = NO_TENSOR_SYMBOL;
227
2.25k
      const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(model->graph, model->outputs[i]);
228
2.25k
      if (alias_to.d < 0)
229
1.25k
        compiled_data->f[i] = model->outputs[i];
230
1.00k
      else { // We cannot differentiate against an alias, therefore, we have to verify this output is full, and we can diff against the original.
231
1.00k
        int ofs[CCV_NNC_MAX_DIM_ALLOC];
232
1.00k
        int inc[CCV_NNC_MAX_DIM_ALLOC];
233
1.00k
        ccv_nnc_tensor_symbol_alias_params(model->graph, model->outputs[i], ofs, inc);
234
1.00k
        int j;
235
13.0k
        for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC; 
j++12.0k
)
236
12.0k
          { assert(ofs[j] == 0); } // There is no ofs.
237
1.00k
        compiled_data->f[i] = alias_to; // Unfortunately, I cannot assert the size yet.
238
1.00k
      }
239
2.25k
    }
240
2.25k
  } else {
241
14
    for (i = 0; i < output_size; 
i++7
)
242
7
    {
243
7
      const ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(model->graph, model->outputs[i]);
244
7
      const ccv_nnc_tensor_symbol_t fit = compiled_data->fits[i] = ccv_nnc_tensor_symbol_new(model->graph, info, 0);
245
7
      compiled_data->f[i] = ccv_nnc_tensor_symbol_new(model->graph, ccv_nnc_tensor_auto, 0);
246
7
      ccv_nnc_graph_exec_symbol_new(model->graph, loss, TENSOR_SYMBOL_LIST(model->outputs[i], fit), TENSOR_SYMBOL_LIST(compiled_data->f[i]), 0);
247
7
    }
248
7
  }
249
2.25k
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
250
2.25k
  ccv_nnc_symbolic_graph_simplify(model->graph,
251
2.25k
    SYMBOLIC_GRAPH_PASSES(CCV_NNC_SIMPLIFY_OPS_FUSION), // Only do Ops fusion, in this way, we can fuse the loss function.
252
2.25k
    0, 0, // No need to provide binds at this point.
253
2.25k
    compiled_data->f, model->output_size,
254
2.25k
    SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
255
2.25k
  // If inputs are from GPU, stream type is GPU.
256
2.25k
  compiled_data->parameters = parameters;
257
2.25k
  compiled_data->internals = internals;
258
2.25k
  compiled_data->ids.parameters = parameter_ids;
259
2.25k
  compiled_data->ids.internals = internal_ids;
260
2.25k
}
261
262
static void _ccv_cnnp_graph_push_graph_exec_symbol(void* context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_symbol_t* const inputs, const int input_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const char* const name)
263
7.39k
{
264
7.39k
  ccv_array_t* const stack = (ccv_array_t*)context;
265
7.39k
  ccv_array_push(stack, &symbol.d);
266
7.39k
}
267
268
static void _ccv_nnc_tensor_symbol_reinit(const ccv_nnc_symbolic_graph_t* const src_graph, ccv_nnc_symbolic_graph_t* const dest_graph, const int src_index, const int dest_index)
269
38.0k
{
270
38.0k
  const ccv_nnc_tensor_symbol_t src_symbol = {
271
38.0k
    .d = src_index,
272
38.0k
    .graph = src_graph
273
38.0k
  };
274
38.0k
  const ccv_nnc_tensor_symbol_t dest_symbol = {
275
38.0k
    .d = dest_index,
276
38.0k
    .graph = dest_graph
277
38.0k
  };
278
38.0k
  const ccv_nnc_tensor_param_t params = ccv_nnc_tensor_symbol_params(src_graph, src_symbol);
279
38.0k
  ccv_nnc_tensor_symbol_set(dest_graph, dest_symbol, params);
280
38.0k
  int ofs[CCV_NNC_MAX_DIM_ALLOC];
281
38.0k
  int inc[CCV_NNC_MAX_DIM_ALLOC];
282
38.0k
  if (0 == ccv_nnc_tensor_symbol_alias_params(src_graph, src_symbol, ofs, inc))
283
1.34k
    ccv_nnc_tensor_symbol_alias_set(dest_graph, dest_symbol, ofs, inc);
284
38.0k
}
285
286
static int _ccv_nnc_tensor_symbol_check_dim(const ccv_nnc_symbolic_graph_t* const src_graph, ccv_nnc_symbolic_graph_t* const dest_graph, const int src_index, const int dest_index)
287
2.44k
{
288
2.44k
  const ccv_nnc_tensor_symbol_t src_symbol = {
289
2.44k
    .d = src_index,
290
2.44k
    .graph = src_graph
291
2.44k
  };
292
2.44k
  const ccv_nnc_tensor_param_t src_params = ccv_nnc_tensor_symbol_params(src_graph, src_symbol);
293
2.44k
  const ccv_nnc_tensor_symbol_t dest_symbol = {
294
2.44k
    .d = dest_index,
295
2.44k
    .graph = dest_graph
296
2.44k
  };
297
2.44k
  const ccv_nnc_tensor_param_t dest_params = ccv_nnc_tensor_symbol_params(dest_graph, dest_symbol);
298
2.44k
  return memcmp(src_params.dim, dest_params.dim, sizeof(src_params.dim)) == 0;
299
2.44k
}
300
301
static void _ccv_cnnp_model_gradient_init(ccv_cnnp_model_t* const model, const int gradient_mode, const uint64_t disable_outgrad, ccv_nnc_tensor_t* const* const fits, const int fit_size);
302
static void _ccv_cnnp_compiled_data_graph_free(ccv_cnnp_compiled_data_t* const compiled_data);
303
304
typedef struct {
305
  int parallel_count;
306
  ccv_nnc_symbolic_graph_t* graph;
307
  ccv_nnc_graph_exec_arena_t* graph_exec_arena;
308
} ccv_nnc_graph_exec_update_t;
309
310
static void _ccv_cnnp_cmd_update_for_execs(void* const context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint)
311
56
{
312
56
  ccv_nnc_graph_exec_update_t* const graph_exec_update = (ccv_nnc_graph_exec_update_t*)context;
313
56
  ccv_nnc_graph_exec_arena_t* const graph_exec_arena = graph_exec_update->graph_exec_arena;
314
56
  ccv_nnc_graph_exec_t graph_exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, symbol);
315
56
  ccv_nnc_graph_exec_set(graph_exec.graph, graph_exec, cmd);
316
56
  ccv_nnc_graph_exec_set_hint(graph_exec.graph, graph_exec, hint);
317
56
  const ccv_nnc_symbolic_graph_t* const graph = graph_exec_update->graph;
318
56
  const int parallel_count = graph_exec_update->parallel_count;
319
56
  int i;
320
176
  for (i = 1; i < parallel_count; 
i++120
)
321
120
  {
322
120
    const ccv_nnc_graph_exec_t copy = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, ccv_nnc_graph_exec_symbol_copy(graph, symbol, i));
323
120
    if (!CCV_NO_GRAPH_EXEC(copy))
324
120
    {
325
120
      ccv_nnc_graph_exec_set(copy.graph, copy, cmd);
326
120
      ccv_nnc_graph_exec_set_hint(copy.graph, copy, hint);
327
120
    }
328
120
  }
329
56
}
330
331
void ccv_cnnp_model_absorb(ccv_cnnp_model_t* const model, ccv_cnnp_model_t* const init, const ccv_nnc_tensor_param_t* const inputs, const int input_size)
332
2.20k
{
333
2.20k
  assert(model->graph);
334
2.20k
  assert(model->compiled_data);
335
2.20k
  assert(!init->graph);
336
2.20k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
337
2.20k
  init->graph = ccv_nnc_symbolic_graph_new();
338
2.20k
  ccv_array_t* const stack = ccv_array_new(sizeof(int), 0, 0);
339
2.20k
  ccv_nnc_graph_exec_symbol_new_hook(init->graph, _ccv_cnnp_graph_push_graph_exec_symbol, stack);
340
2.20k
  _ccv_cnnp_model_compile(init, inputs, input_size, compiled_data->loss);
341
2.20k
  init->parallel_count = model->parallel_count;
342
2.20k
  init->memory_compression = model->memory_compression;
343
2.20k
  init->compiled_data->stream_type = model->compiled_data->stream_type;
344
2.20k
  init->compiled_data->minimize.minimizer = model->compiled_data->minimize.minimizer;
345
2.20k
  init->compiled_data->minimize.max_saved_aux_size = model->compiled_data->minimize.max_saved_aux_size;
346
2.20k
  if (model->compiled_data->gradient_mode != CCV_CNNP_COMPILED_DATA_GRADIENT_NONE)
347
2.20k
    _ccv_cnnp_model_gradient_init(init, model->compiled_data->gradient_mode, model->compiled_data->disable_outgrad, 0, 0);
348
2.20k
  ccv_nnc_graph_exec_symbol_new_hook(init->graph, 0, 0);
349
2.20k
  ccv_nnc_symbolic_graph_tensor_auto(init->graph, TRAVERSE_FULL);
350
2.20k
  int i, j;
351
2.20k
  // Verify parameters, internals and saved_aux in both graph has the same dimensionality.
352
4.64k
  for (i = 0; i < compiled_data->parameters->rnum; 
i++2.44k
)
353
2.44k
  {
354
2.44k
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d;
355
2.44k
    assert(_ccv_nnc_tensor_symbol_check_dim(model->graph, init->graph, d, d));
356
2.44k
  }
357
2.20k
  for (i = 0; i < compiled_data->internals->rnum; 
i++0
)
358
0
  {
359
0
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i))->d;
360
0
    assert(_ccv_nnc_tensor_symbol_check_dim(model->graph, init->graph, d, d));
361
0
  }
362
2.20k
  // Update inputs.
363
2.20k
  assert(model->input_size == init->input_size);
364
4.40k
  
for (i = 0; 2.20k
i < model->input_size;
i++2.20k
)
365
2.20k
    if (model->inputs[i].d >= 0)
366
2.20k
    {
367
2.20k
      assert(init->inputs[i].d >= 0);
368
2.20k
      _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, init->inputs[i].d, model->inputs[i].d);
369
2.20k
    }
370
2.20k
  // Update outputs.
371
2.20k
  assert(model->output_size == init->output_size);
372
4.40k
  
for (i = 0; 2.20k
i < model->output_size;
i++2.20k
)
373
2.20k
  {
374
2.20k
    if (model->outputs[i].d >= 0)
375
2.20k
    {
376
2.20k
      assert(init->outputs[i].d >= 0);
377
2.20k
      _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, init->outputs[i].d, model->outputs[i].d);
378
2.20k
    }
379
2.20k
    if (model->outputs[i].d != model->compiled_data->f[i].d)
380
1.00k
    {
381
1.00k
      assert(init->outputs[i].d != init->compiled_data->f[i].d);
382
1.00k
      if (model->compiled_data->f[i].d >= 0)
383
1.00k
      {
384
1.00k
        assert(init->compiled_data->f[i].d >= 0);
385
1.00k
        _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, init->compiled_data->f[i].d, model->compiled_data->f[i].d);
386
1.00k
      }
387
1.00k
    }
388
2.20k
  }
389
2.20k
  // Go through the graph to set tensor on matching symbols
390
9.59k
  
for (i = 0; 2.20k
i < stack->rnum;
i++7.39k
)
391
7.39k
  {
392
7.39k
    const int d = *(int*)ccv_array_get(stack, i);
393
7.39k
    // If exceed range, skip.
394
7.39k
    if (d >= ccv_nnc_graph_exec_symbol_count(init->graph) ||
395
7.39k
      d >= ccv_nnc_graph_exec_symbol_count(model->graph))
396
0
      continue;
397
7.39k
    const ccv_nnc_graph_exec_symbol_t src_symbol = {
398
7.39k
      .d = d,
399
7.39k
      .graph = init->graph
400
7.39k
    };
401
7.39k
    const ccv_nnc_graph_exec_symbol_t dest_symbol = {
402
7.39k
      .d = d,
403
7.39k
      .graph = model->graph
404
7.39k
    };
405
7.39k
    const ccv_nnc_cmd_t src_cmd = ccv_nnc_graph_exec_symbol_cmd(init->graph, src_symbol);
406
7.39k
    const ccv_nnc_cmd_t dest_cmd = ccv_nnc_graph_exec_symbol_cmd(model->graph, dest_symbol);
407
7.39k
    // If the name doesn't match, skip.
408
7.39k
    if (dest_cmd.cmd != src_cmd.cmd && 
src_cmd.cmd != CCV_NNC_NOOP0
)
409
0
      continue;
410
7.39k
    // Now get all the inputs and outputs, if matches, set them.
411
7.39k
    const int* src_inputs;
412
7.39k
    int src_input_size;
413
7.39k
    const int* src_outputs;
414
7.39k
    int src_output_size;
415
7.39k
    ccv_nnc_graph_exec_symbol_io(init->graph, src_symbol, &src_inputs, &src_input_size, &src_outputs, &src_output_size);
416
7.39k
    const int* dest_inputs;
417
7.39k
    int dest_input_size;
418
7.39k
    const int* dest_outputs;
419
7.39k
    int dest_output_size;
420
7.39k
    ccv_nnc_graph_exec_symbol_io(model->graph, dest_symbol, &dest_inputs, &dest_input_size, &dest_outputs, &dest_output_size);
421
7.39k
    // We may have unmatched input / output size because this is the minimizer and it has
422
7.39k
    // different saved_aux (for example, when we shrunk with CMD_NOOP).
423
7.39k
    if (src_input_size != dest_input_size)
424
0
      continue;
425
7.39k
    if (src_output_size != dest_output_size)
426
0
      continue;
427
7.39k
    ccv_nnc_graph_exec_symbol_set(model->graph, dest_symbol, src_cmd);
428
7.39k
    // There may be mismatches of the source tensor symbols and destination tensor symbols. The reason is because
429
7.39k
    // we may later passed-in the minimizer, therefore, we may allocate tensors for minimizer later in the original
430
7.39k
    // graph whereas in the newly created graph, it is streamlined (the minimizer exists from the beginning). That
431
7.39k
    // will make the order of tensor symbols creation different, therefore, exact which tensor is which wrong as
432
7.39k
    // well. However, set a new minimizer won't change the exec symbol ordering, because we never create new exec
433
7.39k
    // symbols after gradient init step. Changing a new minimizer just updated that exec symbols setting, it is not
434
7.39k
    // a new exec symbol.
435
30.1k
    for (j = 0; j < src_input_size; 
j++22.7k
)
436
22.7k
      if (src_inputs[j] >= 0)
437
19.8k
        _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, src_inputs[j], dest_inputs[j]);
438
20.1k
    for (j = 0; j < src_output_size; 
j++12.7k
)
439
12.7k
      if (src_outputs[j] >= 0)
440
12.7k
        _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, src_outputs[j], dest_outputs[j]);
441
7.39k
  }
442
2.20k
  ccv_array_free(stack);
443
2.20k
  // After this, we get all tensors in the model graph resolved through tensor_auto.
444
2.20k
  ccv_nnc_symbolic_graph_tensor_auto(model->graph, TRAVERSE_FULL);
445
2.20k
  // Verify symbols we get matches.
446
2.20k
  const int parameter_size = compiled_data->parameters->rnum;
447
4.64k
  for (i = 0; i < parameter_size; 
i++2.44k
)
448
2.44k
    { assert(((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d == ((ccv_nnc_tensor_symbol_t*)ccv_array_get(init->compiled_data->parameters, i))->d); }
449
2.20k
  const int internal_size = compiled_data->internals->rnum;
450
2.20k
  for (i = 0; i < internal_size; 
i++0
)
451
0
    { assert(((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i))->d == ((ccv_nnc_tensor_symbol_t*)ccv_array_get(init->compiled_data->internals, i))->d); }
452
2.20k
  // Go through compiled data.
453
2.20k
  if (compiled_data->tensor_arena)
454
2.20k
  {
455
2.20k
    const int flag = ccv_nnc_tensor_arena_reinit(compiled_data->tensor_arena, model->graph);
456
2.20k
    if (flag == 0 && 
compiled_data->graph_exec_arena2.20k
)
457
2.20k
    {
458
2.20k
      ccv_nnc_graph_exec_reinit(compiled_data->graph_exec_arena, compiled_data->graph, model->graph);
459
2.20k
      // Since we will reinit, if we previously set is_test, we need to set it again.
460
2.20k
      if (compiled_data->is_test)
461
1
      {
462
1
        const int parallel_count = ccv_max(model->parallel_count, 1);
463
1
        ccv_nnc_graph_exec_update_t update = {
464
1
          .parallel_count = parallel_count,
465
1
          .graph = model->graph,
466
1
          .graph_exec_arena = compiled_data->graph_exec_arena,
467
1
        };
468
1
        ccv_cnnp_model_set_is_test(model, 1, _ccv_cnnp_cmd_update_for_execs, &update);
469
1
      }
470
2.20k
    } else
471
2
      // Free-up tensor arena & graph exec arena.
472
2
      _ccv_cnnp_compiled_data_graph_free(compiled_data);
473
2.20k
  }
474
2.20k
  // There are other compiled graphs, for accum and apply gradients.
475
2.20k
  // However, the main conclusion is, these absorb operations shouldn't impact parameters.
476
2.20k
  // Thus, it won't impact the shape of gradients (only outgrad). Since for outgrad, we
477
2.20k
  // don't allocate ourselves, it is not a concern. For normal gradients, the shape cannot
478
2.20k
  // be changed otherwise parameters' shape will be meaningless. The same goes to internals.
479
2.20k
  // That is why we don't update these compiled graphs at all this point.
480
2.20k
  // Free the model, we've already "absorbed" it.
481
2.20k
  ccv_cnnp_model_free(init);
482
2.20k
}
483
484
void ccv_cnnp_model_compile(ccv_cnnp_model_t* const model, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_cmd_t minimizer, const ccv_nnc_cmd_t loss)
485
2.25k
{
486
2.25k
  assert(input_size == model->input_size || model->input_size == 0);
487
2.25k
  if (model->input_size == 0)
488
5
    model->input_size = input_size;
489
2.25k
  if (!model->graph) // The graph is not compiled yet.
490
50
  {
491
50
    model->graph = ccv_nnc_symbolic_graph_new();
492
50
    _ccv_cnnp_model_compile(model, inputs, input_size, loss);
493
50
    assert(model->compiled_data);
494
50
    int i, flag = 0;
495
121
    for (i = 0; !flag && 
i < input_size103
;
i++71
)
496
71
      flag = (CCV_TENSOR_GET_MEMORY(inputs[i].type) == CCV_TENSOR_GPU_MEMORY);
497
50
    // If inputs are from GPU, stream type is GPU.
498
50
    model->compiled_data->stream_type = flag ? 
CCV_STREAM_CONTEXT_GPU18
:
CCV_STREAM_CONTEXT_CPU32
;
499
50
    model->compiled_data->minimize.minimizer = minimizer;
500
50
    model->compiled_data->minimize.max_saved_aux_size = ccv_nnc_minimizer_saved_aux_size(minimizer);
501
2.20k
  } else {
502
2.20k
    // Now, finally fill in this part. If the graph is already compiled, we make a copy of the model.
503
2.20k
    // And then absorb the "new model" to the old one.
504
2.20k
    ccv_cnnp_model_t* const init = ccv_cnnp_model_copy(model);
505
2.20k
    ccv_cnnp_model_absorb(model, init, inputs, input_size);
506
2.20k
    // Reset minimizer.
507
2.20k
    ccv_cnnp_model_set_minimizer(model, minimizer, 1, 0, 0);
508
2.20k
  }
509
2.25k
}
510
511
ccv_cnnp_model_t* ccv_cnnp_model_copy(const ccv_cnnp_model_t* const model)
512
2.20k
{
513
2.20k
  return _ccv_cnnp_model_copy(model, 0);
514
2.20k
}
515
516
void ccv_cnnp_model_tensor_auto(ccv_cnnp_model_t* const model, ccv_nnc_tensor_param_t* const outputs, const int output_size)
517
4.43k
{
518
4.43k
  assert(model->graph);
519
4.43k
  assert(output_size == model->output_size);
520
4.43k
  ccv_nnc_symbolic_graph_t* const graph = model->graph;
521
4.43k
  ccv_nnc_symbolic_graph_tensor_auto(graph, TRAVERSE_FULL);
522
4.43k
  int i;
523
8.87k
  for (i = 0; i < output_size; 
i++4.43k
)
524
4.43k
  {
525
4.43k
    assert(model->outputs[i].d != CCV_NNC_NO_TENSOR_SYMBOL);
526
4.43k
    outputs[i] = ccv_nnc_tensor_symbol_params(graph, model->outputs[i]);
527
4.43k
  }
528
4.43k
}
529
530
void ccv_cnnp_model_set_workspace_size(ccv_cnnp_model_t* const model, size_t workspace_size)
531
3
{
532
3
  if (workspace_size == model->workspace_size)
533
0
    return;
534
3
  model->workspace_size = workspace_size;
535
3
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
536
3
  if (compiled_data && compiled_data->graph)
537
0
    ccv_nnc_graph_autotune(compiled_data->graph, workspace_size, 0, TRAVERSE_FULL);
538
3
}
539
540
void ccv_cnnp_model_set_data_parallel(ccv_cnnp_model_t* const model, const int parallel)
541
15
{
542
15
  if (parallel == 0)
543
0
    model->parallel_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU);
544
15
  else
545
15
    model->parallel_count = parallel;
546
15
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
547
15
  if (compiled_data)
548
11
    { assert(!compiled_data->graph); }
549
15
}
550
551
void ccv_cnnp_model_set_memory_compression(ccv_cnnp_model_t* const model, const int memory_compression)
552
0
{
553
0
  model->memory_compression = memory_compression;
554
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
555
0
  if (compiled_data)
556
0
    { assert(!compiled_data->graph); }
557
0
}
558
559
typedef struct {
560
  int parallel_count;
561
  ccv_nnc_symbolic_graph_t* graph;
562
  ccv_cnnp_compiled_data_t* compiled_data;
563
  ccv_nnc_tensor_arena_t* tensor_arena;
564
} ccv_nnc_tensor_init_states_t;
565
566
static int _ccv_cnnp_any_to_init(const ccv_cnnp_compiled_data_t* const compiled_data)
567
55
{
568
55
  int i;
569
149
  for (i = 0; i < compiled_data->parameters->rnum; 
i++94
)
570
121
  {
571
121
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d;
572
121
    if (!(compiled_data->tensors_init.v[d >> 5] & (1u << (d & 0x1f))))
573
27
      return 1;
574
121
  }
575
55
  
for (i = 0; 28
i < compiled_data->internals->rnum28
;
i++0
)
576
5
  {
577
5
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i))->d;
578
5
    if (!(compiled_data->tensors_init.v[d >> 5] & (1u << (d & 0x1f))))
579
5
      return 1;
580
5
  }
581
28
  
return 023
;
582
28
}
583
584
static void _ccv_cnnp_init_states_for_tensors(void* const context, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const input, const ccv_nnc_tensor_symbol_t output_symbol)
585
295
{
586
295
  ccv_nnc_tensor_init_states_t* const tensor_init_states = (ccv_nnc_tensor_init_states_t*)context;
587
295
  ccv_nnc_tensor_arena_t* const tensor_arena = tensor_init_states->tensor_arena;
588
295
  ccv_nnc_tensor_t* const output_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, output_symbol);
589
295
  if (!output_tensor)
590
0
    return;
591
295
  const int d = output_symbol.d;
592
295
  assert(d < tensor_init_states->compiled_data->tensors_init.size);
593
295
  if (tensor_init_states->compiled_data->tensors_init.v[d >> 5] & (1u << (d & 0x1f)))
594
16
    return;
595
279
  tensor_init_states->compiled_data->tensors_init.v[d >> 5] |= (1u << (d & 0x1f));
596
279
  ccv_nnc_cmd_exec(cmd, hint, flags, &input, input ? 
111
:
0268
, &output_tensor, 1, 0);
597
279
  const ccv_nnc_symbolic_graph_t* const graph = tensor_init_states->graph;
598
279
  const int parallel_count = tensor_init_states->parallel_count;
599
279
  int i;
600
759
  for (i = 1; i < parallel_count; 
i++480
)
601
480
  {
602
480
    ccv_nnc_tensor_t* const copy = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_copy(graph, output_symbol, i));
603
480
    if (copy)
604
480
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &output_tensor, 1, &copy, 1, 0);
605
480
  }
606
279
}
607
608
// This method can only handle cases we added new tensors and exec, never delete. This invariant is true because
609
// we setup everything (including calling simplify method) in ccv_cnnp_model_compile method, before this rewind setup.
610
static void _ccv_cnnp_model_rewind_graph(ccv_cnnp_model_t* const model)
611
2
{
612
2
  assert(model->graph);
613
2
  assert(model->compiled_data);
614
2
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
615
2
  assert(compiled_data->rewindables);
616
2
  int i;
617
53
  for (i = 0; i < compiled_data->rewindables->rnum; 
i++51
)
618
51
  {
619
51
    const ccv_cnnp_rewind_symbol_t* const rewind_symbol = (ccv_cnnp_rewind_symbol_t*)ccv_array_get(compiled_data->rewindables, i);
620
51
    if (rewind_symbol->type == CCV_CNNP_REWIND_GRAPH_EXEC)
621
16
      ccv_nnc_graph_exec_symbol_free(model->graph, rewind_symbol->graph_exec);
622
35
    else if (rewind_symbol->type == CCV_CNNP_REWIND_TENSOR)
623
35
      ccv_nnc_tensor_symbol_free(model->graph, rewind_symbol->tensor);
624
51
  }
625
2
  ccv_array_clear(compiled_data->rewindables);
626
2
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
627
2
}
628
629
630
static void _ccv_cnnp_model_tensor_symbol_new_hook(void* context, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_param_t info, const char* const name)
631
5.94k
{
632
5.94k
  const ccv_cnnp_rewind_symbol_t rewind_symbol = {
633
5.94k
    .type = CCV_CNNP_REWIND_TENSOR,
634
5.94k
    .tensor = symbol
635
5.94k
  };
636
5.94k
  ccv_array_t* const rewind_symbols = (ccv_array_t*)context;
637
5.94k
  ccv_array_push(rewind_symbols, &rewind_symbol);
638
5.94k
}
639
640
static void _ccv_cnnp_model_tensor_symbol_alias_new_hook(void* context, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_symbol_t from_symbol, const int ofs[CCV_NNC_MAX_DIM_ALLOC], const int inc[CCV_NNC_MAX_DIM_ALLOC], const ccv_nnc_tensor_param_t info, const char* const name)
641
470
{
642
470
  const ccv_cnnp_rewind_symbol_t rewind_symbol = {
643
470
    .type = CCV_CNNP_REWIND_TENSOR,
644
470
    .tensor = symbol
645
470
  };
646
470
  ccv_array_t* const rewind_symbols = (ccv_array_t*)context;
647
470
  ccv_array_push(rewind_symbols, &rewind_symbol);
648
470
}
649
650
static void _ccv_cnnp_model_graph_exec_symbol_new_hook(void* context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_symbol_t* const inputs, const int input_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const char* const name)
651
2.24k
{
652
2.24k
  const ccv_cnnp_rewind_symbol_t rewind_symbol = {
653
2.24k
    .type = CCV_CNNP_REWIND_GRAPH_EXEC,
654
2.24k
    .graph_exec = symbol
655
2.24k
  };
656
2.24k
  ccv_array_t* const rewind_symbols = (ccv_array_t*)context;
657
2.24k
  ccv_array_push(rewind_symbols, &rewind_symbol);
658
2.24k
}
659
660
static void _ccv_cnnp_model_graph_symbol_exec_set_for_graph_exec_arena(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena, const int parallel_count, const ccv_nnc_graph_exec_symbol_t exec_symbol, const ccv_nnc_cmd_t cmd, ccv_nnc_symbolic_graph_t* const symbolic_graph)
661
35.0k
{
662
35.0k
  ccv_nnc_graph_exec_t const update_exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, exec_symbol);
663
35.0k
  if (!CCV_NO_GRAPH_EXEC(update_exec))
664
35.0k
    
ccv_nnc_graph_exec_set(update_exec.graph, update_exec, cmd)19.9k
;
665
35.0k
  int i;
666
50.0k
  for (i = 1; i < parallel_count; 
i++14.9k
)
667
14.9k
  {
668
14.9k
    ccv_nnc_graph_exec_symbol_t copy_symbol = ccv_nnc_graph_exec_symbol_copy(symbolic_graph, exec_symbol, i);
669
14.9k
    const ccv_nnc_graph_exec_t copy = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, copy_symbol);
670
14.9k
    if (!CCV_NO_GRAPH_EXEC(copy))
671
14.9k
      
ccv_nnc_graph_exec_set(copy.graph, copy, cmd)14.7k
;
672
14.9k
  }
673
35.0k
}
674
675
static void _ccv_cnnp_model_graph_exec_symbol_set(ccv_nnc_symbolic_graph_t* const symbolic_graph, ccv_cnnp_compiled_data_t* const compiled_data, const int parallel_count, const ccv_nnc_graph_exec_symbol_t exec_symbol, const ccv_nnc_cmd_t cmd)
676
20.0k
{
677
20.0k
  assert(compiled_data);
678
20.0k
  assert(symbolic_graph);
679
20.0k
  ccv_nnc_graph_exec_symbol_set(symbolic_graph, exec_symbol, cmd);
680
20.0k
  int i;
681
35.1k
  for (i = 1; i < parallel_count; 
i++15.0k
)
682
15.0k
  {
683
15.0k
    ccv_nnc_graph_exec_symbol_t copy_symbol = ccv_nnc_graph_exec_symbol_copy(symbolic_graph, exec_symbol, i);
684
15.0k
    if (copy_symbol.graph)
685
14.9k
      ccv_nnc_graph_exec_symbol_set(symbolic_graph, copy_symbol, cmd);
686
15.0k
  }
687
20.0k
  ccv_nnc_graph_exec_arena_t* const graph_exec_arena = compiled_data->graph_exec_arena;
688
20.0k
  if (graph_exec_arena)
689
20.0k
    _ccv_cnnp_model_graph_symbol_exec_set_for_graph_exec_arena(graph_exec_arena, parallel_count, exec_symbol, cmd, symbolic_graph);
690
20.0k
  // Skip backward graph exec arena because it is for a specific accum symbolic graph, not the main graph (model->graph)
691
20.0k
  ccv_nnc_graph_exec_arena_t* const gradient_graph_exec_arena = compiled_data->apply_gradients.graph_exec_arena;
692
20.0k
  if (gradient_graph_exec_arena)
693
15.0k
    _ccv_cnnp_model_graph_symbol_exec_set_for_graph_exec_arena(gradient_graph_exec_arena, parallel_count, exec_symbol, cmd, symbolic_graph);
694
20.0k
}
695
696
static int _ccv_cnnp_set_minimizer_for_parameter(ccv_nnc_symbolic_graph_t* const graph, ccv_cnnp_compiled_data_t* const compiled_data, ccv_nnc_graph_exec_symbol_t* const update_nodes, ccv_nnc_tensor_symbol_t* const updated_parameters, ccv_nnc_tensor_symbol_map_t* const saved_aux, const int parallel_count, const ccv_nnc_cmd_t minimizer, const int saved_aux_size, const int max_saved_aux_size, const int parameter_indice)
697
20.0k
{
698
20.0k
  int this_parameter_flag = 0;
699
20.0k
  const ccv_nnc_cmd_t old_minimizer = ccv_nnc_graph_exec_symbol_cmd(graph, update_nodes[parameter_indice]);
700
20.0k
  int j, k;
701
20.0k
  // For no-op, we can preserve previous saved_aux_size.
702
20.0k
  if (old_minimizer.cmd != minimizer.cmd && 
minimizer.cmd != CCV_NNC_NOOP68
)
703
64
  {
704
64
    // If the old minimizer is a noop, then the old_saved_aux_size should be whatever its previous
705
64
    // saved_aux_size is, otherwise we will reinit the saved_aux repeatedly if you switch between
706
64
    // noop and a minimizer. We don't want that because we do that in high-level frameworks to
707
64
    // make sure some model parameters don't update if we don't want them to.
708
64
    int old_saved_aux_size;
709
64
    if (old_minimizer.cmd == CCV_NNC_NOOP)
710
64
    {
711
64
      int input_size;
712
64
      ccv_nnc_graph_exec_symbol_io(graph, update_nodes[parameter_indice], 0, &input_size, 0, 0);
713
64
      if (input_size < 2) // This is not legit.
714
0
        old_saved_aux_size = ccv_nnc_minimizer_saved_aux_size(old_minimizer);
715
64
      else // See ccv_nnc_minimizer_saved_aux_size, the saved_aux is inputs excluding gradients and parameters.
716
64
        old_saved_aux_size = input_size - 2;
717
64
    } else
718
0
      old_saved_aux_size = ccv_nnc_minimizer_saved_aux_size(old_minimizer);
719
64
    if (old_saved_aux_size != saved_aux_size)
720
62
    {
721
62
      this_parameter_flag = 1;
722
62
      if (saved_aux_size > old_saved_aux_size)
723
62
      {
724
62
        // Allocate new tensor symbols.
725
62
        const ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(graph, updated_parameters[parameter_indice]);
726
180
        for (j = old_saved_aux_size; j < saved_aux_size; 
j++118
)
727
118
        {
728
118
          saved_aux[parameter_indice * max_saved_aux_size + j].source = ccv_nnc_tensor_symbol_new(graph, info, 0);
729
118
          saved_aux[parameter_indice * max_saved_aux_size + j].destination = ccv_nnc_tensor_symbol_new(graph, info, 0);
730
454
          for (k = 1; k < parallel_count; 
k++336
)
731
336
          {
732
336
            ccv_nnc_tensor_param_t dev_info = info;
733
336
            CCV_TENSOR_SET_DEVICE_ID(dev_info.type, k);
734
336
            const ccv_nnc_tensor_symbol_t src_copy = ccv_nnc_tensor_symbol_new(graph, dev_info, 0);
735
336
            const ccv_nnc_tensor_symbol_t dest_copy = ccv_nnc_tensor_symbol_new(graph, dev_info, 0);
736
336
            ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k, src_copy);
737
336
            ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k, dest_copy);
738
336
          }
739
118
        }
740
62
      } else {
741
0
        for (j = saved_aux_size; j < old_saved_aux_size; j++)
742
0
        {
743
0
          for (k = 1; k < parallel_count; k++)
744
0
          {
745
0
            const ccv_nnc_tensor_symbol_t src_copy = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k);
746
0
            if (src_copy.d >= 0)
747
0
            {
748
0
              ccv_nnc_tensor_symbol_free(graph, src_copy);
749
0
              ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k, NO_TENSOR_SYMBOL);
750
0
            }
751
0
            const ccv_nnc_tensor_symbol_t dest_copy = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k);
752
0
            if (dest_copy.d >= 0)
753
0
            {
754
0
              ccv_nnc_tensor_symbol_free(graph, dest_copy);
755
0
              ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k, NO_TENSOR_SYMBOL);
756
0
            }
757
0
          }
758
0
          ccv_nnc_tensor_symbol_free(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source);
759
0
          ccv_nnc_tensor_symbol_free(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination);
760
0
          saved_aux[parameter_indice * max_saved_aux_size + j].source = saved_aux[parameter_indice * max_saved_aux_size + j].destination = NO_TENSOR_SYMBOL;
761
0
        }
762
0
      }
763
62
    }
764
64
  }
765
20.0k
  _ccv_cnnp_model_graph_exec_symbol_set(graph, compiled_data, parallel_count, update_nodes[parameter_indice], minimizer);
766
20.0k
  if (this_parameter_flag)
767
62
  {
768
62
    ccv_nnc_tensor_symbol_t update_inputs[saved_aux_size + 2];
769
62
    ccv_nnc_tensor_symbol_t update_outputs[saved_aux_size + 1];
770
62
    const int* inputs = 0;
771
62
    int input_size = 0;
772
62
    ccv_nnc_graph_exec_symbol_io(graph, update_nodes[parameter_indice], &inputs, &input_size, 0, 0);
773
62
    assert(input_size >= 1);
774
62
    update_inputs[0].d = inputs[0];
775
62
    update_inputs[0].graph = graph;
776
62
    update_inputs[1].d = inputs[1];
777
62
    update_inputs[1].graph = graph;
778
62
    update_outputs[0] = updated_parameters[parameter_indice];
779
180
    for (j = 0; j < saved_aux_size; 
j++118
)
780
118
    {
781
118
      update_inputs[j + 2] = saved_aux[parameter_indice * max_saved_aux_size + j].source;
782
118
      update_outputs[j + 1] = saved_aux[parameter_indice * max_saved_aux_size + j].destination;
783
118
    }
784
62
    ccv_nnc_graph_exec_symbol_set_io(graph, update_nodes[parameter_indice], update_inputs, saved_aux_size + 2, update_outputs, saved_aux_size + 1);
785
230
    for (k = 1; k < parallel_count; 
k++168
)
786
168
    {
787
168
      const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(graph, update_nodes[parameter_indice], k);
788
168
      assert(copy.d >= 0);
789
168
      ccv_nnc_graph_exec_symbol_io(graph, copy, &inputs, &input_size, 0, 0);
790
168
      assert(input_size >= 1);
791
168
      update_inputs[0].d = inputs[0];
792
168
      update_inputs[0].graph = graph;
793
168
      update_inputs[1].d = inputs[1];
794
168
      update_inputs[1].graph = graph;
795
168
      update_outputs[0] = ccv_nnc_tensor_symbol_copy(graph, updated_parameters[parameter_indice], k);
796
504
      for (j = 0; j < saved_aux_size; 
j++336
)
797
336
      {
798
336
        update_inputs[j + 2] = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k);
799
336
        update_outputs[j + 1] = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k);
800
336
      }
801
168
      ccv_nnc_graph_exec_symbol_set_io(graph, copy, update_inputs, saved_aux_size + 2, update_outputs, saved_aux_size + 1);
802
168
    }
803
62
  }
804
20.0k
  return this_parameter_flag;
805
20.0k
}
806
807
typedef struct {
808
  int parameter_size;
809
  ccv_nnc_cmd_t minimizer;
810
  ccv_cnnp_model_io_t parameters[1];
811
} ccv_cnnp_set_minimizer_for_parameter_t;
812
813
static int _ccv_cnnp_apply_parameters_with_minimizer(ccv_cnnp_model_t* const model)
814
296
{
815
296
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
816
296
  assert(compiled_data);
817
296
  const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
818
296
  // We update all parameters, at this point, we have one minimizer.
819
296
  const int parameter_size = compiled_data->parameters->rnum;
820
296
  ccv_nnc_graph_exec_symbol_t* const update_nodes = compiled_data->update_nodes;
821
296
  ccv_nnc_symbolic_graph_t* const symbolic_graph = model->graph;
822
296
  assert(symbolic_graph);
823
296
  const int parallel_count = ccv_max(model->parallel_count, 1);
824
296
  ccv_array_t* const parameters = compiled_data->minimize.parameters;
825
296
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
826
296
  int i, j, flag = 0;
827
301
  for (i = 0; i < parameters->rnum; 
i++5
)
828
5
  {
829
5
    ccv_cnnp_set_minimizer_for_parameter_t* const set_minimizer_for_parameter = *(ccv_cnnp_set_minimizer_for_parameter_t**)ccv_array_get(parameters, i);
830
10
    for (j = 0; j < set_minimizer_for_parameter->parameter_size; 
j++5
)
831
5
    {
832
5
      const int param_sel = set_minimizer_for_parameter->parameters[j]->param_sel > 0 ? 
set_minimizer_for_parameter->parameters[j]->param_sel - 13
:
set_minimizer_for_parameter->parameters[j]->param_sel2
;
833
5
      assert(set_minimizer_for_parameter->parameters[j]->param_sel != 0);
834
5
      const int old_rnum = parameter_indices->rnum;
835
5
      ccv_cnnp_model_add_to_parameter_indices(set_minimizer_for_parameter->parameters[j]->model, param_sel, parameter_indices);
836
5
      const int param_ref = set_minimizer_for_parameter->parameters[j]->param_ref > 0 ? 
set_minimizer_for_parameter->parameters[j]->param_ref - 10
: set_minimizer_for_parameter->parameters[j]->param_ref;
837
5
      assert(set_minimizer_for_parameter->parameters[j]->param_ref != 0);
838
5
      if (param_ref >= 0)
839
0
      {
840
0
        assert(param_ref + old_rnum < parameter_indices->rnum);
841
0
        *(int*)ccv_array_get(parameter_indices, old_rnum) = *(int*)ccv_array_get(parameter_indices, param_ref + old_rnum);
842
0
        parameter_indices->rnum = old_rnum + 1;
843
0
      }
844
5
    }
845
5
    const int saved_aux_size = ccv_nnc_minimizer_saved_aux_size(set_minimizer_for_parameter->minimizer);
846
5
    // We may have duplicated indices, but that is OK, we will set it twice.
847
58
    for (j = 0; j < parameter_indices->rnum; 
j++53
)
848
53
    {
849
53
      const int d = *(int*)ccv_array_get(parameter_indices, j);
850
53
      assert(d <= parameter_size);
851
53
      if (_ccv_cnnp_set_minimizer_for_parameter(symbolic_graph, compiled_data, update_nodes, compiled_data->updated_parameters, compiled_data->saved_aux, parallel_count, set_minimizer_for_parameter->minimizer, saved_aux_size, max_saved_aux_size, d))
852
0
        flag = 1;
853
53
    }
854
5
    ccv_array_clear(parameter_indices);
855
5
  }
856
296
  ccv_array_free(parameter_indices);
857
296
  return flag;
858
296
}
859
860
static void _ccv_cnnp_scatter_saved_aux(ccv_nnc_tensor_symbol_map_t* const saved_aux, const int parameter_size, const int old_saved_aux_size, const int new_saved_aux_size)
861
2.23k
{
862
2.23k
  if (new_saved_aux_size == old_saved_aux_size)
863
2.23k
    return;
864
6
  assert(new_saved_aux_size > old_saved_aux_size);
865
6
  int i, j;
866
68
  for (i = parameter_size - 1; i >= 0; 
i--62
)
867
62
  {
868
180
    for (j = new_saved_aux_size - 1; j >= old_saved_aux_size; 
j--118
)
869
118
      saved_aux[i * new_saved_aux_size + j].source = saved_aux[i * new_saved_aux_size + j].destination = NO_TENSOR_SYMBOL;
870
62
    for (j = old_saved_aux_size - 1; j >= 0; 
j--0
)
871
0
      saved_aux[i * new_saved_aux_size + j] = saved_aux[i * old_saved_aux_size + j];
872
62
  }
873
6
}
874
875
static void _ccv_cnnp_model_set_rewindables(ccv_cnnp_model_t* const model)
876
32
{
877
32
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
878
32
  assert(compiled_data);
879
32
  if (!compiled_data->rewindables)
880
32
    compiled_data->rewindables = ccv_array_new(sizeof(ccv_cnnp_rewind_symbol_t), 0, 0);
881
32
  ccv_nnc_tensor_symbol_new_hook(model->graph, _ccv_cnnp_model_tensor_symbol_new_hook, compiled_data->rewindables);
882
32
  ccv_nnc_tensor_symbol_alias_new_hook(model->graph, _ccv_cnnp_model_tensor_symbol_alias_new_hook, compiled_data->rewindables);
883
32
  ccv_nnc_graph_exec_symbol_new_hook(model->graph, _ccv_cnnp_model_graph_exec_symbol_new_hook, compiled_data->rewindables);
884
32
}
885
886
static void _ccv_cnnp_model_gradient_init(ccv_cnnp_model_t* const model, const int gradient_mode, const uint64_t disable_outgrad, ccv_nnc_tensor_t* const* const fits, const int fit_size)
887
2.23k
{
888
2.23k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
889
2.23k
  assert(compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE);
890
2.23k
  assert(gradient_mode != CCV_CNNP_COMPILED_DATA_GRADIENT_NONE);
891
2.23k
  const int evaluate_to_size = compiled_data->evaluate.to_size;
892
2.23k
  assert(evaluate_to_size > 0);
893
2.23k
  const int parallel_count = ccv_max(model->parallel_count, 1);
894
2.23k
  compiled_data->evaluate.tos = ccrealloc(compiled_data->evaluate.tos, sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size * parallel_count + sizeof(ccv_nnc_graph_exec_t) * evaluate_to_size * parallel_count);
895
2.23k
  compiled_data->evaluate.to_ops = (ccv_nnc_graph_exec_t*)(compiled_data->evaluate.tos + evaluate_to_size * parallel_count);
896
2.23k
  int i, j;
897
2.23k
  const int output_size = model->output_size;
898
2.23k
  assert(!fits || fit_size == output_size * parallel_count);
899
2.23k
  if (fits)
900
6
    
for (i = 0; 3
i < output_size;
i++3
)
901
3
      ccv_nnc_tensor_symbol_set(model->graph, compiled_data->fits[i], fits[i]->info);
902
2.23k
  const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
903
2.23k
  const int parameter_size = compiled_data->parameters->rnum;
904
2.23k
  compiled_data->updated_parameters = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size + sizeof(ccv_nnc_graph_exec_symbol_t) * parameter_size + sizeof(ccv_nnc_tensor_symbol_map_t) * max_saved_aux_size * parameter_size);
905
2.23k
  compiled_data->update_nodes = (ccv_nnc_graph_exec_symbol_t*)(compiled_data->updated_parameters + parameter_size);
906
2.23k
  compiled_data->saved_aux = (ccv_nnc_tensor_symbol_map_t*)(compiled_data->update_nodes + parameter_size);
907
2.23k
  int parameter_size_maybe_more = parameter_size;
908
2.23k
  compiled_data->disable_outgrad = disable_outgrad;
909
2.23k
  int outgrad_size;
910
2.23k
  if (gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES || 
model->input_size == 02.22k
)
911
6
    outgrad_size = 0;
912
2.22k
  else if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_NONE) // Compute minimize with gradients including inputs.
913
2.22k
    outgrad_size = model->input_size;
914
3
  else {
915
3
    assert(disable_outgrad != CCV_CNNP_DISABLE_OUTGRAD_ALL); // If it is disable all, gradient mode won't be this.
916
3
    outgrad_size = 0;
917
9
    for (i = 0; i < model->input_size; 
i++6
)
918
6
      if (!(disable_outgrad & ((uint64_t)1 << i)))
919
3
        ++outgrad_size;
920
3
  }
921
2.23k
  compiled_data->outgrad_size = outgrad_size;
922
2.23k
  parameter_size_maybe_more += outgrad_size;
923
2.23k
  compiled_data->gradients = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size_maybe_more + sizeof(ccv_nnc_graph_exec_symbol_t) * parameter_size_maybe_more * parallel_count);
924
2.23k
  compiled_data->outgrads = parameter_size_maybe_more > parameter_size ? 
compiled_data->gradients + parameter_size2.22k
:
06
;
925
2.23k
  compiled_data->backward.tos = (ccv_nnc_graph_exec_symbol_t*)(compiled_data->gradients + parameter_size_maybe_more);
926
2.23k
  compiled_data->backward.to_size = parameter_size_maybe_more;
927
2.23k
  if (gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES || 
model->input_size == 02.22k
)
928
6
    ccv_nnc_symbolic_graph_minimize(model->graph, compiled_data->minimize.minimizer, compiled_data->f, output_size, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), parameter_size, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), compiled_data->gradients, compiled_data->updated_parameters, compiled_data->saved_aux, compiled_data->update_nodes);
929
2.22k
  else if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_NONE) // Compute minimize with gradients including inputs.
930
2.22k
    ccv_nnc_symbolic_graph_minimize(model->graph, compiled_data->minimize.minimizer, compiled_data->f, output_size, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), parameter_size, model->inputs, model->input_size, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), compiled_data->gradients, compiled_data->updated_parameters, compiled_data->saved_aux, compiled_data->update_nodes);
931
3
  else { // Compute minimize with gradients including selected inputs.
932
3
    assert(model->input_size > 0);
933
3
    assert(disable_outgrad != CCV_CNNP_DISABLE_OUTGRAD_ALL); // If it is disable all, gradient mode won't be this.
934
3
    assert(outgrad_size > 0);
935
3
    ccv_nnc_tensor_symbol_t outgrads[outgrad_size];
936
3
    j = 0;
937
9
    for (i = 0; i < model->input_size; 
i++6
)
938
6
      if (!(disable_outgrad & ((uint64_t)1 << i)))
939
3
        outgrads[j++] = model->inputs[i];
940
3
    ccv_nnc_symbolic_graph_minimize(model->graph, compiled_data->minimize.minimizer, compiled_data->f, output_size, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), parameter_size, outgrads, outgrad_size, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), compiled_data->gradients, compiled_data->updated_parameters, compiled_data->saved_aux, compiled_data->update_nodes);
941
3
  }
942
2.23k
  _ccv_cnnp_scatter_saved_aux(compiled_data->saved_aux, parameter_size, ccv_nnc_minimizer_saved_aux_size(compiled_data->minimize.minimizer), compiled_data->minimize.max_saved_aux_size);
943
2.23k
  if (compiled_data->minimize.parameters)
944
5
    _ccv_cnnp_apply_parameters_with_minimizer(model);
945
4.46k
  for (i = 0; i < output_size; 
i++2.23k
)
946
2.23k
  {
947
2.23k
    const ccv_nnc_tensor_symbol_t df = ccv_nnc_tensor_symbol_for_backward(model->graph, compiled_data->f[i]);
948
2.23k
    // Init this to 1 so we can backprop.
949
2.23k
    ccv_nnc_tensor_symbol_set_flags(model->graph, df, CCV_NNC_TENSOR_SYMBOL_INIT_ONES);
950
2.23k
  }
951
7.13k
  for (i = 0; i < parameter_size_maybe_more; 
i++4.90k
)
952
4.90k
    compiled_data->backward.tos[i] = ccv_nnc_graph_exec_symbol_for_backward(model->graph, compiled_data->gradients[i]);
953
2.23k
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS);
954
2.23k
  ccv_nnc_symbolic_graph_set_destinations(model->graph, compiled_data->update_nodes, parameter_size);
955
4.47k
  for (i = 0; i < parameter_size_maybe_more - parameter_size; 
i++2.24k
)
956
2.24k
  {
957
2.24k
    const ccv_nnc_graph_exec_symbol_t outgrad = ccv_nnc_graph_exec_symbol_for_backward(model->graph, compiled_data->outgrads[i]);
958
2.24k
    const int* tos;
959
2.24k
    int to_size;
960
2.24k
    ccv_nnc_graph_exec_symbol_to(model->graph, outgrad, &tos, &to_size);
961
2.24k
    if (to_size == 0) // If this is the end (no minimizers afterwards). We need to attach this as a destination. Otherwise this is covered in update_nodes.
962
5
    {
963
5
      const ccv_nnc_graph_exec_symbol_t* destinations = ccv_nnc_symbolic_graph_destinations(model->graph);
964
5
      int flag = 0;
965
6
      for (j = i - 1; !flag && 
j >= 05
;
j--1
)
966
1
        flag = (destinations[j + parameter_size].d == outgrad.d);
967
5
      if (!flag) // Only if we cannot find it, we add it.
968
4
        ccv_nnc_symbolic_graph_add_destination(model->graph, outgrad);
969
5
    }
970
2.24k
  }
971
2.23k
  if (parallel_count > 1)
972
9
  {
973
9
    ccv_nnc_symbolic_graph_data_parallel(model->graph, parallel_count,
974
9
      0, 0,
975
9
      compiled_data->gradients, parameter_size /* No need to deal with outgrads, we don't allreduce outgrads */,
976
9
      compiled_data->gradients /* We only care about gradients before allreduce, thus, update our current pointers */,
977
9
      0, 0, 0,
978
9
      CCV_NNC_PARALLEL_REDUCE_OP_SUM,
979
9
      SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
980
9
    ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
981
18
    for (i = 0; i < evaluate_to_size; 
i++9
)
982
36
      
for (j = 1; 9
j < parallel_count;
j++27
)
983
27
      {
984
27
        const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->evaluate.tos[i], j);
985
27
        if (copy.d != CCV_NNC_NO_GRAPH_EXEC_SYMBOL)
986
27
          compiled_data->evaluate.tos[compiled_data->evaluate.to_size++] = copy;
987
27
      }
988
176
    for (i = 0; i < parameter_size_maybe_more; 
i++167
)
989
668
      
for (j = 1; 167
j < parallel_count;
j++501
)
990
501
      {
991
501
        const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->backward.tos[i], j);
992
501
        if (copy.d != CCV_NNC_NO_GRAPH_EXEC_SYMBOL)
993
501
          compiled_data->backward.tos[compiled_data->backward.to_size++] = copy;
994
501
      }
995
9
  }
996
2.23k
  // Only use memory compression if we are in gradient parameter mode.
997
2.23k
  if (gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES && 
model->memory_compression6
)
998
0
    ccv_nnc_symbolic_graph_memory_compression(model->graph, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
999
2.23k
  compiled_data->backward.to_size = _ccv_nnc_array_dedup_graph_exec_symbols(compiled_data->backward.tos, compiled_data->backward.to_size);
1000
2.23k
  compiled_data->gradient_mode = gradient_mode;
1001
2.23k
}
1002
1003
void ccv_cnnp_model_tensors_init(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
1004
48
{
1005
48
  assert(!compiled_data->tensors.parameters);
1006
48
  const int parameter_size = compiled_data->parameters->rnum;
1007
48
  const int parallel_count = ccv_max(model->parallel_count, 1);
1008
48
  const int internal_size = compiled_data->internals->rnum;
1009
48
  compiled_data->tensors_init.size = ccv_nnc_tensor_symbol_count(model->graph);
1010
48
  compiled_data->tensors_init.v = cccalloc(((compiled_data->tensors_init.size + 31) >> 5), sizeof(uint32_t));
1011
48
  compiled_data->tensors.parameters = (ccv_nnc_tensor_t**)ccmalloc((sizeof(ccv_nnc_tensor_t*) * parameter_size + sizeof(ccv_nnc_tensor_t*) * internal_size) * parallel_count);
1012
48
  compiled_data->tensors.internals = compiled_data->tensors.parameters + parameter_size * parallel_count;
1013
48
  int i, j;
1014
295
  for (i = 0; i < parameter_size; 
i++247
)
1015
247
  {
1016
247
    const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i);
1017
247
    ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(parameter.graph, parameter);
1018
247
    CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1019
247
    compiled_data->tensors.parameters[i] = ccv_nnc_tensor_new(0, info, 0);
1020
649
    for (j = 1; j < parallel_count; 
j++402
)
1021
402
    {
1022
402
      CCV_TENSOR_SET_DEVICE_ID(info.type, j);
1023
402
      compiled_data->tensors.parameters[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0);
1024
402
    }
1025
247
  }
1026
106
  for (i = 0; i < internal_size; 
i++58
)
1027
58
  {
1028
58
    const ccv_nnc_tensor_symbol_t retained = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i);
1029
58
    ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(retained.graph, retained);
1030
58
    CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1031
58
    compiled_data->tensors.internals[i] = ccv_nnc_tensor_new(0, info, 0);
1032
154
    for (j = 1; j < parallel_count; 
j++96
)
1033
96
    {
1034
96
      CCV_TENSOR_SET_DEVICE_ID(info.type, j);
1035
96
      compiled_data->tensors.internals[i + j * internal_size] = ccv_nnc_tensor_new(0, info, 0);
1036
96
    }
1037
58
  }
1038
48
}
1039
1040
static void _ccv_cnnp_model_copy_tensors(const uint32_t* const tensors_init, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t* const* const tensors, const int tensor_size, const int parallel_count)
1041
7
{
1042
7
  assert(parallel_count > 0);
1043
7
  int i, j;
1044
41
  for (i = 0; i < tensor_size; 
i++34
)
1045
34
  {
1046
34
    if (!tensors[i])
1047
0
      continue;
1048
34
    const int d = tensor_symbols[i].d;
1049
34
    if (!(tensors_init[d >> 5] & (1u << (d & 0x1f))))
1050
0
      continue;
1051
136
    
for (j = 1; 34
j < parallel_count;
j++102
)
1052
102
      if (tensors[i + j * tensor_size])
1053
102
        ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &tensors[i], 1, &tensors[i + j * tensor_size], 1, 0);
1054
34
  }
1055
7
}
1056
1057
static void _ccv_cnnp_model_remove_nocopies(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t** const tensors, const int tensor_size, const int parallel_count)
1058
55
{
1059
55
  assert(parallel_count > 0);
1060
55
  int i, j;
1061
113
  for (i = 0; i < tensor_size; 
i++58
)
1062
58
  {
1063
58
    const ccv_nnc_tensor_symbol_t tensor_symbol = tensor_symbols[i];
1064
154
    for (j = 1; j < parallel_count; 
j++96
)
1065
96
    {
1066
96
      const ccv_nnc_tensor_symbol_t copy = ccv_nnc_tensor_symbol_copy(graph, tensor_symbol, j);
1067
96
      ccv_nnc_tensor_t* copy_tensor = tensors[i + j * tensor_size];
1068
96
      if (copy_tensor && copy.d == CCV_NNC_NO_TENSOR_SYMBOL)
1069
0
      { // We shouldn't allocate this, free it up.
1070
0
        ccv_nnc_tensor_free(tensors[i + j * tensor_size]);
1071
0
        tensors[i + j * tensor_size] = 0;
1072
0
      }
1073
96
    }
1074
58
  }
1075
55
}
1076
1077
static void _ccv_cnnp_model_bind_tensors(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t* const* const tensors, const int tensor_size, const int parallel_count, ccv_array_t* const tensor_binds)
1078
323
{
1079
323
  assert(parallel_count > 0);
1080
323
  int i, j;
1081
1.50k
  for (i = 0; i < tensor_size; 
i++1.18k
)
1082
1.18k
  {
1083
1.18k
    ccv_nnc_tensor_symbol_t tensor_symbol = tensor_symbols[i];
1084
1.18k
    if (graph)
1085
1.18k
    {
1086
1.18k
      const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(graph, tensor_symbol);
1087
1.18k
      if (alias_to.d != CCV_NNC_NO_TENSOR_SYMBOL)
1088
1
        tensor_symbol = alias_to;
1089
1.18k
    }
1090
1.18k
    ccv_nnc_tensor_t* const tensor = tensors[i];
1091
1.18k
    if (tensor && tensor_symbol.d != CCV_NNC_NO_TENSOR_SYMBOL)
1092
1.18k
    {
1093
1.18k
      const ccv_nnc_tensor_bind_t retained_bind = {
1094
1.18k
        .symbol = tensor_symbol,
1095
1.18k
        .tensor = tensor
1096
1.18k
      };
1097
1.18k
      ccv_array_push(tensor_binds, &retained_bind);
1098
1.18k
    }
1099
2.82k
    for (j = 1; j < parallel_count; 
j++1.64k
)
1100
1.64k
    {
1101
1.64k
      const ccv_nnc_tensor_symbol_t copy = ccv_nnc_tensor_symbol_copy(graph, tensor_symbol, j);
1102
1.64k
      ccv_nnc_tensor_t* copy_tensor = tensors[i + j * tensor_size];
1103
1.64k
      if (copy_tensor && copy.d != CCV_NNC_NO_TENSOR_SYMBOL)
1104
1.64k
      {
1105
1.64k
        const ccv_nnc_tensor_bind_t bind = {
1106
1.64k
          .symbol = copy,
1107
1.64k
          .tensor = tensors[i + j * tensor_size]
1108
1.64k
        };
1109
1.64k
        ccv_array_push(tensor_binds, &bind);
1110
1.64k
      }
1111
1.64k
    }
1112
1.18k
  }
1113
323
}
1114
1115
static void _ccv_cnnp_compiled_data_graph_free(ccv_cnnp_compiled_data_t* const compiled_data)
1116
2.31k
{
1117
2.31k
  if (compiled_data->graph)
1118
55
    ccv_nnc_graph_free(compiled_data->graph);
1119
2.31k
  compiled_data->graph = 0;
1120
2.31k
  compiled_data->is_test = 0;
1121
2.31k
  if (compiled_data->tensor_arena)
1122
55
    ccv_nnc_tensor_arena_free(compiled_data->tensor_arena);
1123
2.31k
  compiled_data->tensor_arena = 0;
1124
2.31k
  if (compiled_data->graph_exec_arena)
1125
55
    ccv_nnc_graph_exec_arena_free(compiled_data->graph_exec_arena);
1126
2.31k
  compiled_data->graph_exec_arena = 0;
1127
2.31k
  if (compiled_data->backward.from_ops)
1128
2.31k
    
ccfree24
(compiled_data->backward.from_ops)24
;
1129
2.31k
  compiled_data->backward.from_ops = 0;
1130
2.31k
  if (compiled_data->evaluate.schedule)
1131
29
    ccv_nnc_graph_static_schedule_free(compiled_data->evaluate.schedule);
1132
2.31k
  compiled_data->evaluate.schedule = 0;
1133
2.31k
  if (compiled_data->backward.schedule)
1134
21
    ccv_nnc_graph_static_schedule_free(compiled_data->backward.schedule);
1135
2.31k
  compiled_data->backward.schedule = 0;
1136
2.31k
}
1137
1138
static void _ccv_cnnp_compiled_data_gradient_free(ccv_cnnp_compiled_data_t* const compiled_data)
1139
2.25k
{
1140
2.25k
  if (compiled_data->gradients)
1141
2.25k
    
ccfree2.23k
(compiled_data->gradients)2.23k
;
1142
2.25k
  compiled_data->gradients = 0;
1143
2.25k
  if (compiled_data->updated_parameters)
1144
2.25k
    
ccfree2.23k
(compiled_data->updated_parameters)2.23k
;
1145
2.25k
  compiled_data->updated_parameters = 0;
1146
2.25k
  compiled_data->update_nodes = 0;
1147
2.25k
  compiled_data->saved_aux = 0;
1148
2.25k
}
1149
1150
static void _ccv_cnnp_compiled_data_backward_free(ccv_cnnp_compiled_data_t* const compiled_data)
1151
2.28k
{
1152
2.28k
  if (compiled_data->backward.gradients)
1153
2.28k
    
ccfree4
(compiled_data->backward.gradients)4
;
1154
2.28k
  compiled_data->backward.gradients = 0;
1155
2.28k
  if (compiled_data->backward.accum)
1156
4
    ccv_nnc_graph_free(compiled_data->backward.accum);
1157
2.28k
  compiled_data->backward.accum = 0;
1158
2.28k
  if (compiled_data->backward.tensor_arena)
1159
4
    ccv_nnc_tensor_arena_free(compiled_data->backward.tensor_arena);
1160
2.28k
  compiled_data->backward.tensor_arena = 0;
1161
2.28k
  if (compiled_data->backward.graph_exec_arena)
1162
4
    ccv_nnc_graph_exec_arena_free(compiled_data->backward.graph_exec_arena);
1163
2.28k
  compiled_data->backward.graph_exec_arena = 0;
1164
2.28k
}
1165
1166
static void _ccv_cnnp_compiled_data_apply_gradients_free(ccv_cnnp_compiled_data_t* const compiled_data)
1167
2.26k
{
1168
2.26k
  if (compiled_data->apply_gradients.graph)
1169
19
    ccv_nnc_graph_free(compiled_data->apply_gradients.graph);
1170
2.26k
  compiled_data->apply_gradients.graph = 0;
1171
2.26k
  if (compiled_data->apply_gradients.tensor_arena)
1172
19
    ccv_nnc_tensor_arena_free(compiled_data->apply_gradients.tensor_arena);
1173
2.26k
  compiled_data->apply_gradients.tensor_arena = 0;
1174
2.26k
  if (compiled_data->apply_gradients.graph_exec_arena)
1175
19
    ccv_nnc_graph_exec_arena_free(compiled_data->apply_gradients.graph_exec_arena);
1176
2.26k
  compiled_data->apply_gradients.graph_exec_arena = 0;
1177
2.26k
}
1178
1179
// Compile the graph to run ccv_cnnp_model_fit
1180
static void _ccv_cnnp_model_fit_jit(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const fits, const int fit_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
1181
5
{
1182
5
  int i, j;
1183
5
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1184
5
  assert(!compiled_data->graph || compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_FIT_MODE);
1185
5
  compiled_data->graph_mode = CCV_CNNP_MODEL_GRAPH_FIT_MODE;
1186
5
  const int parallel_count = ccv_max(model->parallel_count, 1);
1187
5
  assert(output_size == model->output_size * parallel_count);
1188
5
  assert(!fits || output_size == fit_size);
1189
5
  assert(output_size > 0);
1190
5
  if (compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE)
1191
5
  {
1192
5
    _ccv_cnnp_model_set_rewindables(model);
1193
5
    _ccv_cnnp_model_gradient_init(model, CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES, CCV_CNNP_DISABLE_OUTGRAD_ALL, fits, fit_size);
1194
5
  } else 
if (0
compiled_data->gradient_mode != CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES0
) {
1195
0
    _ccv_cnnp_model_rewind_graph(model);
1196
0
    _ccv_cnnp_compiled_data_gradient_free(compiled_data);
1197
0
    compiled_data->gradient_mode = CCV_CNNP_COMPILED_DATA_GRADIENT_NONE;
1198
0
    _ccv_cnnp_model_gradient_init(model, CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES, CCV_CNNP_DISABLE_OUTGRAD_ALL, fits, fit_size);
1199
0
  }
1200
5
  const int tensors_init = !!compiled_data->tensors_init.v;
1201
5
  if (!tensors_init)
1202
4
    ccv_cnnp_model_tensors_init(model, compiled_data);
1203
5
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
1204
5
  assert((input_size % parallel_count) == 0);
1205
5
  assert((output_size % parallel_count) == 0);
1206
5
  assert((fit_size % parallel_count) == 0);
1207
5
  const int input_size_per_p = input_size / parallel_count;
1208
5
  _ccv_cnnp_model_bind_tensors(model->graph, model->inputs, inputs, input_size_per_p, parallel_count, tensor_binds);
1209
5
  const int output_size_per_p = output_size / parallel_count;
1210
5
  _ccv_cnnp_model_bind_tensors(model->graph, model->outputs, outputs, output_size_per_p, parallel_count, tensor_binds);
1211
5
  const int fit_size_per_p = fit_size / parallel_count;
1212
5
  _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->fits, fits, fit_size_per_p, parallel_count, tensor_binds);
1213
5
  const int parameter_size = compiled_data->parameters->rnum;
1214
5
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1215
5
  _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->updated_parameters, compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1216
5
  const int internal_size = compiled_data->internals->rnum;
1217
5
  _ccv_cnnp_model_remove_nocopies(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count);
1218
5
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count, tensor_binds);
1219
5
  ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena);
1220
5
  ccv_array_free(tensor_binds);
1221
5
  if (tensors_init && 
parallel_count > 11
)
1222
0
    _ccv_cnnp_model_copy_tensors(compiled_data->tensors_init.v, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, compiled_data->parameters->rnum, parallel_count);
1223
5
  // If tensor is not init'ed, we need to init states first.
1224
5
  if (_ccv_cnnp_any_to_init(compiled_data))
1225
4
  {
1226
4
    ccv_nnc_tensor_init_states_t tensor_init_states = {
1227
4
      .parallel_count = parallel_count,
1228
4
      .graph = model->graph,
1229
4
      .compiled_data = compiled_data,
1230
4
      .tensor_arena = compiled_data->tensor_arena
1231
4
    };
1232
4
    ccv_cnnp_model_init_states(model, model->graph, _ccv_cnnp_init_states_for_tensors, &tensor_init_states);
1233
4
  }
1234
5
  compiled_data->is_test = 0;
1235
5
  const int saved_aux_size = ccv_nnc_minimizer_saved_aux_size(compiled_data->minimize.minimizer);
1236
5
  // No need to set because it is default to training mode.
1237
5
  // ccv_cnnp_model_set_is_test(model, 0, _ccv_cnnp_cmd_update_for_execs, &update);
1238
87
  for (i = 0; i < saved_aux_size * parameter_size; 
i++82
)
1239
82
  {
1240
82
    ccv_nnc_tensor_t* const tensor = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, compiled_data->saved_aux[i].source);
1241
82
    ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &tensor, 1, 0);
1242
286
    for (j = 1; j < parallel_count; 
j++204
)
1243
204
    {
1244
204
      ccv_nnc_tensor_t* const copy = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, compiled_data->saved_aux[i].source, j));
1245
204
      if (copy)
1246
204
        ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &copy, 1, 0);
1247
204
    }
1248
82
  }
1249
5
  const int evaluate_to_size = compiled_data->evaluate.to_size;
1250
5
  compiled_data->evaluate.to_op_size = 0;
1251
16
  for (i = 0; i < evaluate_to_size; 
i++11
)
1252
11
  {
1253
11
    ccv_nnc_graph_exec_t const to = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, compiled_data->evaluate.tos[i]);
1254
11
    if (to.graph)
1255
11
      compiled_data->evaluate.to_ops[compiled_data->evaluate.to_op_size++] = to;
1256
11
  }
1257
5
  ccv_nnc_graph_set_default_static_schedule(compiled_data->graph, compiled_data->stream_type);
1258
5
  ccv_nnc_graph_autotune(compiled_data->graph, model->workspace_size, 0, TRAVERSE_FULL);
1259
5
}
1260
1261
ccv_nnc_stream_context_t* ccv_cnnp_model_default_stream(const ccv_cnnp_model_t* const model)
1262
0
{
1263
0
  const ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1264
0
  if (!compiled_data || !compiled_data->graph)
1265
0
    return 0;
1266
0
  return ccv_nnc_graph_default_stream(compiled_data->graph);
1267
0
}
1268
1269
uint64_t ccv_cnnp_model_memory_size(const ccv_cnnp_model_t* const model)
1270
0
{
1271
0
  const ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1272
0
  if (!compiled_data || !compiled_data->tensor_arena)
1273
0
    return 0;
1274
0
  return ccv_nnc_tensor_arena_size(compiled_data->tensor_arena);
1275
0
}
1276
1277
static void _ccv_cnnp_bind_tensors_to_arena(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t* const* const tensors, const int tensor_size, const int parallel_count)
1278
38.4k
{
1279
38.4k
  int i, j;
1280
113k
  for (i = 0; i < tensor_size; 
i++75.1k
)
1281
75.1k
  {
1282
75.1k
    ccv_nnc_tensor_symbol_t tensor_symbol = tensor_symbols[i];
1283
75.1k
    if (graph)
1284
72.2k
    {
1285
72.2k
      const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(graph, tensor_symbol);
1286
72.2k
      if (alias_to.d != CCV_NNC_NO_TENSOR_SYMBOL)
1287
1.00k
        tensor_symbol = alias_to;
1288
72.2k
    }
1289
75.1k
    ccv_nnc_tensor_bind_symbol(tensor_arena, tensor_symbol, tensors[i]);
1290
76.9k
    for (j = 1; j < parallel_count; 
j++1.76k
)
1291
1.76k
    {
1292
1.76k
      const ccv_nnc_tensor_symbol_t copy = ccv_nnc_tensor_symbol_copy(graph, tensor_symbol, j);
1293
1.76k
      if (copy.d != CCV_NNC_NO_TENSOR_SYMBOL)
1294
1.76k
        ccv_nnc_tensor_bind_symbol(tensor_arena, copy, tensors[i + tensor_size * j]);
1295
1.76k
    }
1296
75.1k
  }
1297
38.4k
}
1298
1299
void ccv_cnnp_model_fit(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const fits, const int fit_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
1300
2.41k
{
1301
2.41k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1302
2.41k
  assert(compiled_data);
1303
2.41k
  const int parallel_count = ccv_max(model->parallel_count, 1);
1304
2.41k
  assert(output_size == model->output_size * parallel_count);
1305
2.41k
  assert(input_size == model->input_size * parallel_count);
1306
2.41k
  assert(!fits || fit_size == output_size);
1307
2.41k
  assert(model->graph);
1308
2.41k
  if (!compiled_data->graph || 
compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_FIT_MODE2.41k
)
1309
5
  {
1310
5
    _ccv_cnnp_compiled_data_graph_free(compiled_data);
1311
5
    _ccv_cnnp_compiled_data_backward_free(compiled_data);
1312
5
    _ccv_cnnp_compiled_data_apply_gradients_free(compiled_data);
1313
5
    // Compile the symbolic graph down only when needed.
1314
5
    _ccv_cnnp_model_fit_jit(model, inputs, input_size, fits, fit_size, outputs, output_size);
1315
2.41k
  } else {
1316
2.41k
    assert((input_size % parallel_count) == 0);
1317
2.41k
    assert((output_size % parallel_count) == 0);
1318
2.41k
    assert((fit_size % parallel_count) == 0);
1319
2.41k
    const int input_size_per_p = input_size / parallel_count;
1320
2.41k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->inputs, inputs, input_size_per_p, parallel_count);
1321
2.41k
    const int output_size_per_p = output_size / parallel_count;
1322
2.41k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->outputs, outputs, output_size_per_p, parallel_count);
1323
2.41k
    const int fit_size_per_p = fit_size / parallel_count;
1324
2.41k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, compiled_data->fits, fits, fit_size_per_p, parallel_count);
1325
2.41k
  }
1326
2.41k
  if (compiled_data->is_test)
1327
0
  {
1328
0
    compiled_data->is_test = 0;
1329
0
    ccv_nnc_graph_exec_update_t update = {
1330
0
      .parallel_count = parallel_count,
1331
0
      .graph = model->graph,
1332
0
      .graph_exec_arena = compiled_data->graph_exec_arena,
1333
0
    };
1334
0
    ccv_cnnp_model_set_is_test(model, 0, _ccv_cnnp_cmd_update_for_execs, &update);
1335
0
  }
1336
2.41k
  ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, 0, tensor_tape, stream_context);
1337
2.41k
}
1338
1339
// Compile the graph to run ccv_cnnp_model_evaluate with require_grad = false (MULTISTAGE_MODE_NO_GRAD).
1340
static void _ccv_cnnp_model_multistage_no_grad_jit(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
1341
26
{
1342
26
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1343
26
  compiled_data->graph_mode = CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE_NO_GRAD;
1344
26
  const int parallel_count = ccv_max(model->parallel_count, 1);
1345
26
  assert(output_size == model->output_size * parallel_count);
1346
26
  assert(output_size > 0);
1347
26
  // If the gradient is not initialized, continue to setup parallel process. We don't init gradient here, but rather,
1348
26
  // we setup proper rewindables so the graph can be rewinded to previous state before we run data parallel.
1349
26
  if (parallel_count > 1 && 
compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE7
)
1350
6
  {
1351
6
    const int evaluate_to_size = compiled_data->evaluate.to_size;
1352
6
    compiled_data->evaluate.tos = ccrealloc(compiled_data->evaluate.tos, sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size * parallel_count + sizeof(ccv_nnc_graph_exec_t) * evaluate_to_size * parallel_count);
1353
6
    _ccv_cnnp_model_set_rewindables(model);
1354
6
    ccv_nnc_symbolic_graph_data_parallel(model->graph, parallel_count,
1355
6
      0, 0,
1356
6
      0, 0, 0,
1357
6
      0, 0, 0,
1358
6
      CCV_NNC_PARALLEL_REDUCE_OP_SUM,
1359
6
      SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
1360
6
    ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1361
6
    int i, j;
1362
12
    for (i = 0; i < evaluate_to_size; 
i++6
)
1363
24
      
for (j = 1; 6
j < parallel_count;
j++18
)
1364
18
      {
1365
18
        const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->evaluate.tos[i], j);
1366
18
        if (copy.d != CCV_NNC_NO_GRAPH_EXEC_SYMBOL)
1367
18
          compiled_data->evaluate.tos[compiled_data->evaluate.to_size++] = copy;
1368
18
      }
1369
6
  }
1370
26
  const int tensors_init = !!compiled_data->tensors_init.v;
1371
26
  if (!tensors_init)
1372
14
    ccv_cnnp_model_tensors_init(model, compiled_data);
1373
26
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
1374
26
  assert((input_size % parallel_count) == 0);
1375
26
  assert((output_size % parallel_count) == 0);
1376
26
  const int input_size_per_p = input_size / parallel_count;
1377
26
  _ccv_cnnp_model_bind_tensors(model->graph, model->inputs, inputs, input_size_per_p, parallel_count, tensor_binds);
1378
26
  const int output_size_per_p = output_size / parallel_count;
1379
26
  _ccv_cnnp_model_bind_tensors(model->graph, model->outputs, outputs, output_size_per_p, parallel_count, tensor_binds);
1380
26
  const int parameter_size = compiled_data->parameters->rnum;
1381
26
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1382
26
  const int internal_size = compiled_data->internals->rnum;
1383
26
  _ccv_cnnp_model_remove_nocopies(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count);
1384
26
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count, tensor_binds);
1385
26
  // If we generated gradient for the graph, only compile part of the graph because the rest is irrelevant for evaluation.
1386
26
  ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), compiled_data->evaluate.tos, compiled_data->evaluate.to_size, &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena);
1387
26
  ccv_array_free(tensor_binds);
1388
26
  // If tensor is not init'ed, we need to init states first.
1389
26
  if (tensors_init && 
parallel_count > 112
)
1390
7
    _ccv_cnnp_model_copy_tensors(compiled_data->tensors_init.v, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, compiled_data->parameters->rnum, parallel_count);
1391
26
  if (_ccv_cnnp_any_to_init(compiled_data))
1392
13
  {
1393
13
    ccv_nnc_tensor_init_states_t tensor_init_states = {
1394
13
      .parallel_count = parallel_count,
1395
13
      .graph = model->graph,
1396
13
      .compiled_data = compiled_data,
1397
13
      .tensor_arena = compiled_data->tensor_arena
1398
13
    };
1399
13
    ccv_cnnp_model_init_states(model, model->graph, _ccv_cnnp_init_states_for_tensors, &tensor_init_states);
1400
13
  }
1401
26
  compiled_data->is_test = 1;
1402
26
  ccv_nnc_graph_exec_update_t update = {
1403
26
    .parallel_count = parallel_count,
1404
26
    .graph = model->graph,
1405
26
    .graph_exec_arena = compiled_data->graph_exec_arena,
1406
26
  };
1407
26
  ccv_cnnp_model_set_is_test(model, 1, _ccv_cnnp_cmd_update_for_execs, &update);
1408
26
  ccv_nnc_graph_set_default_static_schedule(compiled_data->graph, compiled_data->stream_type);
1409
26
  ccv_nnc_graph_autotune(compiled_data->graph, model->workspace_size, 0, TRAVERSE_FULL);
1410
26
}
1411
1412
static void _ccv_cnnp_model_gradient_tensors_init(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
1413
22
{
1414
22
  assert(!compiled_data->tensors.gradients);
1415
22
  const int parameter_size = compiled_data->parameters->rnum;
1416
22
  const int parallel_count = ccv_max(model->parallel_count, 1);
1417
22
  compiled_data->tensors.gradients = (ccv_nnc_tensor_t**)ccmalloc(sizeof(ccv_nnc_tensor_t*) * parameter_size * 2 * parallel_count);
1418
22
  compiled_data->tensors.accum_gradients = compiled_data->tensors.gradients + parameter_size * parallel_count;
1419
22
  int i, j;
1420
159
  for (i = 0; i < parameter_size; 
i++137
)
1421
137
  {
1422
137
    const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i);
1423
137
    ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(parameter.graph, parameter);
1424
137
    CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1425
137
    compiled_data->tensors.gradients[i] = ccv_nnc_tensor_new(0, info, 0);
1426
137
    compiled_data->tensors.accum_gradients[i] = 0; // delay the accumulated gradient allocation until when we need it.
1427
317
    for (j = 1; j < parallel_count; 
j++180
)
1428
180
    {
1429
180
      CCV_TENSOR_SET_DEVICE_ID(info.type, j);
1430
180
      compiled_data->tensors.gradients[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0);
1431
180
      compiled_data->tensors.accum_gradients[i + j * parameter_size] = 0;
1432
180
    }
1433
137
  }
1434
22
}
1435
1436
static int _ccv_cnnp_is_disable_outgrad_all(const uint64_t disable_outgrad, const int input_size)
1437
7.95k
{
1438
7.95k
  if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_ALL)
1439
8
    return 1;
1440
7.94k
  if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_NONE)
1441
7.93k
    return 0;
1442
4
  int i;
1443
4
  for (i = 0; i < input_size; 
i++0
)
1444
4
    if (!(disable_outgrad & ((uint64_t)1 << i)))
1445
4
      return 0;
1446
4
  
return 10
;
1447
4
}
1448
1449
// Compile the graph to run ccv_cnnp_model_evaluate with requires_grad = true (MULTISTAGE_MODE).
1450
// Particularly, this method compiles the evaluation and backprop graph (the main graph).
1451
static void _ccv_cnnp_model_multistage_jit_0(ccv_cnnp_model_t* const model, const uint64_t disable_outgrad, const int is_test, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
1452
24
{
1453
24
  int i, j;
1454
24
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1455
24
  const int target_gradient_mode = _ccv_cnnp_is_disable_outgrad_all(disable_outgrad, model->input_size) ? 
CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES1
:
CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS23
;
1456
24
  assert(!compiled_data->graph || compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE || compiled_data->gradient_mode != target_gradient_mode);
1457
24
  compiled_data->graph_mode = CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE;
1458
24
  const int parallel_count = ccv_max(model->parallel_count, 1);
1459
24
  assert(output_size == model->output_size * parallel_count);
1460
24
  assert(output_size > 0);
1461
24
  // There shouldn't be a loss function if we evaluate with multistage jit.
1462
24
  assert(compiled_data->loss.cmd == CCV_NNC_NOOP);
1463
24
  if (compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE)
1464
21
  {
1465
21
    _ccv_cnnp_model_set_rewindables(model);
1466
21
    _ccv_cnnp_model_gradient_init(model, target_gradient_mode, disable_outgrad, 0, 0); // The type of outputs and fits should be the same. We only use type here.
1467
21
  } else 
if (3
compiled_data->gradient_mode != target_gradient_mode3
) {
1468
2
    _ccv_cnnp_model_rewind_graph(model);
1469
2
    _ccv_cnnp_compiled_data_gradient_free(compiled_data);
1470
2
    compiled_data->gradient_mode = CCV_CNNP_COMPILED_DATA_GRADIENT_NONE;
1471
2
    _ccv_cnnp_model_gradient_init(model, target_gradient_mode, disable_outgrad, 0, 0); // The type of outputs and fits should be the same. We only use type here.
1472
2
  }
1473
24
  const int tensors_init = !!compiled_data->tensors_init.v;
1474
24
  if (!tensors_init)
1475
15
    ccv_cnnp_model_tensors_init(model, compiled_data);
1476
24
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
1477
24
  assert((input_size % parallel_count) == 0);
1478
24
  assert((output_size % parallel_count) == 0);
1479
24
  const int input_size_per_p = input_size / parallel_count;
1480
24
  _ccv_cnnp_model_bind_tensors(model->graph, model->inputs, inputs, input_size_per_p, parallel_count, tensor_binds);
1481
24
  const int output_size_per_p = output_size / parallel_count;
1482
24
  _ccv_cnnp_model_bind_tensors(model->graph, model->outputs, outputs, output_size_per_p, parallel_count, tensor_binds);
1483
24
  const int parameter_size = compiled_data->parameters->rnum;
1484
24
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1485
24
  const int internal_size = compiled_data->internals->rnum;
1486
24
  _ccv_cnnp_model_remove_nocopies(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count);
1487
24
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count, tensor_binds);
1488
24
  if (!compiled_data->tensors.gradients)
1489
22
    _ccv_cnnp_model_gradient_tensors_init(model, compiled_data);
1490
24
  _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count, tensor_binds);
1491
24
  ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), compiled_data->backward.tos, compiled_data->backward.to_size, &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena);
1492
24
  ccv_array_free(tensor_binds);
1493
24
  if (tensors_init && 
parallel_count > 19
)
1494
0
    _ccv_cnnp_model_copy_tensors(compiled_data->tensors_init.v, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, compiled_data->parameters->rnum, parallel_count);
1495
24
  // If tensor is not init'ed, we need to init states first.
1496
24
  if (_ccv_cnnp_any_to_init(compiled_data))
1497
15
  {
1498
15
    ccv_nnc_tensor_init_states_t tensor_init_states = {
1499
15
      .parallel_count = parallel_count,
1500
15
      .graph = model->graph,
1501
15
      .compiled_data = compiled_data,
1502
15
      .tensor_arena = compiled_data->tensor_arena
1503
15
    };
1504
15
    ccv_cnnp_model_init_states(model, model->graph, _ccv_cnnp_init_states_for_tensors, &tensor_init_states);
1505
15
  }
1506
24
  compiled_data->is_test = is_test;
1507
24
  ccv_nnc_graph_exec_update_t update = {
1508
24
    .parallel_count = parallel_count,
1509
24
    .graph = model->graph,
1510
24
    .graph_exec_arena = compiled_data->graph_exec_arena,
1511
24
  };
1512
24
  ccv_cnnp_model_set_is_test(model, is_test, _ccv_cnnp_cmd_update_for_execs, &update);
1513
24
  const int evaluate_to_size = compiled_data->evaluate.to_size;
1514
24
  compiled_data->evaluate.to_op_size = 0;
1515
24
  ccv_array_t* const backward_from = ccv_array_new(sizeof(int), 0, 0);
1516
66
  for (i = 0; i < evaluate_to_size; 
i++42
)
1517
42
  {
1518
42
    ccv_nnc_graph_exec_t const to_op = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, compiled_data->evaluate.tos[i]);
1519
42
    if (to_op.graph)
1520
42
      compiled_data->evaluate.to_ops[compiled_data->evaluate.to_op_size++] = to_op;
1521
42
    const int* tos;
1522
42
    int to_size;
1523
42
    ccv_nnc_graph_exec_symbol_to(model->graph, compiled_data->evaluate.tos[i], &tos, &to_size);
1524
84
    for (j = 0; j < to_size; 
j++42
)
1525
42
    {
1526
42
      ccv_nnc_graph_exec_t const to_op = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, (ccv_nnc_graph_exec_symbol_t){
1527
42
        .d = tos[j],
1528
42
        .graph = model->graph
1529
42
      });
1530
42
      if (to_op.graph)
1531
42
        ccv_array_add_unique_int(backward_from, to_op.d);
1532
42
    }
1533
42
  }
1534
24
  assert(backward_from->rnum > 0);
1535
24
  compiled_data->backward.from_op_size = backward_from->rnum;
1536
24
  compiled_data->backward.from_ops = (ccv_nnc_graph_exec_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_t) * backward_from->rnum);
1537
66
  for (i = 0; i < backward_from->rnum; 
i++42
)
1538
42
    compiled_data->backward.from_ops[i] = (ccv_nnc_graph_exec_t){
1539
42
      .d = *(int*)ccv_array_get(backward_from, i),
1540
42
      .graph = compiled_data->graph,
1541
42
    };
1542
24
  ccv_array_free(backward_from);
1543
24
  ccv_nnc_graph_set_default_static_schedule(compiled_data->graph, compiled_data->stream_type);
1544
24
  ccv_nnc_graph_autotune(compiled_data->graph, model->workspace_size, 0, TRAVERSE_FULL);
1545
24
}
1546
1547
void ccv_cnnp_model_evaluate(ccv_cnnp_model_t* const model, const ccv_cnnp_evaluate_param_t params, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
1548
7.92k
{
1549
7.92k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1550
7.92k
  assert(compiled_data);
1551
7.92k
  const int parallel_count = ccv_max(model->parallel_count, 1);
1552
7.92k
  assert(output_size == model->output_size * parallel_count);
1553
7.92k
  assert(input_size == model->input_size * parallel_count);
1554
7.92k
  assert(model->graph);
1555
7.92k
  const int target_gradient_mode = _ccv_cnnp_is_disable_outgrad_all(params.disable_outgrad, model->input_size) ? 
CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES7
:
CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS7.92k
;
1556
7.92k
  const int mode_mismatch = (params.requires_grad && 
(7.81k
compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE7.81k
||
compiled_data->gradient_mode != target_gradient_mode7.79k
||
compiled_data->disable_outgrad != params.disable_outgrad7.79k
));
1557
7.92k
  if (!compiled_data->graph || 
mode_mismatch7.88k
)
1558
50
  {
1559
50
    _ccv_cnnp_compiled_data_graph_free(compiled_data);
1560
50
    if (mode_mismatch) // If mode mismatch, we need to redo the backward as well (no need to redo apply_gradients, it doesn't require target_gradient_mode or disable_outgrad.
1561
23
      _ccv_cnnp_compiled_data_backward_free(compiled_data);
1562
50
    if (params.requires_grad)
1563
24
      _ccv_cnnp_model_multistage_jit_0(model, params.disable_outgrad, params.is_test, inputs, input_size, outputs, output_size);
1564
26
    else
1565
26
      _ccv_cnnp_model_multistage_no_grad_jit(model, inputs, input_size, outputs, output_size);
1566
7.87k
  } else {
1567
7.87k
    ccv_nnc_tensor_arena_clear_bindings(compiled_data->tensor_arena);
1568
7.87k
    assert((input_size % parallel_count) == 0);
1569
7.87k
    const int input_size_per_p = input_size / parallel_count;
1570
7.87k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->inputs, inputs, input_size_per_p, parallel_count);
1571
7.87k
    assert((output_size % parallel_count) == 0);
1572
7.87k
    const int output_size_per_p = output_size / parallel_count;
1573
7.87k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->outputs, outputs, output_size_per_p, parallel_count);
1574
7.87k
  }
1575
7.92k
  if (compiled_data->is_test != params.is_test)
1576
31
  {
1577
31
    compiled_data->is_test = params.is_test;
1578
31
    ccv_nnc_graph_exec_update_t update = {
1579
31
      .parallel_count = parallel_count,
1580
31
      .graph = model->graph,
1581
31
      .graph_exec_arena = compiled_data->graph_exec_arena,
1582
31
    };
1583
31
    ccv_cnnp_model_set_is_test(model, params.is_test, _ccv_cnnp_cmd_update_for_execs, &update);
1584
31
  }
1585
7.92k
  if (compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE_NO_GRAD)
1586
39
    ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, 0, tensor_tape, stream_context);
1587
7.88k
  else {
1588
7.88k
    if (!compiled_data->evaluate.schedule)
1589
29
      compiled_data->evaluate.schedule = ccv_nnc_graph_static_schedule_new(compiled_data->graph, compiled_data->stream_type, 0, 0, compiled_data->evaluate.to_ops, compiled_data->evaluate.to_op_size);
1590
7.88k
    ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, compiled_data->evaluate.schedule, tensor_tape, stream_context);
1591
7.88k
  }
1592
7.92k
}
1593
1594
// Compile the graph to run ccv_cnnp_model_backward after ccv_cnnp_model_evaluate with requires_grad = true (MULTISTAGE_MODE).
1595
// Particularly, this method compiles the accumulator graph.
1596
static void _ccv_cnnp_model_multistage_jit_1(ccv_cnnp_model_t* const model)
1597
4
{
1598
4
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1599
4
  assert(compiled_data);
1600
4
  assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE);
1601
4
  ccv_nnc_symbolic_graph_t* accum = ccv_nnc_symbolic_graph_new();
1602
4
  const int parallel_count = ccv_max(model->parallel_count, 1);
1603
4
  const int parameter_size = compiled_data->parameters->rnum;
1604
4
  int i, j;
1605
4
  compiled_data->backward.gradients = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size * parallel_count * 3);
1606
4
  compiled_data->backward.accum_gradients = compiled_data->backward.gradients + parameter_size * parallel_count;
1607
4
  compiled_data->backward.updated_accum_gradients = compiled_data->backward.accum_gradients + parameter_size * parallel_count;
1608
18
  for (i = 0; i < parameter_size; 
i++14
)
1609
28
    
for (j = 0; 14
j < parallel_count;
j++14
)
1610
14
    {
1611
14
      const ccv_nnc_tensor_param_t info = compiled_data->tensors.gradients[i + j * parameter_size]->info;
1612
14
      // Now, the old gradient is the accumulated gradient, getting new gradient tensor setup so we can collect them.
1613
14
      compiled_data->tensors.accum_gradients[i + j * parameter_size] = compiled_data->tensors.gradients[i + j * parameter_size];
1614
14
      compiled_data->tensors.gradients[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0);
1615
14
      ccv_nnc_tensor_symbol_t inputs[2];
1616
14
      inputs[0] = compiled_data->backward.accum_gradients[i + j * parameter_size] = ccv_nnc_tensor_symbol_new(accum, info, 0);
1617
14
      inputs[1] = compiled_data->backward.gradients[i + j * parameter_size] = ccv_nnc_tensor_symbol_new(accum, info, 0);
1618
14
      ccv_nnc_tensor_symbol_t output = compiled_data->backward.updated_accum_gradients[i + j * parameter_size] = ccv_nnc_tensor_symbol_new(accum, info, 0);
1619
14
      ccv_nnc_graph_exec_symbol_new(accum, CMD_EWSUM_FORWARD(), inputs, 2, &output, 1, 0);
1620
14
    }
1621
4
  ccv_nnc_graph_exec_symbol_autogen(accum, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1622
4
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
1623
4
  _ccv_cnnp_model_bind_tensors(accum, compiled_data->backward.accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1, tensor_binds);
1624
4
  _ccv_cnnp_model_bind_tensors(accum, compiled_data->backward.gradients, compiled_data->tensors.gradients, parameter_size * parallel_count, 1, tensor_binds);
1625
4
  _ccv_cnnp_model_bind_tensors(accum, compiled_data->backward.updated_accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1, tensor_binds);
1626
4
  ccv_nnc_symbolic_graph_compile(accum, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(accum), SYMBOLIC_GRAPH_DESTINATIONS(accum), &compiled_data->backward.accum, &compiled_data->backward.tensor_arena, &compiled_data->backward.graph_exec_arena);
1627
4
  ccv_nnc_symbolic_graph_free(accum);
1628
4
  ccv_nnc_graph_set_default_static_schedule(compiled_data->backward.accum, compiled_data->stream_type);
1629
4
  ccv_array_free(tensor_binds);
1630
4
}
1631
1632
void ccv_cnnp_model_backward(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const ingrads, const int ingrad_size, ccv_nnc_tensor_t* const* const outgrads, const int outgrad_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
1633
7.87k
{
1634
7.87k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1635
7.87k
  assert(compiled_data);
1636
7.87k
  assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE);
1637
7.87k
  const int parallel_count = ccv_max(model->parallel_count, 1);
1638
7.87k
  assert(ingrad_size == 0 || ingrad_size == model->output_size * parallel_count);
1639
7.87k
  if (outgrad_size > 0)
1640
2.51k
    { assert(outgrad_size == compiled_data->outgrad_size * parallel_count); }
1641
7.87k
  assert(model->graph);
1642
7.87k
  assert(compiled_data->graph);
1643
7.87k
  const int parameter_size = compiled_data->parameters->rnum;
1644
7.87k
  // If we need to accumulate the gradients now, do jit on accumulator.
1645
7.87k
  if (compiled_data->backward.count > 0)
1646
1.71k
  {
1647
1.71k
    if (!compiled_data->backward.accum)
1648
4
      _ccv_cnnp_model_multistage_jit_1(model);
1649
1.71k
    else if (compiled_data->backward.count == 1) {
1650
496
      //  On this round, we need to switch accumulated gradients with gradients (so we can do accumulation properly).
1651
496
      int i;
1652
496
      ccv_nnc_tensor_arena_clear_bindings(compiled_data->backward.tensor_arena);
1653
1.48k
      for (i = 0; i < parameter_size * parallel_count; 
i++986
)
1654
986
      {
1655
986
        ccv_nnc_tensor_t* tensor;
1656
986
        CCV_SWAP(compiled_data->tensors.accum_gradients[i], compiled_data->tensors.gradients[i], tensor);
1657
986
      }
1658
496
      // Do rebind in case we messed up the binding (we switch accum_gradients and gradients).
1659
496
      _ccv_cnnp_bind_tensors_to_arena(compiled_data->backward.tensor_arena, 0, compiled_data->backward.gradients, compiled_data->tensors.gradients, parameter_size * parallel_count, 1);
1660
496
      _ccv_cnnp_bind_tensors_to_arena(compiled_data->backward.tensor_arena, 0, compiled_data->backward.accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1);
1661
496
      _ccv_cnnp_bind_tensors_to_arena(compiled_data->backward.tensor_arena, 0, compiled_data->backward.updated_accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1);
1662
496
    }
1663
1.71k
  }
1664
7.87k
  const int ingrad_size_per_p = model->output_size;
1665
7.87k
  const int outgrad_size_per_p = compiled_data->outgrad_size;
1666
7.87k
  int i, j;
1667
15.7k
  for (i = 0; i < ingrad_size_per_p; 
i++7.87k
)
1668
7.87k
  {
1669
7.87k
    const ccv_nnc_tensor_symbol_t ingrad = ccv_nnc_tensor_symbol_for_backward(model->graph, compiled_data->f[i]);
1670
7.87k
    if (!ingrad_size || 
!ingrads3.78k
||
ingrads[i] == 03.78k
)
1671
4.19k
    {
1672
4.19k
      // Set it to 1 if it is not specified.
1673
4.19k
      ccv_nnc_tensor_t* const ingrad_tensor = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, ingrad);
1674
4.19k
      if (ingrad_tensor)
1675
4.19k
        ccv_nnc_cmd_exec(CMD_SET_FORWARD(1), ccv_nnc_no_hint, 0, 0, 0, TENSOR_LIST(ingrad_tensor), stream_context);
1676
4.31k
      for (j = 1; j < parallel_count; 
j++120
)
1677
120
      {
1678
120
        ccv_nnc_tensor_t* const ingrad_tensor = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, ingrad, j));
1679
120
        if (ingrad_tensor)
1680
120
          ccv_nnc_cmd_exec(CMD_SET_FORWARD(1), ccv_nnc_no_hint, 0, 0, 0, TENSOR_LIST(ingrad_tensor), stream_context);
1681
120
      }
1682
4.19k
    } else {
1683
3.68k
      // Make sure the length matches, in case it is an alias.
1684
3.68k
      assert(ccv_nnc_tensor_count(ingrads[i]->info) == ccv_nnc_tensor_count(ccv_nnc_tensor_symbol_params(model->graph, ingrad)));
1685
3.68k
      ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, ingrad, ingrads[i]);
1686
3.69k
      for (j = 1; j < parallel_count; 
j++6
)
1687
6
        ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, ingrad, j), ingrads[i + ingrad_size_per_p * j]);
1688
3.68k
    }
1689
7.87k
  }
1690
7.87k
  if (outgrad_size > 0)
1691
2.51k
  {
1692
2.51k
    assert(compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS && "shouldn't pass disable_outgrad to ccv_cnnp_model_evaluate before if you plan to compute outgrad");
1693
5.13k
    
for (i = 0; 2.51k
i < outgrad_size_per_p;
i++2.62k
)
1694
2.62k
      if (outgrads[i])
1695
2.42k
      {
1696
2.42k
        const ccv_nnc_tensor_symbol_t outgrad = compiled_data->outgrads[i];
1697
2.42k
        ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, outgrad, outgrads[i]);
1698
2.43k
        for (j = 1; j < parallel_count; 
j++6
)
1699
6
          ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, outgrad, j), outgrads[i + outgrad_size_per_p * j]);
1700
2.42k
      }
1701
5.36k
  } else {
1702
5.36k
    assert(compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES ||
1703
5.36k
      compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS);
1704
5.36k
  }
1705
7.87k
  // We need to rebind here because in ccv_cnnp_evaluate, we clear bindings, that will reset all bindings for the gradients.
1706
7.87k
  // For parameters and internals these are fine because when we clear bindings, it restores to original bindings, which are these
1707
7.87k
  // parameters and internals. The same cannot be said for gradients due to the accum_gradients switching.
1708
7.87k
  _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count);
1709
7.87k
  if (!compiled_data->backward.schedule)
1710
21
    compiled_data->backward.schedule = ccv_nnc_graph_static_schedule_new(compiled_data->graph, compiled_data->stream_type, compiled_data->backward.from_ops, compiled_data->backward.from_op_size, 0, 0);
1711
7.87k
  // Run the backward pass.
1712
7.87k
  ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, compiled_data->backward.schedule, tensor_tape, stream_context);
1713
7.87k
  // If we need to run accumulation round, do that now.
1714
7.87k
  if (compiled_data->backward.count > 0)
1715
1.71k
    ccv_nnc_graph_run_with_schedule(compiled_data->backward.accum, 0, 0, 0, stream_context);
1716
7.87k
  // Update the count, this determines whether we need to accumulate or not.
1717
7.87k
  ++compiled_data->backward.count;
1718
7.87k
}
1719
1720
// Compile the graph to run ccv_cnnp_model_apply_gradients after ccv_cnnp_model_backward (MULTISTAGE_MODE).
1721
// Particularly, this method compiles the parameter update graph.
1722
static void _ccv_cnnp_model_multistage_jit_2(ccv_cnnp_model_t* const model)
1723
19
{
1724
19
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1725
19
  assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE);
1726
19
  const int parallel_count = ccv_max(model->parallel_count, 1);
1727
19
  const int parameter_size = compiled_data->parameters->rnum;
1728
19
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
1729
19
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1730
19
  _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->updated_parameters, compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1731
19
  // Bind accumulated gradients.
1732
19
  if (compiled_data->backward.count > 1)
1733
4
    _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->gradients, compiled_data->tensors.accum_gradients, parameter_size, parallel_count, tensor_binds);
1734
15
  else
1735
15
    _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count, tensor_binds);
1736
19
  ccv_array_t* const apply_gradients_from = ccv_array_new(sizeof(int), 0, 0);
1737
19
  int i, j;
1738
241
  for (i = 0; i < compiled_data->backward.to_size; 
i++222
)
1739
222
  {
1740
222
    const int* tos;
1741
222
    int to_size;
1742
222
    ccv_nnc_graph_exec_symbol_to(model->graph, compiled_data->backward.tos[i], &tos, &to_size);
1743
716
    for (j = 0; j < to_size; 
j++494
)
1744
494
    {
1745
494
      // Check if this is already show up in the backward graph, if that is the case, it won't be in the apply
1746
494
      // gradients graph.
1747
494
      const ccv_nnc_graph_exec_t exec = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, (ccv_nnc_graph_exec_symbol_t){
1748
494
        .d = tos[j],
1749
494
        .graph = model->graph,
1750
494
      });
1751
494
      if (!exec.graph)
1752
309
        ccv_array_add_unique_int(apply_gradients_from, tos[j]);
1753
494
    }
1754
222
  }
1755
19
  const int from_size = apply_gradients_from->rnum;
1756
19
  ccv_nnc_graph_exec_symbol_t* const froms = (ccv_nnc_graph_exec_symbol_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_t) * from_size);
1757
148
  for (i = 0; i < from_size; 
i++129
)
1758
129
    froms[i] = (ccv_nnc_graph_exec_symbol_t){
1759
129
      .d = *(int*)ccv_array_get(apply_gradients_from, i),
1760
129
      .graph = model->graph
1761
129
    };
1762
19
  ccv_array_free(apply_gradients_from);
1763
19
  // It can only ends with updates on the parameters.
1764
19
  ccv_array_t* const tos = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), parameter_size * parallel_count, 0);
1765
148
  for (i = 0;  i < parameter_size; 
i++129
)
1766
129
  {
1767
129
    ccv_array_push(tos, &compiled_data->update_nodes[i]);
1768
309
    for (j = 1; j < parallel_count; 
j++180
)
1769
180
    {
1770
180
      const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->update_nodes[i], j);
1771
180
      ccv_array_push(tos, &copy);
1772
180
    }
1773
129
  }
1774
19
  ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, froms, from_size, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(tos, 0), tos->rnum, &compiled_data->apply_gradients.graph, &compiled_data->apply_gradients.tensor_arena, &compiled_data->apply_gradients.graph_exec_arena);
1775
19
  ccv_array_free(tos);
1776
19
  ccv_array_free(tensor_binds);
1777
19
  ccfree(froms);
1778
19
  const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
1779
204
  for (i = 0; i < max_saved_aux_size * parameter_size; 
i++185
)
1780
185
  {
1781
185
    // Skip on no tensor.
1782
185
    if (compiled_data->saved_aux[i].source.d == CCV_NNC_NO_TENSOR_SYMBOL)
1783
0
      continue;
1784
185
    ccv_nnc_tensor_t* const tensor = ccv_nnc_tensor_from_symbol(compiled_data->apply_gradients.tensor_arena, compiled_data->saved_aux[i].source);
1785
185
    ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &tensor, 1, 0);
1786
533
    for (j = 1; j < parallel_count; 
j++348
)
1787
348
    {
1788
348
      ccv_nnc_tensor_t* const copy = ccv_nnc_tensor_from_symbol(compiled_data->apply_gradients.tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, compiled_data->saved_aux[i].source, j));
1789
348
      if (copy)
1790
348
        ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &copy, 1, 0);
1791
348
    }
1792
185
  }
1793
19
  ccv_nnc_graph_set_default_static_schedule(compiled_data->apply_gradients.graph, compiled_data->stream_type);
1794
19
}
1795
1796
void ccv_cnnp_model_apply_gradients(ccv_cnnp_model_t* const model, ccv_nnc_stream_context_t* const stream_context)
1797
7.80k
{
1798
7.80k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1799
7.80k
  assert(compiled_data);
1800
7.80k
  assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE);
1801
7.80k
  const int parallel_count = ccv_max(model->parallel_count, 1);
1802
7.80k
  assert(model->graph);
1803
7.80k
  assert(compiled_data->graph);
1804
7.80k
  // Skip if there is no backward pass.
1805
7.80k
  if (compiled_data->backward.count <= 0)
1806
1.65k
    return;
1807
6.15k
  // Skip if there is no parameters.
1808
6.15k
  if (compiled_data->parameters->rnum == 0)
1809
1
  {
1810
1
    compiled_data->backward.count = 0;
1811
1
    return;
1812
1
  }
1813
6.15k
  if (!compiled_data->apply_gradients.graph)
1814
19
    _ccv_cnnp_model_multistage_jit_2(model);
1815
6.13k
  else {
1816
6.13k
    const int parameter_size = compiled_data->parameters->rnum;
1817
6.13k
    ccv_nnc_tensor_arena_clear_bindings(compiled_data->apply_gradients.tensor_arena);
1818
6.13k
    // Change to bind accum_gradients if we do gradient accumulation (run backward more than once).
1819
6.13k
    if (compiled_data->backward.count > 1)
1820
496
      _ccv_cnnp_bind_tensors_to_arena(compiled_data->apply_gradients.tensor_arena, model->graph, compiled_data->gradients, compiled_data->tensors.accum_gradients, parameter_size, parallel_count);
1821
5.64k
    else
1822
5.64k
      _ccv_cnnp_bind_tensors_to_arena(compiled_data->apply_gradients.tensor_arena, model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count);
1823
6.13k
  }
1824
6.15k
  ccv_nnc_graph_run_with_schedule(compiled_data->apply_gradients.graph, 0, 0, 0, stream_context);
1825
6.15k
  // Reset backward count to 0.
1826
6.15k
  compiled_data->backward.count = 0;
1827
6.15k
}
1828
1829
void ccv_cnnp_model_set_parameter(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter, const ccv_nnc_tensor_t* const tensor)
1830
8
{
1831
8
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1832
8
  const int param_sel = parameter->param_sel > 0 ? 
parameter->param_sel - 16
:
parameter->param_sel2
;
1833
8
  assert(parameter->param_sel != 0);
1834
8
  const int tensors_init = !!compiled_data->tensors_init.v;
1835
8
  if (!tensors_init)
1836
5
    ccv_cnnp_model_tensors_init(model, compiled_data);
1837
8
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
1838
8
  ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices);
1839
8
  const int param_ref = parameter->param_ref > 0 ? parameter->param_ref - 1 : 
parameter->param_ref0
;
1840
8
  if (param_ref < 0)
1841
0
    { assert(parameter_indices->rnum == 1); }
1842
8
  else
1843
8
    { assert(param_ref < parameter_indices->rnum); }
1844
8
  const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0);
1845
8
  ccv_array_free(parameter_indices);
1846
8
  const int parameter_size = compiled_data->parameters->rnum;
1847
8
  assert(d >= 0);
1848
8
  assert(d < parameter_size);
1849
8
  const int parallel_count = ccv_max(model->parallel_count, 1);
1850
8
  ccv_nnc_tensor_t* const dest = compiled_data->tensors.parameters[d];
1851
8
  assert(dest);
1852
8
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)tensor), TENSOR_LIST(dest), 0);
1853
8
  int i;
1854
8
  for (i = 1; i < parallel_count; 
i++0
)
1855
0
  {
1856
0
    ccv_nnc_tensor_t* const copy_tensor = compiled_data->tensors.parameters[d + i * parameter_size];
1857
0
    if (copy_tensor)
1858
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dest), TENSOR_LIST(copy_tensor), 0);
1859
0
  }
1860
8
  // Mark this symbol as init'ed.
1861
8
  const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, d))->d;
1862
8
  compiled_data->tensors_init.v[s >> 5] |= (1u << (s & 0x1f));
1863
8
}
1864
1865
void ccv_cnnp_model_parameter_copy(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter, ccv_nnc_tensor_t* const tensor)
1866
6
{
1867
6
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1868
6
  const int param_sel = parameter->param_sel > 0 ? 
parameter->param_sel - 13
:
parameter->param_sel3
;
1869
6
  assert(parameter->param_sel != 0);
1870
6
  assert(compiled_data->tensors.parameters);
1871
6
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
1872
6
  ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices);
1873
6
  const int param_ref = parameter->param_ref > 0 ? 
parameter->param_ref - 13
:
parameter->param_ref3
;
1874
6
  if (param_ref < 0)
1875
3
    { assert(parameter_indices->rnum == 1); }
1876
3
  else
1877
3
    { assert(param_ref < parameter_indices->rnum); }
1878
6
  const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0);
1879
6
  ccv_array_free(parameter_indices);
1880
6
  const int parameter_size = compiled_data->parameters->rnum;
1881
6
  assert(d >= 0);
1882
6
  assert(d < parameter_size);
1883
6
  // We don't need to consider parallel_count, every parameter on each device is identical.
1884
6
  ccv_nnc_tensor_t* const src = compiled_data->tensors.parameters[d];
1885
6
  assert(src);
1886
6
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(src), TENSOR_LIST(tensor), 0);
1887
6
}
1888
1889
static ccv_array_t* _ccv_cnnp_model_parameter_indices(const ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, int* const param_ref)
1890
38
{
1891
38
  const int to_param_sel = parameters->param_sel > 0 ? 
parameters->param_sel - 10
: parameters->param_sel;
1892
38
  assert(parameters->param_sel != 0);
1893
38
  ccv_array_t* const to_parameter_indices = ccv_array_new(sizeof(int), 0, 0);
1894
38
  ccv_cnnp_model_add_to_parameter_indices(parameters->model, to_param_sel, to_parameter_indices);
1895
38
  *param_ref = parameters->param_ref > 0 ? 
parameters->param_ref - 10
: parameters->param_ref;
1896
38
  return to_parameter_indices;
1897
38
}
1898
1899
static void _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters, ccv_array_t** const parameter_indices, int* const param_ref, ccv_array_t** const from_parameter_indices, int* const from_param_ref)
1900
12
{
1901
12
  // If the model is not compiled yet. Compile them now.
1902
12
  if (!model->graph)
1903
3
  {
1904
3
    model->graph = ccv_nnc_symbolic_graph_new();
1905
3
    assert(from_model->compiled_data);
1906
3
    const int input_size = from_model->input_size;
1907
3
    ccv_nnc_tensor_param_t input_params[input_size];
1908
3
    int i;
1909
9
    for (i = 0; i < input_size; 
i++6
)
1910
6
      input_params[i] = ccv_nnc_tensor_symbol_params(from_model->graph, from_model->inputs[i]);
1911
3
    _ccv_cnnp_model_compile(model, input_params, input_size, from_model->compiled_data->loss);
1912
3
    model->parallel_count = from_model->parallel_count;
1913
3
    model->memory_compression = from_model->memory_compression;
1914
3
    model->compiled_data->stream_type = from_model->compiled_data->stream_type;
1915
3
    model->compiled_data->minimize.minimizer = from_model->compiled_data->minimize.minimizer;
1916
3
    model->compiled_data->minimize.max_saved_aux_size = from_model->compiled_data->minimize.max_saved_aux_size;
1917
3
  }
1918
12
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
1919
12
  assert(to_compiled_data);
1920
12
  const int to_tensors_init = !!to_compiled_data->tensors_init.v;
1921
12
  if (!to_tensors_init)
1922
9
    ccv_cnnp_model_tensors_init(model, to_compiled_data);
1923
12
  assert(to_compiled_data->tensors.parameters);
1924
12
  *parameter_indices = _ccv_cnnp_model_parameter_indices(model, parameters, param_ref);
1925
12
  *from_parameter_indices = _ccv_cnnp_model_parameter_indices(from_model, from_parameters, from_param_ref);
1926
12
  if (*from_param_ref < 0 && *param_ref >= 0)
1927
0
    { assert((*from_parameter_indices)->rnum == 1); }
1928
12
  else if (*from_param_ref >= 0)
1929
0
    { assert(*from_param_ref < (*from_parameter_indices)->rnum); }
1930
12
  if (*param_ref < 0 && *from_param_ref >= 0)
1931
0
    { assert((*parameter_indices)->rnum == 1); }
1932
12
  else if (*param_ref >= 0)
1933
0
    { assert(*param_ref < (*parameter_indices)->rnum); }
1934
12
  // Should be exactly the same tensor.
1935
12
  if (*param_ref < 0 && *from_param_ref < 0)
1936
12
    { assert((*from_parameter_indices)->rnum == (*parameter_indices)->rnum); }
1937
12
}
1938
1939
void ccv_cnnp_model_set_parameters(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters)
1940
9
{
1941
9
  ccv_array_t* to_parameter_indices;
1942
9
  int to_param_ref;
1943
9
  ccv_array_t* from_parameter_indices;
1944
9
  int from_param_ref;
1945
9
  _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(model, parameters, from_model, from_parameters, &to_parameter_indices, &to_param_ref, &from_parameter_indices, &from_param_ref);
1946
9
  // To models.
1947
9
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
1948
9
  assert(to_compiled_data);
1949
9
  // From models.
1950
9
  const ccv_cnnp_compiled_data_t* const from_compiled_data = from_model->compiled_data;
1951
9
  const int parallel_count = ccv_max(model->parallel_count, 1);
1952
9
  const int to_parameter_size = to_compiled_data->parameters->rnum;
1953
9
  const int rnum = (to_param_ref < 0 && from_param_ref < 0) ? from_parameter_indices->rnum : 
10
;
1954
9
  int i, j;
1955
18
  for (i = 0; i < rnum; 
i++9
)
1956
9
  {
1957
9
    const int src_d = *(int*)ccv_array_get(from_parameter_indices,from_param_ref >= 0 ? from_param_ref : i);
1958
9
    assert(src_d >= 0);
1959
9
    assert(src_d < from_compiled_data->parameters->rnum);
1960
9
    const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(from_compiled_data->parameters, src_d))->d;
1961
9
    // If the original is not init'ed. We cannot copy from.
1962
9
    if (!(from_compiled_data->tensors_init.v[s >> 5] & (1u << (s & 0x1f))))
1963
0
      continue;
1964
9
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
1965
9
    assert(dest_d >= 0);
1966
9
    assert(dest_d < to_compiled_data->parameters->rnum);
1967
9
    ccv_nnc_tensor_t* const src = from_compiled_data->tensors.parameters[src_d];
1968
9
    assert(src);
1969
9
    ccv_nnc_tensor_t* const dest = to_compiled_data->tensors.parameters[dest_d];
1970
9
    assert(dest);
1971
9
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(src), TENSOR_LIST(dest), 0);
1972
27
    for (j = 1; j < parallel_count; 
j++18
)
1973
18
    {
1974
18
      ccv_nnc_tensor_t* const copy_tensor = to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size];
1975
18
      if (copy_tensor)
1976
18
        ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dest), TENSOR_LIST(copy_tensor), 0);
1977
18
    }
1978
9
    // Mark this symbol as init'ed.
1979
9
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(to_compiled_data->parameters, dest_d))->d;
1980
9
    to_compiled_data->tensors_init.v[d >> 5] |= (1u << (d & 0x1f));
1981
9
  }
1982
9
  ccv_array_free(to_parameter_indices);
1983
9
  ccv_array_free(from_parameter_indices);
1984
9
}
1985
1986
static ccv_nnc_synced_stream_t _ccv_cnnp_compiled_data_get_synced_stream(ccv_cnnp_compiled_data_t* const compiled_data, const int type)
1987
24
{
1988
24
  if (!compiled_data->synced_streams)
1989
4
    compiled_data->synced_streams = kh_init(synced_stream);
1990
24
  int ret = 0;
1991
24
  khiter_t k = kh_put(synced_stream, compiled_data->synced_streams, type, &ret);
1992
24
  assert(ret >= 0);
1993
24
  ccv_nnc_synced_stream_t* const synced_stream = &kh_val(compiled_data->synced_streams, k);
1994
24
  // If ret == 0, the key already exist, we can return directly, otherwise, create and return.
1995
24
  if (ret != 0)
1996
16
  {
1997
16
    synced_stream->stream = ccv_nnc_stream_context_new(type);
1998
16
    synced_stream->synced = ccv_nnc_stream_signal_new(type);
1999
16
  }
2000
24
  return *synced_stream;
2001
24
}
2002
2003
void ccv_cnnp_model_parameters_zip_map(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_stream_context_t* const stream_context, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters)
2004
3
{
2005
3
  ccv_array_t* to_parameter_indices;
2006
3
  int to_param_ref;
2007
3
  ccv_array_t* from_parameter_indices;
2008
3
  int from_param_ref;
2009
3
  _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(model, parameters, from_model, from_parameters, &to_parameter_indices, &to_param_ref, &from_parameter_indices, &from_param_ref);
2010
3
  // To models.
2011
3
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2012
3
  assert(to_compiled_data);
2013
3
  // From models.
2014
3
  const ccv_cnnp_compiled_data_t* const from_compiled_data = from_model->compiled_data;
2015
3
  const int parallel_count = ccv_max(model->parallel_count, 1);
2016
3
  const int to_parameter_size = to_compiled_data->parameters->rnum;
2017
3
  const int rnum = (to_param_ref < 0 && from_param_ref < 0) ? from_parameter_indices->rnum : 
10
;
2018
3
  int i, j;
2019
6
  for (i = 0; i < rnum; 
i++3
)
2020
3
  {
2021
3
    const int src_d = *(int*)ccv_array_get(from_parameter_indices,from_param_ref >= 0 ? from_param_ref : i);
2022
3
    assert(src_d >= 0);
2023
3
    assert(src_d < from_compiled_data->parameters->rnum);
2024
3
    const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(from_compiled_data->parameters, src_d))->d;
2025
3
    // If the original is not init'ed. We cannot copy from.
2026
3
    if (!(from_compiled_data->tensors_init.v[s >> 5] & (1u << (s & 0x1f))))
2027
0
      continue;
2028
3
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
2029
3
    assert(dest_d >= 0);
2030
3
    assert(dest_d < to_compiled_data->parameters->rnum);
2031
3
    if (parallel_count > 1)
2032
2
    {
2033
2
      ccv_nnc_stream_context_t* streams[parallel_count];
2034
2
      ccv_nnc_stream_signal_t* signal;
2035
2
      if (stream_context)
2036
1
        signal = ccv_nnc_stream_context_emit_signal_new(stream_context);
2037
10
      for (j = 0; j < parallel_count; 
j++8
)
2038
8
      {
2039
8
        ccv_nnc_tensor_t* const src = from_compiled_data->tensors.parameters[src_d + j * to_parameter_size];
2040
8
        ccv_nnc_tensor_t* const dest = to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size];
2041
8
        if (!dest || !src)
2042
0
        {
2043
0
          streams[j] = 0;
2044
0
          continue;
2045
0
        }
2046
8
        // At the moment, can only handle them on the same device.
2047
8
        assert(CCV_TENSOR_GET_MEMORY(src->info.type) == CCV_TENSOR_GET_MEMORY(dest->info.type));
2048
8
        assert(CCV_TENSOR_GET_DEVICE_ID(src->info.type) == CCV_TENSOR_GET_DEVICE_ID(dest->info.type));
2049
8
        const int stream_type = CCV_TENSOR_GET_MEMORY(src->info.type) == CCV_TENSOR_GPU_MEMORY ? CCV_STREAM_CONTEXT_GPU : 
CCV_STREAM_CONTEXT_CPU0
;
2050
8
        const int device_id = CCV_TENSOR_GET_DEVICE_ID(src->info.type);
2051
8
        int type = stream_type;
2052
8
        CCV_STREAM_SET_DEVICE_ID(type, device_id);
2053
8
        ccv_nnc_synced_stream_t stream_0 = _ccv_cnnp_compiled_data_get_synced_stream(to_compiled_data, type);
2054
8
        // Wait signal to finish.
2055
8
        if (stream_context)
2056
4
          ccv_nnc_stream_context_wait_signal(stream_0.stream, signal);
2057
8
        ccv_nnc_cmd_exec(cmd, hint, flags, TENSOR_LIST(dest, src), TENSOR_LIST(dest), stream_0.stream);
2058
8
        if (stream_context)
2059
4
        {
2060
4
          ccv_nnc_stream_context_emit_signal(stream_0.stream, stream_0.synced);
2061
4
          ccv_nnc_stream_context_wait_signal(stream_context, stream_0.synced);
2062
4
        }
2063
8
        streams[j] = stream_0.stream;
2064
8
      }
2065
2
      // If this should be blocking, blocking it.
2066
2
      if (!stream_context)
2067
5
        
for (j = 0; 1
j < parallel_count;
j++4
)
2068
4
          if (streams[j])
2069
4
            ccv_nnc_stream_context_wait(streams[j]);
2070
2
    } else {
2071
1
      ccv_nnc_tensor_t* const src = from_compiled_data->tensors.parameters[src_d];
2072
1
      assert(src);
2073
1
      ccv_nnc_tensor_t* const dest = to_compiled_data->tensors.parameters[dest_d];
2074
1
      assert(dest);
2075
1
      ccv_nnc_cmd_exec(cmd, hint, flags, TENSOR_LIST(dest, src), TENSOR_LIST(dest), stream_context);
2076
1
    }
2077
3
    // Mark this symbol as init'ed.
2078
3
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(to_compiled_data->parameters, dest_d))->d;
2079
3
    to_compiled_data->tensors_init.v[d >> 5] |= (1u << (d & 0x1f));
2080
3
  }
2081
3
  ccv_array_free(to_parameter_indices);
2082
3
  ccv_array_free(from_parameter_indices);
2083
3
}
2084
2085
void ccv_cnnp_model_parameters_map(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_stream_context_t* const stream_context)
2086
14
{
2087
14
  int to_param_ref;
2088
14
  ccv_array_t* const to_parameter_indices = _ccv_cnnp_model_parameter_indices(model, parameters, &to_param_ref);
2089
14
  // To models.
2090
14
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2091
14
  assert(to_compiled_data);
2092
14
  // Tensor has to be inited already.
2093
14
  assert(!!to_compiled_data->tensors_init.v);
2094
14
  assert(to_compiled_data->tensors.parameters);
2095
14
  // From models.
2096
14
  const int parallel_count = ccv_max(model->parallel_count, 1);
2097
14
  const int to_parameter_size = to_compiled_data->parameters->rnum;
2098
14
  const int rnum = (to_param_ref < 0) ? to_parameter_indices->rnum : 
10
;
2099
14
  int i, j;
2100
28
  for (i = 0; i < rnum; 
i++14
)
2101
14
  {
2102
14
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
2103
14
    assert(dest_d >= 0);
2104
14
    assert(dest_d < to_compiled_data->parameters->rnum);
2105
14
    if (parallel_count > 1)
2106
4
    {
2107
4
      ccv_nnc_stream_context_t* streams[parallel_count];
2108
4
      ccv_nnc_stream_signal_t* signal;
2109
4
      if (stream_context)
2110
1
        signal = ccv_nnc_stream_context_emit_signal_new(stream_context);
2111
20
      for (j = 0; j < parallel_count; 
j++16
)
2112
16
      {
2113
16
        ccv_nnc_tensor_t* const dest = to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size];
2114
16
        if (!dest)
2115
0
        {
2116
0
          streams[j] = 0;
2117
0
          continue;
2118
0
        }
2119
16
        const int stream_type = CCV_TENSOR_GET_MEMORY(dest->info.type) == CCV_TENSOR_GPU_MEMORY ? CCV_STREAM_CONTEXT_GPU : 
CCV_STREAM_CONTEXT_CPU0
;
2120
16
        const int device_id = CCV_TENSOR_GET_DEVICE_ID(dest->info.type);
2121
16
        int type = stream_type;
2122
16
        CCV_STREAM_SET_DEVICE_ID(type, device_id);
2123
16
        ccv_nnc_synced_stream_t stream_0 = _ccv_cnnp_compiled_data_get_synced_stream(to_compiled_data, type);
2124
16
        // Wait signal to finish.
2125
16
        if (stream_context)
2126
4
          ccv_nnc_stream_context_wait_signal(stream_0.stream, signal);
2127
16
        ccv_nnc_cmd_exec(cmd, hint, flags, TENSOR_LIST(dest), TENSOR_LIST(dest), 0);
2128
16
        if (stream_context)
2129
4
        {
2130
4
          ccv_nnc_stream_context_emit_signal(stream_0.stream, stream_0.synced);
2131
4
          ccv_nnc_stream_context_wait_signal(stream_context, stream_0.synced);
2132
4
        }
2133
16
        streams[j] = stream_0.stream;
2134
16
      }
2135
4
      // If this should be blocking, blocking it.
2136
4
      if (!stream_context)
2137
15
        
for (j = 0; 3
j < parallel_count;
j++12
)
2138
12
          if (streams[j])
2139
12
            ccv_nnc_stream_context_wait(streams[j]);
2140
10
    } else {
2141
10
      ccv_nnc_tensor_t* const dest = to_compiled_data->tensors.parameters[dest_d];
2142
10
      assert(dest);
2143
10
      ccv_nnc_cmd_exec(cmd, hint, flags, TENSOR_LIST(dest), TENSOR_LIST(dest), stream_context);
2144
10
    }
2145
14
    // No need to mark this symbol as init'ed, it is already.
2146
14
  }
2147
14
  ccv_array_free(to_parameter_indices);
2148
14
}
2149
2150
ccv_nnc_cmd_t ccv_cnnp_model_minimizer(ccv_cnnp_model_t* const model)
2151
2.20k
{
2152
2.20k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2153
2.20k
  assert(compiled_data);
2154
2.20k
  return compiled_data->minimize.minimizer;
2155
2.20k
}
2156
2157
void ccv_cnnp_model_set_minimizer(ccv_cnnp_model_t* const model, const ccv_nnc_cmd_t minimizer, const int reset, const ccv_cnnp_model_io_t* const set_parameters, const int set_parameter_size)
2158
4.35k
{
2159
4.35k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2160
4.35k
  assert(compiled_data);
2161
4.35k
  const int parameter_size = compiled_data->parameters->rnum;
2162
4.35k
  if (parameter_size == 0)
2163
1
    return;
2164
4.35k
  if (reset)
2165
2.49k
    { assert(set_parameters == 0 && set_parameter_size == 0); }
2166
4.35k
  const int old_max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
2167
4.35k
  const int saved_aux_size = ccv_nnc_minimizer_saved_aux_size(minimizer);
2168
4.35k
  if (saved_aux_size > compiled_data->minimize.max_saved_aux_size)
2169
6
    compiled_data->minimize.max_saved_aux_size = saved_aux_size;
2170
4.35k
  const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
2171
4.35k
  // We update all parameters, at this point, we have one minimizer.
2172
4.35k
  if (set_parameters == 0 || 
set_parameter_size == 0301
)
2173
4.05k
    compiled_data->minimize.minimizer = minimizer;
2174
4.35k
  int i;
2175
4.35k
  if (set_parameters && 
set_parameter_size301
)
2176
301
  {
2177
301
    // I need to save what's the minimizer along with this.
2178
301
    if (!compiled_data->minimize.parameters)
2179
5
      compiled_data->minimize.parameters = ccv_array_new(sizeof(ccv_cnnp_set_minimizer_for_parameter_t*), 1, 0);
2180
301
    ccv_cnnp_set_minimizer_for_parameter_t* const set_minimizer_for_parameter = ccmalloc(sizeof(ccv_cnnp_set_minimizer_for_parameter_t) + (set_parameter_size - 1) * sizeof(ccv_cnnp_model_io_t));
2181
301
    set_minimizer_for_parameter->minimizer = minimizer;
2182
301
    set_minimizer_for_parameter->parameter_size = set_parameter_size;
2183
301
    memcpy(set_minimizer_for_parameter->parameters, set_parameters, sizeof(ccv_cnnp_model_io_t) * set_parameter_size);
2184
301
    ccv_array_push(compiled_data->minimize.parameters, &set_minimizer_for_parameter);
2185
301
  }
2186
4.35k
  // If reset is true, clear the parameters array.
2187
4.35k
  if (reset && 
compiled_data->minimize.parameters2.49k
)
2188
291
  {
2189
582
    for (i = 0; i < compiled_data->minimize.parameters->rnum; 
i++291
)
2190
291
      ccfree(*(ccv_cnnp_set_minimizer_for_parameter_t**)ccv_array_get(compiled_data->minimize.parameters, i));
2191
291
    ccv_array_clear(compiled_data->minimize.parameters);
2192
291
  }
2193
4.35k
  if (!compiled_data->update_nodes)
2194
9
    return;
2195
4.34k
  ccv_nnc_symbolic_graph_t* const symbolic_graph = model->graph;
2196
4.34k
  assert(symbolic_graph);
2197
4.34k
  if (saved_aux_size > old_max_saved_aux_size)
2198
6
  {
2199
6
    assert(compiled_data->updated_parameters);
2200
6
    // Reallocate first, move them around later.
2201
6
    compiled_data->updated_parameters = (ccv_nnc_tensor_symbol_t*)ccrealloc(compiled_data->updated_parameters, sizeof(ccv_nnc_tensor_symbol_t) * parameter_size + sizeof(ccv_nnc_graph_exec_symbol_t) * parameter_size + sizeof(ccv_nnc_tensor_symbol_map_t) * saved_aux_size * parameter_size);
2202
6
    compiled_data->update_nodes = (ccv_nnc_graph_exec_symbol_t*)(compiled_data->updated_parameters + parameter_size);
2203
6
    compiled_data->saved_aux = (ccv_nnc_tensor_symbol_map_t*)(compiled_data->update_nodes + parameter_size);
2204
6
    // We need to do this from back to front because saved_aux_size > old_saved_aux_size, it could overlap.
2205
6
    _ccv_cnnp_scatter_saved_aux(compiled_data->saved_aux, parameter_size, old_max_saved_aux_size, saved_aux_size);
2206
6
  }
2207
4.34k
  int flag = 0;
2208
4.34k
  const int parallel_count = ccv_max(model->parallel_count, 1);
2209
4.34k
  if (set_parameters && 
set_parameter_size296
)
2210
296
  {
2211
296
    ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
2212
592
    for (i = 0; i < set_parameter_size; 
i++296
)
2213
296
    {
2214
296
      const int param_sel = set_parameters[i]->param_sel > 0 ? 
set_parameters[i]->param_sel - 1291
:
set_parameters[i]->param_sel5
;
2215
296
      assert(set_parameters[i]->param_sel != 0);
2216
296
      const int old_rnum = parameter_indices->rnum;
2217
296
      ccv_cnnp_model_add_to_parameter_indices(set_parameters[i]->model, param_sel, parameter_indices);
2218
296
      const int param_ref = set_parameters[i]->param_ref > 0 ? 
set_parameters[i]->param_ref - 10
: set_parameters[i]->param_ref;
2219
296
      assert(set_parameters[i]->param_ref != 0);
2220
296
      if (param_ref >= 0)
2221
0
      {
2222
0
        assert(param_ref + old_rnum < parameter_indices->rnum);
2223
0
        *(int*)ccv_array_get(parameter_indices, old_rnum) = *(int*)ccv_array_get(parameter_indices, param_ref + old_rnum);
2224
0
        parameter_indices->rnum = old_rnum + 1;
2225
0
      }
2226
296
    }
2227
296
    // We may have duplicated indices, but that is OK, we will set it twice.
2228
5.24k
    
for (i = 0; 296
i < parameter_indices->rnum;
i++4.95k
)
2229
4.95k
    {
2230
4.95k
      const int d = *(int*)ccv_array_get(parameter_indices, i);
2231
4.95k
      if (_ccv_cnnp_set_minimizer_for_parameter(symbolic_graph, compiled_data, compiled_data->update_nodes, compiled_data->updated_parameters, compiled_data->saved_aux, parallel_count, minimizer, saved_aux_size, max_saved_aux_size, d))
2232
0
        flag = 1;
2233
4.95k
    }
2234
296
    ccv_array_free(parameter_indices);
2235
4.05k
  } else {
2236
19.1k
    for (i = 0; i < parameter_size; 
i++15.0k
)
2237
15.0k
      if (_ccv_cnnp_set_minimizer_for_parameter(symbolic_graph, compiled_data, compiled_data->update_nodes, compiled_data->updated_parameters, compiled_data->saved_aux, parallel_count, minimizer, saved_aux_size, max_saved_aux_size, i))
2238
62
        flag = 1;
2239
4.05k
    if (compiled_data->minimize.parameters)
2240
291
      if (_ccv_cnnp_apply_parameters_with_minimizer(model))
2241
0
        flag = 1;
2242
4.05k
  }
2243
4.34k
  if (flag)
2244
6
  {
2245
6
    // If saved_aux_size doesn't match, we need to remove / add new saved_aux to the graph. But first, free up apply gradients graph.
2246
6
    if (compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_FIT_MODE)
2247
0
      _ccv_cnnp_compiled_data_graph_free(compiled_data);
2248
6
    _ccv_cnnp_compiled_data_apply_gradients_free(compiled_data);
2249
6
  }
2250
4.34k
}
2251
2252
void ccv_cnnp_model_set_compile_params(ccv_cnnp_model_t* const model, const ccv_nnc_symbolic_graph_compile_param_t compile_params)
2253
{
2254
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2255
  assert(compiled_data);
2256
  compiled_data->compile_params = compile_params;
2257
}
2258
2259
void ccv_cnnp_model_dot(const ccv_cnnp_model_t* const model, const int flags, FILE** const outs, const int out_size)
2260
22
{
2261
22
  if (model->graph && out_size > 0)
2262
22
    ccv_nnc_symbolic_graph_dot(model->graph, flags, outs[0]);
2263
22
  if (model->compiled_data && model->compiled_data->graph && 
out_size > 14
)
2264
0
    ccv_nnc_graph_dot(model->compiled_data->graph, flags, outs[1]);
2265
22
  if (model->compiled_data && model->compiled_data->backward.accum && 
out_size > 20
)
2266
0
    ccv_nnc_graph_dot(model->compiled_data->backward.accum, flags, outs[2]);
2267
22
  if (model->compiled_data && model->compiled_data->apply_gradients.graph && 
out_size > 33
)
2268
0
    ccv_nnc_graph_dot(model->compiled_data->apply_gradients.graph, flags, outs[3]);
2269
22
}
2270
2271
static void _ccv_cnnp_compiled_data_free(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
2272
2.25k
{
2273
2.25k
  int i;
2274
2.25k
  const int parameter_size = compiled_data->parameters->rnum;
2275
2.25k
  ccv_array_free(compiled_data->parameters);
2276
2.25k
  const int internal_size = compiled_data->internals->rnum;
2277
2.25k
  ccv_array_free(compiled_data->internals);
2278
2.25k
  assert(compiled_data->ids.parameters->rnum == parameter_size);
2279
2.25k
  assert(compiled_data->ids.internals->rnum == internal_size);
2280
5.18k
  
for (i = 0; 2.25k
i < parameter_size;
i++2.92k
)
2281
2.92k
    ccfree(*(char**)ccv_array_get(compiled_data->ids.parameters, i));
2282
2.25k
  ccv_array_free(compiled_data->ids.parameters);
2283
2.41k
  for (i = 0; i < internal_size; 
i++160
)
2284
2.25k
    
ccfree160
(*(char**)160
ccv_array_get160
(compiled_data->ids.internals, i));
2285
2.25k
  ccv_array_free(compiled_data->ids.internals);
2286
2.25k
  const int parallel_count = ccv_max(model->parallel_count, 1);
2287
2.25k
  if (compiled_data->tensors.parameters)
2288
48
  {
2289
697
    for (i = 0; i < parameter_size * parallel_count; 
i++649
)
2290
649
      ccv_nnc_tensor_free(compiled_data->tensors.parameters[i]);
2291
202
    for (i = 0; i < internal_size * parallel_count; 
i++154
)
2292
154
      if (compiled_data->tensors.internals[i])
2293
154
        ccv_nnc_tensor_free(compiled_data->tensors.internals[i]);
2294
48
    ccfree(compiled_data->tensors.parameters);
2295
48
  }
2296
2.25k
  if (compiled_data->tensors.gradients)
2297
22
  {
2298
339
    for (i = 0; i < parameter_size * parallel_count; 
i++317
)
2299
317
    {
2300
317
      ccv_nnc_tensor_free(compiled_data->tensors.gradients[i]);
2301
317
      if (compiled_data->tensors.accum_gradients[i])
2302
14
        ccv_nnc_tensor_free(compiled_data->tensors.accum_gradients[i]);
2303
317
    }
2304
22
    ccfree(compiled_data->tensors.gradients);
2305
22
  }
2306
2.25k
  if (compiled_data->minimize.parameters)
2307
5
  {
2308
15
    for (i = 0; i < compiled_data->minimize.parameters->rnum; 
i++10
)
2309
10
      ccfree(*(ccv_cnnp_set_minimizer_for_parameter_t**)ccv_array_get(compiled_data->minimize.parameters, i));
2310
5
    ccv_array_free(compiled_data->minimize.parameters);
2311
5
  }
2312
2.25k
  if (compiled_data->rewindables)
2313
32
    ccv_array_free(compiled_data->rewindables);
2314
2.25k
  if (compiled_data->tensors_init.v)
2315
2.25k
    
ccfree48
(compiled_data->tensors_init.v)48
;
2316
2.25k
  if (compiled_data->evaluate.tos)
2317
2.25k
    ccfree(compiled_data->evaluate.tos);
2318
2.25k
  compiled_data->evaluate.tos = 0;
2319
2.25k
  if (compiled_data->synced_streams)
2320
4
  {
2321
4
    khiter_t k;
2322
36
    for (k = 
kh_begin4
(compiled_data->synced_streams); k != kh_end(compiled_data->synced_streams);
++k32
)
2323
32
    {
2324
32
      if (!kh_exist(compiled_data->synced_streams, k))
2325
32
        
continue16
;
2326
16
      ccv_nnc_synced_stream_t* const synced_stream = &kh_val(compiled_data->synced_streams, k);
2327
16
      ccv_nnc_stream_context_free(synced_stream->stream);
2328
16
      ccv_nnc_stream_signal_free(synced_stream->synced);
2329
16
    }
2330
4
    kh_destroy(synced_stream, compiled_data->synced_streams);
2331
4
  }
2332
2.25k
  _ccv_cnnp_compiled_data_graph_free(compiled_data);
2333
2.25k
  _ccv_cnnp_compiled_data_gradient_free(compiled_data);
2334
2.25k
  _ccv_cnnp_compiled_data_backward_free(compiled_data);
2335
2.25k
  _ccv_cnnp_compiled_data_apply_gradients_free(compiled_data);
2336
2.25k
  ccfree(compiled_data);
2337
2.25k
}
2338
2339
void ccv_cnnp_model_free(ccv_cnnp_model_t* const model)
2340
5.30k
{
2341
5.30k
  if (model->isa->deinit)
2342
1.34k
    model->isa->deinit(model);
2343
5.30k
  if (model->io)
2344
707
  {
2345
707
    int i;
2346
1.74k
    for (i = 0; i < model->io->rnum; 
i++1.04k
)
2347
1.04k
    {
2348
1.04k
      ccv_cnnp_model_io_t model_io = *(ccv_cnnp_model_io_t*)ccv_array_get(model->io, i);
2349
1.04k
      if (model_io->outgoings)
2350
601
        ccv_array_free(model_io->outgoings);
2351
1.04k
      if (model_io->incomings)
2352
556
        ccv_array_free(model_io->incomings);
2353
1.04k
      ccfree(model_io);
2354
1.04k
    }
2355
707
    ccv_array_free(model->io);
2356
707
  }
2357
5.30k
  if (model->parameter_indices)
2358
2.50k
    ccv_array_free(model->parameter_indices);
2359
5.30k
  if (model->inputs)
2360
5.30k
    
ccfree2.25k
(model->inputs)2.25k
;
2361
5.30k
  if (model->graph)
2362
2.25k
    ccv_nnc_symbolic_graph_free(model->graph);
2363
5.30k
  if (model->compiled_data)
2364
2.25k
    _ccv_cnnp_compiled_data_free(model, model->compiled_data);
2365
5.30k
  if (model->name)
2366
5.30k
    
ccfree102
(model->name)102
;
2367
5.30k
  ccfree(model);
2368
5.30k
}