Coverage Report

Created: 2022-07-27 23:53

/home/liu/buildslave/linux-x64-runtests/build/lib/nnc/ccv_cnnp_model.c
Line
Count
Source (jump to first uncovered line)
1
#include "ccv_nnc.h"
2
#include "ccv_nnc_easy.h"
3
#include "ccv_nnc_internal.h"
4
#include "ccv_internal.h"
5
#include "_ccv_cnnp_model.h"
6
7
// MARK - Level-5 API
8
9
ccv_cnnp_model_io_t ccv_cnnp_model_apply(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t* const inputs, const int input_size)
10
542
{
11
542
  assert(input_size > 0);
12
542
  if (!model->io)
13
534
    model->io = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 1, 0);
14
542
  ccv_cnnp_model_io_t model_io = ccmalloc(sizeof(struct ccv_cnnp_model_io_s) + sizeof(ccv_nnc_tensor_symbol_t) * model->output_size);
15
542
  model_io->param_ref = 0;
16
542
  model_io->param_sel = 0;
17
542
  model_io->visit = 0;
18
542
  model_io->model = model;
19
542
  model_io->incomings = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 1, 0);
20
542
  model_io->outgoings = 0;
21
542
  model_io->outputs = (ccv_nnc_tensor_symbol_t*)(model_io + 1);
22
542
  ccv_array_push(model->io, &model_io);
23
542
  int i;
24
542
  ccv_array_resize(model_io->incomings, input_size);
25
542
  memcpy(ccv_array_get(model_io->incomings, 0), inputs, sizeof(ccv_cnnp_model_io_t) * input_size);
26
1.21k
  for (i = 0; i < input_size; 
i++670
)
27
670
  {
28
670
    if (!inputs[i]->outgoings)
29
591
      inputs[i]->outgoings = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 1, 0);
30
670
    ccv_array_push(inputs[i]->outgoings, &model_io);
31
670
  }
32
542
  return model_io;
33
542
}
34
35
int ccv_cnnp_model_output_size(const ccv_cnnp_model_t* const model)
36
0
{
37
0
  return model->output_size;
38
0
}
39
40
ccv_cnnp_model_io_t ccv_cnnp_model_parameters(ccv_cnnp_model_t* const model, const int selector, const int index)
41
359
{
42
359
  if (!model->io)
43
30
    model->io = ccv_array_new(sizeof(ccv_cnnp_model_io_t), 1, 0);
44
359
  ccv_cnnp_model_io_t model_io = ccmalloc(sizeof(struct ccv_cnnp_model_io_s));
45
359
  model_io->param_ref = index >= 0 ? 
index + 112
:
ALL_PARAMETERS347
;
46
359
  model_io->param_sel = selector >= 0 ? 
selector + 1304
:
ALL_PARAMETERS55
;
47
359
  model_io->visit = 0;
48
359
  model_io->model = model;
49
359
  model_io->outputs = 0;
50
359
  model_io->incomings = 0;
51
359
  model_io->outgoings = 0;
52
359
  ccv_array_push(model->io, &model_io);
53
359
  return model_io;
54
359
}
55
56
void ccv_cnnp_model_notify_hook(ccv_cnnp_model_t* const model, ccv_cnnp_model_notify_f func, void* const context)
57
3
{
58
3
  model->notify_hook.func = func;
59
3
  model->notify_hook.context = context;
60
3
}
61
62
void ccv_cnnp_model_notify(const ccv_cnnp_model_t* const model, const int tag, void* const payload)
63
14
{
64
14
  if (model->notify_hook.func)
65
3
    model->notify_hook.func(model, tag, payload, model->notify_hook.context);
66
14
  if (model->isa->notify)
67
1
    model->isa->notify(model, tag, payload);
68
14
}
69
70
static int _ccv_nnc_array_dedup_graph_exec_symbols(ccv_nnc_graph_exec_symbol_t* const graph_exec_symbols, int graph_exec_symbol_size)
71
2.23k
{
72
2.23k
  int i, j;
73
4.90k
  for (i = 0; i < graph_exec_symbol_size; 
i++2.66k
)
74
2.66k
  {
75
2.66k
    ccv_nnc_graph_exec_symbol_t* const graph_exec_symbol = graph_exec_symbols + i;
76
    // Check whether this tensor symbol has any duplicate.
77
27.4k
    for (j = i + 1; j < graph_exec_symbol_size;)
78
24.7k
    {
79
24.7k
      ccv_nnc_graph_exec_symbol_t* const other_symbol = graph_exec_symbols + j;
80
      // If there is a same tensor symbol, remove it.
81
24.7k
      if (other_symbol->d == graph_exec_symbol->d && 
other_symbol->graph == graph_exec_symbol->graph2.75k
)
82
2.75k
      {
83
2.75k
        if (j + 1 < graph_exec_symbol_size)
84
474
          *other_symbol = graph_exec_symbols[graph_exec_symbol_size - 1];
85
2.75k
        --graph_exec_symbol_size;
86
2.75k
        continue;
87
2.75k
      }
88
22.0k
      ++j;
89
22.0k
    }
90
2.66k
  }
91
2.23k
  return graph_exec_symbol_size;
92
2.23k
}
93
94
typedef struct {
95
  ccv_cnnp_model_sequence_t* sequence;
96
  char prefix;
97
  ccv_array_t* symbols;
98
  ccv_array_t* ids;
99
} ccv_cnnp_model_add_to_array_context_t;
100
101
static void _ccv_cnnp_add_to_array(void* const context, const ccv_nnc_tensor_symbol_t symbol)
102
3.12k
{
103
3.12k
  ccv_cnnp_model_add_to_array_context_t* const add_to_array_context = (ccv_cnnp_model_add_to_array_context_t*)context;
104
3.12k
  ccv_cnnp_model_t* const model = add_to_array_context->sequence->model;
105
3.12k
  int i;
106
3.12k
  if (!model->parameter_indices)
107
2.51k
    model->parameter_indices = ccv_array_new(sizeof(int), 0, 0);
108
37.3k
  for (i = 0; i < add_to_array_context->symbols->rnum; 
i++34.2k
)
109
34.2k
  {
110
34.2k
    const ccv_nnc_tensor_symbol_t other_symbol = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(add_to_array_context->symbols, i);
111
34.2k
    if (other_symbol.d == symbol.d && 
other_symbol.graph == symbol.graph23
)
112
23
    {
113
      // Only add to parameter_indices if it is trainable.
114
23
      if (add_to_array_context->prefix == 't')
115
14
        ccv_array_add_unique_int(model->parameter_indices, i);
116
      // Found it, return, don't add it.
117
23
      return;
118
23
    }
119
34.2k
  }
120
  // Only add to parameter_indices if it is trainable.
121
3.10k
  if (add_to_array_context->prefix == 't')
122
2.94k
    ccv_array_push(model->parameter_indices, &add_to_array_context->symbols->rnum);
123
  // This is a new one, no need to add_unique_int, it is unique.
124
3.10k
  ccv_array_push(add_to_array_context->symbols, &symbol);
125
3.10k
  char id[2048];
126
3.10k
  id[0] = add_to_array_context->prefix;
127
3.10k
  id[1] = '-';
128
3.10k
  int total_len = 2;
129
6.20k
  for (i = 0; i < add_to_array_context->sequence->sequences->rnum; 
i++3.10k
)
130
3.10k
  {
131
3.10k
    const ccv_cnnp_model_name_t* const name = (ccv_cnnp_model_name_t*)ccv_array_get(add_to_array_context->sequence->sequences, i);
132
3.10k
    int len;
133
3.10k
    if (name->name && 
name->name[0] != '\0'72
)
134
72
      len = snprintf(id + total_len, 2048 - total_len, "%s-%d-", name->name, name->sequence);
135
3.02k
    else
136
3.02k
      len = snprintf(id + total_len, 2048 - total_len, "%d-", name->sequence);
137
3.10k
    total_len += len;
138
3.10k
    if (total_len >= 2047)
139
0
      break;
140
3.10k
  }
141
3.10k
  if (total_len < 2047)
142
3.10k
    total_len += snprintf(id + total_len, 2048 - total_len, "%d", add_to_array_context->sequence->it);
143
3.10k
  assert(total_len < 2048);
144
3.10k
  char *heap_id = (char*)ccmalloc(total_len + 1);
145
3.10k
  memcpy(heap_id, id, total_len + 1);
146
3.10k
  ccv_array_push(add_to_array_context->ids, &heap_id);
147
3.10k
  ++add_to_array_context->sequence->it;
148
3.10k
}
149
150
static void _ccv_cnnp_compiled_data_init(ccv_cnnp_compiled_data_t* const compiled_data, const int output_size)
151
2.26k
{
152
2.26k
  compiled_data->f = compiled_data->fits + output_size;
153
2.26k
  compiled_data->xpu_alloc.mp_hdr = -1;
154
2.26k
  compiled_data->xpu_alloc.freed = kh_init(dy_str);
155
2.26k
  compiled_data->xpu_alloc.allocd = kh_init(dy_alloc);
156
2.26k
}
157
158
static void _ccv_cnnp_model_compile(ccv_cnnp_model_t* const model, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_cmd_t loss)
159
2.26k
{
160
2.26k
  assert(model->graph);
161
2.26k
  model->inputs = ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * input_size);
162
2.26k
  int i;
163
4.57k
  for (i = 0; i < input_size; 
i++2.31k
)
164
2.31k
    model->inputs[i] = ccv_nnc_tensor_symbol_new(model->graph, inputs[i], 0);
165
2.26k
  ccv_array_t* const parameters = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
166
2.26k
  ccv_array_t* const parameter_ids = ccv_array_new(sizeof(char*), 0, 0);
167
2.26k
  ccv_cnnp_model_sequence_t model_sequence = {
168
2.26k
    .bank = kh_init(ccv_cnnp_model_name_bank)
169
2.26k
  };
170
2.26k
  ccv_cnnp_model_add_to_array_context_t add_to_parameter_context = {
171
2.26k
    .sequence = &model_sequence,
172
2.26k
    .prefix = 't',
173
2.26k
    .symbols = parameters,
174
2.26k
    .ids = parameter_ids,
175
2.26k
  };
176
2.26k
  ccv_array_t* const internals = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
177
2.26k
  ccv_array_t* const internal_ids = ccv_array_new(sizeof(char*), 0, 0);
178
2.26k
  ccv_cnnp_model_add_to_array_context_t add_to_output_context = {
179
2.26k
    .sequence = &model_sequence,
180
2.26k
    .prefix = 'r',
181
2.26k
    .symbols = internals,
182
2.26k
    .ids = internal_ids,
183
2.26k
  };
184
2.26k
  ccv_cnnp_model_build_data_t build_data = {
185
2.26k
    .model_sequence = &model_sequence,
186
2.26k
    .add_to_array = _ccv_cnnp_add_to_array,
187
2.26k
    .parameters = parameters,
188
2.26k
    .context = {
189
2.26k
      .add_to_parameter = &add_to_parameter_context,
190
2.26k
      .add_to_output = &add_to_output_context,
191
2.26k
    },
192
2.26k
  };
193
2.26k
  model->data = &build_data;
194
2.26k
  ccv_cnnp_model_build(model, model->graph, model->inputs, input_size, 0, 0);
195
2.26k
  model->data = 0;
196
2.26k
  kh_destroy(ccv_cnnp_model_name_bank, model_sequence.bank);
197
2.26k
  ccv_array_free(model_sequence.sequences);
198
  // Assert no parameter is alias.
199
5.20k
  for (i = 0; i < parameters->rnum; 
i++2.94k
)
200
2.94k
  {
201
2.94k
    const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(parameters, i);
202
2.94k
    const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(parameter.graph, parameter);
203
2.94k
    assert(alias_to.graph == 0); // Cannot find the one alias to.
204
2.94k
  }
205
  // Assert no internal is alias.
206
2.42k
  
for (i = 0; 2.26k
i < internals->rnum;
i++161
)
207
161
  {
208
161
    const ccv_nnc_tensor_symbol_t retained = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(internals, i);
209
161
    const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(retained.graph, retained);
210
161
    assert(alias_to.graph == 0); // Cannot find the one alias to.
211
161
  }
212
2.26k
  const int output_size = model->output_size;
213
2.26k
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
214
2.26k
  ccv_nnc_symbolic_graph_simplify(model->graph,
215
2.26k
    SYMBOLIC_GRAPH_PASSES(CCV_NNC_SIMPLIFY_COMMON_SUBEXPRESSION_ELIMINATION,
216
2.26k
      CCV_NNC_SIMPLIFY_DATA_TRANSFER_OPT,
217
2.26k
      CCV_NNC_SIMPLIFY_OPS_FUSION,
218
2.26k
      CCV_NNC_SIMPLIFY_GRAPH_PRUNING),
219
2.26k
    model->inputs, input_size,
220
2.26k
    model->outputs, output_size,
221
2.26k
    SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
222
2.26k
  ccv_cnnp_compiled_data_t* compiled_data = model->compiled_data = cccalloc(1, sizeof(ccv_cnnp_compiled_data_t) + sizeof(ccv_nnc_tensor_symbol_t) * (output_size * 2 - 1));
223
2.26k
  _ccv_cnnp_compiled_data_init(compiled_data, output_size);
224
2.26k
  const int evaluate_to_size = compiled_data->evaluate.to_size = ccv_nnc_symbolic_graph_destination_size(model->graph);
225
2.26k
  assert(evaluate_to_size > 0);
226
2.26k
  compiled_data->evaluate.tos = ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size);
227
2.26k
  memcpy(compiled_data->evaluate.tos, ccv_nnc_symbolic_graph_destinations(model->graph), sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size);
228
2.26k
  compiled_data->loss = loss;
229
2.26k
  if (loss.cmd == CCV_NNC_NOOP)
230
2.25k
  {
231
    // If no loss function provided, there is no fits.
232
4.51k
    for (i = 0; i < output_size; 
i++2.26k
)
233
2.26k
    {
234
2.26k
      compiled_data->fits[i] = NO_TENSOR_SYMBOL;
235
2.26k
      const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(model->graph, model->outputs[i]);
236
2.26k
      if (alias_to.d < 0)
237
1.26k
        compiled_data->f[i] = model->outputs[i];
238
1.00k
      else { // We cannot differentiate against an alias, therefore, we have to verify this output is full, and we can diff against the original.
239
1.00k
        int ofs[CCV_NNC_MAX_DIM_ALLOC];
240
1.00k
        int inc[CCV_NNC_MAX_DIM_ALLOC];
241
1.00k
        ccv_nnc_tensor_symbol_alias_params(model->graph, model->outputs[i], ofs, inc);
242
1.00k
        int j;
243
13.0k
        for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC; 
j++12.0k
)
244
12.0k
          { assert(ofs[j] == 0); } // There is no ofs.
245
1.00k
        compiled_data->f[i] = alias_to; // Unfortunately, I cannot assert the size yet.
246
1.00k
      }
247
2.26k
    }
248
2.25k
  } else {
249
14
    for (i = 0; i < output_size; 
i++7
)
250
7
    {
251
7
      const ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(model->graph, model->outputs[i]);
252
7
      const ccv_nnc_tensor_symbol_t fit = compiled_data->fits[i] = ccv_nnc_tensor_symbol_new(model->graph, info, 0);
253
7
      compiled_data->f[i] = ccv_nnc_tensor_symbol_new(model->graph, ccv_nnc_tensor_auto, 0);
254
7
      ccv_nnc_graph_exec_symbol_new(model->graph, loss, TENSOR_SYMBOL_LIST(model->outputs[i], fit), TENSOR_SYMBOL_LIST(compiled_data->f[i]), 0);
255
7
    }
256
7
  }
257
2.26k
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
258
2.26k
  ccv_nnc_symbolic_graph_simplify(model->graph,
259
2.26k
    SYMBOLIC_GRAPH_PASSES(CCV_NNC_SIMPLIFY_OPS_FUSION), // Only do Ops fusion, in this way, we can fuse the loss function.
260
2.26k
    0, 0, // No need to provide binds at this point.
261
2.26k
    compiled_data->f, model->output_size,
262
2.26k
    SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
263
  // If inputs are from GPU, stream type is GPU.
264
2.26k
  compiled_data->parameters = parameters;
265
2.26k
  compiled_data->internals = internals;
266
2.26k
  compiled_data->ids.parameters = parameter_ids;
267
2.26k
  compiled_data->ids.internals = internal_ids;
268
2.26k
}
269
270
static void _ccv_cnnp_graph_push_graph_exec_symbol(void* context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_symbol_t* const inputs, const int input_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const char* const name)
271
7.41k
{
272
7.41k
  ccv_array_t* const stack = (ccv_array_t*)context;
273
7.41k
  ccv_array_push(stack, &symbol.d);
274
7.41k
}
275
276
static void _ccv_nnc_tensor_symbol_reinit(const ccv_nnc_symbolic_graph_t* const src_graph, ccv_nnc_symbolic_graph_t* const dest_graph, const int src_index, const int dest_index)
277
38.1k
{
278
38.1k
  const ccv_nnc_tensor_symbol_t src_symbol = {
279
38.1k
    .d = src_index,
280
38.1k
    .graph = src_graph
281
38.1k
  };
282
38.1k
  const ccv_nnc_tensor_symbol_t dest_symbol = {
283
38.1k
    .d = dest_index,
284
38.1k
    .graph = dest_graph
285
38.1k
  };
286
38.1k
  const ccv_nnc_tensor_param_t params = ccv_nnc_tensor_symbol_params(src_graph, src_symbol);
287
38.1k
  ccv_nnc_tensor_symbol_set(dest_graph, dest_symbol, params);
288
38.1k
  int ofs[CCV_NNC_MAX_DIM_ALLOC];
289
38.1k
  int inc[CCV_NNC_MAX_DIM_ALLOC];
290
38.1k
  if (0 == ccv_nnc_tensor_symbol_alias_params(src_graph, src_symbol, ofs, inc))
291
1.36k
    ccv_nnc_tensor_symbol_alias_set(dest_graph, dest_symbol, ofs, inc);
292
38.1k
}
293
294
static int _ccv_nnc_tensor_symbol_check_dim(const ccv_nnc_symbolic_graph_t* const src_graph, ccv_nnc_symbolic_graph_t* const dest_graph, const int src_index, const int dest_index)
295
2.45k
{
296
2.45k
  const ccv_nnc_tensor_symbol_t src_symbol = {
297
2.45k
    .d = src_index,
298
2.45k
    .graph = src_graph
299
2.45k
  };
300
2.45k
  const ccv_nnc_tensor_param_t src_params = ccv_nnc_tensor_symbol_params(src_graph, src_symbol);
301
2.45k
  const ccv_nnc_tensor_symbol_t dest_symbol = {
302
2.45k
    .d = dest_index,
303
2.45k
    .graph = dest_graph
304
2.45k
  };
305
2.45k
  const ccv_nnc_tensor_param_t dest_params = ccv_nnc_tensor_symbol_params(dest_graph, dest_symbol);
306
2.45k
  return memcmp(src_params.dim, dest_params.dim, sizeof(src_params.dim)) == 0;
307
2.45k
}
308
309
static void _ccv_cnnp_model_gradient_init(ccv_cnnp_model_t* const model, const int gradient_mode, const uint64_t disable_outgrad, ccv_nnc_tensor_t* const* const fits, const int fit_size);
310
static void _ccv_cnnp_compiled_data_graph_free(ccv_cnnp_compiled_data_t* const compiled_data);
311
312
typedef struct {
313
  int parallel_count;
314
  ccv_nnc_symbolic_graph_t* graph;
315
  ccv_nnc_graph_exec_arena_t* graph_exec_arena;
316
} ccv_nnc_graph_exec_update_t;
317
318
static void _ccv_cnnp_cmd_update_for_execs(void* const context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint)
319
60
{
320
60
  ccv_nnc_graph_exec_update_t* const graph_exec_update = (ccv_nnc_graph_exec_update_t*)context;
321
60
  ccv_nnc_graph_exec_arena_t* const graph_exec_arena = graph_exec_update->graph_exec_arena;
322
60
  ccv_nnc_graph_exec_t graph_exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, symbol);
323
60
  ccv_nnc_graph_exec_set(graph_exec.graph, graph_exec, cmd);
324
60
  ccv_nnc_graph_exec_set_hint(graph_exec.graph, graph_exec, hint);
325
60
  const ccv_nnc_symbolic_graph_t* const graph = graph_exec_update->graph;
326
60
  const int parallel_count = graph_exec_update->parallel_count;
327
60
  int i;
328
180
  for (i = 1; i < parallel_count; 
i++120
)
329
120
  {
330
120
    const ccv_nnc_graph_exec_t copy = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, ccv_nnc_graph_exec_symbol_copy(graph, symbol, i));
331
120
    if (!CCV_NO_GRAPH_EXEC(copy))
332
120
    {
333
120
      ccv_nnc_graph_exec_set(copy.graph, copy, cmd);
334
120
      ccv_nnc_graph_exec_set_hint(copy.graph, copy, hint);
335
120
    }
336
120
  }
337
60
}
338
339
void ccv_cnnp_model_absorb(ccv_cnnp_model_t* const model, ccv_cnnp_model_t* const init, const ccv_nnc_tensor_param_t* const inputs, const int input_size)
340
2.20k
{
341
2.20k
  assert(model->graph);
342
2.20k
  assert(model->compiled_data);
343
2.20k
  assert(!init->graph);
344
2.20k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
345
2.20k
  init->graph = ccv_nnc_symbolic_graph_new();
346
2.20k
  ccv_array_t* const stack = ccv_array_new(sizeof(int), 0, 0);
347
2.20k
  ccv_nnc_graph_exec_symbol_new_hook(init->graph, _ccv_cnnp_graph_push_graph_exec_symbol, stack);
348
2.20k
  _ccv_cnnp_model_compile(init, inputs, input_size, compiled_data->loss);
349
2.20k
  init->parallel_count = model->parallel_count;
350
2.20k
  init->memory_compression = model->memory_compression;
351
2.20k
  init->compiled_data->stream_type = model->compiled_data->stream_type;
352
2.20k
  init->compiled_data->minimize.minimizer = model->compiled_data->minimize.minimizer;
353
2.20k
  init->compiled_data->minimize.max_saved_aux_size = model->compiled_data->minimize.max_saved_aux_size;
354
2.20k
  if (model->compiled_data->gradient_mode != CCV_CNNP_COMPILED_DATA_GRADIENT_NONE)
355
2.20k
    _ccv_cnnp_model_gradient_init(init, model->compiled_data->gradient_mode, model->compiled_data->disable_outgrad, 0, 0);
356
2.20k
  ccv_nnc_graph_exec_symbol_new_hook(init->graph, 0, 0);
357
2.20k
  ccv_nnc_symbolic_graph_tensor_auto(init->graph, TRAVERSE_FULL);
358
2.20k
  int i, j;
359
  // Verify parameters, internals and saved_aux in both graph has the same dimensionality.
360
4.65k
  for (i = 0; i < compiled_data->parameters->rnum; 
i++2.45k
)
361
2.45k
  {
362
2.45k
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d;
363
2.45k
    assert(_ccv_nnc_tensor_symbol_check_dim(model->graph, init->graph, d, d));
364
2.45k
  }
365
2.20k
  for (i = 0; i < compiled_data->internals->rnum; 
i++0
)
366
0
  {
367
0
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i))->d;
368
0
    assert(_ccv_nnc_tensor_symbol_check_dim(model->graph, init->graph, d, d));
369
0
  }
370
  // Update inputs.
371
2.20k
  assert(model->input_size == init->input_size);
372
4.42k
  
for (i = 0; 2.20k
i < model->input_size;
i++2.21k
)
373
2.21k
    if (model->inputs[i].d >= 0)
374
2.21k
    {
375
2.21k
      assert(init->inputs[i].d >= 0);
376
2.21k
      _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, init->inputs[i].d, model->inputs[i].d);
377
2.21k
    }
378
  // Update outputs.
379
2.20k
  assert(model->output_size == init->output_size);
380
4.41k
  
for (i = 0; 2.20k
i < model->output_size;
i++2.20k
)
381
2.20k
  {
382
2.20k
    if (model->outputs[i].d >= 0)
383
2.20k
    {
384
2.20k
      assert(init->outputs[i].d >= 0);
385
2.20k
      _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, init->outputs[i].d, model->outputs[i].d);
386
2.20k
    }
387
2.20k
    if (model->outputs[i].d != model->compiled_data->f[i].d)
388
1.00k
    {
389
1.00k
      assert(init->outputs[i].d != init->compiled_data->f[i].d);
390
1.00k
      if (model->compiled_data->f[i].d >= 0)
391
1.00k
      {
392
1.00k
        assert(init->compiled_data->f[i].d >= 0);
393
1.00k
        _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, init->compiled_data->f[i].d, model->compiled_data->f[i].d);
394
1.00k
      }
395
1.00k
    }
396
2.20k
  }
397
  // Go through the graph to set tensor on matching symbols
398
9.62k
  
for (i = 0; 2.20k
i < stack->rnum;
i++7.41k
)
399
7.41k
  {
400
7.41k
    const int d = *(int*)ccv_array_get(stack, i);
401
    // If exceed range, skip.
402
7.41k
    if (d >= ccv_nnc_graph_exec_symbol_count(init->graph) ||
403
7.41k
      d >= ccv_nnc_graph_exec_symbol_count(model->graph))
404
0
      continue;
405
7.41k
    const ccv_nnc_graph_exec_symbol_t src_symbol = {
406
7.41k
      .d = d,
407
7.41k
      .graph = init->graph
408
7.41k
    };
409
7.41k
    const ccv_nnc_graph_exec_symbol_t dest_symbol = {
410
7.41k
      .d = d,
411
7.41k
      .graph = model->graph
412
7.41k
    };
413
7.41k
    const ccv_nnc_cmd_t src_cmd = ccv_nnc_graph_exec_symbol_cmd(init->graph, src_symbol);
414
7.41k
    const ccv_nnc_cmd_t dest_cmd = ccv_nnc_graph_exec_symbol_cmd(model->graph, dest_symbol);
415
    // If the name doesn't match, skip.
416
7.41k
    if (dest_cmd.cmd != src_cmd.cmd && 
src_cmd.cmd != CCV_NNC_NOOP0
)
417
0
      continue;
418
    // Now get all the inputs and outputs, if matches, set them.
419
7.41k
    const int* src_inputs;
420
7.41k
    int src_input_size;
421
7.41k
    const int* src_outputs;
422
7.41k
    int src_output_size;
423
7.41k
    ccv_nnc_graph_exec_symbol_io(init->graph, src_symbol, &src_inputs, &src_input_size, &src_outputs, &src_output_size);
424
7.41k
    const int* dest_inputs;
425
7.41k
    int dest_input_size;
426
7.41k
    const int* dest_outputs;
427
7.41k
    int dest_output_size;
428
7.41k
    ccv_nnc_graph_exec_symbol_io(model->graph, dest_symbol, &dest_inputs, &dest_input_size, &dest_outputs, &dest_output_size);
429
    // We may have unmatched input / output size because this is the minimizer and it has
430
    // different saved_aux (for example, when we shrunk with CMD_NOOP).
431
7.41k
    if (src_input_size != dest_input_size)
432
0
      continue;
433
7.41k
    if (src_output_size != dest_output_size)
434
0
      continue;
435
7.41k
    ccv_nnc_graph_exec_symbol_set(model->graph, dest_symbol, src_cmd);
436
    // There may be mismatches of the source tensor symbols and destination tensor symbols. The reason is because
437
    // we may later passed-in the minimizer, therefore, we may allocate tensors for minimizer later in the original
438
    // graph whereas in the newly created graph, it is streamlined (the minimizer exists from the beginning). That
439
    // will make the order of tensor symbols creation different, therefore, exact which tensor is which wrong as
440
    // well. However, set a new minimizer won't change the exec symbol ordering, because we never create new exec
441
    // symbols after gradient init step. Changing a new minimizer just updated that exec symbols setting, it is not
442
    // a new exec symbol.
443
30.2k
    for (j = 0; j < src_input_size; 
j++22.8k
)
444
22.8k
      if (src_inputs[j] >= 0)
445
19.9k
        _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, src_inputs[j], dest_inputs[j]);
446
20.2k
    for (j = 0; j < src_output_size; 
j++12.8k
)
447
12.8k
      if (src_outputs[j] >= 0)
448
12.7k
        _ccv_nnc_tensor_symbol_reinit(init->graph, model->graph, src_outputs[j], dest_outputs[j]);
449
7.41k
  }
450
2.20k
  ccv_array_free(stack);
451
  // After this, we get all tensors in the model graph resolved through tensor_auto.
452
2.20k
  ccv_nnc_symbolic_graph_tensor_auto(model->graph, TRAVERSE_FULL);
453
  // Verify symbols we get matches.
454
2.20k
  const int parameter_size = compiled_data->parameters->rnum;
455
4.65k
  for (i = 0; i < parameter_size; 
i++2.45k
)
456
2.45k
    { assert(((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d == ((ccv_nnc_tensor_symbol_t*)ccv_array_get(init->compiled_data->parameters, i))->d); }
457
2.20k
  const int internal_size = compiled_data->internals->rnum;
458
2.20k
  for (i = 0; i < internal_size; 
i++0
)
459
0
    { assert(((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i))->d == ((ccv_nnc_tensor_symbol_t*)ccv_array_get(init->compiled_data->internals, i))->d); }
460
  // Go through compiled data.
461
2.20k
  if (compiled_data->tensor_arena)
462
2.20k
  {
463
2.20k
    const int flag = ccv_nnc_tensor_arena_reinit(compiled_data->tensor_arena, model->graph);
464
2.20k
    if (flag == 0 && 
compiled_data->graph_exec_arena2.20k
)
465
2.20k
    {
466
2.20k
      ccv_nnc_graph_exec_reinit(compiled_data->graph_exec_arena, compiled_data->graph, model->graph);
467
      // Since we will reinit, if we previously set is_test, we need to set it again.
468
2.20k
      if (compiled_data->is_test)
469
2
      {
470
2
        const int parallel_count = ccv_max(model->parallel_count, 1);
471
2
        ccv_nnc_graph_exec_update_t update = {
472
2
          .parallel_count = parallel_count,
473
2
          .graph = model->graph,
474
2
          .graph_exec_arena = compiled_data->graph_exec_arena,
475
2
        };
476
2
        ccv_cnnp_model_set_is_test(model, 1, _ccv_cnnp_cmd_update_for_execs, &update);
477
2
      }
478
2.20k
    } else
479
      // Free-up tensor arena & graph exec arena.
480
4
      _ccv_cnnp_compiled_data_graph_free(compiled_data);
481
2.20k
  }
482
  // There are other compiled graphs, for accum and apply gradients.
483
  // However, the main conclusion is, these absorb operations shouldn't impact parameters.
484
  // Thus, it won't impact the shape of gradients (only outgrad). Since for outgrad, we
485
  // don't allocate ourselves, it is not a concern. For normal gradients, the shape cannot
486
  // be changed otherwise parameters' shape will be meaningless. The same goes to internals.
487
  // That is why we don't update these compiled graphs at all this point.
488
  // Free the model, we've already "absorbed" it.
489
2.20k
  ccv_cnnp_model_free(init);
490
2.20k
}
491
492
void ccv_cnnp_model_compile(ccv_cnnp_model_t* const model, const ccv_nnc_tensor_param_t* const inputs, const int input_size, const ccv_nnc_cmd_t minimizer, const ccv_nnc_cmd_t loss)
493
2.25k
{
494
2.25k
  assert(input_size == model->input_size || model->input_size == 0);
495
2.25k
  if (model->input_size == 0)
496
6
    model->input_size = input_size;
497
2.25k
  if (!model->graph) // The graph is not compiled yet.
498
54
  {
499
54
    model->graph = ccv_nnc_symbolic_graph_new();
500
54
    _ccv_cnnp_model_compile(model, inputs, input_size, loss);
501
54
    assert(model->compiled_data);
502
54
    int i, flag = 0;
503
131
    for (i = 0; !flag && 
i < input_size112
;
i++77
)
504
77
      flag = (CCV_TENSOR_GET_MEMORY(inputs[i].type) == CCV_TENSOR_GPU_MEMORY);
505
    // If inputs are from GPU, stream type is GPU.
506
54
    model->compiled_data->stream_type = flag ? 
CCV_STREAM_CONTEXT_GPU19
:
CCV_STREAM_CONTEXT_CPU35
;
507
54
    model->compiled_data->minimize.minimizer = minimizer;
508
54
    model->compiled_data->minimize.max_saved_aux_size = ccv_nnc_minimizer_saved_aux_size(minimizer);
509
2.20k
  } else {
510
    // Now, finally fill in this part. If the graph is already compiled, we make a copy of the model.
511
    // And then absorb the "new model" to the old one.
512
2.20k
    ccv_cnnp_model_t* const init = ccv_cnnp_model_copy(model);
513
2.20k
    ccv_cnnp_model_absorb(model, init, inputs, input_size);
514
    // Reset minimizer.
515
2.20k
    ccv_cnnp_model_set_minimizer(model, minimizer, 1, 0, 0);
516
2.20k
  }
517
2.25k
}
518
519
ccv_cnnp_model_t* ccv_cnnp_model_copy(const ccv_cnnp_model_t* const model)
520
2.21k
{
521
2.21k
  return _ccv_cnnp_model_copy(model, 0);
522
2.21k
}
523
524
void ccv_cnnp_model_tensor_auto(ccv_cnnp_model_t* const model, ccv_nnc_tensor_param_t* const outputs, const int output_size)
525
4.44k
{
526
4.44k
  assert(model->graph);
527
4.44k
  assert(output_size == model->output_size);
528
4.44k
  ccv_nnc_symbolic_graph_t* const graph = model->graph;
529
4.44k
  ccv_nnc_symbolic_graph_tensor_auto(graph, TRAVERSE_FULL);
530
4.44k
  int i;
531
8.88k
  for (i = 0; i < output_size; 
i++4.44k
)
532
4.44k
  {
533
4.44k
    assert(model->outputs[i].d != CCV_NNC_NO_TENSOR_SYMBOL);
534
4.44k
    outputs[i] = ccv_nnc_tensor_symbol_params(graph, model->outputs[i]);
535
4.44k
  }
536
4.44k
}
537
538
void ccv_cnnp_model_set_workspace_size(ccv_cnnp_model_t* const model, size_t workspace_size)
539
3
{
540
3
  if (workspace_size == model->workspace_size)
541
0
    return;
542
3
  model->workspace_size = workspace_size;
543
3
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
544
3
  if (compiled_data && compiled_data->graph)
545
0
    ccv_nnc_graph_autotune(compiled_data->graph, workspace_size, 0, TRAVERSE_FULL);
546
3
}
547
548
void ccv_cnnp_model_set_data_parallel(ccv_cnnp_model_t* const model, const int parallel)
549
15
{
550
15
  if (parallel == 0)
551
0
    model->parallel_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU);
552
15
  else
553
15
    model->parallel_count = parallel;
554
15
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
555
15
  if (compiled_data)
556
11
    { assert(!compiled_data->graph); }
557
15
}
558
559
void ccv_cnnp_model_set_memory_compression(ccv_cnnp_model_t* const model, const int memory_compression)
560
0
{
561
0
  model->memory_compression = memory_compression;
562
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
563
0
  if (compiled_data)
564
0
    { assert(!compiled_data->graph); }
565
0
}
566
567
typedef struct {
568
  int parallel_count;
569
  ccv_nnc_symbolic_graph_t* graph;
570
  ccv_cnnp_compiled_data_t* compiled_data;
571
  ccv_nnc_tensor_arena_t* tensor_arena;
572
} ccv_nnc_tensor_init_states_t;
573
574
static int _ccv_cnnp_any_to_init(const ccv_cnnp_compiled_data_t* const compiled_data)
575
61
{
576
61
  int i;
577
161
  for (i = 0; i < compiled_data->parameters->rnum; 
i++100
)
578
129
  {
579
129
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i))->d;
580
129
    if (!(compiled_data->tensors_init.v[d >> 5] & (1u << (d & 0x1f))))
581
29
      return 1;
582
129
  }
583
32
  for (i = 0; i < compiled_data->internals->rnum; 
i++0
)
584
5
  {
585
5
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i))->d;
586
5
    if (!(compiled_data->tensors_init.v[d >> 5] & (1u << (d & 0x1f))))
587
5
      return 1;
588
5
  }
589
27
  return 0;
590
32
}
591
592
static void _ccv_cnnp_init_states_for_tensors(void* const context, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const input, const ccv_nnc_tensor_symbol_t output_symbol)
593
301
{
594
301
  ccv_nnc_tensor_init_states_t* const tensor_init_states = (ccv_nnc_tensor_init_states_t*)context;
595
301
  ccv_nnc_tensor_arena_t* const tensor_arena = tensor_init_states->tensor_arena;
596
301
  ccv_nnc_tensor_t* const output_tensor = ccv_nnc_tensor_from_symbol(tensor_arena, output_symbol);
597
301
  if (!output_tensor)
598
0
    return;
599
301
  const int d = output_symbol.d;
600
301
  assert(d < tensor_init_states->compiled_data->tensors_init.size);
601
301
  if (tensor_init_states->compiled_data->tensors_init.v[d >> 5] & (1u << (d & 0x1f)))
602
17
    return;
603
284
  tensor_init_states->compiled_data->tensors_init.v[d >> 5] |= (1u << (d & 0x1f));
604
284
  ccv_nnc_cmd_exec(cmd, hint, flags, &input, input ? 
112
:
0272
, &output_tensor, 1, 0);
605
284
  const ccv_nnc_symbolic_graph_t* const graph = tensor_init_states->graph;
606
284
  const int parallel_count = tensor_init_states->parallel_count;
607
284
  int i;
608
764
  for (i = 1; i < parallel_count; 
i++480
)
609
480
  {
610
480
    ccv_nnc_tensor_t* const copy = ccv_nnc_tensor_from_symbol(tensor_arena, ccv_nnc_tensor_symbol_copy(graph, output_symbol, i));
611
480
    if (copy)
612
480
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &output_tensor, 1, &copy, 1, 0);
613
480
  }
614
284
}
615
616
// This method can only handle cases we added new tensors and exec, never delete. This invariant is true because
617
// we setup everything (including calling simplify method) in ccv_cnnp_model_compile method, before this rewind setup.
618
static void _ccv_cnnp_model_rewind_graph(ccv_cnnp_model_t* const model)
619
2
{
620
2
  assert(model->graph);
621
2
  assert(model->compiled_data);
622
2
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
623
2
  assert(compiled_data->rewindables);
624
2
  int i;
625
53
  for (i = 0; i < compiled_data->rewindables->rnum; 
i++51
)
626
51
  {
627
51
    const ccv_cnnp_rewind_symbol_t* const rewind_symbol = (ccv_cnnp_rewind_symbol_t*)ccv_array_get(compiled_data->rewindables, i);
628
51
    if (rewind_symbol->type == CCV_CNNP_REWIND_GRAPH_EXEC)
629
16
      ccv_nnc_graph_exec_symbol_free(model->graph, rewind_symbol->graph_exec);
630
35
    else if (rewind_symbol->type == CCV_CNNP_REWIND_TENSOR)
631
35
      ccv_nnc_tensor_symbol_free(model->graph, rewind_symbol->tensor);
632
51
  }
633
2
  ccv_array_clear(compiled_data->rewindables);
634
2
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
635
2
}
636
637
638
static void _ccv_cnnp_model_tensor_symbol_new_hook(void* context, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_param_t info, const char* const name)
639
5.98k
{
640
5.98k
  const ccv_cnnp_rewind_symbol_t rewind_symbol = {
641
5.98k
    .type = CCV_CNNP_REWIND_TENSOR,
642
5.98k
    .tensor = symbol
643
5.98k
  };
644
5.98k
  ccv_array_t* const rewind_symbols = (ccv_array_t*)context;
645
5.98k
  ccv_array_push(rewind_symbols, &rewind_symbol);
646
5.98k
}
647
648
static void _ccv_cnnp_model_tensor_symbol_alias_new_hook(void* context, const ccv_nnc_tensor_symbol_t symbol, const ccv_nnc_tensor_symbol_t from_symbol, const int ofs[CCV_NNC_MAX_DIM_ALLOC], const int inc[CCV_NNC_MAX_DIM_ALLOC], const ccv_nnc_tensor_param_t info, const char* const name)
649
472
{
650
472
  const ccv_cnnp_rewind_symbol_t rewind_symbol = {
651
472
    .type = CCV_CNNP_REWIND_TENSOR,
652
472
    .tensor = symbol
653
472
  };
654
472
  ccv_array_t* const rewind_symbols = (ccv_array_t*)context;
655
472
  ccv_array_push(rewind_symbols, &rewind_symbol);
656
472
}
657
658
static void _ccv_cnnp_model_graph_exec_symbol_new_hook(void* context, const ccv_nnc_graph_exec_symbol_t symbol, const ccv_nnc_cmd_t cmd, const ccv_nnc_tensor_symbol_t* const inputs, const int input_size, const ccv_nnc_tensor_symbol_t* const outputs, const int output_size, const char* const name)
659
2.25k
{
660
2.25k
  const ccv_cnnp_rewind_symbol_t rewind_symbol = {
661
2.25k
    .type = CCV_CNNP_REWIND_GRAPH_EXEC,
662
2.25k
    .graph_exec = symbol
663
2.25k
  };
664
2.25k
  ccv_array_t* const rewind_symbols = (ccv_array_t*)context;
665
2.25k
  ccv_array_push(rewind_symbols, &rewind_symbol);
666
2.25k
}
667
668
static void _ccv_cnnp_model_graph_symbol_exec_set_for_graph_exec_arena(const ccv_nnc_graph_exec_arena_t* const graph_exec_arena, const int parallel_count, const ccv_nnc_graph_exec_symbol_t exec_symbol, const ccv_nnc_cmd_t cmd, ccv_nnc_symbolic_graph_t* const symbolic_graph)
669
35.1k
{
670
35.1k
  ccv_nnc_graph_exec_t const update_exec = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, exec_symbol);
671
35.1k
  if (!CCV_NO_GRAPH_EXEC(update_exec))
672
19.9k
    ccv_nnc_graph_exec_set(update_exec.graph, update_exec, cmd);
673
35.1k
  int i;
674
50.0k
  for (i = 1; i < parallel_count; 
i++14.9k
)
675
14.9k
  {
676
14.9k
    ccv_nnc_graph_exec_symbol_t copy_symbol = ccv_nnc_graph_exec_symbol_copy(symbolic_graph, exec_symbol, i);
677
14.9k
    const ccv_nnc_graph_exec_t copy = ccv_nnc_graph_exec_from_symbol(graph_exec_arena, copy_symbol);
678
14.9k
    if (!CCV_NO_GRAPH_EXEC(copy))
679
14.7k
      ccv_nnc_graph_exec_set(copy.graph, copy, cmd);
680
14.9k
  }
681
35.1k
}
682
683
static void _ccv_cnnp_model_graph_exec_symbol_set(ccv_nnc_symbolic_graph_t* const symbolic_graph, ccv_cnnp_compiled_data_t* const compiled_data, const int parallel_count, const ccv_nnc_graph_exec_symbol_t exec_symbol, const ccv_nnc_cmd_t cmd)
684
20.1k
{
685
20.1k
  assert(compiled_data);
686
20.1k
  assert(symbolic_graph);
687
20.1k
  ccv_nnc_graph_exec_symbol_set(symbolic_graph, exec_symbol, cmd);
688
20.1k
  int i;
689
35.1k
  for (i = 1; i < parallel_count; 
i++15.0k
)
690
15.0k
  {
691
15.0k
    ccv_nnc_graph_exec_symbol_t copy_symbol = ccv_nnc_graph_exec_symbol_copy(symbolic_graph, exec_symbol, i);
692
15.0k
    if (copy_symbol.graph)
693
14.9k
      ccv_nnc_graph_exec_symbol_set(symbolic_graph, copy_symbol, cmd);
694
15.0k
  }
695
20.1k
  ccv_nnc_graph_exec_arena_t* const graph_exec_arena = compiled_data->graph_exec_arena;
696
20.1k
  if (graph_exec_arena)
697
20.0k
    _ccv_cnnp_model_graph_symbol_exec_set_for_graph_exec_arena(graph_exec_arena, parallel_count, exec_symbol, cmd, symbolic_graph);
698
  // Skip backward graph exec arena because it is for a specific accum symbolic graph, not the main graph (model->graph)
699
20.1k
  ccv_nnc_graph_exec_arena_t* const gradient_graph_exec_arena = compiled_data->apply_gradients.graph_exec_arena;
700
20.1k
  if (gradient_graph_exec_arena)
701
15.0k
    _ccv_cnnp_model_graph_symbol_exec_set_for_graph_exec_arena(gradient_graph_exec_arena, parallel_count, exec_symbol, cmd, symbolic_graph);
702
20.1k
}
703
704
static int _ccv_cnnp_set_minimizer_for_parameter(ccv_nnc_symbolic_graph_t* const graph, ccv_cnnp_compiled_data_t* const compiled_data, ccv_nnc_graph_exec_symbol_t* const update_nodes, ccv_nnc_tensor_symbol_t* const updated_parameters, ccv_nnc_tensor_symbol_map_t* const saved_aux, const int parallel_count, const ccv_nnc_cmd_t minimizer, const int saved_aux_size, const int max_saved_aux_size, const int parameter_indice)
705
20.1k
{
706
20.1k
  int this_parameter_flag = 0;
707
20.1k
  const ccv_nnc_cmd_t old_minimizer = ccv_nnc_graph_exec_symbol_cmd(graph, update_nodes[parameter_indice]);
708
20.1k
  int j, k;
709
  // For no-op, we can preserve previous saved_aux_size.
710
20.1k
  if (old_minimizer.cmd != minimizer.cmd && 
minimizer.cmd != CCV_NNC_NOOP71
)
711
67
  {
712
    // If the old minimizer is a noop, then the old_saved_aux_size should be whatever its previous
713
    // saved_aux_size is, otherwise we will reinit the saved_aux repeatedly if you switch between
714
    // noop and a minimizer. We don't want that because we do that in high-level frameworks to
715
    // make sure some model parameters don't update if we don't want them to.
716
67
    int old_saved_aux_size;
717
67
    if (old_minimizer.cmd == CCV_NNC_NOOP)
718
67
    {
719
67
      int input_size;
720
67
      ccv_nnc_graph_exec_symbol_io(graph, update_nodes[parameter_indice], 0, &input_size, 0, 0);
721
67
      if (input_size < 2) // This is not legit.
722
0
        old_saved_aux_size = ccv_nnc_minimizer_saved_aux_size(old_minimizer);
723
67
      else // See ccv_nnc_minimizer_saved_aux_size, the saved_aux is inputs excluding gradients and parameters.
724
67
        old_saved_aux_size = input_size - 2;
725
67
    } else
726
0
      old_saved_aux_size = ccv_nnc_minimizer_saved_aux_size(old_minimizer);
727
67
    if (old_saved_aux_size != saved_aux_size)
728
65
    {
729
65
      this_parameter_flag = 1;
730
65
      if (saved_aux_size > old_saved_aux_size)
731
65
      {
732
        // Allocate new tensor symbols.
733
65
        const ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(graph, updated_parameters[parameter_indice]);
734
189
        for (j = old_saved_aux_size; j < saved_aux_size; 
j++124
)
735
124
        {
736
124
          saved_aux[parameter_indice * max_saved_aux_size + j].source = ccv_nnc_tensor_symbol_new(graph, info, 0);
737
124
          saved_aux[parameter_indice * max_saved_aux_size + j].destination = ccv_nnc_tensor_symbol_new(graph, info, 0);
738
460
          for (k = 1; k < parallel_count; 
k++336
)
739
336
          {
740
336
            ccv_nnc_tensor_param_t dev_info = info;
741
336
            CCV_TENSOR_SET_DEVICE_ID(dev_info.type, k);
742
336
            const ccv_nnc_tensor_symbol_t src_copy = ccv_nnc_tensor_symbol_new(graph, dev_info, 0);
743
336
            const ccv_nnc_tensor_symbol_t dest_copy = ccv_nnc_tensor_symbol_new(graph, dev_info, 0);
744
336
            ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k, src_copy);
745
336
            ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k, dest_copy);
746
336
          }
747
124
        }
748
65
      } else {
749
0
        for (j = saved_aux_size; j < old_saved_aux_size; j++)
750
0
        {
751
0
          for (k = 1; k < parallel_count; k++)
752
0
          {
753
0
            const ccv_nnc_tensor_symbol_t src_copy = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k);
754
0
            if (src_copy.d >= 0)
755
0
            {
756
0
              ccv_nnc_tensor_symbol_free(graph, src_copy);
757
0
              ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k, NO_TENSOR_SYMBOL);
758
0
            }
759
0
            const ccv_nnc_tensor_symbol_t dest_copy = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k);
760
0
            if (dest_copy.d >= 0)
761
0
            {
762
0
              ccv_nnc_tensor_symbol_free(graph, dest_copy);
763
0
              ccv_nnc_tensor_symbol_set_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k, NO_TENSOR_SYMBOL);
764
0
            }
765
0
          }
766
0
          ccv_nnc_tensor_symbol_free(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source);
767
0
          ccv_nnc_tensor_symbol_free(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination);
768
0
          saved_aux[parameter_indice * max_saved_aux_size + j].source = saved_aux[parameter_indice * max_saved_aux_size + j].destination = NO_TENSOR_SYMBOL;
769
0
        }
770
0
      }
771
65
    }
772
67
  }
773
20.1k
  _ccv_cnnp_model_graph_exec_symbol_set(graph, compiled_data, parallel_count, update_nodes[parameter_indice], minimizer);
774
20.1k
  if (this_parameter_flag)
775
65
  {
776
65
    ccv_nnc_tensor_symbol_t update_inputs[saved_aux_size + 2];
777
65
    ccv_nnc_tensor_symbol_t update_outputs[saved_aux_size + 1];
778
65
    const int* inputs = 0;
779
65
    int input_size = 0;
780
65
    ccv_nnc_graph_exec_symbol_io(graph, update_nodes[parameter_indice], &inputs, &input_size, 0, 0);
781
65
    assert(input_size >= 1);
782
65
    update_inputs[0].d = inputs[0];
783
65
    update_inputs[0].graph = graph;
784
65
    update_inputs[1].d = inputs[1];
785
65
    update_inputs[1].graph = graph;
786
65
    update_outputs[0] = updated_parameters[parameter_indice];
787
189
    for (j = 0; j < saved_aux_size; 
j++124
)
788
124
    {
789
124
      update_inputs[j + 2] = saved_aux[parameter_indice * max_saved_aux_size + j].source;
790
124
      update_outputs[j + 1] = saved_aux[parameter_indice * max_saved_aux_size + j].destination;
791
124
    }
792
65
    ccv_nnc_graph_exec_symbol_set_io(graph, update_nodes[parameter_indice], update_inputs, saved_aux_size + 2, update_outputs, saved_aux_size + 1);
793
233
    for (k = 1; k < parallel_count; 
k++168
)
794
168
    {
795
168
      const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(graph, update_nodes[parameter_indice], k);
796
168
      assert(copy.d >= 0);
797
168
      ccv_nnc_graph_exec_symbol_io(graph, copy, &inputs, &input_size, 0, 0);
798
168
      assert(input_size >= 1);
799
168
      update_inputs[0].d = inputs[0];
800
168
      update_inputs[0].graph = graph;
801
168
      update_inputs[1].d = inputs[1];
802
168
      update_inputs[1].graph = graph;
803
168
      update_outputs[0] = ccv_nnc_tensor_symbol_copy(graph, updated_parameters[parameter_indice], k);
804
504
      for (j = 0; j < saved_aux_size; 
j++336
)
805
336
      {
806
336
        update_inputs[j + 2] = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].source, k);
807
336
        update_outputs[j + 1] = ccv_nnc_tensor_symbol_copy(graph, saved_aux[parameter_indice * max_saved_aux_size + j].destination, k);
808
336
      }
809
168
      ccv_nnc_graph_exec_symbol_set_io(graph, copy, update_inputs, saved_aux_size + 2, update_outputs, saved_aux_size + 1);
810
168
    }
811
65
  }
812
20.1k
  return this_parameter_flag;
813
20.1k
}
814
815
typedef struct {
816
  int parameter_size;
817
  ccv_nnc_cmd_t minimizer;
818
  ccv_cnnp_model_io_t parameters[1];
819
} ccv_cnnp_set_minimizer_for_parameter_t;
820
821
static int _ccv_cnnp_apply_parameters_with_minimizer(ccv_cnnp_model_t* const model)
822
296
{
823
296
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
824
296
  assert(compiled_data);
825
296
  const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
826
  // We update all parameters, at this point, we have one minimizer.
827
296
  const int parameter_size = compiled_data->parameters->rnum;
828
296
  ccv_nnc_graph_exec_symbol_t* const update_nodes = compiled_data->update_nodes;
829
296
  ccv_nnc_symbolic_graph_t* const symbolic_graph = model->graph;
830
296
  assert(symbolic_graph);
831
296
  const int parallel_count = ccv_max(model->parallel_count, 1);
832
296
  ccv_array_t* const parameters = compiled_data->minimize.parameters;
833
296
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
834
296
  int i, j, flag = 0;
835
301
  for (i = 0; i < parameters->rnum; 
i++5
)
836
5
  {
837
5
    ccv_cnnp_set_minimizer_for_parameter_t* const set_minimizer_for_parameter = *(ccv_cnnp_set_minimizer_for_parameter_t**)ccv_array_get(parameters, i);
838
10
    for (j = 0; j < set_minimizer_for_parameter->parameter_size; 
j++5
)
839
5
    {
840
5
      const int param_sel = set_minimizer_for_parameter->parameters[j]->param_sel > 0 ? 
set_minimizer_for_parameter->parameters[j]->param_sel - 13
:
set_minimizer_for_parameter->parameters[j]->param_sel2
;
841
5
      assert(set_minimizer_for_parameter->parameters[j]->param_sel != 0);
842
5
      const int old_rnum = parameter_indices->rnum;
843
5
      ccv_cnnp_model_add_to_parameter_indices(set_minimizer_for_parameter->parameters[j]->model, param_sel, parameter_indices);
844
5
      const int param_ref = set_minimizer_for_parameter->parameters[j]->param_ref > 0 ? 
set_minimizer_for_parameter->parameters[j]->param_ref - 10
: set_minimizer_for_parameter->parameters[j]->param_ref;
845
5
      assert(set_minimizer_for_parameter->parameters[j]->param_ref != 0);
846
5
      if (param_ref >= 0)
847
0
      {
848
0
        assert(param_ref + old_rnum < parameter_indices->rnum);
849
0
        *(int*)ccv_array_get(parameter_indices, old_rnum) = *(int*)ccv_array_get(parameter_indices, param_ref + old_rnum);
850
0
        parameter_indices->rnum = old_rnum + 1;
851
0
      }
852
5
    }
853
5
    const int saved_aux_size = ccv_nnc_minimizer_saved_aux_size(set_minimizer_for_parameter->minimizer);
854
    // We may have duplicated indices, but that is OK, we will set it twice.
855
58
    for (j = 0; j < parameter_indices->rnum; 
j++53
)
856
53
    {
857
53
      const int d = *(int*)ccv_array_get(parameter_indices, j);
858
53
      assert(d <= parameter_size);
859
53
      if (_ccv_cnnp_set_minimizer_for_parameter(symbolic_graph, compiled_data, update_nodes, compiled_data->updated_parameters, compiled_data->saved_aux, parallel_count, set_minimizer_for_parameter->minimizer, saved_aux_size, max_saved_aux_size, d))
860
0
        flag = 1;
861
53
    }
862
5
    ccv_array_clear(parameter_indices);
863
5
  }
864
296
  ccv_array_free(parameter_indices);
865
296
  return flag;
866
296
}
867
868
static void _ccv_cnnp_scatter_saved_aux(ccv_nnc_tensor_symbol_map_t* const saved_aux, const int parameter_size, const int old_saved_aux_size, const int new_saved_aux_size)
869
2.24k
{
870
2.24k
  if (new_saved_aux_size == old_saved_aux_size)
871
2.23k
    return;
872
7
  assert(new_saved_aux_size > old_saved_aux_size);
873
7
  int i, j;
874
72
  for (i = parameter_size - 1; i >= 0; 
i--65
)
875
65
  {
876
189
    for (j = new_saved_aux_size - 1; j >= old_saved_aux_size; 
j--124
)
877
124
      saved_aux[i * new_saved_aux_size + j].source = saved_aux[i * new_saved_aux_size + j].destination = NO_TENSOR_SYMBOL;
878
65
    for (j = old_saved_aux_size - 1; j >= 0; 
j--0
)
879
0
      saved_aux[i * new_saved_aux_size + j] = saved_aux[i * old_saved_aux_size + j];
880
65
  }
881
7
}
882
883
static void _ccv_cnnp_model_set_rewindables(ccv_cnnp_model_t* const model)
884
34
{
885
34
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
886
34
  assert(compiled_data);
887
34
  if (!compiled_data->rewindables)
888
34
    compiled_data->rewindables = ccv_array_new(sizeof(ccv_cnnp_rewind_symbol_t), 0, 0);
889
34
  ccv_nnc_tensor_symbol_new_hook(model->graph, _ccv_cnnp_model_tensor_symbol_new_hook, compiled_data->rewindables);
890
34
  ccv_nnc_tensor_symbol_alias_new_hook(model->graph, _ccv_cnnp_model_tensor_symbol_alias_new_hook, compiled_data->rewindables);
891
34
  ccv_nnc_graph_exec_symbol_new_hook(model->graph, _ccv_cnnp_model_graph_exec_symbol_new_hook, compiled_data->rewindables);
892
34
}
893
894
static void _ccv_cnnp_model_gradient_init(ccv_cnnp_model_t* const model, const int gradient_mode, const uint64_t disable_outgrad, ccv_nnc_tensor_t* const* const fits, const int fit_size)
895
2.23k
{
896
2.23k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
897
2.23k
  assert(compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE);
898
2.23k
  assert(gradient_mode != CCV_CNNP_COMPILED_DATA_GRADIENT_NONE);
899
2.23k
  const int evaluate_to_size = compiled_data->evaluate.to_size;
900
2.23k
  assert(evaluate_to_size > 0);
901
2.23k
  const int parallel_count = ccv_max(model->parallel_count, 1);
902
2.23k
  compiled_data->evaluate.tos = ccrealloc(compiled_data->evaluate.tos, sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size * parallel_count + sizeof(ccv_nnc_graph_exec_t) * evaluate_to_size * parallel_count);
903
2.23k
  compiled_data->evaluate.to_ops = (ccv_nnc_graph_exec_t*)(compiled_data->evaluate.tos + evaluate_to_size * parallel_count);
904
2.23k
  int i, j;
905
2.23k
  const int output_size = model->output_size;
906
2.23k
  assert(!fits || fit_size == output_size * parallel_count);
907
2.23k
  if (fits)
908
6
    
for (i = 0; 3
i < output_size;
i++3
)
909
3
      ccv_nnc_tensor_symbol_set(model->graph, compiled_data->fits[i], fits[i]->info);
910
2.23k
  const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
911
2.23k
  const int parameter_size = compiled_data->parameters->rnum;
912
2.23k
  compiled_data->updated_parameters = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size + sizeof(ccv_nnc_graph_exec_symbol_t) * parameter_size + sizeof(ccv_nnc_tensor_symbol_map_t) * max_saved_aux_size * parameter_size);
913
2.23k
  compiled_data->update_nodes = (ccv_nnc_graph_exec_symbol_t*)(compiled_data->updated_parameters + parameter_size);
914
2.23k
  compiled_data->saved_aux = (ccv_nnc_tensor_symbol_map_t*)(compiled_data->update_nodes + parameter_size);
915
2.23k
  int parameter_size_maybe_more = parameter_size;
916
2.23k
  compiled_data->disable_outgrad = disable_outgrad;
917
2.23k
  int outgrad_size;
918
2.23k
  if (gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES || 
model->input_size == 02.23k
)
919
6
    outgrad_size = 0;
920
2.23k
  else if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_NONE) // Compute minimize with gradients including inputs.
921
2.22k
    outgrad_size = model->input_size;
922
7
  else {
923
7
    assert(disable_outgrad != CCV_CNNP_DISABLE_OUTGRAD_ALL); // If it is disable all, gradient mode won't be this.
924
7
    outgrad_size = 0;
925
25
    for (i = 0; i < model->input_size; 
i++18
)
926
18
      if (!(disable_outgrad & ((uint64_t)1 << i)))
927
7
        ++outgrad_size;
928
7
  }
929
2.23k
  compiled_data->outgrad_size = outgrad_size;
930
2.23k
  parameter_size_maybe_more += outgrad_size;
931
2.23k
  compiled_data->gradients = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size_maybe_more + sizeof(ccv_nnc_graph_exec_symbol_t) * parameter_size_maybe_more * parallel_count);
932
2.23k
  compiled_data->outgrads = parameter_size_maybe_more > parameter_size ? 
compiled_data->gradients + parameter_size2.23k
:
06
;
933
2.23k
  compiled_data->backward.tos = (ccv_nnc_graph_exec_symbol_t*)(compiled_data->gradients + parameter_size_maybe_more);
934
2.23k
  compiled_data->backward.to_size = parameter_size_maybe_more;
935
2.23k
  if (gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES || 
model->input_size == 02.23k
)
936
6
    ccv_nnc_symbolic_graph_minimize(model->graph, compiled_data->minimize.minimizer, compiled_data->f, output_size, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), parameter_size, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), compiled_data->gradients, compiled_data->updated_parameters, compiled_data->saved_aux, compiled_data->update_nodes);
937
2.23k
  else if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_NONE) // Compute minimize with gradients including inputs.
938
2.22k
    ccv_nnc_symbolic_graph_minimize(model->graph, compiled_data->minimize.minimizer, compiled_data->f, output_size, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), parameter_size, model->inputs, model->input_size, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), compiled_data->gradients, compiled_data->updated_parameters, compiled_data->saved_aux, compiled_data->update_nodes);
939
7
  else { // Compute minimize with gradients including selected inputs.
940
7
    assert(model->input_size > 0);
941
7
    assert(disable_outgrad != CCV_CNNP_DISABLE_OUTGRAD_ALL); // If it is disable all, gradient mode won't be this.
942
7
    assert(outgrad_size > 0);
943
7
    ccv_nnc_tensor_symbol_t outgrads[outgrad_size];
944
7
    j = 0;
945
25
    for (i = 0; i < model->input_size; 
i++18
)
946
18
      if (!(disable_outgrad & ((uint64_t)1 << i)))
947
7
        outgrads[j++] = model->inputs[i];
948
7
    ccv_nnc_symbolic_graph_minimize(model->graph, compiled_data->minimize.minimizer, compiled_data->f, output_size, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), parameter_size, outgrads, outgrad_size, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), compiled_data->gradients, compiled_data->updated_parameters, compiled_data->saved_aux, compiled_data->update_nodes);
949
7
  }
950
2.23k
  _ccv_cnnp_scatter_saved_aux(compiled_data->saved_aux, parameter_size, ccv_nnc_minimizer_saved_aux_size(compiled_data->minimize.minimizer), compiled_data->minimize.max_saved_aux_size);
951
2.23k
  if (compiled_data->minimize.parameters)
952
5
    _ccv_cnnp_apply_parameters_with_minimizer(model);
953
4.47k
  for (i = 0; i < output_size; 
i++2.23k
)
954
2.23k
  {
955
2.23k
    const ccv_nnc_tensor_symbol_t df = ccv_nnc_tensor_symbol_for_backward(model->graph, compiled_data->f[i]);
956
    // Init this to 1 so we can backprop.
957
2.23k
    ccv_nnc_tensor_symbol_set_flags(model->graph, df, CCV_NNC_TENSOR_SYMBOL_INIT_ONES);
958
2.23k
  }
959
7.15k
  for (i = 0; i < parameter_size_maybe_more; 
i++4.92k
)
960
4.92k
    compiled_data->backward.tos[i] = ccv_nnc_graph_exec_symbol_for_backward(model->graph, compiled_data->gradients[i]);
961
2.23k
  ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS);
962
2.23k
  ccv_nnc_symbolic_graph_set_destinations(model->graph, compiled_data->update_nodes, parameter_size);
963
4.48k
  for (i = 0; i < parameter_size_maybe_more - parameter_size; 
i++2.24k
)
964
2.24k
  {
965
2.24k
    const ccv_nnc_graph_exec_symbol_t outgrad = ccv_nnc_graph_exec_symbol_for_backward(model->graph, compiled_data->outgrads[i]);
966
2.24k
    const int* tos;
967
2.24k
    int to_size;
968
2.24k
    ccv_nnc_graph_exec_symbol_to(model->graph, outgrad, &tos, &to_size);
969
2.24k
    if (to_size == 0) // If this is the end (no minimizers afterwards). We need to attach this as a destination. Otherwise this is covered in update_nodes.
970
5
    {
971
5
      const ccv_nnc_graph_exec_symbol_t* destinations = ccv_nnc_symbolic_graph_destinations(model->graph);
972
5
      int flag = 0;
973
6
      for (j = i - 1; !flag && 
j >= 05
;
j--1
)
974
1
        flag = (destinations[j + parameter_size].d == outgrad.d);
975
5
      if (!flag) // Only if we cannot find it, we add it.
976
4
        ccv_nnc_symbolic_graph_add_destination(model->graph, outgrad);
977
5
    }
978
2.24k
  }
979
2.23k
  if (parallel_count > 1)
980
9
  {
981
9
    ccv_nnc_symbolic_graph_data_parallel(model->graph, parallel_count,
982
9
      0, 0,
983
9
      compiled_data->gradients, parameter_size /* No need to deal with outgrads, we don't allreduce outgrads */,
984
9
      compiled_data->gradients /* We only care about gradients before allreduce, thus, update our current pointers */,
985
9
      0, 0, 0,
986
9
      CCV_NNC_PARALLEL_REDUCE_OP_SUM,
987
9
      SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
988
9
    ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
989
18
    for (i = 0; i < evaluate_to_size; 
i++9
)
990
36
      
for (j = 1; 9
j < parallel_count;
j++27
)
991
27
      {
992
27
        const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->evaluate.tos[i], j);
993
27
        if (copy.d != CCV_NNC_NO_GRAPH_EXEC_SYMBOL)
994
27
          compiled_data->evaluate.tos[compiled_data->evaluate.to_size++] = copy;
995
27
      }
996
176
    for (i = 0; i < parameter_size_maybe_more; 
i++167
)
997
668
      
for (j = 1; 167
j < parallel_count;
j++501
)
998
501
      {
999
501
        const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->backward.tos[i], j);
1000
501
        if (copy.d != CCV_NNC_NO_GRAPH_EXEC_SYMBOL)
1001
501
          compiled_data->backward.tos[compiled_data->backward.to_size++] = copy;
1002
501
      }
1003
9
  }
1004
  // Only use memory compression if we are in gradient parameter mode.
1005
2.23k
  if (gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES && 
model->memory_compression6
)
1006
0
    ccv_nnc_symbolic_graph_memory_compression(model->graph, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
1007
2.23k
  compiled_data->backward.to_size = _ccv_nnc_array_dedup_graph_exec_symbols(compiled_data->backward.tos, compiled_data->backward.to_size);
1008
2.23k
  compiled_data->gradient_mode = gradient_mode;
1009
2.23k
}
1010
1011
void ccv_cnnp_model_tensors_init(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
1012
52
{
1013
52
  assert(!compiled_data->tensors.parameters);
1014
52
  const int parameter_size = compiled_data->parameters->rnum;
1015
52
  const int parallel_count = ccv_max(model->parallel_count, 1);
1016
52
  const int internal_size = compiled_data->internals->rnum;
1017
52
  compiled_data->tensors_init.size = ccv_nnc_tensor_symbol_count(model->graph);
1018
52
  compiled_data->tensors_init.v = cccalloc(((compiled_data->tensors_init.size + 31) >> 5), sizeof(uint32_t));
1019
52
  compiled_data->tensors.parameters = (ccv_nnc_tensor_t**)ccmalloc((sizeof(ccv_nnc_tensor_t*) * parameter_size + sizeof(ccv_nnc_tensor_t*) * internal_size) * parallel_count);
1020
52
  compiled_data->tensors.internals = compiled_data->tensors.parameters + parameter_size * parallel_count;
1021
52
  int i, j;
1022
303
  for (i = 0; i < parameter_size; 
i++251
)
1023
251
  {
1024
251
    const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i);
1025
251
    ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(parameter.graph, parameter);
1026
251
    CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1027
251
    compiled_data->tensors.parameters[i] = ccv_nnc_tensor_new(0, info, 0);
1028
653
    for (j = 1; j < parallel_count; 
j++402
)
1029
402
    {
1030
402
      CCV_TENSOR_SET_DEVICE_ID(info.type, j);
1031
402
      compiled_data->tensors.parameters[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0);
1032
402
    }
1033
251
  }
1034
111
  for (i = 0; i < internal_size; 
i++59
)
1035
59
  {
1036
59
    const ccv_nnc_tensor_symbol_t retained = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, i);
1037
59
    ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(retained.graph, retained);
1038
59
    CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1039
59
    compiled_data->tensors.internals[i] = ccv_nnc_tensor_new(0, info, 0);
1040
155
    for (j = 1; j < parallel_count; 
j++96
)
1041
96
    {
1042
96
      CCV_TENSOR_SET_DEVICE_ID(info.type, j);
1043
96
      compiled_data->tensors.internals[i + j * internal_size] = ccv_nnc_tensor_new(0, info, 0);
1044
96
    }
1045
59
  }
1046
52
}
1047
1048
static void _ccv_cnnp_model_copy_tensors(const uint32_t* const tensors_init, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t* const* const tensors, const int tensor_size, const int parallel_count)
1049
7
{
1050
7
  assert(parallel_count > 0);
1051
7
  int i, j;
1052
41
  for (i = 0; i < tensor_size; 
i++34
)
1053
34
  {
1054
34
    if (!tensors[i])
1055
0
      continue;
1056
34
    const int d = tensor_symbols[i].d;
1057
34
    if (!(tensors_init[d >> 5] & (1u << (d & 0x1f))))
1058
0
      continue;
1059
136
    
for (j = 1; 34
j < parallel_count;
j++102
)
1060
102
      if (tensors[i + j * tensor_size])
1061
102
        ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, &tensors[i], 1, &tensors[i + j * tensor_size], 1, 0);
1062
34
  }
1063
7
}
1064
1065
static void _ccv_cnnp_model_remove_nocopies(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t** const tensors, const int tensor_size, const int parallel_count)
1066
61
{
1067
61
  assert(parallel_count > 0);
1068
61
  int i, j;
1069
120
  for (i = 0; i < tensor_size; 
i++59
)
1070
59
  {
1071
59
    const ccv_nnc_tensor_symbol_t tensor_symbol = tensor_symbols[i];
1072
155
    for (j = 1; j < parallel_count; 
j++96
)
1073
96
    {
1074
96
      const ccv_nnc_tensor_symbol_t copy = ccv_nnc_tensor_symbol_copy(graph, tensor_symbol, j);
1075
96
      ccv_nnc_tensor_t* copy_tensor = tensors[i + j * tensor_size];
1076
96
      if (copy_tensor && copy.d == CCV_NNC_NO_TENSOR_SYMBOL)
1077
0
      { // We shouldn't allocate this, free it up.
1078
0
        ccv_nnc_tensor_free(tensors[i + j * tensor_size]);
1079
0
        tensors[i + j * tensor_size] = 0;
1080
0
      }
1081
96
    }
1082
59
  }
1083
61
}
1084
1085
static void _ccv_cnnp_model_bind_tensors(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t* const* const tensors, const int tensor_size, const int parallel_count, ccv_array_t* const tensor_binds)
1086
359
{
1087
359
  assert(parallel_count > 0);
1088
359
  int i, j;
1089
1.59k
  for (i = 0; i < tensor_size; 
i++1.23k
)
1090
1.23k
  {
1091
1.23k
    ccv_nnc_tensor_symbol_t tensor_symbol = tensor_symbols[i];
1092
1.23k
    if (graph)
1093
1.23k
    {
1094
1.23k
      const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(graph, tensor_symbol);
1095
1.23k
      if (alias_to.d != CCV_NNC_NO_TENSOR_SYMBOL)
1096
1
        tensor_symbol = alias_to;
1097
1.23k
    }
1098
1.23k
    ccv_nnc_tensor_t* const tensor = tensors[i];
1099
1.23k
    if (tensor && tensor_symbol.d != CCV_NNC_NO_TENSOR_SYMBOL)
1100
1.23k
    {
1101
1.23k
      const ccv_nnc_tensor_bind_t retained_bind = {
1102
1.23k
        .symbol = tensor_symbol,
1103
1.23k
        .tensor = tensor
1104
1.23k
      };
1105
1.23k
      ccv_array_push(tensor_binds, &retained_bind);
1106
1.23k
    }
1107
2.87k
    for (j = 1; j < parallel_count; 
j++1.64k
)
1108
1.64k
    {
1109
1.64k
      const ccv_nnc_tensor_symbol_t copy = ccv_nnc_tensor_symbol_copy(graph, tensor_symbol, j);
1110
1.64k
      ccv_nnc_tensor_t* copy_tensor = tensors[i + j * tensor_size];
1111
1.64k
      if (copy_tensor && copy.d != CCV_NNC_NO_TENSOR_SYMBOL)
1112
1.64k
      {
1113
1.64k
        const ccv_nnc_tensor_bind_t bind = {
1114
1.64k
          .symbol = copy,
1115
1.64k
          .tensor = tensors[i + j * tensor_size]
1116
1.64k
        };
1117
1.64k
        ccv_array_push(tensor_binds, &bind);
1118
1.64k
      }
1119
1.64k
    }
1120
1.23k
  }
1121
359
}
1122
1123
static void _ccv_cnnp_compiled_data_graph_free(ccv_cnnp_compiled_data_t* const compiled_data)
1124
2.32k
{
1125
2.32k
  if (compiled_data->graph)
1126
61
    ccv_nnc_graph_free(compiled_data->graph);
1127
2.32k
  compiled_data->graph = 0;
1128
2.32k
  compiled_data->is_test = 0;
1129
2.32k
  if (compiled_data->tensor_arena)
1130
61
    ccv_nnc_tensor_arena_free(compiled_data->tensor_arena);
1131
2.32k
  compiled_data->tensor_arena = 0;
1132
2.32k
  if (compiled_data->graph_exec_arena)
1133
61
    ccv_nnc_graph_exec_arena_free(compiled_data->graph_exec_arena);
1134
2.32k
  compiled_data->graph_exec_arena = 0;
1135
2.32k
  if (compiled_data->backward.from_ops)
1136
27
    ccfree(compiled_data->backward.from_ops);
1137
2.32k
  compiled_data->backward.from_ops = 0;
1138
2.32k
  if (compiled_data->evaluate.schedule)
1139
32
    ccv_nnc_graph_static_schedule_free(compiled_data->evaluate.schedule);
1140
2.32k
  compiled_data->evaluate.schedule = 0;
1141
2.32k
  if (compiled_data->backward.schedule)
1142
24
    ccv_nnc_graph_static_schedule_free(compiled_data->backward.schedule);
1143
2.32k
  compiled_data->backward.schedule = 0;
1144
2.32k
}
1145
1146
static void _ccv_cnnp_compiled_data_gradient_free(ccv_cnnp_compiled_data_t* const compiled_data)
1147
2.26k
{
1148
2.26k
  if (compiled_data->gradients)
1149
2.23k
    ccfree(compiled_data->gradients);
1150
2.26k
  compiled_data->gradients = 0;
1151
2.26k
  if (compiled_data->updated_parameters)
1152
2.23k
    ccfree(compiled_data->updated_parameters);
1153
2.26k
  compiled_data->updated_parameters = 0;
1154
2.26k
  compiled_data->update_nodes = 0;
1155
2.26k
  compiled_data->saved_aux = 0;
1156
2.26k
}
1157
1158
static void _ccv_cnnp_compiled_data_backward_free(ccv_cnnp_compiled_data_t* const compiled_data)
1159
2.29k
{
1160
2.29k
  if (compiled_data->backward.gradients)
1161
5
    ccfree(compiled_data->backward.gradients);
1162
2.29k
  compiled_data->backward.gradients = 0;
1163
2.29k
  if (compiled_data->backward.accum)
1164
5
    ccv_nnc_graph_free(compiled_data->backward.accum);
1165
2.29k
  compiled_data->backward.accum = 0;
1166
2.29k
  if (compiled_data->backward.tensor_arena)
1167
5
    ccv_nnc_tensor_arena_free(compiled_data->backward.tensor_arena);
1168
2.29k
  compiled_data->backward.tensor_arena = 0;
1169
2.29k
  if (compiled_data->backward.graph_exec_arena)
1170
5
    ccv_nnc_graph_exec_arena_free(compiled_data->backward.graph_exec_arena);
1171
2.29k
  compiled_data->backward.graph_exec_arena = 0;
1172
2.29k
}
1173
1174
static void _ccv_cnnp_compiled_data_apply_gradients_free(ccv_cnnp_compiled_data_t* const compiled_data)
1175
2.27k
{
1176
2.27k
  if (compiled_data->apply_gradients.graph)
1177
21
    ccv_nnc_graph_free(compiled_data->apply_gradients.graph);
1178
2.27k
  compiled_data->apply_gradients.graph = 0;
1179
2.27k
  if (compiled_data->apply_gradients.tensor_arena)
1180
21
    ccv_nnc_tensor_arena_free(compiled_data->apply_gradients.tensor_arena);
1181
2.27k
  compiled_data->apply_gradients.tensor_arena = 0;
1182
2.27k
  if (compiled_data->apply_gradients.graph_exec_arena)
1183
21
    ccv_nnc_graph_exec_arena_free(compiled_data->apply_gradients.graph_exec_arena);
1184
2.27k
  compiled_data->apply_gradients.graph_exec_arena = 0;
1185
2.27k
}
1186
1187
// Compile the graph to run ccv_cnnp_model_fit
1188
static void _ccv_cnnp_model_fit_jit(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const fits, const int fit_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
1189
5
{
1190
5
  int i, j;
1191
5
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1192
5
  assert(!compiled_data->graph || compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_FIT_MODE);
1193
5
  compiled_data->graph_mode = CCV_CNNP_MODEL_GRAPH_FIT_MODE;
1194
5
  const int parallel_count = ccv_max(model->parallel_count, 1);
1195
5
  assert(output_size == model->output_size * parallel_count);
1196
5
  assert(!fits || output_size == fit_size);
1197
5
  assert(output_size > 0);
1198
5
  if (compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE)
1199
5
  {
1200
5
    _ccv_cnnp_model_set_rewindables(model);
1201
5
    _ccv_cnnp_model_gradient_init(model, CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES, CCV_CNNP_DISABLE_OUTGRAD_ALL, fits, fit_size);
1202
5
  } else 
if (0
compiled_data->gradient_mode != CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES0
) {
1203
0
    _ccv_cnnp_model_rewind_graph(model);
1204
0
    _ccv_cnnp_compiled_data_gradient_free(compiled_data);
1205
0
    compiled_data->gradient_mode = CCV_CNNP_COMPILED_DATA_GRADIENT_NONE;
1206
0
    _ccv_cnnp_model_gradient_init(model, CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES, CCV_CNNP_DISABLE_OUTGRAD_ALL, fits, fit_size);
1207
0
  }
1208
5
  const int tensors_init = !!compiled_data->tensors_init.v;
1209
5
  if (!tensors_init)
1210
4
    ccv_cnnp_model_tensors_init(model, compiled_data);
1211
5
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
1212
5
  assert((input_size % parallel_count) == 0);
1213
5
  assert((output_size % parallel_count) == 0);
1214
5
  assert((fit_size % parallel_count) == 0);
1215
5
  const int input_size_per_p = input_size / parallel_count;
1216
5
  _ccv_cnnp_model_bind_tensors(model->graph, model->inputs, inputs, input_size_per_p, parallel_count, tensor_binds);
1217
5
  const int output_size_per_p = output_size / parallel_count;
1218
5
  _ccv_cnnp_model_bind_tensors(model->graph, model->outputs, outputs, output_size_per_p, parallel_count, tensor_binds);
1219
5
  const int fit_size_per_p = fit_size / parallel_count;
1220
5
  _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->fits, fits, fit_size_per_p, parallel_count, tensor_binds);
1221
5
  const int parameter_size = compiled_data->parameters->rnum;
1222
5
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1223
5
  _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->updated_parameters, compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1224
5
  const int internal_size = compiled_data->internals->rnum;
1225
5
  _ccv_cnnp_model_remove_nocopies(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count);
1226
5
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count, tensor_binds);
1227
5
  ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph), &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena);
1228
5
  ccv_array_free(tensor_binds);
1229
5
  if (tensors_init && 
parallel_count > 11
)
1230
0
    _ccv_cnnp_model_copy_tensors(compiled_data->tensors_init.v, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, compiled_data->parameters->rnum, parallel_count);
1231
  // If tensor is not init'ed, we need to init states first.
1232
5
  if (_ccv_cnnp_any_to_init(compiled_data))
1233
4
  {
1234
4
    ccv_nnc_tensor_init_states_t tensor_init_states = {
1235
4
      .parallel_count = parallel_count,
1236
4
      .graph = model->graph,
1237
4
      .compiled_data = compiled_data,
1238
4
      .tensor_arena = compiled_data->tensor_arena
1239
4
    };
1240
4
    ccv_cnnp_model_init_states(model, model->graph, _ccv_cnnp_init_states_for_tensors, &tensor_init_states);
1241
4
  }
1242
5
  compiled_data->is_test = 0;
1243
5
  const int saved_aux_size = ccv_nnc_minimizer_saved_aux_size(compiled_data->minimize.minimizer);
1244
  // No need to set because it is default to training mode.
1245
  // ccv_cnnp_model_set_is_test(model, 0, _ccv_cnnp_cmd_update_for_execs, &update);
1246
87
  for (i = 0; i < saved_aux_size * parameter_size; 
i++82
)
1247
82
  {
1248
82
    ccv_nnc_tensor_t* const tensor = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, compiled_data->saved_aux[i].source);
1249
82
    ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &tensor, 1, 0);
1250
286
    for (j = 1; j < parallel_count; 
j++204
)
1251
204
    {
1252
204
      ccv_nnc_tensor_t* const copy = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, compiled_data->saved_aux[i].source, j));
1253
204
      if (copy)
1254
204
        ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &copy, 1, 0);
1255
204
    }
1256
82
  }
1257
5
  const int evaluate_to_size = compiled_data->evaluate.to_size;
1258
5
  compiled_data->evaluate.to_op_size = 0;
1259
16
  for (i = 0; i < evaluate_to_size; 
i++11
)
1260
11
  {
1261
11
    ccv_nnc_graph_exec_t const to = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, compiled_data->evaluate.tos[i]);
1262
11
    if (to.graph)
1263
11
      compiled_data->evaluate.to_ops[compiled_data->evaluate.to_op_size++] = to;
1264
11
  }
1265
5
  ccv_nnc_graph_set_default_static_schedule(compiled_data->graph, compiled_data->stream_type);
1266
5
  ccv_nnc_graph_autotune(compiled_data->graph, model->workspace_size, 0, TRAVERSE_FULL);
1267
5
}
1268
1269
ccv_nnc_stream_context_t* ccv_cnnp_model_default_stream(const ccv_cnnp_model_t* const model)
1270
0
{
1271
0
  const ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1272
0
  if (!compiled_data || !compiled_data->graph)
1273
0
    return 0;
1274
0
  return ccv_nnc_graph_default_stream(compiled_data->graph);
1275
0
}
1276
1277
uint64_t ccv_cnnp_model_memory_size(const ccv_cnnp_model_t* const model)
1278
0
{
1279
0
  const ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1280
0
  if (!compiled_data || !compiled_data->tensor_arena)
1281
0
    return 0;
1282
0
  return ccv_nnc_tensor_arena_size(compiled_data->tensor_arena);
1283
0
}
1284
1285
static void _ccv_cnnp_bind_tensors_to_arena(ccv_nnc_tensor_arena_t* const tensor_arena, const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const tensor_symbols, ccv_nnc_tensor_t* const* const tensors, const int tensor_size, const int parallel_count)
1286
38.5k
{
1287
38.5k
  int i, j;
1288
113k
  for (i = 0; i < tensor_size; 
i++75.1k
)
1289
75.1k
  {
1290
75.1k
    ccv_nnc_tensor_symbol_t tensor_symbol = tensor_symbols[i];
1291
75.1k
    if (graph)
1292
72.2k
    {
1293
72.2k
      const ccv_nnc_tensor_symbol_t alias_to = ccv_nnc_tensor_symbol_alias_to(graph, tensor_symbol);
1294
72.2k
      if (alias_to.d != CCV_NNC_NO_TENSOR_SYMBOL)
1295
1.00k
        tensor_symbol = alias_to;
1296
72.2k
    }
1297
75.1k
    ccv_nnc_tensor_bind_symbol(tensor_arena, tensor_symbol, tensors[i]);
1298
76.9k
    for (j = 1; j < parallel_count; 
j++1.76k
)
1299
1.76k
    {
1300
1.76k
      const ccv_nnc_tensor_symbol_t copy = ccv_nnc_tensor_symbol_copy(graph, tensor_symbol, j);
1301
1.76k
      if (copy.d != CCV_NNC_NO_TENSOR_SYMBOL)
1302
1.76k
        ccv_nnc_tensor_bind_symbol(tensor_arena, copy, tensors[i + tensor_size * j]);
1303
1.76k
    }
1304
75.1k
  }
1305
38.5k
}
1306
1307
void ccv_cnnp_model_fit(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const fits, const int fit_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
1308
2.41k
{
1309
2.41k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1310
2.41k
  assert(compiled_data);
1311
2.41k
  const int parallel_count = ccv_max(model->parallel_count, 1);
1312
2.41k
  assert(output_size == model->output_size * parallel_count);
1313
2.41k
  assert(input_size == model->input_size * parallel_count);
1314
2.41k
  assert(!fits || fit_size == output_size);
1315
2.41k
  assert(model->graph);
1316
2.41k
  if (!compiled_data->graph || 
compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_FIT_MODE2.41k
)
1317
5
  {
1318
5
    _ccv_cnnp_compiled_data_graph_free(compiled_data);
1319
5
    _ccv_cnnp_compiled_data_backward_free(compiled_data);
1320
5
    _ccv_cnnp_compiled_data_apply_gradients_free(compiled_data);
1321
    // Compile the symbolic graph down only when needed.
1322
5
    _ccv_cnnp_model_fit_jit(model, inputs, input_size, fits, fit_size, outputs, output_size);
1323
2.41k
  } else {
1324
2.41k
    assert((input_size % parallel_count) == 0);
1325
2.41k
    assert((output_size % parallel_count) == 0);
1326
2.41k
    assert((fit_size % parallel_count) == 0);
1327
2.41k
    const int input_size_per_p = input_size / parallel_count;
1328
2.41k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->inputs, inputs, input_size_per_p, parallel_count);
1329
2.41k
    const int output_size_per_p = output_size / parallel_count;
1330
2.41k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->outputs, outputs, output_size_per_p, parallel_count);
1331
2.41k
    const int fit_size_per_p = fit_size / parallel_count;
1332
2.41k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, compiled_data->fits, fits, fit_size_per_p, parallel_count);
1333
2.41k
  }
1334
2.41k
  if (compiled_data->is_test)
1335
0
  {
1336
0
    compiled_data->is_test = 0;
1337
0
    ccv_nnc_graph_exec_update_t update = {
1338
0
      .parallel_count = parallel_count,
1339
0
      .graph = model->graph,
1340
0
      .graph_exec_arena = compiled_data->graph_exec_arena,
1341
0
    };
1342
0
    ccv_cnnp_model_set_is_test(model, 0, _ccv_cnnp_cmd_update_for_execs, &update);
1343
0
  }
1344
2.41k
  ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, 0, tensor_tape, stream_context);
1345
2.41k
}
1346
1347
// Compile the graph to run ccv_cnnp_model_evaluate with require_grad = false (MULTISTAGE_MODE_NO_GRAD).
1348
static void _ccv_cnnp_model_multistage_no_grad_jit(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
1349
29
{
1350
29
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1351
29
  compiled_data->graph_mode = CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE_NO_GRAD;
1352
29
  const int parallel_count = ccv_max(model->parallel_count, 1);
1353
29
  assert(output_size == model->output_size * parallel_count);
1354
29
  assert(output_size > 0);
1355
  // If the gradient is not initialized, continue to setup parallel process. We don't init gradient here, but rather,
1356
  // we setup proper rewindables so the graph can be rewinded to previous state before we run data parallel.
1357
29
  if (parallel_count > 1 && 
compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE7
)
1358
6
  {
1359
6
    const int evaluate_to_size = compiled_data->evaluate.to_size;
1360
6
    compiled_data->evaluate.tos = ccrealloc(compiled_data->evaluate.tos, sizeof(ccv_nnc_graph_exec_symbol_t) * evaluate_to_size * parallel_count + sizeof(ccv_nnc_graph_exec_t) * evaluate_to_size * parallel_count);
1361
6
    _ccv_cnnp_model_set_rewindables(model);
1362
6
    ccv_nnc_symbolic_graph_data_parallel(model->graph, parallel_count,
1363
6
      0, 0,
1364
6
      0, 0, 0,
1365
6
      0, 0, 0,
1366
6
      CCV_NNC_PARALLEL_REDUCE_OP_SUM,
1367
6
      SYMBOLIC_GRAPH_SOURCES(model->graph), SYMBOLIC_GRAPH_DESTINATIONS(model->graph));
1368
6
    ccv_nnc_graph_exec_symbol_autogen(model->graph, 0, 0, CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1369
6
    int i, j;
1370
12
    for (i = 0; i < evaluate_to_size; 
i++6
)
1371
24
      
for (j = 1; 6
j < parallel_count;
j++18
)
1372
18
      {
1373
18
        const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->evaluate.tos[i], j);
1374
18
        if (copy.d != CCV_NNC_NO_GRAPH_EXEC_SYMBOL)
1375
18
          compiled_data->evaluate.tos[compiled_data->evaluate.to_size++] = copy;
1376
18
      }
1377
6
  }
1378
29
  const int tensors_init = !!compiled_data->tensors_init.v;
1379
29
  if (!tensors_init)
1380
16
    ccv_cnnp_model_tensors_init(model, compiled_data);
1381
29
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
1382
29
  assert((input_size % parallel_count) == 0);
1383
29
  assert((output_size % parallel_count) == 0);
1384
29
  const int input_size_per_p = input_size / parallel_count;
1385
29
  _ccv_cnnp_model_bind_tensors(model->graph, model->inputs, inputs, input_size_per_p, parallel_count, tensor_binds);
1386
29
  const int output_size_per_p = output_size / parallel_count;
1387
29
  _ccv_cnnp_model_bind_tensors(model->graph, model->outputs, outputs, output_size_per_p, parallel_count, tensor_binds);
1388
29
  const int parameter_size = compiled_data->parameters->rnum;
1389
29
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1390
29
  const int internal_size = compiled_data->internals->rnum;
1391
29
  _ccv_cnnp_model_remove_nocopies(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count);
1392
29
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count, tensor_binds);
1393
  // If we generated gradient for the graph, only compile part of the graph because the rest is irrelevant for evaluation.
1394
29
  ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), compiled_data->evaluate.tos, compiled_data->evaluate.to_size, &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena);
1395
29
  ccv_array_free(tensor_binds);
1396
  // If tensor is not init'ed, we need to init states first.
1397
29
  if (tensors_init && 
parallel_count > 113
)
1398
7
    _ccv_cnnp_model_copy_tensors(compiled_data->tensors_init.v, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, compiled_data->parameters->rnum, parallel_count);
1399
29
  if (_ccv_cnnp_any_to_init(compiled_data))
1400
13
  {
1401
13
    ccv_nnc_tensor_init_states_t tensor_init_states = {
1402
13
      .parallel_count = parallel_count,
1403
13
      .graph = model->graph,
1404
13
      .compiled_data = compiled_data,
1405
13
      .tensor_arena = compiled_data->tensor_arena
1406
13
    };
1407
13
    ccv_cnnp_model_init_states(model, model->graph, _ccv_cnnp_init_states_for_tensors, &tensor_init_states);
1408
13
  }
1409
29
  compiled_data->is_test = 1;
1410
29
  ccv_nnc_graph_exec_update_t update = {
1411
29
    .parallel_count = parallel_count,
1412
29
    .graph = model->graph,
1413
29
    .graph_exec_arena = compiled_data->graph_exec_arena,
1414
29
  };
1415
29
  ccv_cnnp_model_set_is_test(model, 1, _ccv_cnnp_cmd_update_for_execs, &update);
1416
29
  ccv_nnc_graph_set_default_static_schedule(compiled_data->graph, compiled_data->stream_type);
1417
29
  ccv_nnc_graph_autotune(compiled_data->graph, model->workspace_size, 0, TRAVERSE_FULL);
1418
29
}
1419
1420
static void _ccv_cnnp_model_gradient_tensors_init(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
1421
24
{
1422
24
  assert(!compiled_data->tensors.gradients);
1423
24
  const int parameter_size = compiled_data->parameters->rnum;
1424
24
  const int parallel_count = ccv_max(model->parallel_count, 1);
1425
24
  compiled_data->tensors.gradients = (ccv_nnc_tensor_t**)ccmalloc(sizeof(ccv_nnc_tensor_t*) * parameter_size * 2 * parallel_count);
1426
24
  compiled_data->tensors.accum_gradients = compiled_data->tensors.gradients + parameter_size * parallel_count;
1427
24
  int i, j;
1428
165
  for (i = 0; i < parameter_size; 
i++141
)
1429
141
  {
1430
141
    const ccv_nnc_tensor_symbol_t parameter = *(ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, i);
1431
141
    ccv_nnc_tensor_param_t info = ccv_nnc_tensor_symbol_params(parameter.graph, parameter);
1432
141
    CCV_TENSOR_SET_DEVICE_ID(info.type, 0);
1433
141
    compiled_data->tensors.gradients[i] = ccv_nnc_tensor_new(0, info, 0);
1434
141
    compiled_data->tensors.accum_gradients[i] = 0; // delay the accumulated gradient allocation until when we need it.
1435
321
    for (j = 1; j < parallel_count; 
j++180
)
1436
180
    {
1437
180
      CCV_TENSOR_SET_DEVICE_ID(info.type, j);
1438
180
      compiled_data->tensors.gradients[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0);
1439
180
      compiled_data->tensors.accum_gradients[i + j * parameter_size] = 0;
1440
180
    }
1441
141
  }
1442
24
}
1443
1444
static int _ccv_cnnp_is_disable_outgrad_all(const uint64_t disable_outgrad, const int input_size)
1445
7.96k
{
1446
7.96k
  if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_ALL)
1447
11
    return 1;
1448
7.95k
  if (disable_outgrad == CCV_CNNP_DISABLE_OUTGRAD_NONE)
1449
7.94k
    return 0;
1450
8
  int i;
1451
8
  for (i = 0; i < input_size; 
i++0
)
1452
8
    if (!(disable_outgrad & ((uint64_t)1 << i)))
1453
8
      return 0;
1454
0
  return 1;
1455
8
}
1456
1457
// Compile the graph to run ccv_cnnp_model_evaluate with requires_grad = true (MULTISTAGE_MODE).
1458
// Particularly, this method compiles the evaluation and backprop graph (the main graph).
1459
static void _ccv_cnnp_model_multistage_jit_0(ccv_cnnp_model_t* const model, const uint64_t disable_outgrad, const int is_test, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size)
1460
27
{
1461
27
  int i, j;
1462
27
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1463
27
  const int target_gradient_mode = _ccv_cnnp_is_disable_outgrad_all(disable_outgrad, model->input_size) ? 
CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES1
:
CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS26
;
1464
27
  assert(!compiled_data->graph || compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE || compiled_data->gradient_mode != target_gradient_mode);
1465
27
  compiled_data->graph_mode = CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE;
1466
27
  const int parallel_count = ccv_max(model->parallel_count, 1);
1467
27
  assert(output_size == model->output_size * parallel_count);
1468
27
  assert(output_size > 0);
1469
  // There shouldn't be a loss function if we evaluate with multistage jit.
1470
27
  assert(compiled_data->loss.cmd == CCV_NNC_NOOP);
1471
27
  if (compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_NONE)
1472
23
  {
1473
23
    _ccv_cnnp_model_set_rewindables(model);
1474
23
    _ccv_cnnp_model_gradient_init(model, target_gradient_mode, disable_outgrad, 0, 0); // The type of outputs and fits should be the same. We only use type here.
1475
23
  } else 
if (4
compiled_data->gradient_mode != target_gradient_mode4
) {
1476
2
    _ccv_cnnp_model_rewind_graph(model);
1477
2
    _ccv_cnnp_compiled_data_gradient_free(compiled_data);
1478
2
    compiled_data->gradient_mode = CCV_CNNP_COMPILED_DATA_GRADIENT_NONE;
1479
2
    _ccv_cnnp_model_gradient_init(model, target_gradient_mode, disable_outgrad, 0, 0); // The type of outputs and fits should be the same. We only use type here.
1480
2
  }
1481
27
  const int tensors_init = !!compiled_data->tensors_init.v;
1482
27
  if (!tensors_init)
1483
17
    ccv_cnnp_model_tensors_init(model, compiled_data);
1484
27
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
1485
27
  assert((input_size % parallel_count) == 0);
1486
27
  assert((output_size % parallel_count) == 0);
1487
27
  const int input_size_per_p = input_size / parallel_count;
1488
27
  _ccv_cnnp_model_bind_tensors(model->graph, model->inputs, inputs, input_size_per_p, parallel_count, tensor_binds);
1489
27
  const int output_size_per_p = output_size / parallel_count;
1490
27
  _ccv_cnnp_model_bind_tensors(model->graph, model->outputs, outputs, output_size_per_p, parallel_count, tensor_binds);
1491
27
  const int parameter_size = compiled_data->parameters->rnum;
1492
27
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1493
27
  const int internal_size = compiled_data->internals->rnum;
1494
27
  _ccv_cnnp_model_remove_nocopies(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count);
1495
27
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->internals, 0), compiled_data->tensors.internals, internal_size, parallel_count, tensor_binds);
1496
27
  if (!compiled_data->tensors.gradients)
1497
24
    _ccv_cnnp_model_gradient_tensors_init(model, compiled_data);
1498
27
  _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count, tensor_binds);
1499
27
  ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(model->graph), compiled_data->backward.tos, compiled_data->backward.to_size, &compiled_data->graph, &compiled_data->tensor_arena, &compiled_data->graph_exec_arena);
1500
27
  ccv_array_free(tensor_binds);
1501
27
  if (tensors_init && 
parallel_count > 110
)
1502
0
    _ccv_cnnp_model_copy_tensors(compiled_data->tensors_init.v, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, compiled_data->parameters->rnum, parallel_count);
1503
  // If tensor is not init'ed, we need to init states first.
1504
27
  if (_ccv_cnnp_any_to_init(compiled_data))
1505
17
  {
1506
17
    ccv_nnc_tensor_init_states_t tensor_init_states = {
1507
17
      .parallel_count = parallel_count,
1508
17
      .graph = model->graph,
1509
17
      .compiled_data = compiled_data,
1510
17
      .tensor_arena = compiled_data->tensor_arena
1511
17
    };
1512
17
    ccv_cnnp_model_init_states(model, model->graph, _ccv_cnnp_init_states_for_tensors, &tensor_init_states);
1513
17
  }
1514
27
  compiled_data->is_test = is_test;
1515
27
  ccv_nnc_graph_exec_update_t update = {
1516
27
    .parallel_count = parallel_count,
1517
27
    .graph = model->graph,
1518
27
    .graph_exec_arena = compiled_data->graph_exec_arena,
1519
27
  };
1520
27
  ccv_cnnp_model_set_is_test(model, is_test, _ccv_cnnp_cmd_update_for_execs, &update);
1521
27
  const int evaluate_to_size = compiled_data->evaluate.to_size;
1522
27
  compiled_data->evaluate.to_op_size = 0;
1523
27
  ccv_array_t* const backward_from = ccv_array_new(sizeof(int), 0, 0);
1524
72
  for (i = 0; i < evaluate_to_size; 
i++45
)
1525
45
  {
1526
45
    ccv_nnc_graph_exec_t const to_op = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, compiled_data->evaluate.tos[i]);
1527
45
    if (to_op.graph)
1528
45
      compiled_data->evaluate.to_ops[compiled_data->evaluate.to_op_size++] = to_op;
1529
45
    const int* tos;
1530
45
    int to_size;
1531
45
    ccv_nnc_graph_exec_symbol_to(model->graph, compiled_data->evaluate.tos[i], &tos, &to_size);
1532
90
    for (j = 0; j < to_size; 
j++45
)
1533
45
    {
1534
45
      ccv_nnc_graph_exec_t const to_op = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, (ccv_nnc_graph_exec_symbol_t){
1535
45
        .d = tos[j],
1536
45
        .graph = model->graph
1537
45
      });
1538
45
      if (to_op.graph)
1539
45
        ccv_array_add_unique_int(backward_from, to_op.d);
1540
45
    }
1541
45
  }
1542
27
  assert(backward_from->rnum > 0);
1543
27
  compiled_data->backward.from_op_size = backward_from->rnum;
1544
27
  compiled_data->backward.from_ops = (ccv_nnc_graph_exec_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_t) * backward_from->rnum);
1545
72
  for (i = 0; i < backward_from->rnum; 
i++45
)
1546
45
    compiled_data->backward.from_ops[i] = (ccv_nnc_graph_exec_t){
1547
45
      .d = *(int*)ccv_array_get(backward_from, i),
1548
45
      .graph = compiled_data->graph,
1549
45
    };
1550
27
  ccv_array_free(backward_from);
1551
27
  ccv_nnc_graph_set_default_static_schedule(compiled_data->graph, compiled_data->stream_type);
1552
27
  ccv_nnc_graph_autotune(compiled_data->graph, model->workspace_size, 0, TRAVERSE_FULL);
1553
27
}
1554
1555
void ccv_cnnp_model_evaluate(ccv_cnnp_model_t* const model, const ccv_cnnp_evaluate_param_t params, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
1556
7.93k
{
1557
7.93k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1558
7.93k
  assert(compiled_data);
1559
7.93k
  const int parallel_count = ccv_max(model->parallel_count, 1);
1560
7.93k
  assert(output_size == model->output_size * parallel_count);
1561
7.93k
  assert(input_size == model->input_size * parallel_count);
1562
7.93k
  assert(model->graph);
1563
7.93k
  const int target_gradient_mode = _ccv_cnnp_is_disable_outgrad_all(params.disable_outgrad, model->input_size) ? 
CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES10
:
CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS7.92k
;
1564
7.93k
  const int mode_mismatch = (params.requires_grad && 
(7.82k
compiled_data->graph_mode != CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE7.82k
||
compiled_data->gradient_mode != target_gradient_mode7.79k
||
compiled_data->disable_outgrad != params.disable_outgrad7.79k
));
1565
7.93k
  if (!compiled_data->graph || 
mode_mismatch7.88k
)
1566
56
  {
1567
56
    _ccv_cnnp_compiled_data_graph_free(compiled_data);
1568
56
    if (mode_mismatch) // If mode mismatch, we need to redo the backward as well (no need to redo apply_gradients, it doesn't require target_gradient_mode or disable_outgrad.
1569
25
      _ccv_cnnp_compiled_data_backward_free(compiled_data);
1570
56
    if (params.requires_grad)
1571
27
      _ccv_cnnp_model_multistage_jit_0(model, params.disable_outgrad, params.is_test, inputs, input_size, outputs, output_size);
1572
29
    else
1573
29
      _ccv_cnnp_model_multistage_no_grad_jit(model, inputs, input_size, outputs, output_size);
1574
7.88k
  } else {
1575
7.88k
    ccv_nnc_tensor_arena_clear_bindings(compiled_data->tensor_arena);
1576
7.88k
    assert((input_size % parallel_count) == 0);
1577
7.88k
    const int input_size_per_p = input_size / parallel_count;
1578
7.88k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->inputs, inputs, input_size_per_p, parallel_count);
1579
7.88k
    assert((output_size % parallel_count) == 0);
1580
7.88k
    const int output_size_per_p = output_size / parallel_count;
1581
7.88k
    _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, model->outputs, outputs, output_size_per_p, parallel_count);
1582
7.88k
  }
1583
7.93k
  if (compiled_data->is_test != params.is_test)
1584
33
  {
1585
33
    compiled_data->is_test = params.is_test;
1586
33
    ccv_nnc_graph_exec_update_t update = {
1587
33
      .parallel_count = parallel_count,
1588
33
      .graph = model->graph,
1589
33
      .graph_exec_arena = compiled_data->graph_exec_arena,
1590
33
    };
1591
33
    ccv_cnnp_model_set_is_test(model, params.is_test, _ccv_cnnp_cmd_update_for_execs, &update);
1592
33
  }
1593
7.93k
  if (compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE_NO_GRAD)
1594
43
    ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, 0, tensor_tape, stream_context);
1595
7.89k
  else {
1596
7.89k
    if (!compiled_data->evaluate.schedule)
1597
32
      compiled_data->evaluate.schedule = ccv_nnc_graph_static_schedule_new(compiled_data->graph, compiled_data->stream_type, 0, 0, compiled_data->evaluate.to_ops, compiled_data->evaluate.to_op_size);
1598
7.89k
    ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, compiled_data->evaluate.schedule, tensor_tape, stream_context);
1599
7.89k
  }
1600
7.93k
}
1601
1602
// Compile the graph to run ccv_cnnp_model_backward after ccv_cnnp_model_evaluate with requires_grad = true (MULTISTAGE_MODE).
1603
// Particularly, this method compiles the accumulator graph.
1604
static void _ccv_cnnp_model_multistage_jit_1(ccv_cnnp_model_t* const model)
1605
5
{
1606
5
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1607
5
  assert(compiled_data);
1608
5
  assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE);
1609
5
  ccv_nnc_symbolic_graph_t* accum = ccv_nnc_symbolic_graph_new();
1610
5
  const int parallel_count = ccv_max(model->parallel_count, 1);
1611
5
  const int parameter_size = compiled_data->parameters->rnum;
1612
5
  int i, j;
1613
5
  compiled_data->backward.gradients = (ccv_nnc_tensor_symbol_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_t) * parameter_size * parallel_count * 3);
1614
5
  compiled_data->backward.accum_gradients = compiled_data->backward.gradients + parameter_size * parallel_count;
1615
5
  compiled_data->backward.updated_accum_gradients = compiled_data->backward.accum_gradients + parameter_size * parallel_count;
1616
20
  for (i = 0; i < parameter_size; 
i++15
)
1617
30
    
for (j = 0; 15
j < parallel_count;
j++15
)
1618
15
    {
1619
15
      const ccv_nnc_tensor_param_t info = compiled_data->tensors.gradients[i + j * parameter_size]->info;
1620
      // Now, the old gradient is the accumulated gradient, getting new gradient tensor setup so we can collect them.
1621
15
      compiled_data->tensors.accum_gradients[i + j * parameter_size] = compiled_data->tensors.gradients[i + j * parameter_size];
1622
15
      compiled_data->tensors.gradients[i + j * parameter_size] = ccv_nnc_tensor_new(0, info, 0);
1623
15
      ccv_nnc_tensor_symbol_t inputs[2];
1624
15
      inputs[0] = compiled_data->backward.accum_gradients[i + j * parameter_size] = ccv_nnc_tensor_symbol_new(accum, info, 0);
1625
15
      inputs[1] = compiled_data->backward.gradients[i + j * parameter_size] = ccv_nnc_tensor_symbol_new(accum, info, 0);
1626
15
      ccv_nnc_tensor_symbol_t output = compiled_data->backward.updated_accum_gradients[i + j * parameter_size] = ccv_nnc_tensor_symbol_new(accum, info, 0);
1627
15
      ccv_nnc_graph_exec_symbol_new(accum, CMD_EWSUM_FORWARD(), inputs, 2, &output, 1, 0);
1628
15
    }
1629
5
  ccv_nnc_graph_exec_symbol_autogen(accum, 0, 0, CCV_NNC_AUTOGEN_ALL_EXECS | CCV_NNC_AUTOGEN_SOURCES_AND_DESTINATIONS);
1630
5
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
1631
5
  _ccv_cnnp_model_bind_tensors(accum, compiled_data->backward.accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1, tensor_binds);
1632
5
  _ccv_cnnp_model_bind_tensors(accum, compiled_data->backward.gradients, compiled_data->tensors.gradients, parameter_size * parallel_count, 1, tensor_binds);
1633
5
  _ccv_cnnp_model_bind_tensors(accum, compiled_data->backward.updated_accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1, tensor_binds);
1634
5
  ccv_nnc_symbolic_graph_compile(accum, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, SYMBOLIC_GRAPH_SOURCES(accum), SYMBOLIC_GRAPH_DESTINATIONS(accum), &compiled_data->backward.accum, &compiled_data->backward.tensor_arena, &compiled_data->backward.graph_exec_arena);
1635
5
  ccv_nnc_symbolic_graph_free(accum);
1636
5
  ccv_nnc_graph_set_default_static_schedule(compiled_data->backward.accum, compiled_data->stream_type);
1637
5
  ccv_array_free(tensor_binds);
1638
5
}
1639
1640
void ccv_cnnp_model_backward(ccv_cnnp_model_t* const model, ccv_nnc_tensor_t* const* const ingrads, const int ingrad_size, ccv_nnc_tensor_t* const* const outgrads, const int outgrad_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context)
1641
7.88k
{
1642
7.88k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1643
7.88k
  assert(compiled_data);
1644
7.88k
  assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE);
1645
7.88k
  const int parallel_count = ccv_max(model->parallel_count, 1);
1646
7.88k
  assert(ingrad_size == 0 || ingrad_size == model->output_size * parallel_count);
1647
7.88k
  if (outgrad_size > 0)
1648
2.51k
    { assert(outgrad_size == compiled_data->outgrad_size * parallel_count); }
1649
7.88k
  assert(model->graph);
1650
7.88k
  assert(compiled_data->graph);
1651
7.88k
  const int parameter_size = compiled_data->parameters->rnum;
1652
  // If we need to accumulate the gradients now, do jit on accumulator.
1653
7.88k
  if (compiled_data->backward.count > 0)
1654
1.71k
  {
1655
1.71k
    if (!compiled_data->backward.accum)
1656
5
      _ccv_cnnp_model_multistage_jit_1(model);
1657
1.71k
    else if (compiled_data->backward.count == 1) {
1658
      //  On this round, we need to switch accumulated gradients with gradients (so we can do accumulation properly).
1659
496
      int i;
1660
496
      ccv_nnc_tensor_arena_clear_bindings(compiled_data->backward.tensor_arena);
1661
1.48k
      for (i = 0; i < parameter_size * parallel_count; 
i++986
)
1662
986
      {
1663
986
        ccv_nnc_tensor_t* tensor;
1664
986
        CCV_SWAP(compiled_data->tensors.accum_gradients[i], compiled_data->tensors.gradients[i], tensor);
1665
986
      }
1666
      // Do rebind in case we messed up the binding (we switch accum_gradients and gradients).
1667
496
      _ccv_cnnp_bind_tensors_to_arena(compiled_data->backward.tensor_arena, 0, compiled_data->backward.gradients, compiled_data->tensors.gradients, parameter_size * parallel_count, 1);
1668
496
      _ccv_cnnp_bind_tensors_to_arena(compiled_data->backward.tensor_arena, 0, compiled_data->backward.accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1);
1669
496
      _ccv_cnnp_bind_tensors_to_arena(compiled_data->backward.tensor_arena, 0, compiled_data->backward.updated_accum_gradients, compiled_data->tensors.accum_gradients, parameter_size * parallel_count, 1);
1670
496
    }
1671
1.71k
  }
1672
7.88k
  const int ingrad_size_per_p = model->output_size;
1673
7.88k
  const int outgrad_size_per_p = compiled_data->outgrad_size;
1674
7.88k
  int i, j;
1675
15.7k
  for (i = 0; i < ingrad_size_per_p; 
i++7.88k
)
1676
7.88k
  {
1677
7.88k
    const ccv_nnc_tensor_symbol_t ingrad = ccv_nnc_tensor_symbol_for_backward(model->graph, compiled_data->f[i]);
1678
7.88k
    if (!ingrad_size || 
!ingrads3.78k
||
ingrads[i] == 03.78k
)
1679
4.19k
    {
1680
      // Set it to 1 if it is not specified.
1681
4.19k
      ccv_nnc_tensor_t* const ingrad_tensor = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, ingrad);
1682
4.19k
      if (ingrad_tensor)
1683
4.19k
        ccv_nnc_cmd_exec(CMD_SET_FORWARD(1), ccv_nnc_no_hint, 0, 0, 0, TENSOR_LIST(ingrad_tensor), stream_context);
1684
4.31k
      for (j = 1; j < parallel_count; 
j++120
)
1685
120
      {
1686
120
        ccv_nnc_tensor_t* const ingrad_tensor = ccv_nnc_tensor_from_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, ingrad, j));
1687
120
        if (ingrad_tensor)
1688
120
          ccv_nnc_cmd_exec(CMD_SET_FORWARD(1), ccv_nnc_no_hint, 0, 0, 0, TENSOR_LIST(ingrad_tensor), stream_context);
1689
120
      }
1690
4.19k
    } else {
1691
      // Make sure the length matches, in case it is an alias.
1692
3.68k
      assert(ccv_nnc_tensor_count(ingrads[i]->info) == ccv_nnc_tensor_count(ccv_nnc_tensor_symbol_params(model->graph, ingrad)));
1693
3.68k
      ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, ingrad, ingrads[i]);
1694
3.69k
      for (j = 1; j < parallel_count; 
j++6
)
1695
6
        ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, ingrad, j), ingrads[i + ingrad_size_per_p * j]);
1696
3.68k
    }
1697
7.88k
  }
1698
7.88k
  if (outgrad_size > 0)
1699
2.51k
  {
1700
2.51k
    assert(compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS && "shouldn't pass disable_outgrad to ccv_cnnp_model_evaluate before if you plan to compute outgrad");
1701
5.13k
    
for (i = 0; 2.51k
i < outgrad_size_per_p;
i++2.62k
)
1702
2.62k
      if (outgrads[i])
1703
2.43k
      {
1704
2.43k
        const ccv_nnc_tensor_symbol_t outgrad = compiled_data->outgrads[i];
1705
2.43k
        ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, outgrad, outgrads[i]);
1706
2.43k
        for (j = 1; j < parallel_count; 
j++6
)
1707
6
          ccv_nnc_tensor_bind_symbol(compiled_data->tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, outgrad, j), outgrads[i + outgrad_size_per_p * j]);
1708
2.43k
      }
1709
5.37k
  } else {
1710
5.37k
    assert(compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES ||
1711
5.37k
      compiled_data->gradient_mode == CCV_CNNP_COMPILED_DATA_GRADIENT_TRAINABLES_AND_INPUTS);
1712
5.37k
  }
1713
  // We need to rebind here because in ccv_cnnp_evaluate, we clear bindings, that will reset all bindings for the gradients.
1714
  // For parameters and internals these are fine because when we clear bindings, it restores to original bindings, which are these
1715
  // parameters and internals. The same cannot be said for gradients due to the accum_gradients switching.
1716
7.88k
  _ccv_cnnp_bind_tensors_to_arena(compiled_data->tensor_arena, model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count);
1717
7.88k
  if (!compiled_data->backward.schedule)
1718
24
    compiled_data->backward.schedule = ccv_nnc_graph_static_schedule_new(compiled_data->graph, compiled_data->stream_type, compiled_data->backward.from_ops, compiled_data->backward.from_op_size, 0, 0);
1719
  // Run the backward pass.
1720
7.88k
  ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, compiled_data->backward.schedule, tensor_tape, stream_context);
1721
  // If we need to run accumulation round, do that now.
1722
7.88k
  if (compiled_data->backward.count > 0)
1723
1.71k
    ccv_nnc_graph_run_with_schedule(compiled_data->backward.accum, 0, 0, 0, stream_context);
1724
  // Update the count, this determines whether we need to accumulate or not.
1725
7.88k
  ++compiled_data->backward.count;
1726
7.88k
}
1727
1728
// Compile the graph to run ccv_cnnp_model_apply_gradients after ccv_cnnp_model_backward (MULTISTAGE_MODE).
1729
// Particularly, this method compiles the parameter update graph.
1730
static void _ccv_cnnp_model_multistage_jit_2(ccv_cnnp_model_t* const model)
1731
21
{
1732
21
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1733
21
  assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE);
1734
21
  const int parallel_count = ccv_max(model->parallel_count, 1);
1735
21
  const int parameter_size = compiled_data->parameters->rnum;
1736
21
  ccv_array_t* const tensor_binds = ccv_array_new(sizeof(ccv_nnc_tensor_bind_t), 0, 0);
1737
21
  _ccv_cnnp_model_bind_tensors(model->graph, (ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, 0), compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1738
21
  _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->updated_parameters, compiled_data->tensors.parameters, parameter_size, parallel_count, tensor_binds);
1739
  // Bind accumulated gradients.
1740
21
  if (compiled_data->backward.count > 1)
1741
4
    _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->gradients, compiled_data->tensors.accum_gradients, parameter_size, parallel_count, tensor_binds);
1742
17
  else
1743
17
    _ccv_cnnp_model_bind_tensors(model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count, tensor_binds);
1744
21
  ccv_array_t* const apply_gradients_from = ccv_array_new(sizeof(int), 0, 0);
1745
21
  int i, j;
1746
247
  for (i = 0; i < compiled_data->backward.to_size; 
i++226
)
1747
226
  {
1748
226
    const int* tos;
1749
226
    int to_size;
1750
226
    ccv_nnc_graph_exec_symbol_to(model->graph, compiled_data->backward.tos[i], &tos, &to_size);
1751
726
    for (j = 0; j < to_size; 
j++500
)
1752
500
    {
1753
      // Check if this is already show up in the backward graph, if that is the case, it won't be in the apply
1754
      // gradients graph.
1755
500
      const ccv_nnc_graph_exec_t exec = ccv_nnc_graph_exec_from_symbol(compiled_data->graph_exec_arena, (ccv_nnc_graph_exec_symbol_t){
1756
500
        .d = tos[j],
1757
500
        .graph = model->graph,
1758
500
      });
1759
500
      if (!exec.graph)
1760
313
        ccv_array_add_unique_int(apply_gradients_from, tos[j]);
1761
500
    }
1762
226
  }
1763
21
  const int from_size = apply_gradients_from->rnum;
1764
21
  ccv_nnc_graph_exec_symbol_t* const froms = (ccv_nnc_graph_exec_symbol_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_t) * from_size);
1765
154
  for (i = 0; i < from_size; 
i++133
)
1766
133
    froms[i] = (ccv_nnc_graph_exec_symbol_t){
1767
133
      .d = *(int*)ccv_array_get(apply_gradients_from, i),
1768
133
      .graph = model->graph
1769
133
    };
1770
21
  ccv_array_free(apply_gradients_from);
1771
  // It can only ends with updates on the parameters.
1772
21
  ccv_array_t* const tos = ccv_array_new(sizeof(ccv_nnc_graph_exec_symbol_t), parameter_size * parallel_count, 0);
1773
154
  for (i = 0;  i < parameter_size; 
i++133
)
1774
133
  {
1775
133
    ccv_array_push(tos, &compiled_data->update_nodes[i]);
1776
313
    for (j = 1; j < parallel_count; 
j++180
)
1777
180
    {
1778
180
      const ccv_nnc_graph_exec_symbol_t copy = ccv_nnc_graph_exec_symbol_copy(model->graph, compiled_data->update_nodes[i], j);
1779
180
      ccv_array_push(tos, &copy);
1780
180
    }
1781
133
  }
1782
21
  ccv_nnc_symbolic_graph_compile(model->graph, compiled_data->compile_params, (ccv_nnc_tensor_bind_t*)ccv_array_get(tensor_binds, 0), tensor_binds->rnum, 0, 0, froms, from_size, (ccv_nnc_graph_exec_symbol_t*)ccv_array_get(tos, 0), tos->rnum, &compiled_data->apply_gradients.graph, &compiled_data->apply_gradients.tensor_arena, &compiled_data->apply_gradients.graph_exec_arena);
1783
21
  ccv_array_free(tos);
1784
21
  ccv_array_free(tensor_binds);
1785
21
  ccfree(froms);
1786
21
  const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
1787
213
  for (i = 0; i < max_saved_aux_size * parameter_size; 
i++192
)
1788
192
  {
1789
    // Skip on no tensor.
1790
192
    if (compiled_data->saved_aux[i].source.d == CCV_NNC_NO_TENSOR_SYMBOL)
1791
0
      continue;
1792
192
    ccv_nnc_tensor_t* const tensor = ccv_nnc_tensor_from_symbol(compiled_data->apply_gradients.tensor_arena, compiled_data->saved_aux[i].source);
1793
192
    ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &tensor, 1, 0);
1794
540
    for (j = 1; j < parallel_count; 
j++348
)
1795
348
    {
1796
348
      ccv_nnc_tensor_t* const copy = ccv_nnc_tensor_from_symbol(compiled_data->apply_gradients.tensor_arena, ccv_nnc_tensor_symbol_copy(model->graph, compiled_data->saved_aux[i].source, j));
1797
348
      if (copy)
1798
348
        ccv_nnc_cmd_exec(CMD_SET_FORWARD(0), ccv_nnc_no_hint, 0, 0, 0, &copy, 1, 0);
1799
348
    }
1800
192
  }
1801
21
  ccv_nnc_graph_set_default_static_schedule(compiled_data->apply_gradients.graph, compiled_data->stream_type);
1802
21
}
1803
1804
void ccv_cnnp_model_apply_gradients(ccv_cnnp_model_t* const model, ccv_nnc_stream_context_t* const stream_context)
1805
7.81k
{
1806
7.81k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1807
7.81k
  assert(compiled_data);
1808
7.81k
  assert(compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE);
1809
7.81k
  const int parallel_count = ccv_max(model->parallel_count, 1);
1810
7.81k
  assert(model->graph);
1811
7.81k
  assert(compiled_data->graph);
1812
  // Skip if there is no backward pass.
1813
7.81k
  if (compiled_data->backward.count <= 0)
1814
1.65k
    return;
1815
  // Skip if there is no parameters.
1816
6.16k
  if (compiled_data->parameters->rnum == 0)
1817
1
  {
1818
1
    compiled_data->backward.count = 0;
1819
1
    return;
1820
1
  }
1821
6.16k
  if (!compiled_data->apply_gradients.graph)
1822
21
    _ccv_cnnp_model_multistage_jit_2(model);
1823
6.14k
  else {
1824
6.14k
    const int parameter_size = compiled_data->parameters->rnum;
1825
6.14k
    ccv_nnc_tensor_arena_clear_bindings(compiled_data->apply_gradients.tensor_arena);
1826
    // Change to bind accum_gradients if we do gradient accumulation (run backward more than once).
1827
6.14k
    if (compiled_data->backward.count > 1)
1828
497
      _ccv_cnnp_bind_tensors_to_arena(compiled_data->apply_gradients.tensor_arena, model->graph, compiled_data->gradients, compiled_data->tensors.accum_gradients, parameter_size, parallel_count);
1829
5.64k
    else
1830
5.64k
      _ccv_cnnp_bind_tensors_to_arena(compiled_data->apply_gradients.tensor_arena, model->graph, compiled_data->gradients, compiled_data->tensors.gradients, parameter_size, parallel_count);
1831
6.14k
  }
1832
6.16k
  ccv_nnc_graph_run_with_schedule(compiled_data->apply_gradients.graph, 0, 0, 0, stream_context);
1833
  // Reset backward count to 0.
1834
6.16k
  compiled_data->backward.count = 0;
1835
6.16k
}
1836
1837
void ccv_cnnp_model_set_parameter(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter, const ccv_nnc_tensor_t* const tensor)
1838
8
{
1839
8
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1840
8
  const int param_sel = parameter->param_sel > 0 ? 
parameter->param_sel - 16
:
parameter->param_sel2
;
1841
8
  assert(parameter->param_sel != 0);
1842
8
  const int tensors_init = !!compiled_data->tensors_init.v;
1843
8
  if (!tensors_init)
1844
5
    ccv_cnnp_model_tensors_init(model, compiled_data);
1845
8
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
1846
8
  ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices);
1847
8
  const int param_ref = parameter->param_ref > 0 ? parameter->param_ref - 1 : 
parameter->param_ref0
;
1848
8
  if (param_ref < 0)
1849
0
    { assert(parameter_indices->rnum == 1); }
1850
8
  else
1851
8
    { assert(param_ref < parameter_indices->rnum); }
1852
8
  const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0);
1853
8
  ccv_array_free(parameter_indices);
1854
8
  const int parameter_size = compiled_data->parameters->rnum;
1855
8
  assert(d >= 0);
1856
8
  assert(d < parameter_size);
1857
8
  const int parallel_count = ccv_max(model->parallel_count, 1);
1858
8
  ccv_nnc_tensor_t* const dest = compiled_data->tensors.parameters[d];
1859
8
  assert(dest);
1860
8
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST((ccv_nnc_tensor_t*)tensor), TENSOR_LIST(dest), 0);
1861
8
  int i;
1862
8
  for (i = 1; i < parallel_count; 
i++0
)
1863
0
  {
1864
0
    ccv_nnc_tensor_t* const copy_tensor = compiled_data->tensors.parameters[d + i * parameter_size];
1865
0
    if (copy_tensor)
1866
0
      ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dest), TENSOR_LIST(copy_tensor), 0);
1867
0
  }
1868
  // Mark this symbol as init'ed.
1869
8
  const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(compiled_data->parameters, d))->d;
1870
8
  compiled_data->tensors_init.v[s >> 5] |= (1u << (s & 0x1f));
1871
8
}
1872
1873
void ccv_cnnp_model_parameter_copy(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter, ccv_nnc_tensor_t* const tensor)
1874
6
{
1875
6
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1876
6
  const int param_sel = parameter->param_sel > 0 ? 
parameter->param_sel - 13
:
parameter->param_sel3
;
1877
6
  assert(parameter->param_sel != 0);
1878
6
  assert(compiled_data->tensors.parameters);
1879
6
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
1880
6
  ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices);
1881
6
  const int param_ref = parameter->param_ref > 0 ? 
parameter->param_ref - 13
:
parameter->param_ref3
;
1882
6
  if (param_ref < 0)
1883
3
    { assert(parameter_indices->rnum == 1); }
1884
3
  else
1885
3
    { assert(param_ref < parameter_indices->rnum); }
1886
6
  const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0);
1887
6
  ccv_array_free(parameter_indices);
1888
6
  const int parameter_size = compiled_data->parameters->rnum;
1889
6
  assert(d >= 0);
1890
6
  assert(d < parameter_size);
1891
  // We don't need to consider parallel_count, every parameter on each device is identical.
1892
6
  ccv_nnc_tensor_t* const src = compiled_data->tensors.parameters[d];
1893
6
  assert(src);
1894
6
  ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(src), TENSOR_LIST(tensor), 0);
1895
6
}
1896
1897
ccv_nnc_tensor_param_t ccv_cnnp_model_parameter_tensor_params(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameter)
1898
1
{
1899
1
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
1900
1
  const int param_sel = parameter->param_sel > 0 ? 
parameter->param_sel - 10
: parameter->param_sel;
1901
1
  assert(parameter->param_sel != 0);
1902
1
  assert(compiled_data->tensors.parameters);
1903
1
  ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
1904
1
  ccv_cnnp_model_add_to_parameter_indices(parameter->model, param_sel, parameter_indices);
1905
1
  const int param_ref = parameter->param_ref > 0 ? 
parameter->param_ref - 10
: parameter->param_ref;
1906
1
  if (param_ref < 0)
1907
1
    { assert(parameter_indices->rnum == 1); }
1908
0
  else
1909
0
    { assert(param_ref < parameter_indices->rnum); }
1910
1
  const int d = *(int*)ccv_array_get(parameter_indices, param_ref >= 0 ? param_ref : 0);
1911
1
  ccv_array_free(parameter_indices);
1912
1
  const int parameter_size = compiled_data->parameters->rnum;
1913
1
  assert(d >= 0);
1914
1
  assert(d < parameter_size);
1915
  // We don't need to consider parallel_count, every parameter on each device is identical.
1916
1
  ccv_nnc_tensor_t* const tensor = compiled_data->tensors.parameters[d];
1917
1
  assert(tensor);
1918
1
  return tensor->info;
1919
1
}
1920
1921
static ccv_array_t* _ccv_cnnp_model_parameter_indices(const ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, int* const param_ref)
1922
44
{
1923
44
  const int to_param_sel = parameters->param_sel > 0 ? 
parameters->param_sel - 10
: parameters->param_sel;
1924
44
  assert(parameters->param_sel != 0);
1925
44
  ccv_array_t* const to_parameter_indices = ccv_array_new(sizeof(int), 0, 0);
1926
44
  ccv_cnnp_model_add_to_parameter_indices(parameters->model, to_param_sel, to_parameter_indices);
1927
44
  *param_ref = parameters->param_ref > 0 ? 
parameters->param_ref - 10
: parameters->param_ref;
1928
44
  return to_parameter_indices;
1929
44
}
1930
1931
static void _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters, ccv_array_t** const parameter_indices, int* const param_ref, ccv_array_t** const from_parameter_indices, int* const from_param_ref)
1932
12
{
1933
  // If the model is not compiled yet. Compile them now.
1934
12
  if (!model->graph)
1935
3
  {
1936
3
    model->graph = ccv_nnc_symbolic_graph_new();
1937
3
    assert(from_model->compiled_data);
1938
3
    const int input_size = from_model->input_size;
1939
3
    ccv_nnc_tensor_param_t input_params[input_size];
1940
3
    int i;
1941
9
    for (i = 0; i < input_size; 
i++6
)
1942
6
      input_params[i] = ccv_nnc_tensor_symbol_params(from_model->graph, from_model->inputs[i]);
1943
3
    _ccv_cnnp_model_compile(model, input_params, input_size, from_model->compiled_data->loss);
1944
3
    model->parallel_count = from_model->parallel_count;
1945
3
    model->memory_compression = from_model->memory_compression;
1946
3
    model->compiled_data->stream_type = from_model->compiled_data->stream_type;
1947
3
    model->compiled_data->minimize.minimizer = from_model->compiled_data->minimize.minimizer;
1948
3
    model->compiled_data->minimize.max_saved_aux_size = from_model->compiled_data->minimize.max_saved_aux_size;
1949
3
  }
1950
12
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
1951
12
  assert(to_compiled_data);
1952
12
  const int to_tensors_init = !!to_compiled_data->tensors_init.v;
1953
12
  if (!to_tensors_init)
1954
9
    ccv_cnnp_model_tensors_init(model, to_compiled_data);
1955
12
  assert(to_compiled_data->tensors.parameters);
1956
12
  *parameter_indices = _ccv_cnnp_model_parameter_indices(model, parameters, param_ref);
1957
12
  *from_parameter_indices = _ccv_cnnp_model_parameter_indices(from_model, from_parameters, from_param_ref);
1958
12
  if (*from_param_ref < 0 && *param_ref >= 0)
1959
0
    { assert((*from_parameter_indices)->rnum == 1); }
1960
12
  else if (*from_param_ref >= 0)
1961
0
    { assert(*from_param_ref < (*from_parameter_indices)->rnum); }
1962
12
  if (*param_ref < 0 && *from_param_ref >= 0)
1963
0
    { assert((*parameter_indices)->rnum == 1); }
1964
12
  else if (*param_ref >= 0)
1965
0
    { assert(*param_ref < (*parameter_indices)->rnum); }
1966
  // Should be exactly the same tensor.
1967
12
  if (*param_ref < 0 && *from_param_ref < 0)
1968
12
    { assert((*from_parameter_indices)->rnum == (*parameter_indices)->rnum); }
1969
12
}
1970
1971
void ccv_cnnp_model_set_parameters(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters)
1972
9
{
1973
9
  ccv_array_t* to_parameter_indices;
1974
9
  int to_param_ref;
1975
9
  ccv_array_t* from_parameter_indices;
1976
9
  int from_param_ref;
1977
9
  _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(model, parameters, from_model, from_parameters, &to_parameter_indices, &to_param_ref, &from_parameter_indices, &from_param_ref);
1978
  // To models.
1979
9
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
1980
9
  assert(to_compiled_data);
1981
  // From models.
1982
9
  const ccv_cnnp_compiled_data_t* const from_compiled_data = from_model->compiled_data;
1983
9
  const int parallel_count = ccv_max(model->parallel_count, 1);
1984
9
  const int to_parameter_size = to_compiled_data->parameters->rnum;
1985
9
  const int rnum = (to_param_ref < 0 && from_param_ref < 0) ? from_parameter_indices->rnum : 
10
;
1986
9
  int i, j;
1987
18
  for (i = 0; i < rnum; 
i++9
)
1988
9
  {
1989
9
    const int src_d = *(int*)ccv_array_get(from_parameter_indices,from_param_ref >= 0 ? from_param_ref : i);
1990
9
    assert(src_d >= 0);
1991
9
    assert(src_d < from_compiled_data->parameters->rnum);
1992
9
    const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(from_compiled_data->parameters, src_d))->d;
1993
    // If the original is not init'ed. We cannot copy from.
1994
9
    if (!(from_compiled_data->tensors_init.v[s >> 5] & (1u << (s & 0x1f))))
1995
0
      continue;
1996
9
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
1997
9
    assert(dest_d >= 0);
1998
9
    assert(dest_d < to_compiled_data->parameters->rnum);
1999
9
    ccv_nnc_tensor_t* const src = from_compiled_data->tensors.parameters[src_d];
2000
9
    assert(src);
2001
9
    ccv_nnc_tensor_t* const dest = to_compiled_data->tensors.parameters[dest_d];
2002
9
    assert(dest);
2003
9
    ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(src), TENSOR_LIST(dest), 0);
2004
27
    for (j = 1; j < parallel_count; 
j++18
)
2005
18
    {
2006
18
      ccv_nnc_tensor_t* const copy_tensor = to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size];
2007
18
      if (copy_tensor)
2008
18
        ccv_nnc_cmd_exec(CMD_DATA_TRANSFER_FORWARD(), ccv_nnc_no_hint, 0, TENSOR_LIST(dest), TENSOR_LIST(copy_tensor), 0);
2009
18
    }
2010
    // Mark this symbol as init'ed.
2011
9
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(to_compiled_data->parameters, dest_d))->d;
2012
9
    to_compiled_data->tensors_init.v[d >> 5] |= (1u << (d & 0x1f));
2013
9
  }
2014
9
  ccv_array_free(to_parameter_indices);
2015
9
  ccv_array_free(from_parameter_indices);
2016
9
}
2017
2018
ccv_nnc_stream_context_t* ccv_cnnp_compiled_data_get_stream(ccv_cnnp_compiled_data_t* const compiled_data, const int type)
2019
24
{
2020
24
  if (!compiled_data->stream_map)
2021
4
    compiled_data->stream_map = kh_init(stream_map);
2022
24
  int ret = 0;
2023
24
  khiter_t k = kh_put(stream_map, compiled_data->stream_map, type, &ret);
2024
24
  assert(ret >= 0);
2025
24
  ccv_nnc_stream_context_t* stream = kh_val(compiled_data->stream_map, k);
2026
  // If ret == 0, the key already exist, we can return directly, otherwise, create and return.
2027
24
  if (ret != 0)
2028
16
  {
2029
16
    stream = ccv_nnc_stream_context_new(type);
2030
16
    kh_val(compiled_data->stream_map, k) = stream;
2031
16
  }
2032
24
  return stream;
2033
24
}
2034
2035
void ccv_cnnp_model_parameters_zip_map(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const aux_ins, const int aux_in_size, ccv_nnc_tensor_t* const* const aux_outs, const int aux_out_size, ccv_nnc_stream_context_t* const stream_context, const ccv_cnnp_model_t* const from_model, const ccv_cnnp_model_io_t from_parameters)
2036
3
{
2037
3
  ccv_array_t* to_parameter_indices;
2038
3
  int to_param_ref;
2039
3
  ccv_array_t* from_parameter_indices;
2040
3
  int from_param_ref;
2041
3
  _ccv_cnnp_model_to_parameter_indices_and_from_parameter_indices(model, parameters, from_model, from_parameters, &to_parameter_indices, &to_param_ref, &from_parameter_indices, &from_param_ref);
2042
  // To models.
2043
3
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2044
3
  assert(to_compiled_data);
2045
  // From models.
2046
3
  const ccv_cnnp_compiled_data_t* const from_compiled_data = from_model->compiled_data;
2047
3
  const int parallel_count = ccv_max(model->parallel_count, 1);
2048
3
  const int to_parameter_size = to_compiled_data->parameters->rnum;
2049
3
  const int rnum = (to_param_ref < 0 && from_param_ref < 0) ? from_parameter_indices->rnum : 
10
;
2050
3
  assert(aux_in_size >= 0);
2051
3
  assert(aux_out_size >= 0);
2052
3
  int i, j;
2053
3
  ccv_nnc_tensor_t* inputs[aux_in_size + 2];
2054
3
  ccv_nnc_tensor_t* outputs[aux_out_size + 1];
2055
3
  for (i = 0; i < aux_in_size; 
i++0
)
2056
0
    inputs[i + 2] = aux_ins[i];
2057
3
  for (i = 0; i < aux_out_size; 
i++0
)
2058
0
    outputs[i + 1] = aux_outs[i];
2059
6
  for (i = 0; i < rnum; 
i++3
)
2060
3
  {
2061
3
    const int src_d = *(int*)ccv_array_get(from_parameter_indices,from_param_ref >= 0 ? from_param_ref : i);
2062
3
    assert(src_d >= 0);
2063
3
    assert(src_d < from_compiled_data->parameters->rnum);
2064
3
    const int s = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(from_compiled_data->parameters, src_d))->d;
2065
    // If the original is not init'ed. We cannot copy from.
2066
3
    if (!(from_compiled_data->tensors_init.v[s >> 5] & (1u << (s & 0x1f))))
2067
0
      continue;
2068
3
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
2069
3
    assert(dest_d >= 0);
2070
3
    assert(dest_d < to_compiled_data->parameters->rnum);
2071
3
    if (parallel_count > 1)
2072
2
    {
2073
2
      ccv_nnc_stream_context_t* streams[parallel_count];
2074
2
      ccv_nnc_stream_signal_t* signal;
2075
2
      if (stream_context)
2076
1
        signal = ccv_nnc_stream_context_emit_signal_new(stream_context);
2077
10
      for (j = 0; j < parallel_count; 
j++8
)
2078
8
      {
2079
8
        ccv_nnc_tensor_t* const src = from_compiled_data->tensors.parameters[src_d + j * to_parameter_size];
2080
8
        ccv_nnc_tensor_t* const dest = to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size];
2081
8
        if (!dest || !src)
2082
0
        {
2083
0
          streams[j] = 0;
2084
0
          continue;
2085
0
        }
2086
        // At the moment, can only handle them on the same device.
2087
8
        assert(CCV_TENSOR_GET_MEMORY(src->info.type) == CCV_TENSOR_GET_MEMORY(dest->info.type));
2088
8
        assert(CCV_TENSOR_GET_DEVICE_ID(src->info.type) == CCV_TENSOR_GET_DEVICE_ID(dest->info.type));
2089
8
        const int stream_type = CCV_TENSOR_GET_MEMORY(src->info.type) == CCV_TENSOR_GPU_MEMORY ? CCV_STREAM_CONTEXT_GPU : 
CCV_STREAM_CONTEXT_CPU0
;
2090
8
        const int device_id = CCV_TENSOR_GET_DEVICE_ID(src->info.type);
2091
8
        int type = stream_type;
2092
8
        CCV_STREAM_SET_DEVICE_ID(type, device_id);
2093
8
        ccv_nnc_stream_context_t* const stream_0 = ccv_cnnp_compiled_data_get_stream(to_compiled_data, type);
2094
        // Wait signal to finish.
2095
8
        if (stream_context)
2096
4
          ccv_nnc_stream_context_wait_signal(stream_0, signal);
2097
8
        inputs[0] = outputs[0] = dest;
2098
8
        inputs[1] = src;
2099
8
        ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 2, outputs, aux_out_size + 1, stream_0);
2100
8
        if (stream_context)
2101
4
        {
2102
4
          ccv_nnc_stream_signal_t* const signal = ccv_nnc_stream_context_emit_signal_new(stream_0);
2103
4
          ccv_nnc_stream_context_wait_signal(stream_context, signal);
2104
4
        }
2105
8
        streams[j] = stream_0;
2106
8
      }
2107
      // If this should be blocking, blocking it.
2108
2
      if (!stream_context)
2109
5
        
for (j = 0; 1
j < parallel_count;
j++4
)
2110
4
          if (streams[j])
2111
4
            ccv_nnc_stream_context_wait(streams[j]);
2112
2
    } else {
2113
1
      ccv_nnc_tensor_t* const src = from_compiled_data->tensors.parameters[src_d];
2114
1
      assert(src);
2115
1
      ccv_nnc_tensor_t* const dest = to_compiled_data->tensors.parameters[dest_d];
2116
1
      assert(dest);
2117
1
      inputs[0] = outputs[0] = dest;
2118
1
      inputs[1] = src;
2119
1
      ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 2, outputs, aux_out_size + 1, stream_context);
2120
1
    }
2121
    // Mark this symbol as init'ed.
2122
3
    const int d = ((ccv_nnc_tensor_symbol_t*)ccv_array_get(to_compiled_data->parameters, dest_d))->d;
2123
3
    to_compiled_data->tensors_init.v[d >> 5] |= (1u << (d & 0x1f));
2124
3
  }
2125
3
  ccv_array_free(to_parameter_indices);
2126
3
  ccv_array_free(from_parameter_indices);
2127
3
}
2128
2129
void ccv_cnnp_model_parameters_map(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const aux_ins, const int aux_in_size, ccv_nnc_tensor_t* const* const aux_outs, const int aux_out_size, ccv_nnc_stream_context_t* const stream_context)
2130
14
{
2131
14
  int to_param_ref;
2132
14
  ccv_array_t* const to_parameter_indices = _ccv_cnnp_model_parameter_indices(model, parameters, &to_param_ref);
2133
  // To models.
2134
14
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2135
14
  assert(to_compiled_data);
2136
  // Tensor has to be inited already.
2137
14
  assert(!!to_compiled_data->tensors_init.v);
2138
14
  assert(to_compiled_data->tensors.parameters);
2139
  // From models.
2140
14
  const int parallel_count = ccv_max(model->parallel_count, 1);
2141
14
  const int to_parameter_size = to_compiled_data->parameters->rnum;
2142
14
  const int rnum = (to_param_ref < 0) ? to_parameter_indices->rnum : 
10
;
2143
14
  assert(aux_in_size >= 0);
2144
14
  assert(aux_out_size >= 0);
2145
14
  int i, j;
2146
14
  ccv_nnc_tensor_t* inputs[aux_in_size + 1];
2147
14
  ccv_nnc_tensor_t* outputs[aux_out_size + 1];
2148
14
  for (i = 0; i < aux_in_size; 
i++0
)
2149
0
    inputs[i + 1] = aux_ins[i];
2150
14
  for (i = 0; i < aux_out_size; 
i++0
)
2151
0
    outputs[i + 1] = aux_outs[i];
2152
28
  for (i = 0; i < rnum; 
i++14
)
2153
14
  {
2154
14
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
2155
14
    assert(dest_d >= 0);
2156
14
    assert(dest_d < to_compiled_data->parameters->rnum);
2157
14
    if (parallel_count > 1)
2158
4
    {
2159
4
      ccv_nnc_stream_context_t* streams[parallel_count];
2160
4
      ccv_nnc_stream_signal_t* signal;
2161
4
      if (stream_context)
2162
1
        signal = ccv_nnc_stream_context_emit_signal_new(stream_context);
2163
20
      for (j = 0; j < parallel_count; 
j++16
)
2164
16
      {
2165
16
        ccv_nnc_tensor_t* const dest = to_compiled_data->tensors.parameters[dest_d + j * to_parameter_size];
2166
16
        if (!dest)
2167
0
        {
2168
0
          streams[j] = 0;
2169
0
          continue;
2170
0
        }
2171
16
        const int stream_type = CCV_TENSOR_GET_MEMORY(dest->info.type) == CCV_TENSOR_GPU_MEMORY ? CCV_STREAM_CONTEXT_GPU : 
CCV_STREAM_CONTEXT_CPU0
;
2172
16
        const int device_id = CCV_TENSOR_GET_DEVICE_ID(dest->info.type);
2173
16
        int type = stream_type;
2174
16
        CCV_STREAM_SET_DEVICE_ID(type, device_id);
2175
16
        ccv_nnc_stream_context_t* const stream_0 = ccv_cnnp_compiled_data_get_stream(to_compiled_data, type);
2176
        // Wait signal to finish.
2177
16
        if (stream_context)
2178
4
          ccv_nnc_stream_context_wait_signal(stream_0, signal);
2179
16
        inputs[0] = outputs[0] = dest;
2180
16
        ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_0);
2181
16
        if (stream_context)
2182
4
        {
2183
4
          ccv_nnc_stream_signal_t* const signal = ccv_nnc_stream_context_emit_signal_new(stream_0);
2184
4
          ccv_nnc_stream_context_wait_signal(stream_context, signal);
2185
4
        }
2186
16
        streams[j] = stream_0;
2187
16
      }
2188
      // If this should be blocking, blocking it.
2189
4
      if (!stream_context)
2190
15
        
for (j = 0; 3
j < parallel_count;
j++12
)
2191
12
          if (streams[j])
2192
12
            ccv_nnc_stream_context_wait(streams[j]);
2193
10
    } else {
2194
10
      ccv_nnc_tensor_t* const dest = to_compiled_data->tensors.parameters[dest_d];
2195
10
      assert(dest);
2196
10
      inputs[0] = outputs[0] = dest;
2197
10
      ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_context);
2198
10
    }
2199
    // No need to mark this symbol as init'ed, it is already.
2200
14
  }
2201
14
  ccv_array_free(to_parameter_indices);
2202
14
}
2203
2204
void ccv_cnnp_model_parameter_gradients_map(ccv_cnnp_model_t* const model, const ccv_cnnp_model_io_t parameters, const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const aux_ins, const int aux_in_size, ccv_nnc_tensor_t* const* const aux_outs, const int aux_out_size, ccv_nnc_stream_context_t* const stream_context)
2205
6
{
2206
6
  int to_param_ref;
2207
6
  ccv_array_t* const to_parameter_indices = _ccv_cnnp_model_parameter_indices(model, parameters, &to_param_ref);
2208
  // To models.
2209
6
  ccv_cnnp_compiled_data_t* const to_compiled_data = model->compiled_data;
2210
6
  assert(to_compiled_data);
2211
  // Tensor has to be inited already.
2212
6
  assert(!!to_compiled_data->tensors_init.v);
2213
6
  ccv_nnc_tensor_t** tensor_gradients;
2214
6
  if (to_compiled_data->backward.count > 1)
2215
3
    tensor_gradients = to_compiled_data->tensors.accum_gradients;
2216
3
  else
2217
3
    tensor_gradients = to_compiled_data->tensors.gradients;
2218
6
  assert(tensor_gradients);
2219
  // From models.
2220
6
  const int parallel_count = ccv_max(model->parallel_count, 1);
2221
6
  const int to_parameter_size = to_compiled_data->parameters->rnum;
2222
6
  const int rnum = (to_param_ref < 0) ? to_parameter_indices->rnum : 
10
;
2223
6
  assert(aux_in_size >= 0);
2224
6
  assert(aux_out_size >= 0);
2225
6
  int i, j;
2226
6
  ccv_nnc_tensor_t* inputs[aux_in_size + 1];
2227
6
  ccv_nnc_tensor_t* outputs[aux_out_size + 1];
2228
10
  for (i = 0; i < aux_in_size; 
i++4
)
2229
4
    inputs[i + 1] = aux_ins[i];
2230
14
  for (i = 0; i < aux_out_size; 
i++8
)
2231
8
    outputs[i + 1] = aux_outs[i];
2232
12
  for (i = 0; i < rnum; 
i++6
)
2233
6
  {
2234
6
    const int dest_d = *(int*)ccv_array_get(to_parameter_indices, to_param_ref >= 0 ? to_param_ref : i);
2235
6
    assert(dest_d >= 0);
2236
6
    assert(dest_d < to_compiled_data->parameters->rnum);
2237
6
    if (parallel_count > 1)
2238
0
    {
2239
0
      ccv_nnc_stream_context_t* streams[parallel_count];
2240
0
      ccv_nnc_stream_signal_t* signal;
2241
0
      if (stream_context)
2242
0
        signal = ccv_nnc_stream_context_emit_signal_new(stream_context);
2243
0
      for (j = 0; j < parallel_count; j++)
2244
0
      {
2245
0
        ccv_nnc_tensor_t* const dest = tensor_gradients[dest_d + j * to_parameter_size];
2246
0
        if (!dest)
2247
0
        {
2248
0
          streams[j] = 0;
2249
0
          continue;
2250
0
        }
2251
0
        const int stream_type = CCV_TENSOR_GET_MEMORY(dest->info.type) == CCV_TENSOR_GPU_MEMORY ? CCV_STREAM_CONTEXT_GPU : CCV_STREAM_CONTEXT_CPU;
2252
0
        const int device_id = CCV_TENSOR_GET_DEVICE_ID(dest->info.type);
2253
0
        int type = stream_type;
2254
0
        CCV_STREAM_SET_DEVICE_ID(type, device_id);
2255
0
        ccv_nnc_stream_context_t* const stream_0 = ccv_cnnp_compiled_data_get_stream(to_compiled_data, type);
2256
        // Wait signal to finish.
2257
0
        if (stream_context)
2258
0
          ccv_nnc_stream_context_wait_signal(stream_0, signal);
2259
0
        inputs[0] = outputs[0] = dest;
2260
0
        ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_0);
2261
0
        if (stream_context)
2262
0
        {
2263
0
          ccv_nnc_stream_signal_t* const signal = ccv_nnc_stream_context_emit_signal_new(stream_0);
2264
0
          ccv_nnc_stream_context_wait_signal(stream_context, signal);
2265
0
        }
2266
0
        streams[j] = stream_0;
2267
0
      }
2268
      // If this should be blocking, blocking it.
2269
0
      if (!stream_context)
2270
0
        for (j = 0; j < parallel_count; j++)
2271
0
          if (streams[j])
2272
0
            ccv_nnc_stream_context_wait(streams[j]);
2273
6
    } else {
2274
6
      ccv_nnc_tensor_t* const dest = tensor_gradients[dest_d];
2275
6
      assert(dest);
2276
6
      inputs[0] = outputs[0] = dest;
2277
6
      ccv_nnc_cmd_exec(cmd, hint, flags, inputs, aux_in_size + 1, outputs, aux_out_size + 1, stream_context);
2278
6
    }
2279
    // No need to mark this symbol as init'ed, it is already.
2280
6
  }
2281
6
  ccv_array_free(to_parameter_indices);
2282
6
}
2283
2284
ccv_nnc_cmd_t ccv_cnnp_model_minimizer(ccv_cnnp_model_t* const model)
2285
2.20k
{
2286
2.20k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2287
2.20k
  assert(compiled_data);
2288
2.20k
  return compiled_data->minimize.minimizer;
2289
2.20k
}
2290
2291
void ccv_cnnp_model_set_minimizer(ccv_cnnp_model_t* const model, const ccv_nnc_cmd_t minimizer, const int reset, const ccv_cnnp_model_io_t* const set_parameters, const int set_parameter_size)
2292
4.36k
{
2293
4.36k
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2294
4.36k
  assert(compiled_data);
2295
4.36k
  const int parameter_size = compiled_data->parameters->rnum;
2296
4.36k
  if (parameter_size == 0)
2297
1
    return;
2298
4.36k
  if (reset)
2299
2.49k
    { assert(set_parameters == 0 && set_parameter_size == 0); }
2300
4.36k
  const int old_max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
2301
4.36k
  const int saved_aux_size = ccv_nnc_minimizer_saved_aux_size(minimizer);
2302
4.36k
  if (saved_aux_size > compiled_data->minimize.max_saved_aux_size)
2303
7
    compiled_data->minimize.max_saved_aux_size = saved_aux_size;
2304
4.36k
  const int max_saved_aux_size = compiled_data->minimize.max_saved_aux_size;
2305
  // We update all parameters, at this point, we have one minimizer.
2306
4.36k
  if (set_parameters == 0 || 
set_parameter_size == 0301
)
2307
4.06k
    compiled_data->minimize.minimizer = minimizer;
2308
4.36k
  int i;
2309
4.36k
  if (set_parameters && 
set_parameter_size301
)
2310
301
  {
2311
    // I need to save what's the minimizer along with this.
2312
301
    if (!compiled_data->minimize.parameters)
2313
5
      compiled_data->minimize.parameters = ccv_array_new(sizeof(ccv_cnnp_set_minimizer_for_parameter_t*), 1, 0);
2314
301
    ccv_cnnp_set_minimizer_for_parameter_t* const set_minimizer_for_parameter = ccmalloc(sizeof(ccv_cnnp_set_minimizer_for_parameter_t) + (set_parameter_size - 1) * sizeof(ccv_cnnp_model_io_t));
2315
301
    set_minimizer_for_parameter->minimizer = minimizer;
2316
301
    set_minimizer_for_parameter->parameter_size = set_parameter_size;
2317
301
    memcpy(set_minimizer_for_parameter->parameters, set_parameters, sizeof(ccv_cnnp_model_io_t) * set_parameter_size);
2318
301
    ccv_array_push(compiled_data->minimize.parameters, &set_minimizer_for_parameter);
2319
301
  }
2320
  // If reset is true, clear the parameters array.
2321
4.36k
  if (reset && 
compiled_data->minimize.parameters2.49k
)
2322
291
  {
2323
582
    for (i = 0; i < compiled_data->minimize.parameters->rnum; 
i++291
)
2324
291
      ccfree(*(ccv_cnnp_set_minimizer_for_parameter_t**)ccv_array_get(compiled_data->minimize.parameters, i));
2325
291
    ccv_array_clear(compiled_data->minimize.parameters);
2326
291
  }
2327
4.36k
  if (!compiled_data->update_nodes)
2328
9
    return;
2329
4.35k
  ccv_nnc_symbolic_graph_t* const symbolic_graph = model->graph;
2330
4.35k
  assert(symbolic_graph);
2331
4.35k
  if (saved_aux_size > old_max_saved_aux_size)
2332
7
  {
2333
7
    assert(compiled_data->updated_parameters);
2334
    // Reallocate first, move them around later.
2335
7
    compiled_data->updated_parameters = (ccv_nnc_tensor_symbol_t*)ccrealloc(compiled_data->updated_parameters, sizeof(ccv_nnc_tensor_symbol_t) * parameter_size + sizeof(ccv_nnc_graph_exec_symbol_t) * parameter_size + sizeof(ccv_nnc_tensor_symbol_map_t) * saved_aux_size * parameter_size);
2336
7
    compiled_data->update_nodes = (ccv_nnc_graph_exec_symbol_t*)(compiled_data->updated_parameters + parameter_size);
2337
7
    compiled_data->saved_aux = (ccv_nnc_tensor_symbol_map_t*)(compiled_data->update_nodes + parameter_size);
2338
    // We need to do this from back to front because saved_aux_size > old_saved_aux_size, it could overlap.
2339
7
    _ccv_cnnp_scatter_saved_aux(compiled_data->saved_aux, parameter_size, old_max_saved_aux_size, saved_aux_size);
2340
7
  }
2341
4.35k
  int flag = 0;
2342
4.35k
  const int parallel_count = ccv_max(model->parallel_count, 1);
2343
4.35k
  if (set_parameters && 
set_parameter_size296
)
2344
296
  {
2345
296
    ccv_array_t* const parameter_indices = ccv_array_new(sizeof(int), 0, 0);
2346
592
    for (i = 0; i < set_parameter_size; 
i++296
)
2347
296
    {
2348
296
      const int param_sel = set_parameters[i]->param_sel > 0 ? 
set_parameters[i]->param_sel - 1291
:
set_parameters[i]->param_sel5
;
2349
296
      assert(set_parameters[i]->param_sel != 0);
2350
296
      const int old_rnum = parameter_indices->rnum;
2351
296
      ccv_cnnp_model_add_to_parameter_indices(set_parameters[i]->model, param_sel, parameter_indices);
2352
296
      const int param_ref = set_parameters[i]->param_ref > 0 ? 
set_parameters[i]->param_ref - 10
: set_parameters[i]->param_ref;
2353
296
      assert(set_parameters[i]->param_ref != 0);
2354
296
      if (param_ref >= 0)
2355
0
      {
2356
0
        assert(param_ref + old_rnum < parameter_indices->rnum);
2357
0
        *(int*)ccv_array_get(parameter_indices, old_rnum) = *(int*)ccv_array_get(parameter_indices, param_ref + old_rnum);
2358
0
        parameter_indices->rnum = old_rnum + 1;
2359
0
      }
2360
296
    }
2361
    // We may have duplicated indices, but that is OK, we will set it twice.
2362
5.24k
    
for (i = 0; 296
i < parameter_indices->rnum;
i++4.95k
)
2363
4.95k
    {
2364
4.95k
      const int d = *(int*)ccv_array_get(parameter_indices, i);
2365
4.95k
      if (_ccv_cnnp_set_minimizer_for_parameter(symbolic_graph, compiled_data, compiled_data->update_nodes, compiled_data->updated_parameters, compiled_data->saved_aux, parallel_count, minimizer, saved_aux_size, max_saved_aux_size, d))
2366
0
        flag = 1;
2367
4.95k
    }
2368
296
    ccv_array_free(parameter_indices);
2369
4.05k
  } else {
2370
19.1k
    for (i = 0; i < parameter_size; 
i++15.1k
)
2371
15.1k
      if (_ccv_cnnp_set_minimizer_for_parameter(symbolic_graph, compiled_data, compiled_data->update_nodes, compiled_data->updated_parameters, compiled_data->saved_aux, parallel_count, minimizer, saved_aux_size, max_saved_aux_size, i))
2372
65
        flag = 1;
2373
4.05k
    if (compiled_data->minimize.parameters)
2374
291
      if (_ccv_cnnp_apply_parameters_with_minimizer(model))
2375
0
        flag = 1;
2376
4.05k
  }
2377
4.35k
  if (flag)
2378
7
  {
2379
    // If saved_aux_size doesn't match, we need to remove / add new saved_aux to the graph. But first, free up apply gradients graph.
2380
7
    if (compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_FIT_MODE)
2381
0
      _ccv_cnnp_compiled_data_graph_free(compiled_data);
2382
7
    _ccv_cnnp_compiled_data_apply_gradients_free(compiled_data);
2383
7
  }
2384
4.35k
}
2385
2386
void ccv_cnnp_model_set_compile_params(ccv_cnnp_model_t* const model, const ccv_nnc_symbolic_graph_compile_param_t compile_params)
2387
0
{
2388
0
  ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data;
2389
0
  assert(compiled_data);
2390
0
  compiled_data->compile_params = compile_params;
2391
0
}
2392
2393
void ccv_cnnp_model_dot(const ccv_cnnp_model_t* const model, const int flags, FILE** const outs, const int out_size)
2394
23
{
2395
23
  if (model->graph && out_size > 0)
2396
23
    ccv_nnc_symbolic_graph_dot(model->graph, flags, outs[0]);
2397
23
  if (model->compiled_data && model->compiled_data->graph && 
out_size > 14
)
2398
0
    ccv_nnc_graph_dot(model->compiled_data->graph, flags, outs[1]);
2399
23
  if (model->compiled_data && model->compiled_data->backward.accum && 
out_size > 20
)
2400
0
    ccv_nnc_graph_dot(model->compiled_data->backward.accum, flags, outs[2]);
2401
23
  if (model->compiled_data && model->compiled_data->apply_gradients.graph && 
out_size > 33
)
2402
0
    ccv_nnc_graph_dot(model->compiled_data->apply_gradients.graph, flags, outs[3]);
2403
23
}
2404
2405
static void _ccv_cnnp_compiled_data_free(const ccv_cnnp_model_t* const model, ccv_cnnp_compiled_data_t* const compiled_data)
2406
2.26k
{
2407
2.26k
  int i;
2408
2.26k
  const int parameter_size = compiled_data->parameters->rnum;
2409
2.26k
  ccv_array_free(compiled_data->parameters);
2410
2.26k
  const int internal_size = compiled_data->internals->rnum;
2411
2.26k
  ccv_array_free(compiled_data->internals);
2412
2.26k
  assert(compiled_data->ids.parameters->rnum == parameter_size);
2413
2.26k
  assert(compiled_data->ids.internals->rnum == internal_size);
2414
5.20k
  
for (i = 0; 2.26k
i < parameter_size;
i++2.94k
)
2415
2.94k
    ccfree(*(char**)ccv_array_get(compiled_data->ids.parameters, i));
2416
2.26k
  ccv_array_free(compiled_data->ids.parameters);
2417
2.42k
  for (i = 0; i < internal_size; 
i++161
)
2418
161
    ccfree(*(char**)ccv_array_get(compiled_data->ids.internals, i));
2419
2.26k
  ccv_array_free(compiled_data->ids.internals);
2420
2.26k
  const int parallel_count = ccv_max(model->parallel_count, 1);
2421
2.26k
  if (compiled_data->tensors.parameters)
2422
52
  {
2423
705
    for (i = 0; i < parameter_size * parallel_count; 
i++653
)
2424
653
      ccv_nnc_tensor_free(compiled_data->tensors.parameters[i]);
2425
207
    for (i = 0; i < internal_size * parallel_count; 
i++155
)
2426
155
      if (compiled_data->tensors.internals[i])
2427
155
        ccv_nnc_tensor_free(compiled_data->tensors.internals[i]);
2428
52
    ccfree(compiled_data->tensors.parameters);
2429
52
  }
2430
2.26k
  if (compiled_data->tensors.gradients)
2431
24
  {
2432
345
    for (i = 0; i < parameter_size * parallel_count; 
i++321
)
2433
321
    {
2434
321
      ccv_nnc_tensor_free(compiled_data->tensors.gradients[i]);
2435
321
      if (compiled_data->tensors.accum_gradients[i])
2436
15
        ccv_nnc_tensor_free(compiled_data->tensors.accum_gradients[i]);
2437
321
    }
2438
24
    ccfree(compiled_data->tensors.gradients);
2439
24
  }
2440
2.26k
  if (compiled_data->minimize.parameters)
2441
5
  {
2442
15
    for (i = 0; i < compiled_data->minimize.parameters->rnum; 
i++10
)
2443
10
      ccfree(*(ccv_cnnp_set_minimizer_for_parameter_t**)ccv_array_get(compiled_data->minimize.parameters, i));
2444
5
    ccv_array_free(compiled_data->minimize.parameters);
2445
5
  }
2446
2.26k
  if (compiled_data->rewindables)
2447
34
    ccv_array_free(compiled_data->rewindables);
2448
2.26k
  if (compiled_data->tensors_init.v)
2449
52
    ccfree(compiled_data->tensors_init.v);
2450
2.26k
  if (compiled_data->evaluate.tos)
2451
2.26k
    ccfree(compiled_data->evaluate.tos);
2452
2.26k
  compiled_data->evaluate.tos = 0;
2453
2.26k
  if (compiled_data->stream_map)
2454
4
  {
2455
4
    khiter_t k;
2456
36
    for (k = 
kh_begin4
(compiled_data->stream_map); k != kh_end(compiled_data->stream_map);
++k32
)
2457
32
    {
2458
32
      if (!kh_exist(compiled_data->stream_map, k))
2459
16
        continue;
2460
16
      ccv_nnc_stream_context_t* const stream = kh_val(compiled_data->stream_map, k);
2461
16
      ccv_nnc_stream_context_free(stream);
2462
16
    }
2463
4
    kh_destroy(stream_map, compiled_data->stream_map);
2464
4
  }
2465
2.26k
  _ccv_cnnp_compiled_data_graph_free(compiled_data);
2466
2.26k
  _ccv_cnnp_compiled_data_gradient_free(compiled_data);
2467
2.26k
  _ccv_cnnp_compiled_data_backward_free(compiled_data);
2468
2.26k
  _ccv_cnnp_compiled_data_apply_gradients_free(compiled_data);
2469
2.26k
  ccv_nnc_xpu_alloc_destroy(&compiled_data->xpu_alloc);
2470
2.26k
  ccfree(compiled_data);
2471
2.26k
}
2472
2473
void ccv_cnnp_model_free(ccv_cnnp_model_t* const model)
2474
5.35k
{
2475
5.35k
  if (model->isa->deinit)
2476
1.35k
    model->isa->deinit(model);
2477
5.35k
  if (model->io)
2478
745
  {
2479
745
    int i;
2480
1.82k
    for (i = 0; i < model->io->rnum; 
i++1.08k
)
2481
1.08k
    {
2482
1.08k
      ccv_cnnp_model_io_t model_io = *(ccv_cnnp_model_io_t*)ccv_array_get(model->io, i);
2483
1.08k
      if (model_io->outgoings)
2484
633
        ccv_array_free(model_io->outgoings);
2485
1.08k
      if (model_io->incomings)
2486
579
        ccv_array_free(model_io->incomings);
2487
1.08k
      ccfree(model_io);
2488
1.08k
    }
2489
745
    ccv_array_free(model->io);
2490
745
  }
2491
5.35k
  if (model->parameter_indices)
2492
2.51k
    ccv_array_free(model->parameter_indices);
2493
5.35k
  if (model->inputs)
2494
2.26k
    ccfree(model->inputs);
2495
5.35k
  if (model->graph)
2496
2.26k
    ccv_nnc_symbolic_graph_free(model->graph);
2497
5.35k
  if (model->compiled_data)
2498
2.26k
    _ccv_cnnp_compiled_data_free(model, model->compiled_data);
2499
5.35k
  if (model->name)
2500
109
    ccfree(model->name);
2501
5.35k
  ccfree(model);
2502
5.35k
}